summaryrefslogtreecommitdiffstats
path: root/libmpcodecs/native
diff options
context:
space:
mode:
authorarpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2>2002-04-13 18:03:02 +0000
committerarpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2>2002-04-13 18:03:02 +0000
commit93c499d0432d3b67b50bb8a70a4cb0b76ff3fb33 (patch)
treed65a40d89e2519ba98a9f8fe7e84a37725525208 /libmpcodecs/native
parentb48576a528cb746bc9d77769e667684edc069899 (diff)
downloadmpv-93c499d0432d3b67b50bb8a70a4cb0b76ff3fb33.tar.bz2
mpv-93c499d0432d3b67b50bb8a70a4cb0b76ff3fb33.tar.xz
moved to libmpcodecs/native/
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@5603 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libmpcodecs/native')
-rw-r--r--libmpcodecs/native/RTjpegN.c3800
-rw-r--r--libmpcodecs/native/RTjpegN.h58
-rw-r--r--libmpcodecs/native/alaw.h71
-rw-r--r--libmpcodecs/native/cinepak.c886
-rw-r--r--libmpcodecs/native/cyuv.c135
-rw-r--r--libmpcodecs/native/fli.c379
-rw-r--r--libmpcodecs/native/lzoconf.h386
-rw-r--r--libmpcodecs/native/minilzo.c2848
-rw-r--r--libmpcodecs/native/minilzo.h97
-rw-r--r--libmpcodecs/native/msvidc.c393
-rw-r--r--libmpcodecs/native/nuppelvideo.c123
-rw-r--r--libmpcodecs/native/qtrle.c125
-rw-r--r--libmpcodecs/native/qtrpza.c264
-rw-r--r--libmpcodecs/native/qtsmc.c513
-rw-r--r--libmpcodecs/native/roqav.c670
-rw-r--r--libmpcodecs/native/roqav.h12
16 files changed, 10760 insertions, 0 deletions
diff --git a/libmpcodecs/native/RTjpegN.c b/libmpcodecs/native/RTjpegN.c
new file mode 100644
index 0000000000..4b7bb5b454
--- /dev/null
+++ b/libmpcodecs/native/RTjpegN.c
@@ -0,0 +1,3800 @@
+/*
+ RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
+
+ With modifications by:
+ (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
+ and
+ (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+#ifdef HAVE_MMX
+#define MMX
+#endif
+
+#include "RTjpegN.h"
+
+#ifdef MMX
+#include "mmx.h"
+#endif
+
+//#define SHOWBLOCK 1
+#define BETTERCOMPRESSION 1
+
+static const unsigned char RTjpeg_ZZ[64]={
+0,
+8, 1,
+2, 9, 16,
+24, 17, 10, 3,
+4, 11, 18, 25, 32,
+40, 33, 26, 19, 12, 5,
+6, 13, 20, 27, 34, 41, 48,
+56, 49, 42, 35, 28, 21, 14, 7,
+15, 22, 29, 36, 43, 50, 57,
+58, 51, 44, 37, 30, 23,
+31, 38, 45, 52, 59,
+60, 53, 46, 39,
+47, 54, 61,
+62, 55,
+63 };
+
+static const __u64 RTjpeg_aan_tab[64]={
+4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
+5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
+5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
+5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
+4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
+3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
+2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
+1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
+};
+
+#ifndef MMX
+static __s32 RTjpeg_ws[64+31];
+#endif
+__u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32];
+
+static __s16 *block; // rh
+static __s16 *RTjpeg_block;
+static __s32 *RTjpeg_lqt;
+static __s32 *RTjpeg_cqt;
+static __u32 *RTjpeg_liqt;
+static __u32 *RTjpeg_ciqt;
+
+static unsigned char RTjpeg_lb8;
+static unsigned char RTjpeg_cb8;
+static int RTjpeg_width, RTjpeg_height;
+static int RTjpeg_Ywidth, RTjpeg_Cwidth;
+static int RTjpeg_Ysize, RTjpeg_Csize;
+
+static __s16 *RTjpeg_old=NULL;
+
+#ifdef MMX
+mmx_t RTjpeg_lmask;
+mmx_t RTjpeg_cmask;
+#else
+__u16 RTjpeg_lmask;
+__u16 RTjpeg_cmask;
+#endif
+int RTjpeg_mtest=0;
+
+static const unsigned char RTjpeg_lum_quant_tbl[64] = {
+ 16, 11, 10, 16, 24, 40, 51, 61,
+ 12, 12, 14, 19, 26, 58, 60, 55,
+ 14, 13, 16, 24, 40, 57, 69, 56,
+ 14, 17, 22, 29, 51, 87, 80, 62,
+ 18, 22, 37, 56, 68, 109, 103, 77,
+ 24, 35, 55, 64, 81, 104, 113, 92,
+ 49, 64, 78, 87, 103, 121, 120, 101,
+ 72, 92, 95, 98, 112, 100, 103, 99
+ };
+
+static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
+ 17, 18, 24, 47, 99, 99, 99, 99,
+ 18, 21, 26, 66, 99, 99, 99, 99,
+ 24, 26, 56, 99, 99, 99, 99, 99,
+ 47, 66, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99
+ };
+
+#ifdef BETTERCOMPRESSION
+
+/*--------------------------------------------------*/
+/* better encoding, but needs a lot more cpu time */
+/* seems to be more effective than old method +lzo */
+/* with this encoding lzo isn't efficient anymore */
+/* there is still more potential for better */
+/* encoding but that would need even more cputime */
+/* anyway your mileage may vary */
+/* */
+/* written by Martin BIELY and Roman HOCHLEITNER */
+/*--------------------------------------------------*/
+
+/* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* Block to Stream (encoding) */
+/* */
+
+int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
+{
+ register int ci, co=1;
+ register __s16 ZZvalue;
+ register unsigned char bitten;
+ register unsigned char bitoff;
+
+#ifdef SHOWBLOCK
+
+ int ii;
+ for (ii=0; ii < 64; ii++) {
+ fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
+ }
+ fprintf(stdout, "\n\n");
+
+#endif
+
+// *strm++ = 0x10;
+// *strm = 0x00;
+//
+// return 2;
+
+ // first byte allways written
+ (__u8)strm[0]=
+ (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
+
+
+ ci=63;
+ while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
+
+ bitten = ((unsigned char)ci) << 2;
+
+ if (ci==0) {
+ (__u8)strm[1]= bitten;
+ co = 2;
+ return (int)co;
+ }
+
+ /* bitoff=0 because the high 6bit contain first non zero position */
+ bitoff = 0;
+ co = 1;
+
+ for(; ci>0; ci--) {
+
+ ZZvalue = data[RTjpeg_ZZ[ci]];
+
+ switch(ZZvalue) {
+ case 0:
+ break;
+ case 1:
+ bitten |= (0x01<<bitoff);
+ break;
+ case -1:
+ bitten |= (0x03<<bitoff);
+ break;
+ default:
+ bitten |= (0x02<<bitoff);
+ goto HERZWEH;
+ break;
+ }
+
+ if( bitoff == 0 ) {
+ (__u8)strm[co]= bitten;
+ bitten = 0;
+ bitoff = 8;
+ co++;
+ } /* "fall through" */
+ bitoff-=2;
+
+ }
+
+ /* ci must be 0 */
+ if(bitoff != 6) {
+
+ (__u8)strm[co]= bitten;
+ co++;
+
+ }
+ goto BAUCHWEH;
+
+HERZWEH:
+/* ci cannot be 0 */
+/* correct bitoff to nibble boundaries */
+
+ switch(bitoff){
+ case 4:
+ case 6:
+ bitoff = 0;
+ break;
+ case 2:
+ case 0:
+ (__u8)strm[co]= bitten;
+ bitoff = 4;
+ co++;
+ bitten = 0; // clear half nibble values in bitten
+ break;
+ default:
+ break;
+ }
+
+ for(; ci>0; ci--) {
+
+ ZZvalue = data[RTjpeg_ZZ[ci]];
+
+ if( (ZZvalue > 7) || (ZZvalue < -7) ) {
+ bitten |= (0x08<<bitoff);
+ goto HIRNWEH;
+ }
+
+ bitten |= (ZZvalue&0xf)<<bitoff;
+
+ if( bitoff == 0 ) {
+ (__u8)strm[co]= bitten;
+ bitten = 0;
+ bitoff = 8;
+ co++;
+ } /* "fall thru" */
+ bitoff-=4;
+ }
+
+ /* ci must be 0 */
+ if( bitoff == 0 ) {
+ (__u8)strm[co]= bitten;
+ co++;
+ }
+ goto BAUCHWEH;
+
+HIRNWEH:
+
+ (__u8)strm[co]= bitten;
+ co++;
+
+
+ /* bitting is over now we bite */
+ for(; ci>0; ci--) {
+
+ ZZvalue = data[RTjpeg_ZZ[ci]];
+
+ if(ZZvalue>0)
+ {
+ strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
+ }
+ else
+ {
+ strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
+ }
+
+ }
+
+
+BAUCHWEH:
+ /* we gotoo much now we are ill */
+#ifdef SHOWBLOCK
+{
+int i;
+fprintf(stdout, "\nco = '%d'\n", co);
+ for (i=0; i < co+2; i++) {
+ fprintf(stdout, "%d ", strm[i]);
+ }
+fprintf(stdout, "\n\n");
+}
+#endif
+
+ return (int)co;
+}
+
+/* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* Stream to Block (decoding) */
+/* */
+
+int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
+{
+ int ci;
+ register int co;
+ register int i;
+ register unsigned char bitten;
+ register unsigned char bitoff;
+
+ /* first byte always read */
+ i=RTjpeg_ZZ[0];
+ data[i]=((__u8)strm[0])*qtbl[i];
+
+ /* we start at the behind */
+
+ bitten = ((unsigned char)strm[1]) >> 2;
+ co = 63;
+ for(; co > bitten; co--) {
+
+ data[RTjpeg_ZZ[co]] = 0;
+
+ }
+
+ if (co==0) {
+ ci = 2;
+ goto AUTOBAHN;
+ }
+
+ /* we have to read the last 2 bits of the second byte */
+ ci=1;
+ bitoff = 0;
+
+ for(; co>0; co--) {
+
+ bitten = ((unsigned char)strm[ci]) >> bitoff;
+ bitten &= 0x03;
+
+ i=RTjpeg_ZZ[co];
+
+ switch( bitten ) {
+ case 0x03:
+ data[i]= -qtbl[i];
+ break;
+ case 0x02:
+ goto FUSSWEG;
+ break;
+ case 0x01:
+ data[i]= qtbl[i];
+ break;
+ case 0x00:
+ data[i]= 0;
+ break;
+ default:
+
+ }
+
+ if( bitoff == 0 ) {
+ bitoff = 8;
+ ci++;
+ }
+ bitoff -= 2;
+ }
+ /* co is 0 now */
+ /* data is written properly */
+
+ /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
+ if (bitoff!=6) ci++;
+
+ goto AUTOBAHN;
+
+
+FUSSWEG:
+/* correct bitoff to nibble */
+ switch(bitoff){
+ case 4:
+ case 6:
+ bitoff = 0;
+ break;
+ case 2:
+ case 0:
+ /* we have to read from the next byte */
+ ci++;
+ bitoff = 4;
+ break;
+ default:
+ break;
+ }
+
+ for(; co>0; co--) {
+
+ bitten = ((unsigned char)strm[ci]) >> bitoff;
+ bitten &= 0x0f;
+
+ i=RTjpeg_ZZ[co];
+
+ if( bitten == 0x08 ) {
+ goto STRASSE;
+ }
+
+ /* the compiler cannot do sign extension for signed nibbles */
+ if( bitten & 0x08 ) {
+ bitten |= 0xf0;
+ }
+ /* the unsigned char bitten now is a valid signed char */
+
+ data[i]=((signed char)bitten)*qtbl[i];
+
+ if( bitoff == 0 ) {
+ bitoff = 8;
+ ci++;
+ }
+ bitoff -= 4;
+ }
+ /* co is 0 */
+
+ /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
+ if (bitoff!=4) ci++;
+
+ goto AUTOBAHN;
+
+STRASSE:
+ ci++;
+
+ for(; co>0; co--) {
+ i=RTjpeg_ZZ[co];
+ data[i]=strm[ci++]*qtbl[i];
+ }
+
+ /* ci now is the count, because it points to next element => no incrementing */
+
+AUTOBAHN:
+
+#ifdef SHOWBLOCK
+fprintf(stdout, "\nci = '%d'\n", ci);
+ for (i=0; i < 64; i++) {
+ fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
+ }
+fprintf(stdout, "\n\n");
+#endif
+
+ return ci;
+}
+
+#else
+
+int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
+{
+ register int ci, co=1, tmp;
+ register __s16 ZZvalue;
+
+#ifdef SHOWBLOCK
+
+ int ii;
+ for (ii=0; ii < 64; ii++) {
+ fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
+ }
+ fprintf(stdout, "\n\n");
+
+#endif
+
+ (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
+
+ for(ci=1; ci<=bt8; ci++)
+ {
+ ZZvalue = data[RTjpeg_ZZ[ci]];
+
+ if(ZZvalue>0)
+ {
+ strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
+ }
+ else
+ {
+ strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
+ }
+ }
+
+ for(; ci<64; ci++)
+ {
+ ZZvalue = data[RTjpeg_ZZ[ci]];
+
+ if(ZZvalue>0)
+ {
+ strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
+ }
+ else if(ZZvalue<0)
+ {
+ strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
+ }
+ else /* compress zeros */
+ {
+ tmp=ci;
+ do
+ {
+ ci++;
+ }
+ while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
+
+ strm[co++]=(__s8)(63+(ci-tmp));
+ ci--;
+ }
+ }
+ return (int)co;
+}
+
+int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
+{
+ int ci=1, co=1, tmp;
+ register int i;
+
+ i=RTjpeg_ZZ[0];
+ data[i]=((__u8)strm[0])*qtbl[i];
+
+ for(co=1; co<=bt8; co++)
+ {
+ i=RTjpeg_ZZ[co];
+ data[i]=strm[ci++]*qtbl[i];
+ }
+
+ for(; co<64; co++)
+ {
+ if(strm[ci]>63)
+ {
+ tmp=co+strm[ci]-63;
+ for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
+ co--;
+ } else
+ {
+ i=RTjpeg_ZZ[co];
+ data[i]=strm[ci]*qtbl[i];
+ }
+ ci++;
+ }
+ return (int)ci;
+}
+#endif
+
+#if defined(MMX)
+void RTjpeg_quant_init(void)
+{
+ int i;
+ __s16 *qtbl;
+
+ qtbl=(__s16 *)RTjpeg_lqt;
+ for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];
+
+ qtbl=(__s16 *)RTjpeg_cqt;
+ for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
+}
+
+static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL;
+static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL;
+
+void RTjpeg_quant(__s16 *block, __s32 *qtbl)
+{
+ int i;
+ mmx_t *bl, *ql;
+
+ ql=(mmx_t *)qtbl;
+ bl=(mmx_t *)block;
+
+ movq_m2r(RTjpeg_ones, mm6);
+ movq_m2r(RTjpeg_half, mm7);
+
+ for(i=16; i; i--)
+ {
+ movq_m2r(*(ql++), mm0); /* quant vals (4) */
+ movq_m2r(*bl, mm2); /* block vals (4) */
+ movq_r2r(mm0, mm1);
+ movq_r2r(mm2, mm3);
+
+ punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
+ punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
+
+ punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
+ punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
+
+ pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
+ pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
+
+ psrad_i2r(16, mm0);
+ psrad_i2r(16, mm1);
+
+ packssdw_r2r(mm1, mm0);
+
+ movq_r2m(mm0, *(bl++));
+
+ }
+}
+#else
+void RTjpeg_quant_init(void)
+{
+}
+
+void RTjpeg_quant(__s16 *block, __s32 *qtbl)
+{
+ int i;
+
+ for(i=0; i<64; i++)
+ block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
+}
+#endif
+
+/*
+ * Perform the forward DCT on one block of samples.
+ */
+#ifdef MMX
+static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL;
+static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL;
+static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL;
+static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL;
+static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL;
+
+#else
+
+#define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */
+#define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */
+#define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */
+#define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */
+
+#define DESCALE10(x) (__s16)( ((x)+128) >> 8)
+#define DESCALE20(x) (__s16)(((x)+32768) >> 16)
+#define D_MULTIPLY(var,const) ((__s32) ((var) * (const)))
+#endif
+
+void RTjpeg_dct_init(void)
+{
+ int i;
+
+ for(i=0; i<64; i++)
+ {
+ RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
+ RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
+ }
+}
+
+void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
+{
+#ifndef MMX
+ __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __s32 tmp10, tmp11, tmp12, tmp13;
+ __s32 z1, z2, z3, z4, z5, z11, z13;
+ __u8 *idataptr;
+ __s16 *odataptr;
+ __s32 *wsptr;
+ int ctr;
+
+ idataptr = idata;
+ wsptr = RTjpeg_ws;
+ for (ctr = 7; ctr >= 0; ctr--) {
+ tmp0 = idataptr[0] + idataptr[7];
+ tmp7 = idataptr[0] - idataptr[7];
+ tmp1 = idataptr[1] + idataptr[6];
+ tmp6 = idataptr[1] - idataptr[6];
+ tmp2 = idataptr[2] + idataptr[5];
+ tmp5 = idataptr[2] - idataptr[5];
+ tmp3 = idataptr[3] + idataptr[4];
+ tmp4 = idataptr[3] - idataptr[4];
+
+ tmp10 = (tmp0 + tmp3); /* phase 2 */
+ tmp13 = tmp0 - tmp3;
+ tmp11 = (tmp1 + tmp2);
+ tmp12 = tmp1 - tmp2;
+
+ wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
+ wsptr[4] = (tmp10 - tmp11)<<8;
+
+ z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
+ wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
+ wsptr[6] = (tmp13<<8) - z1;
+
+ tmp10 = tmp4 + tmp5; /* phase 2 */
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
+ z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
+ z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
+ z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
+
+ z11 = (tmp7<<8) + z3; /* phase 5 */
+ z13 = (tmp7<<8) - z3;
+
+ wsptr[5] = z13 + z2; /* phase 6 */
+ wsptr[3] = z13 - z2;
+ wsptr[1] = z11 + z4;
+ wsptr[7] = z11 - z4;
+
+ idataptr += rskip<<3; /* advance pointer to next row */
+ wsptr += 8;
+ }
+
+ wsptr = RTjpeg_ws;
+ odataptr=odata;
+ for (ctr = 7; ctr >= 0; ctr--) {
+ tmp0 = wsptr[0] + wsptr[56];
+ tmp7 = wsptr[0] - wsptr[56];
+ tmp1 = wsptr[8] + wsptr[48];
+ tmp6 = wsptr[8] - wsptr[48];
+ tmp2 = wsptr[16] + wsptr[40];
+ tmp5 = wsptr[16] - wsptr[40];
+ tmp3 = wsptr[24] + wsptr[32];
+ tmp4 = wsptr[24] - wsptr[32];
+
+ tmp10 = tmp0 + tmp3; /* phase 2 */
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
+ odataptr[32] = DESCALE10(tmp10 - tmp11);
+
+ z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
+ odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
+ odataptr[48] = DESCALE20((tmp13<<8) - z1);
+
+ tmp10 = tmp4 + tmp5; /* phase 2 */
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
+ z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
+ z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
+ z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
+
+ z11 = (tmp7<<8) + z3; /* phase 5 */
+ z13 = (tmp7<<8) - z3;
+
+ odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
+ odataptr[24] = DESCALE20(z13 - z2);
+ odataptr[8] = DESCALE20(z11 + z4);
+ odataptr[56] = DESCALE20(z11 - z4);
+
+ odataptr++; /* advance pointer to next column */
+ wsptr++;
+ }
+#else
+ volatile mmx_t tmp6, tmp7;
+ register mmx_t *dataptr = (mmx_t *)odata;
+ mmx_t *idata2 = (mmx_t *)idata;
+
+ // first copy the input 8 bit to the destination 16 bits
+
+ movq_m2r(RTjpeg_zero, mm2);
+
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+1));
+
+ idata2 += rskip;
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr+2));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+3));
+
+ idata2 += rskip;
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr+4));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+5));
+
+ idata2 += rskip;
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr+6));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+7));
+
+ idata2 += rskip;
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr+8));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+9));
+
+ idata2 += rskip;
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr+10));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+11));
+
+ idata2 += rskip;
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr+12));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+13));
+
+ idata2 += rskip;
+
+ movq_m2r(*idata2, mm0);
+ movq_r2r(mm0, mm1);
+
+ punpcklbw_r2r(mm2, mm0);
+ movq_r2m(mm0, *(dataptr+14));
+
+ punpckhbw_r2r(mm2, mm1);
+ movq_r2m(mm1, *(dataptr+15));
+
+/* Start Transpose to do calculations on rows */
+
+ movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
+
+ movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
+ movq_r2r(mm7, mm5);
+
+ punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
+ movq_r2r(mm6, mm2);
+
+ punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
+ movq_r2r(mm7, mm1);
+
+ movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
+ punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
+
+ movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
+ punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
+
+ movq_r2m(mm7,*(dataptr+9)); // write result 1
+ punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
+
+ movq_r2m(mm1,*(dataptr+11)); // write result 2
+ punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
+
+ movq_r2r(mm5, mm1);
+ punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
+
+ movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
+ punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
+
+ movq_r2m(mm5,*(dataptr+13)); // write result 3
+
+ // last 4x4 done
+
+ movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
+
+ movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
+ movq_r2r(mm0, mm6);
+
+ punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
+ movq_r2r(mm2, mm7);
+
+ punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
+ movq_r2r(mm0, mm4);
+
+ //
+ movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
+ punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
+
+ movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
+ punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
+
+ punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
+ movq_r2r(mm1, mm2); // copy first line
+
+ punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
+ movq_r2r(mm6, mm5); // copy first intermediate result
+
+ movq_r2m(mm0, *(dataptr+8)); // write result 1
+ punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
+
+ punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
+ movq_r2r(mm3, mm0); // copy third line
+
+ punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
+
+ movq_r2m(mm4, *(dataptr+10)); // write result 2 out
+ punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
+
+ punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
+ movq_r2r(mm1, mm4);
+
+ movq_r2m(mm6, *(dataptr+12)); // write result 3 out
+ punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
+
+ punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
+ movq_r2r(mm2, mm6);
+
+ movq_r2m(mm5, *(dataptr+14)); // write result 4 out
+ punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
+
+ movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
+ punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
+
+ movq_r2m(mm4, *(dataptr+3)); // write result 6 out
+ punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
+
+ movq_r2m(mm2, *(dataptr+5)); // write result 7 out
+
+ movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
+
+ movq_r2m(mm6, *(dataptr+7)); // write result 8 out
+
+
+// Do first 4x4 quadrant, which is used in the beginning of the DCT:
+
+ movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
+ movq_r2r(mm0, mm2);
+
+ punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
+ movq_r2r(mm7, mm4);
+
+ punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
+ movq_r2r(mm0, mm1);
+
+ movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
+ punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
+
+ movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
+ punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
+
+ movq_r2r(mm0, mm7); // write result 1
+ punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
+
+ psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
+ movq_r2r(mm1, mm6); // write result 2
+
+ paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
+ punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
+
+ paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
+ movq_r2r(mm2, mm3); // copy first intermediate result
+
+ psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
+ punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
+
+ movq_r2m(mm7, tmp7);
+ movq_r2r(mm2, mm5); // write result 3
+
+ movq_r2m(mm6, tmp6);
+ punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
+
+ paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
+ movq_r2r(mm3, mm4); // write result 4
+
+/************************************************************************************************
+ End of Transpose
+************************************************************************************************/
+
+
+ paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
+ movq_r2r(mm0, mm7);
+
+ psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
+ movq_r2r(mm1, mm6);
+
+ paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
+ psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
+
+ psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
+ paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
+
+ psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
+ paddw_r2r(mm7, mm6); // tmp12 + tmp13
+
+ /* stage 3 */
+
+ movq_m2r(tmp6, mm2);
+ movq_r2r(mm0, mm3);
+
+ psllw_i2r(2, mm6); // m8 * 2^2
+ paddw_r2r(mm1, mm0);
+
+ pmulhw_m2r(RTjpeg_C4, mm6); // z1
+ psubw_r2r(mm1, mm3);
+
+ movq_r2m(mm0, *dataptr);
+ movq_r2r(mm7, mm0);
+
+ /* Odd part */
+ movq_r2m(mm3, *(dataptr+8));
+ paddw_r2r(mm5, mm4); // tmp10
+
+ movq_m2r(tmp7, mm3);
+ paddw_r2r(mm6, mm0); // tmp32
+
+ paddw_r2r(mm2, mm5); // tmp11
+ psubw_r2r(mm6, mm7); // tmp33
+
+ movq_r2m(mm0, *(dataptr+4));
+ paddw_r2r(mm3, mm2); // tmp12
+
+ /* stage 4 */
+
+ movq_r2m(mm7, *(dataptr+12));
+ movq_r2r(mm4, mm1); // copy of tmp10
+
+ psubw_r2r(mm2, mm1); // tmp10 - tmp12
+ psllw_i2r(2, mm4); // m8 * 2^2
+
+ movq_m2r(RTjpeg_C2mC6, mm0);
+ psllw_i2r(2, mm1);
+
+ pmulhw_m2r(RTjpeg_C6, mm1); // z5
+ psllw_i2r(2, mm2);
+
+ pmulhw_r2r(mm0, mm4); // z5
+
+ /* stage 5 */
+
+ pmulhw_m2r(RTjpeg_C2pC6, mm2);
+ psllw_i2r(2, mm5);
+
+ pmulhw_m2r(RTjpeg_C4, mm5); // z3
+ movq_r2r(mm3, mm0); // copy tmp7
+
+ movq_m2r(*(dataptr+1), mm7);
+ paddw_r2r(mm1, mm4); // z2
+
+ paddw_r2r(mm1, mm2); // z4
+
+ paddw_r2r(mm5, mm0); // z11
+ psubw_r2r(mm5, mm3); // z13
+
+ /* stage 6 */
+
+ movq_r2r(mm3, mm5); // copy z13
+ psubw_r2r(mm4, mm3); // y3=z13 - z2
+
+ paddw_r2r(mm4, mm5); // y5=z13 + z2
+ movq_r2r(mm0, mm6); // copy z11