diff options
author | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-24 22:16:29 +0000 |
---|---|---|
committer | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-24 22:16:29 +0000 |
commit | 09c4c292778d7d1fe8604058b2d8009a9fcbb296 (patch) | |
tree | ef1755f400dcff57a7ce70121216655beffd6f11 /postproc | |
parent | 44711c5f07776de55fedfbf679f56a035a157b58 (diff) | |
download | mpv-09c4c292778d7d1fe8604058b2d8009a9fcbb296.tar.bz2 mpv-09c4c292778d7d1fe8604058b2d8009a9fcbb296.tar.xz |
runtime cpu detection
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3100 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'postproc')
-rw-r--r-- | postproc/postprocess.c | 3211 | ||||
-rw-r--r-- | postproc/postprocess_template.c | 828 |
2 files changed, 174 insertions, 3865 deletions
diff --git a/postproc/postprocess.c b/postproc/postprocess.c index a2e9174b70..e54bcf05e7 100644 --- a/postproc/postprocess.c +++ b/postproc/postprocess.c @@ -62,6 +62,8 @@ optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur commandline option for the deblock / dering thresholds +put fastmemcpy back +dont use #ifdef ARCH_X86 for the asm stuff ... cross compilers? (note cpudetect uses ARCH_X86) ... */ @@ -78,43 +80,25 @@ commandline option for the deblock / dering thresholds //#undef HAVE_MMX2 //#define HAVE_3DNOW //#undef HAVE_MMX +//#undef ARCH_X86 //#define DEBUG_BRIGHTNESS -#include "../libvo/fastmemcpy.h" +//#include "../libvo/fastmemcpy.h" #include "postprocess.h" +#include "../cpudetect.h" #define MIN(a,b) ((a) > (b) ? (b) : (a)) #define MAX(a,b) ((a) < (b) ? (b) : (a)) #define ABS(a) ((a) > 0 ? (a) : (-(a))) #define SIGN(a) ((a) > 0 ? 1 : -1) -#ifdef HAVE_MMX2 -#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" -#elif defined (HAVE_3DNOW) -#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" -#endif - -#ifdef HAVE_MMX2 -#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" -#elif defined (HAVE_MMX) -#define PMINUB(b,a,t) \ - "movq " #a ", " #t " \n\t"\ - "psubusb " #b ", " #t " \n\t"\ - "psubb " #t ", " #a " \n\t" -#endif - -#ifdef HAVE_MMX2 -#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" -#elif defined (HAVE_MMX) -#define PMAXUB(a,b) \ - "psubusb " #a ", " #b " \n\t"\ - "paddb " #a ", " #b " \n\t" -#endif - - #define GET_MODE_BUFFER_SIZE 500 #define OPTIONS_ARRAY_SIZE 10 -#ifdef HAVE_MMX +#ifdef ARCH_X86 +#define CAN_COMPILE_X86_ASM +#endif + +#ifdef CAN_COMPILE_X86_ASM static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL; static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL; static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL; @@ -157,7 +141,6 @@ static uint32_t __attribute__((aligned(4))) maxTmpNoise[4]; #else static uint64_t packedYOffset= 0x0000000000000000LL; static uint64_t packedYScale= 0x0100010001000100LL; -static uint8_t tempBlocks[8*16*2]; //used for the horizontal code #endif int hFlatnessThreshold= 56 - 16; @@ -196,7 +179,7 @@ static char *replaceTable[]= NULL //End Marker }; -#ifdef HAVE_MMX +#ifdef CAN_COMPILE_X86_ASM static inline void unusedVariableWarningFixer() { if( @@ -220,7 +203,7 @@ static inline long long rdtsc() } #endif -#ifdef HAVE_MMX2 +#ifdef CAN_COMPILE_X86_ASM static inline void prefetchnta(void *p) { asm volatile( "prefetchnta (%0)\n\t" @@ -250,1229 +233,7 @@ static inline void prefetcht2(void *p) } #endif -//FIXME? |255-0| = 1 (shouldnt be a problem ...) -/** - * Check if the middle 8x8 Block in the given 8x16 block is flat - */ -static inline int isVertDC(uint8_t src[], int stride){ - int numEq= 0; -#ifndef HAVE_MMX - int y; -#endif - src+= stride*4; // src points to begin of the 8x8 Block -#ifdef HAVE_MMX -asm volatile( - "leal (%1, %2), %%eax \n\t" - "leal (%%eax, %2, 4), %%ebx \n\t" -// 0 1 2 3 4 5 6 7 8 9 -// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 - "movq b7E, %%mm7 \n\t" // mm7 = 0x7F - "movq b7C, %%mm6 \n\t" // mm6 = 0x7D - "movq (%1), %%mm0 \n\t" - "movq (%%eax), %%mm1 \n\t" - "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece - "paddb %%mm7, %%mm0 \n\t" - "pcmpgtb %%mm6, %%mm0 \n\t" - - "movq (%%eax,%2), %%mm2 \n\t" - "psubb %%mm2, %%mm1 \n\t" - "paddb %%mm7, %%mm1 \n\t" - "pcmpgtb %%mm6, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - - "movq (%%eax, %2, 2), %%mm1 \n\t" - "psubb %%mm1, %%mm2 \n\t" - "paddb %%mm7, %%mm2 \n\t" - "pcmpgtb %%mm6, %%mm2 \n\t" - "paddb %%mm2, %%mm0 \n\t" - - "movq (%1, %2, 4), %%mm2 \n\t" - "psubb %%mm2, %%mm1 \n\t" - "paddb %%mm7, %%mm1 \n\t" - "pcmpgtb %%mm6, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - - "movq (%%ebx), %%mm1 \n\t" - "psubb %%mm1, %%mm2 \n\t" - "paddb %%mm7, %%mm2 \n\t" - "pcmpgtb %%mm6, %%mm2 \n\t" - "paddb %%mm2, %%mm0 \n\t" - - "movq (%%ebx, %2), %%mm2 \n\t" - "psubb %%mm2, %%mm1 \n\t" - "paddb %%mm7, %%mm1 \n\t" - "pcmpgtb %%mm6, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - - "movq (%%ebx, %2, 2), %%mm1 \n\t" - "psubb %%mm1, %%mm2 \n\t" - "paddb %%mm7, %%mm2 \n\t" - "pcmpgtb %%mm6, %%mm2 \n\t" - "paddb %%mm2, %%mm0 \n\t" - - " \n\t" -#ifdef HAVE_MMX2 - "pxor %%mm7, %%mm7 \n\t" - "psadbw %%mm7, %%mm0 \n\t" -#else - "movq %%mm0, %%mm1 \n\t" - "psrlw $8, %%mm0 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "psrlq $16, %%mm0 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "psrlq $32, %%mm0 \n\t" - "paddb %%mm1, %%mm0 \n\t" -#endif - "movd %%mm0, %0 \n\t" - : "=r" (numEq) - : "r" (src), "r" (stride) - : "%ebx" - ); - numEq= (-numEq) &0xFF; - -#else - for(y=0; y<BLOCK_SIZE-1; y++) - { - if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; - if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; - if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++; - if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++; - if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++; - if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++; - if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++; - if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++; - src+= stride; - } -#endif -/* if(abs(numEq - asmEq) > 0) - { - printf("\nasm:%d c:%d\n", asmEq, numEq); - for(int y=0; y<8; y++) - { - for(int x=0; x<8; x++) - { - printf("%d ", temp[x + y*stride]); - } - printf("\n"); - } - } -*/ -// for(int i=0; i<numEq/8; i++) src[i]=255; - return (numEq > vFlatnessThreshold) ? 1 : 0; -} - -static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) -{ -#ifdef HAVE_MMX - int isOk; - src+= stride*3; - asm volatile( -// "int $3 \n\t" - "movq (%1, %2), %%mm0 \n\t" - "movq (%1, %2, 8), %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "psubusb %%mm1, %%mm0 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm1, %%mm0 \n\t" // ABS Diff - - "movq pQPb, %%mm7 \n\t" // QP,..., QP - "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP - "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 - "pcmpeqd b00, %%mm0 \n\t" - "psrlq $16, %%mm0 \n\t" - "pcmpeqd bFF, %%mm0 \n\t" -// "movd %%mm0, (%1, %2, 4)\n\t" - "movd %%mm0, %0 \n\t" - : "=r" (isOk) - : "r" (src), "r" (stride) - ); - return isOk; -#else - - int isOk2= 1; - int x; - src+= stride*3; - for(x=0; x<BLOCK_SIZE; x++) - { - if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; - } -/* if(isOk && !isOk2 || !isOk && isOk2) - { - printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); - for(int y=0; y<9; y++) - { - for(int x=0; x<8; x++) - { - printf("%d ", src[x + y*stride]); - } - printf("\n"); - } - } */ - - return isOk2; -#endif - -} - -/** - * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) - * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 - */ -static inline void doVertLowPass(uint8_t *src, int stride, int QP) -{ -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - src+= stride*3; - asm volatile( //"movv %0 %1 %2\n\t" - "movq pQPb, %%mm0 \n\t" // QP,..., QP - - "movq (%0), %%mm6 \n\t" - "movq (%0, %1), %%mm5 \n\t" - "movq %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm2 \n\t" - "psubusb %%mm6, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "por %%mm5, %%mm2 \n\t" // ABS Diff of lines - "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 - "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF - - "pand %%mm2, %%mm6 \n\t" - "pandn %%mm1, %%mm2 \n\t" - "por %%mm2, %%mm6 \n\t"// First Line to Filter - - "movq (%0, %1, 8), %%mm5 \n\t" - "leal (%0, %1, 4), %%eax \n\t" - "leal (%0, %1, 8), %%ebx \n\t" - "subl %1, %%ebx \n\t" - "addl %1, %0 \n\t" // %0 points to line 1 not 0 - "movq (%0, %1, 8), %%mm7 \n\t" - "movq %%mm5, %%mm1 \n\t" - "movq %%mm7, %%mm2 \n\t" - "psubusb %%mm7, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "por %%mm5, %%mm2 \n\t" // ABS Diff of lines - "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 - "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF - - "pand %%mm2, %%mm7 \n\t" - "pandn %%mm1, %%mm2 \n\t" - "por %%mm2, %%mm7 \n\t" // First Line to Filter - - - // 1 2 3 4 5 6 7 8 - // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 - // 6 4 2 2 1 1 - // 6 4 4 2 - // 6 8 2 - - "movq (%0, %1), %%mm0 \n\t" // 1 - "movq %%mm0, %%mm1 \n\t" // 1 - PAVGB(%%mm6, %%mm0) //1 1 /2 - PAVGB(%%mm6, %%mm0) //3 1 /4 - - "movq (%0, %1, 4), %%mm2 \n\t" // 1 - "movq %%mm2, %%mm5 \n\t" // 1 - PAVGB((%%eax), %%mm2) // 11 /2 - PAVGB((%0, %1, 2), %%mm2) // 211 /4 - "movq %%mm2, %%mm3 \n\t" // 211 /4 - "movq (%0), %%mm4 \n\t" // 1 - PAVGB(%%mm4, %%mm3) // 4 211 /8 - PAVGB(%%mm0, %%mm3) //642211 /16 - "movq %%mm3, (%0) \n\t" // X - // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 - "movq %%mm1, %%mm0 \n\t" // 1 - PAVGB(%%mm6, %%mm0) //1 1 /2 - "movq %%mm4, %%mm3 \n\t" // 1 - PAVGB((%0,%1,2), %%mm3) // 1 1 /2 - PAVGB((%%eax,%1,2), %%mm5) // 11 /2 - PAVGB((%%eax), %%mm5) // 211 /4 - PAVGB(%%mm5, %%mm3) // 2 2211 /8 - PAVGB(%%mm0, %%mm3) //4242211 /16 - "movq %%mm3, (%0,%1) \n\t" // X - // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 - PAVGB(%%mm4, %%mm6) //11 /2 - "movq (%%ebx), %%mm0 \n\t" // 1 - PAVGB((%%eax, %1, 2), %%mm0) // 11/2 - "movq %%mm0, %%mm3 \n\t" // 11/2 - PAVGB(%%mm1, %%mm0) // 2 11/4 - PAVGB(%%mm6, %%mm0) //222 11/8 - PAVGB(%%mm2, %%mm0) //22242211/16 - "movq (%0, %1, 2), %%mm2 \n\t" // 1 - "movq %%mm0, (%0, %1, 2) \n\t" // X - // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 - "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 - PAVGB((%%ebx), %%mm0) // 11 /2 - PAVGB(%%mm0, %%mm6) //11 11 /4 - PAVGB(%%mm1, %%mm4) // 11 /2 - PAVGB(%%mm2, %%mm1) // 11 /2 - PAVGB(%%mm1, %%mm6) //1122 11 /8 - PAVGB(%%mm5, %%mm6) //112242211 /16 - "movq (%%eax), %%mm5 \n\t" // 1 - "movq %%mm6, (%%eax) \n\t" // X - // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 - "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 - PAVGB(%%mm7, %%mm6) // 11 /2 - PAVGB(%%mm4, %%mm6) // 11 11 /4 - PAVGB(%%mm3, %%mm6) // 11 2211 /8 - PAVGB(%%mm5, %%mm2) // 11 /2 - "movq (%0, %1, 4), %%mm4 \n\t" // 1 - PAVGB(%%mm4, %%mm2) // 112 /4 - PAVGB(%%mm2, %%mm6) // 112242211 /16 - "movq %%mm6, (%0, %1, 4) \n\t" // X - // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 - PAVGB(%%mm7, %%mm1) // 11 2 /4 - PAVGB(%%mm4, %%mm5) // 11 /2 - PAVGB(%%mm5, %%mm0) // 11 11 /4 - "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 - PAVGB(%%mm6, %%mm1) // 11 4 2 /8 - PAVGB(%%mm0, %%mm1) // 11224222 /16 - "movq %%mm1, (%%eax, %1, 2) \n\t" // X - // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 - PAVGB((%%ebx), %%mm2) // 112 4 /8 - "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 - PAVGB(%%mm0, %%mm6) // 1 1 /2 - PAVGB(%%mm7, %%mm6) // 1 12 /4 - PAVGB(%%mm2, %%mm6) // 1122424 /4 - "movq %%mm6, (%%ebx) \n\t" // X - // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 - PAVGB(%%mm7, %%mm5) // 11 2 /4 - PAVGB(%%mm7, %%mm5) // 11 6 /8 - - PAVGB(%%mm3, %%mm0) // 112 /4 - PAVGB(%%mm0, %%mm5) // 112246 /16 - "movq %%mm5, (%%eax, %1, 4) \n\t" // X - "subl %1, %0 \n\t" - - : - : "r" (src), "r" (stride) - : "%eax", "%ebx" - ); -#else - const int l1= stride; - const int l2= stride + l1; - const int l3= stride + l2; - const int l4= stride + l3; - const int l5= stride + l4; - const int l6= stride + l5; - const int l7= stride + l6; - const int l8= stride + l7; - const int l9= stride + l8; - int x; - src+= stride*3; - for(x=0; x<BLOCK_SIZE; x++) - { - const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; - const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; - - int sums[9]; - sums[0] = first + src[l1]; - sums[1] = src[l1] + src[l2]; - sums[2] = src[l2] + src[l3]; - sums[3] = src[l3] + src[l4]; - sums[4] = src[l4] + src[l5]; - sums[5] = src[l5] + src[l6]; - sums[6] = src[l6] + src[l7]; - sums[7] = src[l7] + src[l8]; - sums[8] = src[l8] + last; - - src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; - src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; - src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; - src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; - src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; - src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; - src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; - src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; - - src++; - } - -#endif -} - -/** - * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar - * values are correctly clipped (MMX2) - * values are wraparound (C) - * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient - 0 8 16 24 - x = 8 - x/2 = 4 - x/8 = 1 - 1 12 12 23 - */ -static inline void vertRK1Filter(uint8_t *src, int stride, int QP) -{ -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - src+= stride*3; -// FIXME rounding - asm volatile( - "pxor %%mm7, %%mm7 \n\t" // 0 - "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" -// 0 1 2 3 4 5 6 7 8 9 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "movq pQPb, %%mm0 \n\t" // QP,..., QP - "movq %%mm0, %%mm1 \n\t" // QP,..., QP - "paddusb b02, %%mm0 \n\t" - "psrlw $2, %%mm0 \n\t" - "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4 - "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... - "movq (%0, %1, 4), %%mm2 \n\t" // line 4 - "movq (%%ebx), %%mm3 \n\t" // line 5 - "movq %%mm2, %%mm4 \n\t" // line 4 - "pcmpeqb %%mm5, %%mm5 \n\t" // -1 - "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 - PAVGB(%%mm3, %%mm5) - "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 - "psubusb %%mm3, %%mm4 \n\t" - "psubusb %%mm2, %%mm3 \n\t" - "por %%mm3, %%mm4 \n\t" // |l4 - l5| - "psubusb %%mm0, %%mm4 \n\t" - "pcmpeqb %%mm7, %%mm4 \n\t" - "pand %%mm4, %%mm5 \n\t" // d/2 - -// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 - "paddb %%mm5, %%mm2 \n\t" -// "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%0,%1, 4) \n\t" - - "movq (%%ebx), %%mm2 \n\t" -// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 - "psubb %%mm5, %%mm2 \n\t" -// "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%ebx) \n\t" - - "paddb %%mm6, %%mm5 \n\t" - "psrlw $2, %%mm5 \n\t" - "pand b3F, %%mm5 \n\t" - "psubb b20, %%mm5 \n\t" // (l5-l4)/8 - - "movq (%%eax, %1, 2), %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 - "paddsb %%mm5, %%mm2 \n\t" - "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%eax, %1, 2) \n\t" - - "movq (%%ebx, %1), %%mm2 \n\t" - "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 - "psubsb %%mm5, %%mm2 \n\t" - "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%ebx, %1) \n\t" - - : - : "r" (src), "r" (stride) - : "%eax", "%ebx" - ); -#else - const int l1= stride; - const int l2= stride + l1; - const int l3= stride + l2; - const int l4= stride + l3; - const int l5= stride + l4; - const int l6= stride + l5; -// const int l7= stride + l6; -// const int l8= stride + l7; -// const int l9= stride + l8; - int x; - const int QP15= QP + (QP>>2); - src+= stride*3; - for(x=0; x<BLOCK_SIZE; x++) - { - const int v = (src[x+l5] - src[x+l4]); - if(ABS(v) < QP15) - { - src[x+l3] +=v>>3; - src[x+l4] +=v>>1; - src[x+l5] -=v>>1; - src[x+l6] -=v>>3; - - } - } - -#endif -} - -/** - * Experimental Filter 1 - * will not damage linear gradients - * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter - * can only smooth blocks at the expected locations (it cant smooth them if they did move) - * MMX2 version does correct clipping C version doesnt - */ -static inline void vertX1Filter(uint8_t *src, int stride, int QP) -{ -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) - src+= stride*3; - - asm volatile( - "pxor %%mm7, %%mm7 \n\t" // 0 -// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" -// 0 1 2 3 4 5 6 7 8 9 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 - "movq (%0, %1, 4), %%mm1 \n\t" // line 4 - "movq %%mm1, %%mm2 \n\t" // line 4 - "psubusb %%mm0, %%mm1 \n\t" - "psubusb %%mm2, %%mm0 \n\t" - "por %%mm1, %%mm0 \n\t" // |l2 - l3| - "movq (%%ebx), %%mm3 \n\t" // line 5 - "movq (%%ebx, %1), %%mm4 \n\t" // line 6 - "movq %%mm3, %%mm5 \n\t" // line 5 - "psubusb %%mm4, %%mm3 \n\t" - "psubusb %%mm5, %%mm4 \n\t" - "por %%mm4, %%mm3 \n\t" // |l5 - l6| - PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 - "movq %%mm2, %%mm1 \n\t" // line 4 - "psubusb %%mm5, %%mm2 \n\t" - "movq %%mm2, %%mm4 \n\t" - "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 - "psubusb %%mm1, %%mm5 \n\t" - "por %%mm5, %%mm4 \n\t" // |l4 - l5| - "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) - "movq %%mm4, %%mm3 \n\t" // d - "psubusb pQPb, %%mm4 \n\t" - "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 - "psubusb b01, %%mm3 \n\t" - "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 - - PAVGB(%%mm7, %%mm3) // d/2 - "movq %%mm3, %%mm1 \n\t" // d/2 - PAVGB(%%mm7, %%mm3) // d/4 - PAVGB(%%mm1, %%mm3) // 3*d/8 - - "movq (%0, %1, 4), %%mm0 \n\t" // line 4 - "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 - "psubusb %%mm3, %%mm0 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%0, %1, 4) \n\t" // line 4 - - "movq (%%ebx), %%mm0 \n\t" // line 5 - "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 - "paddusb %%mm3, %%mm0 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ebx) \n\t" // line 5 - - PAVGB(%%mm7, %%mm1) // d/4 - - "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 - "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 - "psubusb %%mm1, %%mm0 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 - - "movq (%%ebx, %1), %%mm0 \n\t" // line 6 - "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 - "paddusb %%mm1, %%mm0 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ebx, %1) \n\t" // line 6 - - PAVGB(%%mm7, %%mm1) // d/8 - - "movq (%%eax, %1), %%mm0 \n\t" // line 2 - "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 - "psubusb %%mm1, %%mm0 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%eax, %1) \n\t" // line 2 - - "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 - "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 - "paddusb %%mm1, %%mm0 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 - - : - : "r" (src), "r" (stride) - : "%eax", "%ebx" - ); -#else - - const int l1= stride; - const int l2= stride + l1; - const int l3= stride + l2; - const int l4= stride + l3; - const int l5= stride + l4; - const int l6= stride + l5; - const int l7= stride + l6; -// const int l8= stride + l7; -// const int l9= stride + l8; - int x; - - src+= stride*3; - for(x=0; x<BLOCK_SIZE; x++) - { - int a= src[l3] - src[l4]; - int b= src[l4] - src[l5]; - int c= src[l5] - src[l6]; - - int d= ABS(b) - ((ABS(a) + ABS(c))>>1); - d= MAX(d, 0); - - if(d < QP) - { - int v = d * SIGN(-b); - - src[l2] +=v>>3; - src[l3] +=v>>2; - src[l4] +=(3*v)>>3; - src[l5] -=(3*v)>>3; - src[l6] -=v>>2; - src[l7] -=v>>3; - - } - src++; - } - /* - const int l1= stride; - const int l2= stride + l1; - const int l3= stride + l2; - const int l4= stride + l3; - const int l5= stride + l4; - const int l6= stride + l5; - const int l7= stride + l6; - const int l8= stride + l7; - const int l9= stride + l8; - for(int x=0; x<BLOCK_SIZE; x++) - { - int v2= src[l2]; - int v3= src[l3]; - int v4= src[l4]; - int v5= src[l5]; - int v6= src[l6]; - int v7= src[l7]; - - if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 ) - { - src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16; - src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16; - src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; - src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; - } - src++; - } -*/ -#endif -} - -/** - * Experimental Filter 1 (Horizontal) - * will not damage linear gradients - * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter - * can only smooth blocks at the expected locations (it cant smooth them if they did move) - * MMX2 version does correct clipping C version doesnt - * not identical with the vertical one - */ -static inline void horizX1Filter(uint8_t *src, int stride, int QP) -{ - int y; -//FIXME (has little in common with the mmx2 version) - for(y=0; y<BLOCK_SIZE; y++) - { - int a= src[1] - src[2]; - int b= src[3] - src[4]; - int c= src[5] - src[6]; - - int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); - - if(d < QP) - { - int v = d * SIGN(-b); - - src[1] +=v/8; - src[2] +=v/4; - src[3] +=3*v/8; - src[4] -=3*v/8; - src[5] -=v/4; - src[6] -=v/8; - - } - src+=stride; - } -} - - -static inline void doVertDefFilter(uint8_t src[], int stride, int QP) -{ -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) -/* - uint8_t tmp[16]; - const int l1= stride; - const int l2= stride + l1; - const int l3= stride + l2; - const int l4= (int)tmp - (int)src - stride*3; - const int l5= (int)tmp - (int)src - stride*3 + 8; - const int l6= stride*3 + l3; - const int l7= stride + l6; - const int l8= stride + l7; - - memcpy(tmp, src+stride*7, 8); - memcpy(tmp+8, src+stride*8, 8); -*/ - src+= stride*4; - asm volatile( - -#if 0 //sligtly more accurate and slightly slower - "pxor %%mm7, %%mm7 \n\t" // 0 - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" -// 0 1 2 3 4 5 6 7 -// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 - - - "movq (%0, %1, 2), %%mm0 \n\t" // l2 - "movq (%0), %%mm1 \n\t" // l0 - "movq %%mm0, %%mm2 \n\t" // l2 - PAVGB(%%mm7, %%mm0) // ~l2/2 - PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 - PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 - - "movq (%%eax), %%mm1 \n\t" // l1 - "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 - "movq %%mm1, %%mm4 \n\t" // l1 - PAVGB(%%mm7, %%mm1) // ~l1/2 - PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 - PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 - - "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 - "psubusb %%mm1, %%mm0 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 -// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 - - "movq (%0, %1, 4), %%mm0 \n\t" // l4 - "movq %%mm0, %%mm4 \n\t" // l4 - PAVGB(%%mm7, %%mm0) // ~l4/2 - PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 - PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 - - "movq (%%ebx), %%mm2 \n\t" // l5 - "movq %%mm3, %%mm5 \n\t" // l3 - PAVGB(%%mm7, %%mm3) // ~l3/2 - PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 - PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 - - "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 - "psubusb %%mm3, %%mm0 \n\t" - "psubusb %%mm6, %%mm3 \n\t" - "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 - "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) -// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 - - "movq (%%ebx, %1), %%mm6 \n\t" // l6 - "movq %%mm6, %%mm5 \n\t" // l6 - PAVGB(%%mm7, %%mm6) // ~l6/2 - PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 - PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 - - "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 - "movq %%mm2, %%mm4 \n\t" // l5 - PAVGB(%%mm7, %%mm2) // ~l5/2 - PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 - PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 - - "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 - "psubusb %%mm2, %%mm6 \n\t" - "psubusb %%mm4, %%mm2 \n\t" - "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 -// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 - - - PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 - "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ? - "paddusb b01, %%mm4 \n\t" - "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP - "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 - "pand %%mm4, %%mm3 \n\t" - - "movq %%mm3, %%mm1 \n\t" -// "psubusb b01, %%mm3 \n\t" - PAVGB(%%mm7, %%mm3) - PAVGB(%%mm7, %%mm3) - "paddusb %%mm1, %%mm3 \n\t" -// "paddusb b01, %%mm3 \n\t" - - "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 - "movq (%0, %1, 4), %%mm5 \n\t" //l4 - "movq (%0, %1, 4), %%mm4 \n\t" //l4 - "psubusb %%mm6, %%mm5 \n\t" - "psubusb %%mm4, %%mm6 \n\t" - "por %%mm6, %%mm5 \n\t" // |l3-l4| - "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) - "pxor %%mm6, %%mm0 \n\t" - "pand %%mm0, %%mm3 \n\t" - PMINUB(%%mm5, %%mm3, %%mm0) - - "psubusb b01, %%mm3 \n\t" - PAVGB(%%mm7, %%mm3) - - "movq (%%eax, %1, 2), %%mm0 \n\t" - "movq (%0, %1, 4), %%mm2 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "psubb %%mm3, %%mm0 \n\t" - "paddb %%mm3, %%mm2 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "movq %%mm0, (%%eax, %1, 2) \n\t" - "movq %%mm2, (%0, %1, 4) \n\t" -#endif - - "leal (%0, %1), %%eax \n\t" - "pcmpeqb %%mm6, %%mm6 \n\t" // -1 -// 0 1 2 3 4 5 6 7 -// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 - - - "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 - "movq (%0, %1, 4), %%mm0 \n\t" // l4 - "pxor %%mm6, %%mm1 \n\t" // -l3-1 - PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 -// mm1=-l3-1, mm0=128-q - - "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 - "movq (%%eax, %1), %%mm3 \n\t" // l2 - "pxor %%mm6, %%mm2 \n\t" // -l5-1 - "movq %%mm2, %%mm5 \n\t" // -l5-1 - "movq b80, %%mm4 \n\t" // 128 - "leal (%%eax, %1, 4), %%ebx \n\t" - PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 - PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 - PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 - PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 -// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 - - "movq (%%eax), %%mm2 \n\t" // l1 - "pxor %%mm6, %%mm2 \n\t" // -l1-1 - PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 - PAVGB((%0), %%mm1) // (l0-l3+256)/2 - "movq b80, %%mm3 \n\t" // 128 - PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 - PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 - PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 -// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 - - PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 - "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 - "pxor %%mm6, %%mm1 \n\t" // -l7-1 - PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 - "movq b80, %%mm2 \n\t" // 128 - PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 - PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 - PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 -// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 - - "movq b00, %%mm1 \n\t" // 0 - "movq b00, %%mm5 \n\t" // 0 - "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 - "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 - PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| - PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| - PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 - -// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 - - "movq b00, %%mm7 \n\t" // 0 - "movq pQPb, %%mm2 \n\t" // QP - PAVGB(%%mm6, %%mm2) // 128 + QP/2 - "psubb %%mm6, %%mm2 \n\t" - - "movq %%mm4, %%mm1 \n\t" - "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) - "pxor %%mm1, %%mm4 \n\t" - "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 - "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 - "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 -// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 - - "movq %%mm4, %%mm3 \n\t" // d - "psubusb b01, %%mm4 \n\t" - PAVGB(%%mm7, %%mm4) // d/32 - PAVGB(%%mm7, %%mm4) // (d + 32)/64 - "paddb %%mm3, %%mm4 \n\t" // 5d/64 - "pand %%mm2, %%mm4 \n\t" - - "movq b80, %%mm5 \n\t" // 128 - "psubb %%mm0, %%mm5 \n\t" // q - "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding - "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) - "pxor %%mm7, %%mm5 \n\t" - - PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) - "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) - - "pand %%mm7, %%mm4 \n\t" - "movq (%%eax, %1, 2), %%mm0 \n\t" - "movq (%0, %1, 4), %%mm2 \n\t" - "pxor %%mm1, %%mm0 \n\t" - "pxor %%mm1, %%mm2 \n\t" - "paddb %%mm4, %%mm0 \n\t" - "psubb %%mm4, %%mm2 \n\t" - "pxor %%mm1, %%mm0 \n\t" - "pxor %%mm1, %%mm2 \n\t" - "movq %%mm0, (%%eax, %1, 2) \n\t" - "movq %%mm2, (%0, %1, 4) \n\t" - - : - : "r" (src), "r" (stride) - : "%eax", "%ebx" - ); - -/* - { - int x; - src-= stride; - for(x=0; x<BLOCK_SIZE; x++) - { - const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); - if(ABS(middleEnergy)< 8*QP) - { - const int q=(src[l4] - src[l5])/2; - const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); - const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); - - int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); - d= MAX(d, 0); - - d= (5*d + 32) >> 6; - d*= SIGN(-middleEnergy); - - if(q>0) - { - d= d<0 ? 0 : d; - d= d>q ? q : d; - } - else - { - d= d>0 ? 0 : d; - d= d<q ? q : d; - } - - src[l4]-= d; - src[l5]+= d; - } - src++; - } -src-=8; - for(x=0; x<8; x++) - { - int y; - for(y=4; y<6; y++) - { - int d= src[x+y*stride] - tmp[x+(y-4)*8]; - int ad= ABS(d); - static int max=0; - static int sum=0; - static int num=0; - static int bias=0; - - if(max<ad) max=ad; - sum+= ad>3 ? 1 : 0; - if(ad>3) - { - src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; - } - if(y==4) bias+=d; - num++; - if(num%1000000 == 0) - { - printf(" %d %d %d %d\n", num, sum, max, bias); - } - } - } -} -*/ -#elif defined (HAVE_MMX) - src+= stride*4; - - asm volatile( - "pxor %%mm7, %%mm7 \n\t" - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" -// 0 1 2 3 4 5 6 7 -// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 - - "movq (%0), %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 - "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 - - "movq (%%eax), %%mm2 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 - "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 - - "movq (%%eax, %1), %%mm4 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 - "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 - - "paddw %%m |