diff options
Diffstat (limited to 'postproc/postprocess_template.c')
-rw-r--r-- | postproc/postprocess_template.c | 954 |
1 files changed, 431 insertions, 523 deletions
diff --git a/postproc/postprocess_template.c b/postproc/postprocess_template.c index dbb6dd5f42..6cd35b0752 100644 --- a/postproc/postprocess_template.c +++ b/postproc/postprocess_template.c @@ -45,23 +45,19 @@ //FIXME? |255-0| = 1 (shouldnt be a problem ...) +#ifdef HAVE_MMX /** * Check if the middle 8x8 Block in the given 8x16 block is flat */ -static inline int RENAME(isVertDC)(uint8_t src[], int stride){ +static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){ int numEq= 0; -#ifndef HAVE_MMX - int y; -#endif src+= stride*4; // src points to begin of the 8x8 Block -#ifdef HAVE_MMX asm volatile( "leal (%1, %2), %%eax \n\t" - "leal (%%eax, %2, 4), %%ebx \n\t" // 0 1 2 3 4 5 6 7 8 9 -// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 - "movq "MANGLE(mmxDCOffset)", %%mm7 \n\t" // mm7 = 0x7F - "movq "MANGLE(mmxDCThreshold)", %%mm6 \n\t" // mm6 = 0x7D +// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 + "movq %3, %%mm7 \n\t" // mm7 = 0x7F + "movq %4, %%mm6 \n\t" // mm6 = 0x7D "movq (%1), %%mm0 \n\t" "movq (%%eax), %%mm1 \n\t" "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece @@ -79,6 +75,8 @@ asm volatile( "paddb %%mm7, %%mm2 \n\t" "pcmpgtb %%mm6, %%mm2 \n\t" "paddb %%mm2, %%mm0 \n\t" + + "leal (%%eax, %2, 4), %%eax \n\t" "movq (%1, %2, 4), %%mm2 \n\t" "psubb %%mm2, %%mm1 \n\t" @@ -86,19 +84,19 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%ebx), %%mm1 \n\t" + "movq (%%eax), %%mm1 \n\t" "psubb %%mm1, %%mm2 \n\t" "paddb %%mm7, %%mm2 \n\t" "pcmpgtb %%mm6, %%mm2 \n\t" "paddb %%mm2, %%mm0 \n\t" - "movq (%%ebx, %2), %%mm2 \n\t" + "movq (%%eax, %2), %%mm2 \n\t" "psubb %%mm2, %%mm1 \n\t" "paddb %%mm7, %%mm1 \n\t" "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%ebx, %2, 2), %%mm1 \n\t" + "movq (%%eax, %2, 2), %%mm1 \n\t" "psubb %%mm1, %%mm2 \n\t" "paddb %%mm7, %%mm2 \n\t" "pcmpgtb %%mm6, %%mm2 \n\t" @@ -121,49 +119,20 @@ asm volatile( #endif "movd %%mm0, %0 \n\t" : "=r" (numEq) - : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "r" (src), "r" (stride), "m" (c->mmxDcOffset), "m" (c->mmxDcThreshold) + : "%eax" ); numEq= (-numEq) &0xFF; - -#else - for(y=0; y<BLOCK_SIZE-1; y++) - { - if(((src[0] - src[0+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - if(((src[1] - src[1+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - if(((src[2] - src[2+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - if(((src[3] - src[3+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - if(((src[4] - src[4+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - if(((src[5] - src[5+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - if(((src[6] - src[6+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - if(((src[7] - src[7+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; - src+= stride; - } -#endif -/* if(abs(numEq - asmEq) > 0) - { - printf("\nasm:%d c:%d\n", asmEq, numEq); - for(int y=0; y<8; y++) - { - for(int x=0; x<8; x++) - { - printf("%d ", temp[x + y*stride]); - } - printf("\n"); - } - } -*/ -// for(int i=0; i<numEq/8; i++) src[i]=255; - return (numEq > vFlatnessThreshold) ? 1 : 0; + return numEq > c->ppMode.flatnessThreshold; } +#endif -static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP) +static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c) { #ifdef HAVE_MMX int isOk; src+= stride*3; asm volatile( -// "int $3 \n\t" "movq (%1, %2), %%mm0 \n\t" "movq (%1, %2, 8), %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" @@ -171,55 +140,39 @@ static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP) "psubusb %%mm2, %%mm1 \n\t" "por %%mm1, %%mm0 \n\t" // ABS Diff - "movq "MANGLE(pQPb)", %%mm7 \n\t" // QP,..., QP + "movq %3, %%mm7 \n\t" // QP,..., QP "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 - "pcmpeqd "MANGLE(b00)", %%mm0 \n\t" - "psrlq $16, %%mm0 \n\t" - "pcmpeqd "MANGLE(bFF)", %%mm0 \n\t" -// "movd %%mm0, (%1, %2, 4)\n\t" + "packssdw %%mm0, %%mm0 \n\t" "movd %%mm0, %0 \n\t" : "=r" (isOk) - : "r" (src), "r" (stride) + : "r" (src), "r" (stride), "m" (c->pQPb) ); - return isOk; + return isOk==0; #else - - int isOk2= 1; int x; + const int QP= c->QP; src+= stride*3; for(x=0; x<BLOCK_SIZE; x++) { - if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; + if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; } -/* if(isOk && !isOk2 || !isOk && isOk2) - { - printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); - for(int y=0; y<9; y++) - { - for(int x=0; x<8; x++) - { - printf("%d ", src[x + y*stride]); - } - printf("\n"); - } - } */ - return isOk2; + return 1; #endif - } /** * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 */ -static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) +static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) { #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= stride*3; asm volatile( //"movv %0 %1 %2\n\t" - "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP + "movq %2, %%mm0 \n\t" // QP,..., QP + "pxor %%mm4, %%mm4 \n\t" "movq (%0), %%mm6 \n\t" "movq (%0, %1), %%mm5 \n\t" @@ -229,7 +182,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) "psubusb %%mm1, %%mm2 \n\t" "por %%mm5, %%mm2 \n\t" // ABS Diff of lines "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 - "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF + "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF "pand %%mm2, %%mm6 \n\t" "pandn %%mm1, %%mm2 \n\t" @@ -237,8 +190,8 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) "movq (%0, %1, 8), %%mm5 \n\t" "leal (%0, %1, 4), %%eax \n\t" - "leal (%0, %1, 8), %%ebx \n\t" - "subl %1, %%ebx \n\t" + "leal (%0, %1, 8), %%ecx \n\t" + "subl %1, %%ecx \n\t" "addl %1, %0 \n\t" // %0 points to line 1 not 0 "movq (%0, %1, 8), %%mm7 \n\t" "movq %%mm5, %%mm1 \n\t" @@ -247,7 +200,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) "psubusb %%mm1, %%mm2 \n\t" "por %%mm5, %%mm2 \n\t" // ABS Diff of lines "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 - "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF + "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF "pand %%mm2, %%mm7 \n\t" "pandn %%mm1, %%mm2 \n\t" @@ -255,7 +208,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) // 1 2 3 4 5 6 7 8 - // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 + // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 // 6 4 2 2 1 1 // 6 4 4 2 // 6 8 2 @@ -286,7 +239,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) "movq %%mm3, (%0,%1) \n\t" // X // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 PAVGB(%%mm4, %%mm6) //11 /2 - "movq (%%ebx), %%mm0 \n\t" // 1 + "movq (%%ecx), %%mm0 \n\t" // 1 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 "movq %%mm0, %%mm3 \n\t" // 11/2 PAVGB(%%mm1, %%mm0) // 2 11/4 @@ -296,7 +249,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) "movq %%mm0, (%0, %1, 2) \n\t" // X // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 - PAVGB((%%ebx), %%mm0) // 11 /2 + PAVGB((%%ecx), %%mm0) // 11 /2 PAVGB(%%mm0, %%mm6) //11 11 /4 PAVGB(%%mm1, %%mm4) // 11 /2 PAVGB(%%mm2, %%mm1) // 11 /2 @@ -323,12 +276,12 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) PAVGB(%%mm0, %%mm1) // 11224222 /16 "movq %%mm1, (%%eax, %1, 2) \n\t" // X // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 - PAVGB((%%ebx), %%mm2) // 112 4 /8 + PAVGB((%%ecx), %%mm2) // 112 4 /8 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 PAVGB(%%mm0, %%mm6) // 1 1 /2 PAVGB(%%mm7, %%mm6) // 1 12 /4 PAVGB(%%mm2, %%mm6) // 1122424 /4 - "movq %%mm6, (%%ebx) \n\t" // X + "movq %%mm6, (%%ecx) \n\t" // X // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 PAVGB(%%mm7, %%mm5) // 11 2 /4 PAVGB(%%mm7, %%mm5) // 11 6 /8 @@ -339,8 +292,8 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) "subl %1, %0 \n\t" : - : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "r" (src), "r" (stride), "m" (c->pQPb) + : "%eax", "%ecx" ); #else const int l1= stride; @@ -356,8 +309,8 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) src+= stride*3; for(x=0; x<BLOCK_SIZE; x++) { - const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; - const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; + const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; + const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; int sums[9]; sums[0] = first + src[l1]; @@ -381,10 +334,10 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) src++; } - #endif } +#if 0 /** * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar * values are correctly clipped (MMX2) @@ -405,9 +358,9 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) "pxor %%mm7, %%mm7 \n\t" // 0 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" + "leal (%%eax, %1, 4), %%ecx \n\t" // 0 1 2 3 4 5 6 7 8 9 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 +// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP "movq %%mm0, %%mm1 \n\t" // QP,..., QP "paddusb "MANGLE(b02)", %%mm0 \n\t" @@ -415,7 +368,7 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... "movq (%0, %1, 4), %%mm2 \n\t" // line 4 - "movq (%%ebx), %%mm3 \n\t" // line 5 + "movq (%%ecx), %%mm3 \n\t" // line 5 "movq %%mm2, %%mm4 \n\t" // line 4 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 @@ -433,11 +386,11 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) // "psubb %%mm6, %%mm2 \n\t" "movq %%mm2, (%0,%1, 4) \n\t" - "movq (%%ebx), %%mm2 \n\t" + "movq (%%ecx), %%mm2 \n\t" // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 "psubb %%mm5, %%mm2 \n\t" // "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%ebx) \n\t" + "movq %%mm2, (%%ecx) \n\t" "paddb %%mm6, %%mm5 \n\t" "psrlw $2, %%mm5 \n\t" @@ -450,15 +403,15 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) "psubb %%mm6, %%mm2 \n\t" "movq %%mm2, (%%eax, %1, 2) \n\t" - "movq (%%ebx, %1), %%mm2 \n\t" + "movq (%%ecx, %1), %%mm2 \n\t" "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 "psubsb %%mm5, %%mm2 \n\t" "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%ebx, %1) \n\t" + "movq %%mm2, (%%ecx, %1) \n\t" : : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%eax", "%ecx" ); #else const int l1= stride; @@ -488,6 +441,7 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) #endif } +#endif /** * Experimental Filter 1 @@ -496,7 +450,7 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) * can only smooth blocks at the expected locations (it cant smooth them if they did move) * MMX2 version does correct clipping C version doesnt */ -static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) +static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) { #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= stride*3; @@ -504,17 +458,17 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) asm volatile( "pxor %%mm7, %%mm7 \n\t" // 0 "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" + "leal (%%eax, %1, 4), %%ecx \n\t" // 0 1 2 3 4 5 6 7 8 9 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 +// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 "movq %%mm1, %%mm2 \n\t" // line 4 "psubusb %%mm0, %%mm1 \n\t" "psubusb %%mm2, %%mm0 \n\t" "por %%mm1, %%mm0 \n\t" // |l2 - l3| - "movq (%%ebx), %%mm3 \n\t" // line 5 - "movq (%%ebx, %1), %%mm4 \n\t" // line 6 + "movq (%%ecx), %%mm3 \n\t" // line 5 + "movq (%%ecx, %1), %%mm4 \n\t" // line 6 "movq %%mm3, %%mm5 \n\t" // line 5 "psubusb %%mm4, %%mm3 \n\t" "psubusb %%mm5, %%mm4 \n\t" @@ -528,7 +482,7 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) "por %%mm5, %%mm4 \n\t" // |l4 - l5| "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) "movq %%mm4, %%mm3 \n\t" // d - "movq "MANGLE(pQPb)", %%mm0 \n\t" + "movq %2, %%mm0 \n\t" "paddusb %%mm0, %%mm0 \n\t" "psubusb %%mm0, %%mm4 \n\t" "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 @@ -546,11 +500,11 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) "pxor %%mm2, %%mm0 \n\t" "movq %%mm0, (%0, %1, 4) \n\t" // line 4 - "movq (%%ebx), %%mm0 \n\t" // line 5 + "movq (%%ecx), %%mm0 \n\t" // line 5 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 "paddusb %%mm3, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ebx) \n\t" // line 5 + "movq %%mm0, (%%ecx) \n\t" // line 5 PAVGB(%%mm7, %%mm1) // d/4 @@ -560,11 +514,11 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) "pxor %%mm2, %%mm0 \n\t" "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 - "movq (%%ebx, %1), %%mm0 \n\t" // line 6 + "movq (%%ecx, %1), %%mm0 \n\t" // line 6 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 "paddusb %%mm1, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ebx, %1) \n\t" // line 6 + "movq %%mm0, (%%ecx, %1) \n\t" // line 6 PAVGB(%%mm7, %%mm1) // d/8 @@ -574,15 +528,15 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) "pxor %%mm2, %%mm0 \n\t" "movq %%mm0, (%%eax, %1) \n\t" // line 2 - "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 + "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 "paddusb %%mm1, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 + "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 : - : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "r" (src), "r" (stride), "m" (co->pQPb) + : "%eax", "%ecx" ); #else @@ -607,7 +561,7 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) int d= ABS(b) - ((ABS(a) + ABS(c))>>1); d= MAX(d, 0); - if(d < QP*2) + if(d < co->QP*2) { int v = d * SIGN(-b); @@ -621,39 +575,10 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) } src++; } - /* - const int l1= stride; - const int l2= stride + l1; - const int l3= stride + l2; - const int l4= stride + l3; - const int l5= stride + l4; - const int l6= stride + l5; - const int l7= stride + l6; - const int l8= stride + l7; - const int l9= stride + l8; - for(int x=0; x<BLOCK_SIZE; x++) - { - int v2= src[l2]; - int v3= src[l3]; - int v4= src[l4]; - int v5= src[l5]; - int v6= src[l6]; - int v7= src[l7]; - - if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 ) - { - src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16; - src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16; - src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; - src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; - } - src++; - } -*/ #endif } -static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) +static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) { #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) /* @@ -676,10 +601,10 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) #if 0 //sligtly more accurate and slightly slower "pxor %%mm7, %%mm7 \n\t" // 0 "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" + "leal (%%eax, %1, 4), %%ecx \n\t" // 0 1 2 3 4 5 6 7 -// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 +// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 +// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 "movq (%0, %1, 2), %%mm0 \n\t" // l2 @@ -708,7 +633,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 - "movq (%%ebx), %%mm2 \n\t" // l5 + "movq (%%ecx), %%mm2 \n\t" // l5 "movq %%mm3, %%mm5 \n\t" // l3 PAVGB(%%mm7, %%mm3) // ~l3/2 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 @@ -721,13 +646,13 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 - "movq (%%ebx, %1), %%mm6 \n\t" // l6 + "movq (%%ecx, %1), %%mm6 \n\t" // l6 "movq %%mm6, %%mm5 \n\t" // l6 PAVGB(%%mm7, %%mm6) // ~l6/2 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 - "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 + "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 "movq %%mm2, %%mm4 \n\t" // l5 PAVGB(%%mm7, %%mm2) // ~l5/2 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 @@ -741,7 +666,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 - "movq "MANGLE(pQPb)", %%mm4 \n\t" // QP //FIXME QP+1 ? + "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? "paddusb "MANGLE(b01)", %%mm4 \n\t" "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 @@ -783,8 +708,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) "leal (%0, %1), %%eax \n\t" "pcmpeqb %%mm6, %%mm6 \n\t" // -1 // 0 1 2 3 4 5 6 7 -// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 +// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 +// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 @@ -798,7 +723,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) "pxor %%mm6, %%mm2 \n\t" // -l5-1 "movq %%mm2, %%mm5 \n\t" // -l5-1 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 - "leal (%%eax, %1, 4), %%ebx \n\t" + "leal (%%eax, %1, 4), %%ecx \n\t" PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 @@ -815,8 +740,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 - PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 - "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 + PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 + "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 "pxor %%mm6, %%mm1 \n\t" // -l7-1 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 @@ -836,7 +761,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 - "movq "MANGLE(pQPb)", %%mm2 \n\t" // QP + "movq %2, %%mm2 \n\t" // QP PAVGB(%%mm6, %%mm2) // 128 + QP/2 "psubb %%mm6, %%mm2 \n\t" @@ -877,8 +802,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) "movq %%mm2, (%0, %1, 4) \n\t" : - : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "r" (src), "r" (stride), "m" (c->pQPb) + : "%eax", "%ecx" ); /* @@ -951,10 +876,12 @@ src-=8; asm volatile( "pxor %%mm7, %%mm7 \n\t" "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" + "leal (%%eax, %1, 4), %%edx \n\t" + "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars + "andl $0xFFFFFFF8, %%ecx \n\t" // align // 0 1 2 3 4 5 6 7 -// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 +// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 +// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 "movq (%0), %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" @@ -992,8 +919,8 @@ src-=8; "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 - "movq %%mm0, "MANGLE(temp0)" \n\t" // 2L0 - 5L1 + 5L2 - 2L3 - "movq %%mm1, "MANGLE(temp1)" \n\t" // 2H0 - 5H1 + 5H2 - 2H3 + "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 + "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 "movq (%0, %1, 4), %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" @@ -1002,8 +929,8 @@ src-=8; "psubw %%mm0, %%mm2 \n\t" // L3 - L4 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 - "movq %%mm2, "MANGLE(temp2)" \n\t" // L3 - L4 - "movq %%mm3, "MANGLE(temp3)" \n\t" // H3 - H4 + "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 + "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 "paddw %%mm4, %%mm4 \n\t" // 2L2 "paddw %%mm5, %%mm5 \n\t" // 2H2 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 @@ -1014,7 +941,7 @@ src-=8; "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 //50 opcodes so far - "movq (%%ebx), %%mm2 \n\t" + "movq (%%edx), %%mm2 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" // L5 "punpckhbw %%mm7, %%mm3 \n\t" // H5 @@ -1023,10 +950,10 @@ src-=8; "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 - "movq (%%ebx, %1), %%mm6 \n\t" + "movq (%%edx, %1), %%mm6 \n\t" "punpcklbw %%mm7, %%mm6 \n\t" // L6 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 - "movq (%%ebx, %1), %%mm6 \n\t" + "movq (%%edx, %1), %%mm6 \n\t" "punpckhbw %%mm7, %%mm6 \n\t" // H6 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 @@ -1040,7 +967,7 @@ src-=8; "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - "movq (%%ebx, %1, 2), %%mm2 \n\t" + "movq (%%edx, %1, 2), %%mm2 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" // L7 "punpckhbw %%mm7, %%mm3 \n\t" // H7 @@ -1050,8 +977,8 @@ src-=8; "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 - "movq "MANGLE(temp0)", %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 - "movq "MANGLE(temp1)", %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 + "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 + "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 #ifdef HAVE_MMX2 "movq %%mm7, %%mm6 \n\t" // 0 @@ -1106,8 +1033,6 @@ src-=8; "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| // 100 opcodes "movd %2, %%mm2 \n\t" // QP - "punpcklwd %%mm2, %%mm2 \n\t" - "punpcklwd %%mm2, %%mm2 \n\t" "psllw $3, %%mm2 \n\t" // 8QP "movq %%mm2, %%mm3 \n\t" // 8QP "pcmpgtw %%mm4, %%mm2 \n\t" @@ -1129,18 +1054,8 @@ src-=8; "psrlw $6, %%mm4 \n\t" "psrlw $6, %%mm5 \n\t" -/* - "movq w06, %%mm2 \n\t" // 6 - "paddw %%mm2, %%mm4 \n\t" - "paddw %%mm2, %%mm5 \n\t" - "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16 -//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120 - "pmulhw %%mm2, %%mm4 \n\t" // hd/13 - "pmulhw %%mm2, %%mm5 \n\t" // ld/13 -*/ - - "movq "MANGLE(temp2)", %%mm0 \n\t" // L3 - L4 - "movq "MANGLE(temp3)", %%mm1 \n\t" // H3 - H4 + "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 + "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" @@ -1183,8 +1098,8 @@ src-=8; "movq %%mm0, (%0, %1, 4) \n\t" : - : "r" (src), "r" (stride), "r" (QP) - : "%eax", "%ebx" + : "r" (src), "r" (stride), "m" (c->pQPb) + : "%eax", "%edx", "%ecx" ); #else const int l1= stride; @@ -1201,7 +1116,7 @@ src-=8; for(x=0; x<BLOCK_SIZE; x++) { const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); - if(ABS(middleEnergy) < 8*QP) + if(ABS(middleEnergy) < 8*c->QP) { const int q=(src[l4] - src[l5])/2; const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); @@ -1232,21 +1147,25 @@ src-=8; #endif } -static inline void RENAME(dering)(uint8_t src[], int stride, int QP) +static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) { #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) asm volatile( - "movq "MANGLE(pQPb)", %%mm0 \n\t" - "paddusb %%mm0, %%mm0 \n\t" - "movq %%mm0, "MANGLE(pQPb2)" \n\t" + "pxor %%mm6, %%mm6 \n\t" + "pcmpeqb %%mm7, %%mm7 \n\t" + "movq %2, %%mm0 \n\t" + "punpcklbw %%mm6, %%mm0 \n\t" + "psrlw $1, %%mm0 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movq %%mm0, %3 \n\t" "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" + "leal (%%eax, %1, 4), %%edx \n\t" + // 0 1 2 3 4 5 6 7 8 9 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 +// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 - "pcmpeqb %%mm7, %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" #undef FIND_MIN_MAX #ifdef HAVE_MMX2 #define FIND_MIN_MAX(addr)\ @@ -1267,9 +1186,9 @@ FIND_MIN_MAX((%%eax)) FIND_MIN_MAX((%%eax, %1)) FIND_MIN_MAX((%%eax, %1, 2)) FIND_MIN_MAX((%0, %1, 4)) -FIND_MIN_MAX((%%ebx)) -FIND_MIN_MAX((%%ebx, %1)) -FIND_MIN_MAX((%%ebx, %1, 2)) +FIND_MIN_MAX((%%edx)) +FIND_MIN_MAX((%%edx, %1)) +FIND_MIN_MAX((%%edx, %1, 2)) FIND_MIN_MAX((%0, %1, 8)) "movq %%mm7, %%mm4 \n\t" @@ -1322,11 +1241,13 @@ FIND_MIN_MAX((%0, %1, 8)) "movd %%mm6, %%ecx \n\t" "cmpb "MANGLE(deringThreshold)", %%cl \n\t" " jb 1f \n\t" + "leal -24(%%esp), %%ecx \n\t" + "andl $0xFFFFFFF8, %%ecx \n\t" PAVGB(%%mm0, %%mm7) // a=(max + min)/2 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" - "movq %%mm7, "MANGLE(temp0)" \n\t" + "movq %%mm7, (%%ecx) \n\t" "movq (%0), %%mm0 \n\t" // L10 "movq %%mm0, %%mm1 \n\t" // L10 @@ -1390,8 +1311,8 @@ FIND_MIN_MAX((%0, %1, 8)) PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ PAVGB(lx, pplx) \ - "movq " #lx ", "MANGLE(temp1)" \n\t"\ - "movq "MANGLE(temp0)", " #lx " \n\t"\ + "movq " #lx ", 8(%%ecx) \n\t"\ + "movq (%%ecx), " #lx " \n\t"\ "psubusb " #lx ", " #t1 " \n\t"\ "psubusb " #lx ", " #t0 " \n\t"\ "psubusb " #lx ", " #sx " \n\t"\ @@ -1405,8 +1326,8 @@ FIND_MIN_MAX((%0, %1, 8)) PAVGB(plx, pplx) /* filtered */\ "movq " #dst ", " #t0 " \n\t" /* dst */\ "movq " #t0 ", " #t1 " \n\t" /* dst */\ - "psubusb "MANGLE(pQPb2)", " #t0 " \n\t"\ - "paddusb "MANGLE(pQPb2)", " #t1 " \n\t"\ + "psubusb %3, " #t0 " \n\t"\ + "paddusb %3, " #t1 " \n\t"\ PMAXUB(t0, pplx)\ PMINUB(t1, pplx, t0)\ "paddb " #sx ", " #ppsx " \n\t"\ @@ -1418,7 +1339,7 @@ FIND_MIN_MAX((%0, %1, 8)) "pandn " #dst ", " #ppsx " \n\t"\ "por " #pplx ", " #ppsx " \n\t"\ "movq " #ppsx ", " #dst " \n\t"\ - "movq "MANGLE(temp1)", " #lx " \n\t" + "movq 8(%%ecx), " #lx " \n\t" /* 0000000 @@ -1439,15 +1360,15 @@ FIND_MIN_MAX((%0, %1, 8)) DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) -DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) -DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) -DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) -DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) -DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) +DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) +DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) +DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) +DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) +DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) "1: \n\t" - : : "r" (src), "r" (stride), "r" (QP) - : "%eax", "%ebx", "%ecx" + : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) + : "%eax", "%edx", "%ecx" ); #else int y; @@ -1456,6 +1377,7 @@ DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm int avg; uint8_t *p; int s[10]; + const int QP2= c->QP/2 + 1; for(y=1; y<9; y++) { @@ -1468,30 +1390,41 @@ DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm if(*p < min) min= *p; } } - avg= (min + max + 1)/2; + avg= (min + max + 1)>>1; if(max - min <deringThreshold) return; for(y=0; y<10; y++) { - int x; int t = 0; - p= src + stride*y; - for(x=0; x<10; x++) - { - if(*p > avg) t |= (1<<x); - p++; - } + + if(src[stride*y + 0] > avg) t+= 1; + if(src[stride*y + 1] > avg) t+= 2; + if(src[stride*y + 2] > avg) t+= 4; + if(src[stride*y + 3] > avg) t+= 8; + if(src[stride*y + 4] > avg) t+= 16; + if(src[stride*y + 5] > avg) t+= 32; + if(src[stride*y + 6] > avg) t+= 64; + if(src[stride*y + 7] > avg) t+= 128; + if(src[stride*y + 8] > avg) t+= 256; + if(src[stride*y + 9] > avg) t+= 512; + t |= (~t)<<16; t &= (t<<1) & (t>>1); s[y] = t; } - + for(y=1; y<9; y++) { - int x; int t = s[y-1] & s[y] & s[y+1]; t|= t>>16; + s[y-1]= t; + } + + for(y=1; y<9; y++) + { + int x; + int t = s[y-1]; p= src + stride*y; for(x=1; x<9; x++) @@ -1544,8 +1477,8 @@ DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm } } #endif - if (*p + 2*QP < f) *p= *p + 2*QP; - else if(*p - 2*QP > f) *p= *p - 2*QP; + if (*p + QP2 < f) *p= *p + QP2; + else if(*p - QP2 > f) *p= *p - QP2; else *p=f; } } @@ -1582,9 +1515,9 @@ static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int strid src+= 4*stride; asm volatile( "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" + "leal (%%eax, %1, 4), %%ecx \n\t" // 0 1 2 3 4 5 6 7 8 9 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 +// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 "movq (%0), %%mm0 \n\t" "movq (%%eax, %1), %%mm1 \n\t" @@ -1593,15 +1526,15 @@ static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int strid "movq (%0, %1, 4), %%mm0 \n\t" PAVGB(%%mm0, %%mm1) "movq %%mm1, (%%eax, %1, 2) \n\t" - "movq (%%ebx, %1), %%mm1 \n\t" + "movq (%%ecx, %1), %%mm1 \n\t" PAVGB(%%mm1, %%mm0) - "movq %%mm0, (%%ebx) \n\t" + "movq %%mm0, (%%ecx) \n\t" "movq (%0, %1, 8), %%mm0 \n\t" PAVGB(%%mm0, %%mm1) - "movq %%mm1, (%%ebx, %1, 2) \n\t" + "movq %%mm1, (%%ecx, %1, 2) \n\t" : : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%eax", "%ecx" ); #else int x; @@ -1631,12 +1564,12 @@ static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride src+= stride*3; asm volatile( "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" - "leal (%%ebx, %1, 4), %%ecx \n\t" + "leal (%%eax, %1, 4), %%edx \n\t" + "leal (%%edx, %1, 4), %%ecx \n\t" "addl %1, %%ecx \n\t" "pxor %%mm7, %%mm7 \n\t" // 0 1 2 3 4 5 6 7 8 9 10 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx +// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx #define DEINT_CUBIC(a,b,c,d,e)\ "movq " #a ", %%mm0 \n\t"\ @@ -1660,13 +1593,13 @@ static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride "packuswb %%mm3, %%mm1 \n\t"\ "movq %%mm1, " #c " \n\t" -DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1)) -DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8)) -DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx)) -DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) +DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) +DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) +DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) +DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) : : "r" (src), "r" (stride) - : "%eax", "%ebx", "ecx" + : "%eax", "%edx", "ecx" ); #else int x; @@ -1687,6 +1620,85 @@ DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) * will be called for every 8x8 block and can read & write from line 4-15 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too * lines 4-12 will be read into the deblocking filter and should be deinterlaced + * this filter will read lines 4-13 and write 5-11 + * no cliping in C version + */ +static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) +{ +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) + src+= stride*4; + asm volatile( + "leal (%0, %1), %%eax \n\t" + "leal (%%eax, %1, 4), %%edx \n\t" + "pxor %%mm7, %%mm7 \n\t" + "movq (%2), %%mm0 \n\t" +// 0 1 2 3 4 5 6 7 8 9 10 +// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx + +#define DEINT_FF(a,b,c,d)\ + "movq " #a ", %%mm1 \n\t"\ + "movq " #b ", %%mm2 \n\t"\ + "movq " #c ", %%mm3 \n\t"\ + "movq " #d ", %%mm4 \n\t"\ + PAVGB(%%mm3, %%mm1) |