summaryrefslogtreecommitdiffstats
path: root/postproc
diff options
context:
space:
mode:
authormichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-11-24 22:16:29 +0000
committermichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-11-24 22:16:29 +0000
commit09c4c292778d7d1fe8604058b2d8009a9fcbb296 (patch)
treeef1755f400dcff57a7ce70121216655beffd6f11 /postproc
parent44711c5f07776de55fedfbf679f56a035a157b58 (diff)
downloadmpv-09c4c292778d7d1fe8604058b2d8009a9fcbb296.tar.bz2
mpv-09c4c292778d7d1fe8604058b2d8009a9fcbb296.tar.xz
runtime cpu detection
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3100 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'postproc')
-rw-r--r--postproc/postprocess.c3211
-rw-r--r--postproc/postprocess_template.c828
2 files changed, 174 insertions, 3865 deletions
diff --git a/postproc/postprocess.c b/postproc/postprocess.c
index a2e9174b70..e54bcf05e7 100644
--- a/postproc/postprocess.c
+++ b/postproc/postprocess.c
@@ -62,6 +62,8 @@ optimize c versions
try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
smart blur
commandline option for the deblock / dering thresholds
+put fastmemcpy back
+dont use #ifdef ARCH_X86 for the asm stuff ... cross compilers? (note cpudetect uses ARCH_X86)
...
*/
@@ -78,43 +80,25 @@ commandline option for the deblock / dering thresholds
//#undef HAVE_MMX2
//#define HAVE_3DNOW
//#undef HAVE_MMX
+//#undef ARCH_X86
//#define DEBUG_BRIGHTNESS
-#include "../libvo/fastmemcpy.h"
+//#include "../libvo/fastmemcpy.h"
#include "postprocess.h"
+#include "../cpudetect.h"
#define MIN(a,b) ((a) > (b) ? (b) : (a))
#define MAX(a,b) ((a) < (b) ? (b) : (a))
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
#define SIGN(a) ((a) > 0 ? 1 : -1)
-#ifdef HAVE_MMX2
-#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
-#elif defined (HAVE_3DNOW)
-#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
-#endif
-
-#ifdef HAVE_MMX2
-#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
-#elif defined (HAVE_MMX)
-#define PMINUB(b,a,t) \
- "movq " #a ", " #t " \n\t"\
- "psubusb " #b ", " #t " \n\t"\
- "psubb " #t ", " #a " \n\t"
-#endif
-
-#ifdef HAVE_MMX2
-#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
-#elif defined (HAVE_MMX)
-#define PMAXUB(a,b) \
- "psubusb " #a ", " #b " \n\t"\
- "paddb " #a ", " #b " \n\t"
-#endif
-
-
#define GET_MODE_BUFFER_SIZE 500
#define OPTIONS_ARRAY_SIZE 10
-#ifdef HAVE_MMX
+#ifdef ARCH_X86
+#define CAN_COMPILE_X86_ASM
+#endif
+
+#ifdef CAN_COMPILE_X86_ASM
static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
@@ -157,7 +141,6 @@ static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
#else
static uint64_t packedYOffset= 0x0000000000000000LL;
static uint64_t packedYScale= 0x0100010001000100LL;
-static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
#endif
int hFlatnessThreshold= 56 - 16;
@@ -196,7 +179,7 @@ static char *replaceTable[]=
NULL //End Marker
};
-#ifdef HAVE_MMX
+#ifdef CAN_COMPILE_X86_ASM
static inline void unusedVariableWarningFixer()
{
if(
@@ -220,7 +203,7 @@ static inline long long rdtsc()
}
#endif
-#ifdef HAVE_MMX2
+#ifdef CAN_COMPILE_X86_ASM
static inline void prefetchnta(void *p)
{
asm volatile( "prefetchnta (%0)\n\t"
@@ -250,1229 +233,7 @@ static inline void prefetcht2(void *p)
}
#endif
-//FIXME? |255-0| = 1 (shouldnt be a problem ...)
-/**
- * Check if the middle 8x8 Block in the given 8x16 block is flat
- */
-static inline int isVertDC(uint8_t src[], int stride){
- int numEq= 0;
-#ifndef HAVE_MMX
- int y;
-#endif
- src+= stride*4; // src points to begin of the 8x8 Block
-#ifdef HAVE_MMX
-asm volatile(
- "leal (%1, %2), %%eax \n\t"
- "leal (%%eax, %2, 4), %%ebx \n\t"
-// 0 1 2 3 4 5 6 7 8 9
-// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
- "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
- "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
- "movq (%1), %%mm0 \n\t"
- "movq (%%eax), %%mm1 \n\t"
- "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
- "paddb %%mm7, %%mm0 \n\t"
- "pcmpgtb %%mm6, %%mm0 \n\t"
-
- "movq (%%eax,%2), %%mm2 \n\t"
- "psubb %%mm2, %%mm1 \n\t"
- "paddb %%mm7, %%mm1 \n\t"
- "pcmpgtb %%mm6, %%mm1 \n\t"
- "paddb %%mm1, %%mm0 \n\t"
-
- "movq (%%eax, %2, 2), %%mm1 \n\t"
- "psubb %%mm1, %%mm2 \n\t"
- "paddb %%mm7, %%mm2 \n\t"
- "pcmpgtb %%mm6, %%mm2 \n\t"
- "paddb %%mm2, %%mm0 \n\t"
-
- "movq (%1, %2, 4), %%mm2 \n\t"
- "psubb %%mm2, %%mm1 \n\t"
- "paddb %%mm7, %%mm1 \n\t"
- "pcmpgtb %%mm6, %%mm1 \n\t"
- "paddb %%mm1, %%mm0 \n\t"
-
- "movq (%%ebx), %%mm1 \n\t"
- "psubb %%mm1, %%mm2 \n\t"
- "paddb %%mm7, %%mm2 \n\t"
- "pcmpgtb %%mm6, %%mm2 \n\t"
- "paddb %%mm2, %%mm0 \n\t"
-
- "movq (%%ebx, %2), %%mm2 \n\t"
- "psubb %%mm2, %%mm1 \n\t"
- "paddb %%mm7, %%mm1 \n\t"
- "pcmpgtb %%mm6, %%mm1 \n\t"
- "paddb %%mm1, %%mm0 \n\t"
-
- "movq (%%ebx, %2, 2), %%mm1 \n\t"
- "psubb %%mm1, %%mm2 \n\t"
- "paddb %%mm7, %%mm2 \n\t"
- "pcmpgtb %%mm6, %%mm2 \n\t"
- "paddb %%mm2, %%mm0 \n\t"
-
- " \n\t"
-#ifdef HAVE_MMX2
- "pxor %%mm7, %%mm7 \n\t"
- "psadbw %%mm7, %%mm0 \n\t"
-#else
- "movq %%mm0, %%mm1 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "paddb %%mm1, %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "psrlq $16, %%mm0 \n\t"
- "paddb %%mm1, %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "psrlq $32, %%mm0 \n\t"
- "paddb %%mm1, %%mm0 \n\t"
-#endif
- "movd %%mm0, %0 \n\t"
- : "=r" (numEq)
- : "r" (src), "r" (stride)
- : "%ebx"
- );
- numEq= (-numEq) &0xFF;
-
-#else
- for(y=0; y<BLOCK_SIZE-1; y++)
- {
- if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
- if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
- if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
- if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
- if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
- if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
- if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
- if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
- src+= stride;
- }
-#endif
-/* if(abs(numEq - asmEq) > 0)
- {
- printf("\nasm:%d c:%d\n", asmEq, numEq);
- for(int y=0; y<8; y++)
- {
- for(int x=0; x<8; x++)
- {
- printf("%d ", temp[x + y*stride]);
- }
- printf("\n");
- }
- }
-*/
-// for(int i=0; i<numEq/8; i++) src[i]=255;
- return (numEq > vFlatnessThreshold) ? 1 : 0;
-}
-
-static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
-{
-#ifdef HAVE_MMX
- int isOk;
- src+= stride*3;
- asm volatile(
-// "int $3 \n\t"
- "movq (%1, %2), %%mm0 \n\t"
- "movq (%1, %2, 8), %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "psubusb %%mm1, %%mm0 \n\t"
- "psubusb %%mm2, %%mm1 \n\t"
- "por %%mm1, %%mm0 \n\t" // ABS Diff
-
- "movq pQPb, %%mm7 \n\t" // QP,..., QP
- "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
- "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
- "pcmpeqd b00, %%mm0 \n\t"
- "psrlq $16, %%mm0 \n\t"
- "pcmpeqd bFF, %%mm0 \n\t"
-// "movd %%mm0, (%1, %2, 4)\n\t"
- "movd %%mm0, %0 \n\t"
- : "=r" (isOk)
- : "r" (src), "r" (stride)
- );
- return isOk;
-#else
-
- int isOk2= 1;
- int x;
- src+= stride*3;
- for(x=0; x<BLOCK_SIZE; x++)
- {
- if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
- }
-/* if(isOk && !isOk2 || !isOk && isOk2)
- {
- printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
- for(int y=0; y<9; y++)
- {
- for(int x=0; x<8; x++)
- {
- printf("%d ", src[x + y*stride]);
- }
- printf("\n");
- }
- } */
-
- return isOk2;
-#endif
-
-}
-
-/**
- * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
- */
-static inline void doVertLowPass(uint8_t *src, int stride, int QP)
-{
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
- src+= stride*3;
- asm volatile( //"movv %0 %1 %2\n\t"
- "movq pQPb, %%mm0 \n\t" // QP,..., QP
-
- "movq (%0), %%mm6 \n\t"
- "movq (%0, %1), %%mm5 \n\t"
- "movq %%mm5, %%mm1 \n\t"
- "movq %%mm6, %%mm2 \n\t"
- "psubusb %%mm6, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
- "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
- "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
-
- "pand %%mm2, %%mm6 \n\t"
- "pandn %%mm1, %%mm2 \n\t"
- "por %%mm2, %%mm6 \n\t"// First Line to Filter
-
- "movq (%0, %1, 8), %%mm5 \n\t"
- "leal (%0, %1, 4), %%eax \n\t"
- "leal (%0, %1, 8), %%ebx \n\t"
- "subl %1, %%ebx \n\t"
- "addl %1, %0 \n\t" // %0 points to line 1 not 0
- "movq (%0, %1, 8), %%mm7 \n\t"
- "movq %%mm5, %%mm1 \n\t"
- "movq %%mm7, %%mm2 \n\t"
- "psubusb %%mm7, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
- "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
- "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
-
- "pand %%mm2, %%mm7 \n\t"
- "pandn %%mm1, %%mm2 \n\t"
- "por %%mm2, %%mm7 \n\t" // First Line to Filter
-
-
- // 1 2 3 4 5 6 7 8
- // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
- // 6 4 2 2 1 1
- // 6 4 4 2
- // 6 8 2
-
- "movq (%0, %1), %%mm0 \n\t" // 1
- "movq %%mm0, %%mm1 \n\t" // 1
- PAVGB(%%mm6, %%mm0) //1 1 /2
- PAVGB(%%mm6, %%mm0) //3 1 /4
-
- "movq (%0, %1, 4), %%mm2 \n\t" // 1
- "movq %%mm2, %%mm5 \n\t" // 1
- PAVGB((%%eax), %%mm2) // 11 /2
- PAVGB((%0, %1, 2), %%mm2) // 211 /4
- "movq %%mm2, %%mm3 \n\t" // 211 /4
- "movq (%0), %%mm4 \n\t" // 1
- PAVGB(%%mm4, %%mm3) // 4 211 /8
- PAVGB(%%mm0, %%mm3) //642211 /16
- "movq %%mm3, (%0) \n\t" // X
- // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
- "movq %%mm1, %%mm0 \n\t" // 1
- PAVGB(%%mm6, %%mm0) //1 1 /2
- "movq %%mm4, %%mm3 \n\t" // 1
- PAVGB((%0,%1,2), %%mm3) // 1 1 /2
- PAVGB((%%eax,%1,2), %%mm5) // 11 /2
- PAVGB((%%eax), %%mm5) // 211 /4
- PAVGB(%%mm5, %%mm3) // 2 2211 /8
- PAVGB(%%mm0, %%mm3) //4242211 /16
- "movq %%mm3, (%0,%1) \n\t" // X
- // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
- PAVGB(%%mm4, %%mm6) //11 /2
- "movq (%%ebx), %%mm0 \n\t" // 1
- PAVGB((%%eax, %1, 2), %%mm0) // 11/2
- "movq %%mm0, %%mm3 \n\t" // 11/2
- PAVGB(%%mm1, %%mm0) // 2 11/4
- PAVGB(%%mm6, %%mm0) //222 11/8
- PAVGB(%%mm2, %%mm0) //22242211/16
- "movq (%0, %1, 2), %%mm2 \n\t" // 1
- "movq %%mm0, (%0, %1, 2) \n\t" // X
- // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
- "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
- PAVGB((%%ebx), %%mm0) // 11 /2
- PAVGB(%%mm0, %%mm6) //11 11 /4
- PAVGB(%%mm1, %%mm4) // 11 /2
- PAVGB(%%mm2, %%mm1) // 11 /2
- PAVGB(%%mm1, %%mm6) //1122 11 /8
- PAVGB(%%mm5, %%mm6) //112242211 /16
- "movq (%%eax), %%mm5 \n\t" // 1
- "movq %%mm6, (%%eax) \n\t" // X
- // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
- "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
- PAVGB(%%mm7, %%mm6) // 11 /2
- PAVGB(%%mm4, %%mm6) // 11 11 /4
- PAVGB(%%mm3, %%mm6) // 11 2211 /8
- PAVGB(%%mm5, %%mm2) // 11 /2
- "movq (%0, %1, 4), %%mm4 \n\t" // 1
- PAVGB(%%mm4, %%mm2) // 112 /4
- PAVGB(%%mm2, %%mm6) // 112242211 /16
- "movq %%mm6, (%0, %1, 4) \n\t" // X
- // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
- PAVGB(%%mm7, %%mm1) // 11 2 /4
- PAVGB(%%mm4, %%mm5) // 11 /2
- PAVGB(%%mm5, %%mm0) // 11 11 /4
- "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
- PAVGB(%%mm6, %%mm1) // 11 4 2 /8
- PAVGB(%%mm0, %%mm1) // 11224222 /16
- "movq %%mm1, (%%eax, %1, 2) \n\t" // X
- // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
- PAVGB((%%ebx), %%mm2) // 112 4 /8
- "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
- PAVGB(%%mm0, %%mm6) // 1 1 /2
- PAVGB(%%mm7, %%mm6) // 1 12 /4
- PAVGB(%%mm2, %%mm6) // 1122424 /4
- "movq %%mm6, (%%ebx) \n\t" // X
- // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
- PAVGB(%%mm7, %%mm5) // 11 2 /4
- PAVGB(%%mm7, %%mm5) // 11 6 /8
-
- PAVGB(%%mm3, %%mm0) // 112 /4
- PAVGB(%%mm0, %%mm5) // 112246 /16
- "movq %%mm5, (%%eax, %1, 4) \n\t" // X
- "subl %1, %0 \n\t"
-
- :
- : "r" (src), "r" (stride)
- : "%eax", "%ebx"
- );
-#else
- const int l1= stride;
- const int l2= stride + l1;
- const int l3= stride + l2;
- const int l4= stride + l3;
- const int l5= stride + l4;
- const int l6= stride + l5;
- const int l7= stride + l6;
- const int l8= stride + l7;
- const int l9= stride + l8;
- int x;
- src+= stride*3;
- for(x=0; x<BLOCK_SIZE; x++)
- {
- const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
- const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
-
- int sums[9];
- sums[0] = first + src[l1];
- sums[1] = src[l1] + src[l2];
- sums[2] = src[l2] + src[l3];
- sums[3] = src[l3] + src[l4];
- sums[4] = src[l4] + src[l5];
- sums[5] = src[l5] + src[l6];
- sums[6] = src[l6] + src[l7];
- sums[7] = src[l7] + src[l8];
- sums[8] = src[l8] + last;
-
- src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
- src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
- src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
- src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
- src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
- src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
- src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
- src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
-
- src++;
- }
-
-#endif
-}
-
-/**
- * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
- * values are correctly clipped (MMX2)
- * values are wraparound (C)
- * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
- 0 8 16 24
- x = 8
- x/2 = 4
- x/8 = 1
- 1 12 12 23
- */
-static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
-{
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
- src+= stride*3;
-// FIXME rounding
- asm volatile(
- "pxor %%mm7, %%mm7 \n\t" // 0
- "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
- "leal (%0, %1), %%eax \n\t"
- "leal (%%eax, %1, 4), %%ebx \n\t"
-// 0 1 2 3 4 5 6 7 8 9
-// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
- "movq pQPb, %%mm0 \n\t" // QP,..., QP
- "movq %%mm0, %%mm1 \n\t" // QP,..., QP
- "paddusb b02, %%mm0 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
- "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
- "movq (%0, %1, 4), %%mm2 \n\t" // line 4
- "movq (%%ebx), %%mm3 \n\t" // line 5
- "movq %%mm2, %%mm4 \n\t" // line 4
- "pcmpeqb %%mm5, %%mm5 \n\t" // -1
- "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
- PAVGB(%%mm3, %%mm5)
- "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
- "psubusb %%mm3, %%mm4 \n\t"
- "psubusb %%mm2, %%mm3 \n\t"
- "por %%mm3, %%mm4 \n\t" // |l4 - l5|
- "psubusb %%mm0, %%mm4 \n\t"
- "pcmpeqb %%mm7, %%mm4 \n\t"
- "pand %%mm4, %%mm5 \n\t" // d/2
-
-// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
- "paddb %%mm5, %%mm2 \n\t"
-// "psubb %%mm6, %%mm2 \n\t"
- "movq %%mm2, (%0,%1, 4) \n\t"
-
- "movq (%%ebx), %%mm2 \n\t"
-// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
- "psubb %%mm5, %%mm2 \n\t"
-// "psubb %%mm6, %%mm2 \n\t"
- "movq %%mm2, (%%ebx) \n\t"
-
- "paddb %%mm6, %%mm5 \n\t"
- "psrlw $2, %%mm5 \n\t"
- "pand b3F, %%mm5 \n\t"
- "psubb b20, %%mm5 \n\t" // (l5-l4)/8
-
- "movq (%%eax, %1, 2), %%mm2 \n\t"
- "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
- "paddsb %%mm5, %%mm2 \n\t"
- "psubb %%mm6, %%mm2 \n\t"
- "movq %%mm2, (%%eax, %1, 2) \n\t"
-
- "movq (%%ebx, %1), %%mm2 \n\t"
- "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
- "psubsb %%mm5, %%mm2 \n\t"
- "psubb %%mm6, %%mm2 \n\t"
- "movq %%mm2, (%%ebx, %1) \n\t"
-
- :
- : "r" (src), "r" (stride)
- : "%eax", "%ebx"
- );
-#else
- const int l1= stride;
- const int l2= stride + l1;
- const int l3= stride + l2;
- const int l4= stride + l3;
- const int l5= stride + l4;
- const int l6= stride + l5;
-// const int l7= stride + l6;
-// const int l8= stride + l7;
-// const int l9= stride + l8;
- int x;
- const int QP15= QP + (QP>>2);
- src+= stride*3;
- for(x=0; x<BLOCK_SIZE; x++)
- {
- const int v = (src[x+l5] - src[x+l4]);
- if(ABS(v) < QP15)
- {
- src[x+l3] +=v>>3;
- src[x+l4] +=v>>1;
- src[x+l5] -=v>>1;
- src[x+l6] -=v>>3;
-
- }
- }
-
-#endif
-}
-
-/**
- * Experimental Filter 1
- * will not damage linear gradients
- * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
- * can only smooth blocks at the expected locations (it cant smooth them if they did move)
- * MMX2 version does correct clipping C version doesnt
- */
-static inline void vertX1Filter(uint8_t *src, int stride, int QP)
-{
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
- src+= stride*3;
-
- asm volatile(
- "pxor %%mm7, %%mm7 \n\t" // 0
-// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
- "leal (%0, %1), %%eax \n\t"
- "leal (%%eax, %1, 4), %%ebx \n\t"
-// 0 1 2 3 4 5 6 7 8 9
-// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
- "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
- "movq (%0, %1, 4), %%mm1 \n\t" // line 4
- "movq %%mm1, %%mm2 \n\t" // line 4
- "psubusb %%mm0, %%mm1 \n\t"
- "psubusb %%mm2, %%mm0 \n\t"
- "por %%mm1, %%mm0 \n\t" // |l2 - l3|
- "movq (%%ebx), %%mm3 \n\t" // line 5
- "movq (%%ebx, %1), %%mm4 \n\t" // line 6
- "movq %%mm3, %%mm5 \n\t" // line 5
- "psubusb %%mm4, %%mm3 \n\t"
- "psubusb %%mm5, %%mm4 \n\t"
- "por %%mm4, %%mm3 \n\t" // |l5 - l6|
- PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
- "movq %%mm2, %%mm1 \n\t" // line 4
- "psubusb %%mm5, %%mm2 \n\t"
- "movq %%mm2, %%mm4 \n\t"
- "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
- "psubusb %%mm1, %%mm5 \n\t"
- "por %%mm5, %%mm4 \n\t" // |l4 - l5|
- "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
- "movq %%mm4, %%mm3 \n\t" // d
- "psubusb pQPb, %%mm4 \n\t"
- "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
- "psubusb b01, %%mm3 \n\t"
- "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
-
- PAVGB(%%mm7, %%mm3) // d/2
- "movq %%mm3, %%mm1 \n\t" // d/2
- PAVGB(%%mm7, %%mm3) // d/4
- PAVGB(%%mm1, %%mm3) // 3*d/8
-
- "movq (%0, %1, 4), %%mm0 \n\t" // line 4
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
- "psubusb %%mm3, %%mm0 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "movq %%mm0, (%0, %1, 4) \n\t" // line 4
-
- "movq (%%ebx), %%mm0 \n\t" // line 5
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
- "paddusb %%mm3, %%mm0 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "movq %%mm0, (%%ebx) \n\t" // line 5
-
- PAVGB(%%mm7, %%mm1) // d/4
-
- "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
- "psubusb %%mm1, %%mm0 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
-
- "movq (%%ebx, %1), %%mm0 \n\t" // line 6
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
- "paddusb %%mm1, %%mm0 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "movq %%mm0, (%%ebx, %1) \n\t" // line 6
-
- PAVGB(%%mm7, %%mm1) // d/8
-
- "movq (%%eax, %1), %%mm0 \n\t" // line 2
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
- "psubusb %%mm1, %%mm0 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "movq %%mm0, (%%eax, %1) \n\t" // line 2
-
- "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
- "paddusb %%mm1, %%mm0 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
-
- :
- : "r" (src), "r" (stride)
- : "%eax", "%ebx"
- );
-#else
-
- const int l1= stride;
- const int l2= stride + l1;
- const int l3= stride + l2;
- const int l4= stride + l3;
- const int l5= stride + l4;
- const int l6= stride + l5;
- const int l7= stride + l6;
-// const int l8= stride + l7;
-// const int l9= stride + l8;
- int x;
-
- src+= stride*3;
- for(x=0; x<BLOCK_SIZE; x++)
- {
- int a= src[l3] - src[l4];
- int b= src[l4] - src[l5];
- int c= src[l5] - src[l6];
-
- int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
- d= MAX(d, 0);
-
- if(d < QP)
- {
- int v = d * SIGN(-b);
-
- src[l2] +=v>>3;
- src[l3] +=v>>2;
- src[l4] +=(3*v)>>3;
- src[l5] -=(3*v)>>3;
- src[l6] -=v>>2;
- src[l7] -=v>>3;
-
- }
- src++;
- }
- /*
- const int l1= stride;
- const int l2= stride + l1;
- const int l3= stride + l2;
- const int l4= stride + l3;
- const int l5= stride + l4;
- const int l6= stride + l5;
- const int l7= stride + l6;
- const int l8= stride + l7;
- const int l9= stride + l8;
- for(int x=0; x<BLOCK_SIZE; x++)
- {
- int v2= src[l2];
- int v3= src[l3];
- int v4= src[l4];
- int v5= src[l5];
- int v6= src[l6];
- int v7= src[l7];
-
- if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
- {
- src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
- src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
- src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
- src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
- }
- src++;
- }
-*/
-#endif
-}
-
-/**
- * Experimental Filter 1 (Horizontal)
- * will not damage linear gradients
- * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
- * can only smooth blocks at the expected locations (it cant smooth them if they did move)
- * MMX2 version does correct clipping C version doesnt
- * not identical with the vertical one
- */
-static inline void horizX1Filter(uint8_t *src, int stride, int QP)
-{
- int y;
-//FIXME (has little in common with the mmx2 version)
- for(y=0; y<BLOCK_SIZE; y++)
- {
- int a= src[1] - src[2];
- int b= src[3] - src[4];
- int c= src[5] - src[6];
-
- int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
-
- if(d < QP)
- {
- int v = d * SIGN(-b);
-
- src[1] +=v/8;
- src[2] +=v/4;
- src[3] +=3*v/8;
- src[4] -=3*v/8;
- src[5] -=v/4;
- src[6] -=v/8;
-
- }
- src+=stride;
- }
-}
-
-
-static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
-{
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-/*
- uint8_t tmp[16];
- const int l1= stride;
- const int l2= stride + l1;
- const int l3= stride + l2;
- const int l4= (int)tmp - (int)src - stride*3;
- const int l5= (int)tmp - (int)src - stride*3 + 8;
- const int l6= stride*3 + l3;
- const int l7= stride + l6;
- const int l8= stride + l7;
-
- memcpy(tmp, src+stride*7, 8);
- memcpy(tmp+8, src+stride*8, 8);
-*/
- src+= stride*4;
- asm volatile(
-
-#if 0 //sligtly more accurate and slightly slower
- "pxor %%mm7, %%mm7 \n\t" // 0
- "leal (%0, %1), %%eax \n\t"
- "leal (%%eax, %1, 4), %%ebx \n\t"
-// 0 1 2 3 4 5 6 7
-// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
-// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
-
-
- "movq (%0, %1, 2), %%mm0 \n\t" // l2
- "movq (%0), %%mm1 \n\t" // l0
- "movq %%mm0, %%mm2 \n\t" // l2
- PAVGB(%%mm7, %%mm0) // ~l2/2
- PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
- PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
-
- "movq (%%eax), %%mm1 \n\t" // l1
- "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
- "movq %%mm1, %%mm4 \n\t" // l1
- PAVGB(%%mm7, %%mm1) // ~l1/2
- PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
- PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
-
- "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
- "psubusb %%mm1, %%mm0 \n\t"
- "psubusb %%mm4, %%mm1 \n\t"
- "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
-// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
-
- "movq (%0, %1, 4), %%mm0 \n\t" // l4
- "movq %%mm0, %%mm4 \n\t" // l4
- PAVGB(%%mm7, %%mm0) // ~l4/2
- PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
- PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
-
- "movq (%%ebx), %%mm2 \n\t" // l5
- "movq %%mm3, %%mm5 \n\t" // l3
- PAVGB(%%mm7, %%mm3) // ~l3/2
- PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
- PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
-
- "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
- "psubusb %%mm3, %%mm0 \n\t"
- "psubusb %%mm6, %%mm3 \n\t"
- "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
- "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
-// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
-
- "movq (%%ebx, %1), %%mm6 \n\t" // l6
- "movq %%mm6, %%mm5 \n\t" // l6
- PAVGB(%%mm7, %%mm6) // ~l6/2
- PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
- PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
-
- "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
- "movq %%mm2, %%mm4 \n\t" // l5
- PAVGB(%%mm7, %%mm2) // ~l5/2
- PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
- PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
-
- "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
- "psubusb %%mm2, %%mm6 \n\t"
- "psubusb %%mm4, %%mm2 \n\t"
- "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
-// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
-
-
- PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
- "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
- "paddusb b01, %%mm4 \n\t"
- "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
- "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
- "pand %%mm4, %%mm3 \n\t"
-
- "movq %%mm3, %%mm1 \n\t"
-// "psubusb b01, %%mm3 \n\t"
- PAVGB(%%mm7, %%mm3)
- PAVGB(%%mm7, %%mm3)
- "paddusb %%mm1, %%mm3 \n\t"
-// "paddusb b01, %%mm3 \n\t"
-
- "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
- "movq (%0, %1, 4), %%mm5 \n\t" //l4
- "movq (%0, %1, 4), %%mm4 \n\t" //l4
- "psubusb %%mm6, %%mm5 \n\t"
- "psubusb %%mm4, %%mm6 \n\t"
- "por %%mm6, %%mm5 \n\t" // |l3-l4|
- "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
- "pxor %%mm6, %%mm0 \n\t"
- "pand %%mm0, %%mm3 \n\t"
- PMINUB(%%mm5, %%mm3, %%mm0)
-
- "psubusb b01, %%mm3 \n\t"
- PAVGB(%%mm7, %%mm3)
-
- "movq (%%eax, %1, 2), %%mm0 \n\t"
- "movq (%0, %1, 4), %%mm2 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "psubb %%mm3, %%mm0 \n\t"
- "paddb %%mm3, %%mm2 \n\t"
- "pxor %%mm6, %%mm0 \n\t"
- "pxor %%mm6, %%mm2 \n\t"
- "movq %%mm0, (%%eax, %1, 2) \n\t"
- "movq %%mm2, (%0, %1, 4) \n\t"
-#endif
-
- "leal (%0, %1), %%eax \n\t"
- "pcmpeqb %%mm6, %%mm6 \n\t" // -1
-// 0 1 2 3 4 5 6 7
-// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
-// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
-
-
- "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
- "movq (%0, %1, 4), %%mm0 \n\t" // l4
- "pxor %%mm6, %%mm1 \n\t" // -l3-1
- PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
-// mm1=-l3-1, mm0=128-q
-
- "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
- "movq (%%eax, %1), %%mm3 \n\t" // l2
- "pxor %%mm6, %%mm2 \n\t" // -l5-1
- "movq %%mm2, %%mm5 \n\t" // -l5-1
- "movq b80, %%mm4 \n\t" // 128
- "leal (%%eax, %1, 4), %%ebx \n\t"
- PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
- PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
- PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
- PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
-// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
-
- "movq (%%eax), %%mm2 \n\t" // l1
- "pxor %%mm6, %%mm2 \n\t" // -l1-1
- PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
- PAVGB((%0), %%mm1) // (l0-l3+256)/2
- "movq b80, %%mm3 \n\t" // 128
- PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
- PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
- PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
-// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
-
- PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
- "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
- "pxor %%mm6, %%mm1 \n\t" // -l7-1
- PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
- "movq b80, %%mm2 \n\t" // 128
- PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
- PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
- PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
-// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
-
- "movq b00, %%mm1 \n\t" // 0
- "movq b00, %%mm5 \n\t" // 0
- "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
- "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
- PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
- PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
- PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
-
-// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
-
- "movq b00, %%mm7 \n\t" // 0
- "movq pQPb, %%mm2 \n\t" // QP
- PAVGB(%%mm6, %%mm2) // 128 + QP/2
- "psubb %%mm6, %%mm2 \n\t"
-
- "movq %%mm4, %%mm1 \n\t"
- "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
- "pxor %%mm1, %%mm4 \n\t"
- "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
- "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
- "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
-// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
-
- "movq %%mm4, %%mm3 \n\t" // d
- "psubusb b01, %%mm4 \n\t"
- PAVGB(%%mm7, %%mm4) // d/32
- PAVGB(%%mm7, %%mm4) // (d + 32)/64
- "paddb %%mm3, %%mm4 \n\t" // 5d/64
- "pand %%mm2, %%mm4 \n\t"
-
- "movq b80, %%mm5 \n\t" // 128
- "psubb %%mm0, %%mm5 \n\t" // q
- "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
- "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
- "pxor %%mm7, %%mm5 \n\t"
-
- PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
- "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
-
- "pand %%mm7, %%mm4 \n\t"
- "movq (%%eax, %1, 2), %%mm0 \n\t"
- "movq (%0, %1, 4), %%mm2 \n\t"
- "pxor %%mm1, %%mm0 \n\t"
- "pxor %%mm1, %%mm2 \n\t"
- "paddb %%mm4, %%mm0 \n\t"
- "psubb %%mm4, %%mm2 \n\t"
- "pxor %%mm1, %%mm0 \n\t"
- "pxor %%mm1, %%mm2 \n\t"
- "movq %%mm0, (%%eax, %1, 2) \n\t"
- "movq %%mm2, (%0, %1, 4) \n\t"
-
- :
- : "r" (src), "r" (stride)
- : "%eax", "%ebx"
- );
-
-/*
- {
- int x;
- src-= stride;
- for(x=0; x<BLOCK_SIZE; x++)
- {
- const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
- if(ABS(middleEnergy)< 8*QP)
- {
- const int q=(src[l4] - src[l5])/2;
- const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
- const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
-
- int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
- d= MAX(d, 0);
-
- d= (5*d + 32) >> 6;
- d*= SIGN(-middleEnergy);
-
- if(q>0)
- {
- d= d<0 ? 0 : d;
- d= d>q ? q : d;
- }
- else
- {
- d= d>0 ? 0 : d;
- d= d<q ? q : d;
- }
-
- src[l4]-= d;
- src[l5]+= d;
- }
- src++;
- }
-src-=8;
- for(x=0; x<8; x++)
- {
- int y;
- for(y=4; y<6; y++)
- {
- int d= src[x+y*stride] - tmp[x+(y-4)*8];
- int ad= ABS(d);
- static int max=0;
- static int sum=0;
- static int num=0;
- static int bias=0;
-
- if(max<ad) max=ad;
- sum+= ad>3 ? 1 : 0;
- if(ad>3)
- {
- src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
- }
- if(y==4) bias+=d;
- num++;
- if(num%1000000 == 0)
- {
- printf(" %d %d %d %d\n", num, sum, max, bias);
- }
- }
- }
-}
-*/
-#elif defined (HAVE_MMX)
- src+= stride*4;
-
- asm volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "leal (%0, %1), %%eax \n\t"
- "leal (%%eax, %1, 4), %%ebx \n\t"
-// 0 1 2 3 4 5 6 7
-// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
-// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
-
- "movq (%0), %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
- "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
-
- "movq (%%eax), %%mm2 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
- "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
-
- "movq (%%eax, %1), %%mm4 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
- "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
-
- "paddw %%mm0, %%mm0 \n\t" // 2L0
- "paddw %%mm1, %%mm1 \n\t" // 2H0
- "psubw %%mm4, %%mm2 \n\t" // L1 - L2
- "psubw %%mm5, %%mm3 \n\t" // H1 - H2
- "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
- "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
-