diff options
author | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-24 01:38:30 +0000 |
---|---|---|
committer | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-24 01:38:30 +0000 |
commit | 39595a9f2ebe3daf21063824d29174716c11788a (patch) | |
tree | 311361625f3725d1e171ed94fe7d37e5fd41b777 /postproc | |
parent | f67f9bdcbebd8011c5ee832ca3af6c94154c26dd (diff) | |
download | mpv-39595a9f2ebe3daf21063824d29174716c11788a.tar.bz2 mpv-39595a9f2ebe3daf21063824d29174716c11788a.tar.xz |
faster dering
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3094 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'postproc')
-rw-r--r-- | postproc/postprocess.c | 176 | ||||
-rw-r--r-- | postproc/postprocess_template.c | 176 |
2 files changed, 238 insertions, 114 deletions
diff --git a/postproc/postprocess.c b/postproc/postprocess.c index d590b01a46..d0ae70b81e 100644 --- a/postproc/postprocess.c +++ b/postproc/postprocess.c @@ -47,7 +47,6 @@ c = checked against the other implementations (-vo md5) /* TODO: -verify that everything workes as it should (how?) reduce the time wasted on the mem transfer implement everything in C at least (done at the moment but ...) unroll stuff if instructions depend too much on the prior one @@ -62,7 +61,8 @@ border remover optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur -commandline option for the deblock thresholds +commandline option for the deblock / dering thresholds +memcpy chrominance if no chroma filtering is done ... */ @@ -162,6 +162,7 @@ static uint8_t tempBlocks[8*16*2]; //used for the horizontal code int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; +int deringThreshold= 20; //amount of "black" u r willing to loose to get a brightness corrected picture double maxClippedThreshold= 0.01; @@ -310,28 +311,26 @@ asm volatile( "paddb %%mm2, %%mm0 \n\t" " \n\t" +#ifdef HAVE_MMX2 + "pxor %%mm7, %%mm7 \n\t" + "psadbw %%mm7, %%mm0 \n\t" +#else "movq %%mm0, %%mm1 \n\t" "psrlw $8, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" -#ifdef HAVE_MMX2 - "pshufw $0xF9, %%mm0, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "pshufw $0xFE, %%mm0, %%mm1 \n\t" -#else "movq %%mm0, %%mm1 \n\t" "psrlq $16, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" -#endif "paddb %%mm1, %%mm0 \n\t" +#endif "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%ebx" ); - - numEq= (256 - numEq) &0xFF; + numEq= (-numEq) &0xFF; #else for(y=0; y<BLOCK_SIZE-1; y++) @@ -1591,21 +1590,21 @@ static inline void dering(uint8_t src[], int stride, int QP) // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "pcmpeqb %%mm6, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" + "pcmpeqb %%mm7, %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" #ifdef HAVE_MMX2 #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "pminub %%mm0, %%mm6 \n\t"\ - "pmaxub %%mm0, %%mm7 \n\t" + "pminub %%mm0, %%mm7 \n\t"\ + "pmaxub %%mm0, %%mm6 \n\t" #else #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "movq %%mm6, %%mm1 \n\t"\ - "psubusb %%mm0, %%mm7 \n\t"\ - "paddb %%mm0, %%mm7 \n\t"\ + "movq %%mm7, %%mm1 \n\t"\ + "psubusb %%mm0, %%mm6 \n\t"\ + "paddb %%mm0, %%mm6 \n\t"\ "psubusb %%mm0, %%mm1 \n\t"\ - "psubb %%mm1, %%mm6 \n\t" + "psubb %%mm1, %%mm7 \n\t" #endif FIND_MIN_MAX((%%eax)) @@ -1617,52 +1616,57 @@ FIND_MIN_MAX((%%ebx, %1)) FIND_MIN_MAX((%%ebx, %1, 2)) FIND_MIN_MAX((%0, %1, 8)) - "movq %%mm6, %%mm4 \n\t" - "psrlq $8, %%mm6 \n\t" -#ifdef HAVE_MMX2 - "pminub %%mm4, %%mm6 \n\t" // min of pixels - "pshufw $0xF9, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" // min of pixels - "pshufw $0xFE, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" -#else - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" - "movq %%mm6, %%mm4 \n\t" - "psrlq $16, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" - "movq %%mm6, %%mm4 \n\t" - "psrlq $32, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" -#endif - - "movq %%mm7, %%mm4 \n\t" "psrlq $8, %%mm7 \n\t" #ifdef HAVE_MMX2 - "pmaxub %%mm4, %%mm7 \n\t" // max of pixels + "pminub %%mm4, %%mm7 \n\t" // min of pixels "pshufw $0xF9, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" + "pminub %%mm4, %%mm7 \n\t" // min of pixels "pshufw $0xFE, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" + "pminub %%mm4, %%mm7 \n\t" #else - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" "movq %%mm7, %%mm4 \n\t" "psrlq $16, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" "movq %%mm7, %%mm4 \n\t" "psrlq $32, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" +#endif + + + "movq %%mm6, %%mm4 \n\t" + "psrlq $8, %%mm6 \n\t" +#ifdef HAVE_MMX2 + "pmaxub %%mm4, %%mm6 \n\t" // max of pixels + "pshufw $0xF9, %%mm6, %%mm4 \n\t" + "pmaxub %%mm4, %%mm6 \n\t" + "pshufw $0xFE, %%mm6, %%mm4 \n\t" + "pmaxub %%mm4, %%mm6 \n\t" +#else + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" + "movq %%mm6, %%mm4 \n\t" + "psrlq $16, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" + "movq %%mm6, %%mm4 \n\t" + "psrlq $32, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" #endif - PAVGB(%%mm6, %%mm7) // a=(max + min)/2 + "movq %%mm6, %%mm0 \n\t" // max + "psubb %%mm7, %%mm6 \n\t" // max - min + "movd %%mm6, %%ecx \n\t" + "cmpb deringThreshold, %%cl \n\t" + " jb 1f \n\t" + PAVGB(%%mm0, %%mm7) // a=(max + min)/2 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" @@ -1785,9 +1789,9 @@ DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) - + "1: \n\t" : : "r" (src), "r" (stride), "r" (QP) - : "%eax", "%ebx" + : "%eax", "%ebx", "%ecx" ); #else int y; @@ -1810,6 +1814,8 @@ DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm } avg= (min + max + 1)/2; + if(max - min <deringThreshold) return; + for(y=0; y<10; y++) { int x; @@ -1842,13 +1848,69 @@ DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); f= (f + 8)>>4; +#ifdef DEBUG_DERING_THRESHOLD + asm volatile("emms\n\t":); + { + static long long numPixels=0; + if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; +// if((max-min)<20 || (max-min)*QP<200) +// if((max-min)*QP < 500) +// if(max-min<QP/2) + if(max-min < 20) + { + static int numSkiped=0; + static int errorSum=0; + static int worstQP=0; + static int worstRange=0; + static int worstDiff=0; + int diff= (f - *p); + int absDiff= ABS(diff); + int error= diff*diff; + + if(x==1 || x==8 || y==1 || y==8) continue; + + numSkiped++; + if(absDiff > worstDiff) + { + worstDiff= absDiff; + worstQP= QP; + worstRange= max-min; + } + errorSum+= error; + + if(1024LL*1024LL*1024LL % numSkiped == 0) + { + printf( "sum:%1.3f, skip:%d, wQP:%d, " + "wRange:%d, wDiff:%d, relSkip:%1.3f\n", + (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, + worstDiff, (float)numSkiped/numPixels); + } + } + } +#endif if (*p + 2*QP < f) *p= *p + 2*QP; else if(*p - 2*QP > f) *p= *p - 2*QP; else *p=f; } } } - +#ifdef DEBUG_DERING_THRESHOLD + if(max-min < 20) + { + for(y=1; y<9; y++) + { + int x; + int t = 0; + p= src + stride*y; + for(x=1; x<9; x++) + { + p++; + *p = MIN(*p + 20, 255); + } + } +// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; + } +#endif #endif } diff --git a/postproc/postprocess_template.c b/postproc/postprocess_template.c index d590b01a46..d0ae70b81e 100644 --- a/postproc/postprocess_template.c +++ b/postproc/postprocess_template.c @@ -47,7 +47,6 @@ c = checked against the other implementations (-vo md5) /* TODO: -verify that everything workes as it should (how?) reduce the time wasted on the mem transfer implement everything in C at least (done at the moment but ...) unroll stuff if instructions depend too much on the prior one @@ -62,7 +61,8 @@ border remover optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur -commandline option for the deblock thresholds +commandline option for the deblock / dering thresholds +memcpy chrominance if no chroma filtering is done ... */ @@ -162,6 +162,7 @@ static uint8_t tempBlocks[8*16*2]; //used for the horizontal code int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; +int deringThreshold= 20; //amount of "black" u r willing to loose to get a brightness corrected picture double maxClippedThreshold= 0.01; @@ -310,28 +311,26 @@ asm volatile( "paddb %%mm2, %%mm0 \n\t" " \n\t" +#ifdef HAVE_MMX2 + "pxor %%mm7, %%mm7 \n\t" + "psadbw %%mm7, %%mm0 \n\t" +#else "movq %%mm0, %%mm1 \n\t" "psrlw $8, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" -#ifdef HAVE_MMX2 - "pshufw $0xF9, %%mm0, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "pshufw $0xFE, %%mm0, %%mm1 \n\t" -#else "movq %%mm0, %%mm1 \n\t" "psrlq $16, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" -#endif "paddb %%mm1, %%mm0 \n\t" +#endif "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%ebx" ); - - numEq= (256 - numEq) &0xFF; + numEq= (-numEq) &0xFF; #else for(y=0; y<BLOCK_SIZE-1; y++) @@ -1591,21 +1590,21 @@ static inline void dering(uint8_t src[], int stride, int QP) // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "pcmpeqb %%mm6, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" + "pcmpeqb %%mm7, %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" #ifdef HAVE_MMX2 #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "pminub %%mm0, %%mm6 \n\t"\ - "pmaxub %%mm0, %%mm7 \n\t" + "pminub %%mm0, %%mm7 \n\t"\ + "pmaxub %%mm0, %%mm6 \n\t" #else #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "movq %%mm6, %%mm1 \n\t"\ - "psubusb %%mm0, %%mm7 \n\t"\ - "paddb %%mm0, %%mm7 \n\t"\ + "movq %%mm7, %%mm1 \n\t"\ + "psubusb %%mm0, %%mm6 \n\t"\ + "paddb %%mm0, %%mm6 \n\t"\ "psubusb %%mm0, %%mm1 \n\t"\ - "psubb %%mm1, %%mm6 \n\t" + "psubb %%mm1, %%mm7 \n\t" #endif FIND_MIN_MAX((%%eax)) @@ -1617,52 +1616,57 @@ FIND_MIN_MAX((%%ebx, %1)) FIND_MIN_MAX((%%ebx, %1, 2)) FIND_MIN_MAX((%0, %1, 8)) - "movq %%mm6, %%mm4 \n\t" - "psrlq $8, %%mm6 \n\t" -#ifdef HAVE_MMX2 - "pminub %%mm4, %%mm6 \n\t" // min of pixels - "pshufw $0xF9, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" // min of pixels - "pshufw $0xFE, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" -#else - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" - "movq %%mm6, %%mm4 \n\t" - "psrlq $16, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" - "movq %%mm6, %%mm4 \n\t" - "psrlq $32, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" -#endif - - "movq %%mm7, %%mm4 \n\t" "psrlq $8, %%mm7 \n\t" #ifdef HAVE_MMX2 - "pmaxub %%mm4, %%mm7 \n\t" // max of pixels + "pminub %%mm4, %%mm7 \n\t" // min of pixels "pshufw $0xF9, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" + "pminub %%mm4, %%mm7 \n\t" // min of pixels "pshufw $0xFE, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" + "pminub %%mm4, %%mm7 \n\t" #else - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" "movq %%mm7, %%mm4 \n\t" "psrlq $16, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" "movq %%mm7, %%mm4 \n\t" "psrlq $32, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" +#endif + + + "movq %%mm6, %%mm4 \n\t" + "psrlq $8, %%mm6 \n\t" +#ifdef HAVE_MMX2 + "pmaxub %%mm4, %%mm6 \n\t" // max of pixels + "pshufw $0xF9, %%mm6, %%mm4 \n\t" + "pmaxub %%mm4, %%mm6 \n\t" + "pshufw $0xFE, %%mm6, %%mm4 \n\t" + "pmaxub %%mm4, %%mm6 \n\t" +#else + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" + "movq %%mm6, %%mm4 \n\t" + "psrlq $16, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" + "movq %%mm6, %%mm4 \n\t" + "psrlq $32, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" #endif - PAVGB(%%mm6, %%mm7) // a=(max + min)/2 + "movq %%mm6, %%mm0 \n\t" // max + "psubb %%mm7, %%mm6 \n\t" // max - min + "movd %%mm6, %%ecx \n\t" + "cmpb deringThreshold, %%cl \n\t" + " jb 1f \n\t" + PAVGB(%%mm0, %%mm7) // a=(max + min)/2 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" @@ -1785,9 +1789,9 @@ DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) - + "1: \n\t" : : "r" (src), "r" (stride), "r" (QP) - : "%eax", "%ebx" + : "%eax", "%ebx", "%ecx" ); #else int y; @@ -1810,6 +1814,8 @@ DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm } avg= (min + max + 1)/2; + if(max - min <deringThreshold) return; + for(y=0; y<10; y++) { int x; @@ -1842,13 +1848,69 @@ DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); f= (f + 8)>>4; +#ifdef DEBUG_DERING_THRESHOLD + asm volatile("emms\n\t":); + { + static long long numPixels=0; + if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; +// if((max-min)<20 || (max-min)*QP<200) +// if((max-min)*QP < 500) +// if(max-min<QP/2) + if(max-min < 20) + { + static int numSkiped=0; + static int errorSum=0; + static int worstQP=0; + static int worstRange=0; + static int worstDiff=0; + int diff= (f - *p); + int absDiff= ABS(diff); + int error= diff*diff; + + if(x==1 || x==8 || y==1 || y==8) continue; + + numSkiped++; + if(absDiff > worstDiff) + { + worstDiff= absDiff; + worstQP= QP; + worstRange= max-min; + } + errorSum+= error; + + if(1024LL*1024LL*1024LL % numSkiped == 0) + { + printf( "sum:%1.3f, skip:%d, wQP:%d, " + "wRange:%d, wDiff:%d, relSkip:%1.3f\n", + (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, + worstDiff, (float)numSkiped/numPixels); + } + } + } +#endif if (*p + 2*QP < f) *p= *p + 2*QP; else if(*p - 2*QP > f) *p= *p - 2*QP; else *p=f; } } } - +#ifdef DEBUG_DERING_THRESHOLD + if(max-min < 20) + { + for(y=1; y<9; y++) + { + int x; + int t = 0; + p= src + stride*y; + for(x=1; x<9; x++) + { + p++; + *p = MIN(*p + 20, 255); + } + } +// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; + } +#endif #endif } |