summaryrefslogtreecommitdiffstats
path: root/postproc
diff options
context:
space:
mode:
authormichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-10-23 15:55:54 +0000
committermichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-10-23 15:55:54 +0000
commit77f285907e14573f9f6ac8df3fe1589050a67852 (patch)
tree03fb0a88c3883c8f9a56c9efc8d94c4386bf104b /postproc
parentffa3a86b12cfce81c64ca6ff0c6bc7f10f0ecc99 (diff)
downloadmpv-77f285907e14573f9f6ac8df3fe1589050a67852.tar.bz2
mpv-77f285907e14573f9f6ac8df3fe1589050a67852.tar.xz
more speed
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@2429 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'postproc')
-rw-r--r--postproc/postprocess.c122
-rw-r--r--postproc/postprocess_template.c122
2 files changed, 118 insertions, 126 deletions
diff --git a/postproc/postprocess.c b/postproc/postprocess.c
index 5ed24779bd..ac5aa24cf7 100644
--- a/postproc/postprocess.c
+++ b/postproc/postprocess.c
@@ -60,6 +60,7 @@ compare the quality & speed of all filters
split this huge file
fix warnings (unused vars, ...)
noise reduction filters
+write an exact implementation of the horizontal delocking filter
...
Notes:
@@ -1450,7 +1451,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
{
#ifdef HAVE_MMX
asm volatile(
- "pushl %0 \n\t"
+ "leal (%0, %1), %%ecx \n\t"
+ "leal (%%ecx, %1, 4), %%ebx \n\t"
+// 0 1 2 3 4 5 6 7 8 9
+// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t"
"movq bm00001000, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" // QP
@@ -1464,10 +1468,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
//FIXME? "unroll by 2" and mix
#ifdef HAVE_MMX2
-#define HDF(i) \
- "movq " #i "(%%eax), %%mm0 \n\t"\
- "movq %%mm0, %%mm1 \n\t"\
- "movq %%mm0, %%mm2 \n\t"\
+#define HDF(src, dst) \
+ "movq " #src "(%%eax), %%mm0 \n\t"\
+ "movq " #src "(%%eax), %%mm1 \n\t"\
+ "movq " #src "(%%eax), %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\
@@ -1486,12 +1490,12 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\
- "movd %%mm0, (%0) \n\t"\
+ "movd %%mm0, " #dst" \n\t"\
"psrlq $32, %%mm0 \n\t"\
- "movd %%mm0, 4(%0) \n\t"
+ "movd %%mm0, 4" #dst" \n\t"
#else
-#define HDF(i)\
- "movq " #i "(%%eax), %%mm0 \n\t"\
+#define HDF(src, dst)\
+ "movq " #src "(%%eax), %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
@@ -1515,29 +1519,21 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\
- "movd %%mm0, (%0) \n\t"\
+ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\
- "movd %%mm0, 4(%0) \n\t"
+ "movd %%mm0, 4" #dst " \n\t"
#endif
- HDF(0)
- "addl %1, %0 \n\t"
- HDF(8)
- "addl %1, %0 \n\t"
- HDF(16)
- "addl %1, %0 \n\t"
- HDF(24)
- "addl %1, %0 \n\t"
- HDF(32)
- "addl %1, %0 \n\t"
- HDF(40)
- "addl %1, %0 \n\t"
- HDF(48)
- "addl %1, %0 \n\t"
- HDF(56)
- "popl %0 \n\t"
+ HDF(0,(%0))
+ HDF(8,(%%ecx))
+ HDF(16,(%%ecx, %1))
+ HDF(24,(%%ecx, %1, 2))
+ HDF(32,(%0, %1, 4))
+ HDF(40,(%%ebx))
+ HDF(48,(%%ebx, %1))
+ HDF(56,(%%ebx, %1, 2))
:
: "r" (dst), "r" (stride), "r" (QP)
- : "%eax"
+ : "%eax", "%ebx", "%ecx"
);
#else
uint8_t *src= tempBlock;
@@ -1597,8 +1593,11 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
{
//return;
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
- asm volatile( //"movv %0 %1 %2\n\t"
- "pushl %0\n\t"
+ asm volatile(
+ "leal (%0, %1), %%ecx \n\t"
+ "leal (%%ecx, %1, 4), %%ebx \n\t"
+// 0 1 2 3 4 5 6 7 8 9
+// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t"
"leal tempBlock, %%eax \n\t"
/*
@@ -1714,20 +1713,20 @@ Implemented Exact 7-Tap
#endif
/* uses the 7-Tap Filter: 1112111 */
-#define NEW_HLP(i)\
- "movq " #i "(%%eax), %%mm0 \n\t"\
- "movq %%mm0, %%mm1 \n\t"\
- "movq %%mm0, %%mm2 \n\t"\
- "movd -4(%0), %%mm3 \n\t" /*0001000*/\
- "movd 8(%0), %%mm4 \n\t" /*0001000*/\
+#define NEW_HLP(src, dst)\
+ "movq " #src "(%%eax), %%mm1 \n\t"\
+ "movq " #src "(%%eax), %%mm2 \n\t"\
"psllq $8, %%mm1 \n\t"\
"psrlq $8, %%mm2 \n\t"\
+ "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
+ "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
"psrlq $24, %%mm3 \n\t"\
"psllq $56, %%mm4 \n\t"\
"por %%mm3, %%mm1 \n\t"\
"por %%mm4, %%mm2 \n\t"\
"movq %%mm1, %%mm5 \n\t"\
PAVGB(%%mm2, %%mm1)\
+ "movq " #src "(%%eax), %%mm0 \n\t"\
PAVGB(%%mm1, %%mm0)\
"psllq $8, %%mm5 \n\t"\
"psrlq $8, %%mm2 \n\t"\
@@ -1742,9 +1741,9 @@ Implemented Exact 7-Tap
PAVGB(%%mm2, %%mm1)\
PAVGB(%%mm1, %%mm5)\
PAVGB(%%mm5, %%mm0)\
- "movd %%mm0, (%0) \n\t"\
+ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\
- "movd %%mm0, 4(%0) \n\t"
+ "movd %%mm0, 4" #dst " \n\t"
/* uses the 9-Tap Filter: 112242211 */
#define NEW_HLP2(i)\
@@ -1786,28 +1785,20 @@ Implemented Exact 7-Tap
"psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t"
-#define HLP(i) NEW_HLP(i)
-
- HLP(0)
- "addl %1, %0 \n\t"
- HLP(8)
- "addl %1, %0 \n\t"
- HLP(16)
- "addl %1, %0 \n\t"
- HLP(24)
- "addl %1, %0 \n\t"
- HLP(32)
- "addl %1, %0 \n\t"
- HLP(40)
- "addl %1, %0 \n\t"
- HLP(48)
- "addl %1, %0 \n\t"
- HLP(56)
+#define HLP(src, dst) NEW_HLP(src, dst)
+
+ HLP(0, (%0))
+ HLP(8, (%%ecx))
+ HLP(16, (%%ecx, %1))
+ HLP(24, (%%ecx, %1, 2))
+ HLP(32, (%0, %1, 4))
+ HLP(40, (%%ebx))
+ HLP(48, (%%ebx, %1))
+ HLP(56, (%%ebx, %1, 2))
- "popl %0\n\t"
:
: "r" (dst), "r" (stride)
- : "%eax", "%ebx"
+ : "%eax", "%ebx", "%ecx"
);
#else
@@ -2743,10 +2734,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE)
{
const int stride= dstStride;
- int QP= isColor ?
- QPs[(y>>3)*QPStride + (x>>3)]:
- QPs[(y>>4)*QPStride + (x>>4)];
- if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
+ int QP;
+ if(isColor)
+ {
+ QP=QPs[(y>>3)*QPStride + (x>>3)];
+ }
+ else
+ {
+ QP= QPs[(y>>4)*QPStride + (x>>4)];
+ if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
+ yHistogram[ srcBlock[srcStride*5] ]++;
+ }
#ifdef HAVE_MMX
asm volatile(
"movd %0, %%mm7 \n\t"
@@ -2776,8 +2774,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
*/
#endif
- if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
-
#ifdef PP_FUNNY_STRIDE
//can we mess with a 8x16 block, if not use a temp buffer, yes again
if(x+7 >= width)
diff --git a/postproc/postprocess_template.c b/postproc/postprocess_template.c
index 5ed24779bd..ac5aa24cf7 100644
--- a/postproc/postprocess_template.c
+++ b/postproc/postprocess_template.c
@@ -60,6 +60,7 @@ compare the quality & speed of all filters
split this huge file
fix warnings (unused vars, ...)
noise reduction filters
+write an exact implementation of the horizontal delocking filter
...
Notes:
@@ -1450,7 +1451,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
{
#ifdef HAVE_MMX
asm volatile(
- "pushl %0 \n\t"
+ "leal (%0, %1), %%ecx \n\t"
+ "leal (%%ecx, %1, 4), %%ebx \n\t"
+// 0 1 2 3 4 5 6 7 8 9
+// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t"
"movq bm00001000, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" // QP
@@ -1464,10 +1468,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
//FIXME? "unroll by 2" and mix
#ifdef HAVE_MMX2
-#define HDF(i) \
- "movq " #i "(%%eax), %%mm0 \n\t"\
- "movq %%mm0, %%mm1 \n\t"\
- "movq %%mm0, %%mm2 \n\t"\
+#define HDF(src, dst) \
+ "movq " #src "(%%eax), %%mm0 \n\t"\
+ "movq " #src "(%%eax), %%mm1 \n\t"\
+ "movq " #src "(%%eax), %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\
@@ -1486,12 +1490,12 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\
- "movd %%mm0, (%0) \n\t"\
+ "movd %%mm0, " #dst" \n\t"\
"psrlq $32, %%mm0 \n\t"\
- "movd %%mm0, 4(%0) \n\t"
+ "movd %%mm0, 4" #dst" \n\t"
#else
-#define HDF(i)\
- "movq " #i "(%%eax), %%mm0 \n\t"\
+#define HDF(src, dst)\
+ "movq " #src "(%%eax), %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
@@ -1515,29 +1519,21 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\
- "movd %%mm0, (%0) \n\t"\
+ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\
- "movd %%mm0, 4(%0) \n\t"
+ "movd %%mm0, 4" #dst " \n\t"
#endif
- HDF(0)
- "addl %1, %0 \n\t"
- HDF(8)
- "addl %1, %0 \n\t"
- HDF(16)
- "addl %1, %0 \n\t"
- HDF(24)
- "addl %1, %0 \n\t"
- HDF(32)
- "addl %1, %0 \n\t"
- HDF(40)
- "addl %1, %0 \n\t"
- HDF(48)
- "addl %1, %0 \n\t"
- HDF(56)
- "popl %0 \n\t"
+ HDF(0,(%0))
+ HDF(8,(%%ecx))
+ HDF(16,(%%ecx, %1))
+ HDF(24,(%%ecx, %1, 2))
+ HDF(32,(%0, %1, 4))
+ HDF(40,(%%ebx))
+ HDF(48,(%%ebx, %1))
+ HDF(56,(%%ebx, %1, 2))
:
: "r" (dst), "r" (stride), "r" (QP)
- : "%eax"
+ : "%eax", "%ebx", "%ecx"
);
#else
uint8_t *src= tempBlock;
@@ -1597,8 +1593,11 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
{
//return;
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
- asm volatile( //"movv %0 %1 %2\n\t"
- "pushl %0\n\t"
+ asm volatile(
+ "leal (%0, %1), %%ecx \n\t"
+ "leal (%%ecx, %1, 4), %%ebx \n\t"
+// 0 1 2 3 4 5 6 7 8 9
+// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t"
"leal tempBlock, %%eax \n\t"
/*
@@ -1714,20 +1713,20 @@ Implemented Exact 7-Tap
#endif
/* uses the 7-Tap Filter: 1112111 */
-#define NEW_HLP(i)\
- "movq " #i "(%%eax), %%mm0 \n\t"\
- "movq %%mm0, %%mm1 \n\t"\
- "movq %%mm0, %%mm2 \n\t"\
- "movd -4(%0), %%mm3 \n\t" /*0001000*/\
- "movd 8(%0), %%mm4 \n\t" /*0001000*/\
+#define NEW_HLP(src, dst)\
+ "movq " #src "(%%eax), %%mm1 \n\t"\
+ "movq " #src "(%%eax), %%mm2 \n\t"\
"psllq $8, %%mm1 \n\t"\
"psrlq $8, %%mm2 \n\t"\
+ "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
+ "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
"psrlq $24, %%mm3 \n\t"\
"psllq $56, %%mm4 \n\t"\
"por %%mm3, %%mm1 \n\t"\
"por %%mm4, %%mm2 \n\t"\
"movq %%mm1, %%mm5 \n\t"\
PAVGB(%%mm2, %%mm1)\
+ "movq " #src "(%%eax), %%mm0 \n\t"\
PAVGB(%%mm1, %%mm0)\
"psllq $8, %%mm5 \n\t"\
"psrlq $8, %%mm2 \n\t"\
@@ -1742,9 +1741,9 @@ Implemented Exact 7-Tap
PAVGB(%%mm2, %%mm1)\
PAVGB(%%mm1, %%mm5)\
PAVGB(%%mm5, %%mm0)\
- "movd %%mm0, (%0) \n\t"\
+ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\
- "movd %%mm0, 4(%0) \n\t"
+ "movd %%mm0, 4" #dst " \n\t"
/* uses the 9-Tap Filter: 112242211 */
#define NEW_HLP2(i)\
@@ -1786,28 +1785,20 @@ Implemented Exact 7-Tap
"psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t"
-#define HLP(i) NEW_HLP(i)
-
- HLP(0)
- "addl %1, %0 \n\t"
- HLP(8)
- "addl %1, %0 \n\t"
- HLP(16)
- "addl %1, %0 \n\t"
- HLP(24)
- "addl %1, %0 \n\t"
- HLP(32)
- "addl %1, %0 \n\t"
- HLP(40)
- "addl %1, %0 \n\t"
- HLP(48)
- "addl %1, %0 \n\t"
- HLP(56)
+#define HLP(src, dst) NEW_HLP(src, dst)
+
+ HLP(0, (%0))
+ HLP(8, (%%ecx))
+ HLP(16, (%%ecx, %1))
+ HLP(24, (%%ecx, %1, 2))
+ HLP(32, (%0, %1, 4))
+ HLP(40, (%%ebx))
+ HLP(48, (%%ebx, %1))
+ HLP(56, (%%ebx, %1, 2))
- "popl %0\n\t"
:
: "r" (dst), "r" (stride)
- : "%eax", "%ebx"
+ : "%eax", "%ebx", "%ecx"
);
#else
@@ -2743,10 +2734,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE)
{
const int stride= dstStride;
- int QP= isColor ?
- QPs[(y>>3)*QPStride + (x>>3)]:
- QPs[(y>>4)*QPStride + (x>>4)];
- if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
+ int QP;
+ if(isColor)
+ {
+ QP=QPs[(y>>3)*QPStride + (x>>3)];
+ }
+ else
+ {
+ QP= QPs[(y>>4)*QPStride + (x>>4)];
+ if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
+ yHistogram[ srcBlock[srcStride*5] ]++;
+ }
#ifdef HAVE_MMX
asm volatile(
"movd %0, %%mm7 \n\t"
@@ -2776,8 +2774,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
*/
#endif
- if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
-
#ifdef PP_FUNNY_STRIDE
//can we mess with a 8x16 block, if not use a temp buffer, yes again
if(x+7 >= width)