summaryrefslogtreecommitdiffstats
path: root/libao2/fir.h
diff options
context:
space:
mode:
authoranders <anders@b3059339-0415-0410-9bf9-f77b7e298cf2>2002-02-16 13:08:14 +0000
committeranders <anders@b3059339-0415-0410-9bf9-f77b7e298cf2>2002-02-16 13:08:14 +0000
commit1de3804595626b0d262e1bbd90089f995d17ec0b (patch)
tree0e12ee60a8eda73e3f5c041f68142b4dbab29489 /libao2/fir.h
parent6292bf2273f9c9d58b364f78a5ebf19e4036e0b1 (diff)
downloadmpv-1de3804595626b0d262e1bbd90089f995d17ec0b.tar.bz2
mpv-1de3804595626b0d262e1bbd90089f995d17ec0b.tar.xz
MMX part rewritten and 16 tap filter added for better sound qualty
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@4726 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libao2/fir.h')
-rw-r--r--libao2/fir.h164
1 files changed, 58 insertions, 106 deletions
diff --git a/libao2/fir.h b/libao2/fir.h
index 8690824784..6c2d5e646c 100644
--- a/libao2/fir.h
+++ b/libao2/fir.h
@@ -11,123 +11,75 @@
#ifndef __FIR_H__
#define __FIR_H__
-/* 4, 8 and 16 tap FIR filters implemented using SSE instructions
- int16_t* x Input data
- int16_t* y Output value
- int16_t* w Filter weights
-
- C function
- for(int i = 0 ; i < L ; i++)
- *y += w[i]*x[i];
-*/
+/* Fixpoint 16 bit fir filter FIR filter. The filter is implemented
+both in C and MMX assembly. The filter consists of one macro
+UPDATE_QUE and one inline function firn. The macro can be used for
+adding new data to the circular buffer used by the filter firn.
+Limitations: max length of n = 16*4 and n must be multiple of 4 (pad
+fiter with zeros for other lengths). Sometimes it works with filters
+longer than 4*16 (the problem is overshoot and the acumulated energy
+in the filter taps). */
-#ifdef HAVE_SSE
+#ifdef HAVE_MMX
+inline int32_t firn(int16_t* x, int16_t* w, int16_t n)
+{
+ register int32_t y; // Output
+ // Prologue
+ asm volatile(" pxor %mm1, %mm1;\n" ); // Clear buffer yt
+ // Main loop
+ while((n-=4)>=0){
+ asm volatile(
+ " movq (%1), %%mm0;\n" // Load x(n:n+4)
+ " pmaddwd (%0), %%mm0;\n" // yt(n:n+1)=sum(x(n:n+4).*w(n:n+4))
+ " psrld $16, %%mm0;\n" // yt(n:n+1)=yt(n:n+1)>>16
+ " paddd %%mm0, %%mm1;\n" // yt(n:n+1)=yt(n-2:n-1)+yt(n:n+1)
+ :: "r" (w), "r" (x));
+ w+=4; x+=4;
+ }
+ // Epilogue
+ asm volatile(
+ " movq %%mm1, %%mm0;\n"
+ " punpckhdq %%mm1, %%mm0;\n"
+ " paddd %%mm0, %%mm1;\n" //yt(n)=yt(n)+yt(n+1)
+ " movd %%mm1, %0 ;\n" //y=yt
+ " emms ;\n"
+ : "=&r" (y));
+ return y;
+}
-// This block should be MMX only compatible, but it isn't...
-#ifdef L4
-#define LOAD_QUE(x) \
- __asm __volatile("movq %0, %%mm2\n\t" \
- : \
- :"m"((x)[0]) \
- :"memory");
-#define SAVE_QUE(x) \
- __asm __volatile("movq %%mm2, %0\n\t" \
- "emms \n\t" \
- :"=m"(x[0]) \
- : \
- :"memory");
-#define UPDATE_QUE(in) \
- __asm __volatile("psllq $16, %%mm2\n\t" \
- "pinsrw $0, %0,%%mm2\n\t" \
- : \
- :"m" ((in)[0]) \
- :"memory");
-#define FIR(x,w,y) \
- __asm __volatile("movq %%mm2, %%mm0\n\t" \
- "pmaddwd %1, %%mm0\n\t" \
- "movq %%mm0, %%mm1\n\t" \
- "psrlq $32, %%mm1\n\t" \
- "paddd %%mm0, %%mm1\n\t" \
- "movd %%mm1, %%esi\n\t" \
- "shrl $16, %%esi\n\t" \
- "movw %%si, %0\n\t" \
- : "=m" ((y)[0]) \
- : "m" ((w)[0]) \
- : "memory", "%esi");
-#endif /* L4 */
+#else /* HAVE_MMX */
-// It is possible to make the 8 bit filter a lot faster by using the
-// 128 bit registers, feel free to optimize.
-#ifdef L8
-#define LOAD_QUE(x) \
- __asm __volatile("movq %0, %%mm5\n\t" \
- "movq %1, %%mm4\n\t" \
- : \
- :"m"((x)[0]), \
- "m"((x)[4]) \
- :"memory");
-#define SAVE_QUE(x) \
- __asm __volatile("movq %%mm5, %0\n\t" \
- "movq %%mm4, %1\n\t" \
- "emms \n\t" \
- :"=m"((x)[0]), \
- "=m"((x)[4]) \
- : \
- :"memory");
-
-// Below operation could replace line 2 to 5 in macro below but can
-// not cause of compiler bug ???
-// "pextrw $3, %%mm5,%%eax\n\t"
-#define UPDATE_QUE(in) \
- __asm __volatile("psllq $16, %%mm4\n\t" \
- "movq %%mm5, %%mm0\n\t" \
- "psrlq $48, %%mm0\n\t" \
- "movd %%mm0, %%eax\n\t" \
- "pinsrw $0, %%eax,%%mm4\n\t" \
- "psllq $16, %%mm5\n\t" \
- "pinsrw $0, %0,%%mm5\n\t" \
- : \
- :"m" ((in)[0]) \
- :"memory", "%eax");
-#define FIR(x,w,y) \
- __asm __volatile("movq %%mm5, %%mm0\n\t" \
- "pmaddwd %1, %%mm0\n\t" \
- "movq %%mm4, %%mm1\n\t" \
- "pmaddwd %2, %%mm1\n\t" \
- "paddd %%mm1, %%mm0\n\t" \
- "movq %%mm0, %%mm1\n\t" \
- "psrlq $32, %%mm1\n\t" \
- "paddd %%mm0, %%mm1\n\t" \
- "movd %%mm1, %%esi\n\t" \
- "shrl $16, %%esi\n\t" \
- "movw %%si, %0\n\t" \
- : "=m" ((y)[0]) \
- : "m" ((w)[0]), \
- "m" ((w)[4]) \
- : "memory", "%esi");
-#endif /* L8 */
+// Same thing as above but in C
+inline int32_t firn(int16_t* x, int16_t* w, int16_t n)
+{
+ register int32_t y=0;
+ while((n-=4) >=0)
+ y+=w[n]*x[n]+w[n+1]*x[n+1]+w[n+2]*x[n+2]+w[n+3]*x[n+3] >> 16;
+ return y;
+}
-#else /* HAVE_SSE */
+#endif /* HAVE_MMX */
-#define LOAD_QUE(x)
-#define SAVE_QUE(x)
-#define UPDATE_QUE(inm) \
- xi=(--xi)&(L-1); \
- x[xi]=x[xi+L]=*(inm);
+// Macro to add new data to circular queue
+#define UPDATE_QUE(ind,xq,xid) \
+ xid=(--xid)&(L-1); \
+ xq[xid]=xq[xid+L]=*(ind);
-#ifdef L4
-#define FIR(x,w,y) \
- y[0]=(w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16;
-#else
+#ifdef L8
+#ifdef HAVE_MMX
+#define FIR(x,w,y) *y=(int16_t)firn(x,w,8);
+#else /* HAVE_MMX */
+// Unrolled loop to speed up execution
#define FIR(x,w,y){ \
int16_t a = (w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; \
int16_t b = (w[4]*x[4]+w[5]*x[5]+w[6]*x[6]+w[7]*x[7]) >> 16; \
y[0] = a+b; \
}
-#endif /* L4 */
+#endif /* HAVE_MMX */
+#endif /* L8 */
-#endif /* HAVE_SSE */
+#ifdef L16
+#define FIR(x,w,y) *y=(int16_t)firn(x,w,16);
+#endif /* L16 */
#endif /* __FIR_H__ */
-
-