MMX part rewritten and 16 tap filter added for better sound qualty

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@4726 b3059339-0415-0410-9bf9-f77b7e298cf2
author: anders <anders@b3059339-0415-0410-9bf9-f77b7e298cf2> 2002-02-16 13:08:14 +0000
committer: anders <anders@b3059339-0415-0410-9bf9-f77b7e298cf2> 2002-02-16 13:08:14 +0000
commit: 1de3804595626b0d262e1bbd90089f995d17ec0b (patch)
tree: 0e12ee60a8eda73e3f5c041f68142b4dbab29489 /libao2/fir.h
parent: 6292bf2273f9c9d58b364f78a5ebf19e4036e0b1 (diff)
download: mpv-1de3804595626b0d262e1bbd90089f995d17ec0b.tar.bz2
mpv-1de3804595626b0d262e1bbd90089f995d17ec0b.tar.xz
1 files changed, 58 insertions, 106 deletions
diff --git a/libao2/fir.h b/libao2/fir.h
index 8690824784..6c2d5e646c 100644
--- a/libao2/fir.h
+++ b/libao2/fir.h
@@ -11,123 +11,75 @@
 #ifndef __FIR_H__
 #define __FIR_H__
 
-/* 4, 8 and 16 tap FIR filters implemented using SSE instructions 
-   int16_t* x Input data
-   int16_t* y Output value
-   int16_t* w Filter weights 
-   
-   C function
-   for(int i = 0 ; i < L ; i++)
-     *y += w[i]*x[i];
-*/
+/* Fixpoint 16 bit fir filter FIR filter. The filter is implemented
+both in C and MMX assembly. The filter consists of one macro
+UPDATE_QUE and one inline function firn. The macro can be used for
+adding new data to the circular buffer used by the filter firn.
+Limitations: max length of n = 16*4 and n must be multiple of 4 (pad
+fiter with zeros for other lengths). Sometimes it works with filters
+longer than 4*16 (the problem is overshoot and the acumulated energy
+in the filter taps). */
 
-#ifdef HAVE_SSE
+#ifdef HAVE_MMX
+inline int32_t firn(int16_t* x, int16_t* w, int16_t n)
+{
+  register int32_t y; // Output
+  // Prologue
+  asm volatile(" pxor %mm1, %mm1;\n" ); // Clear buffer yt
+  // Main loop
+  while((n-=4)>=0){
+    asm volatile(
+	" movq 		(%1),	%%mm0;\n"  // Load x(n:n+4)
+	" pmaddwd	(%0),	%%mm0;\n"  // yt(n:n+1)=sum(x(n:n+4).*w(n:n+4))
+	" psrld	      	$16,	%%mm0;\n"  // yt(n:n+1)=yt(n:n+1)>>16
+	" paddd	 	%%mm0,	%%mm1;\n"  // yt(n:n+1)=yt(n-2:n-1)+yt(n:n+1)
+	:: "r" (w), "r" (x));
+    w+=4; x+=4;
+  }
+  // Epilogue
+  asm volatile(
+	" movq        	%%mm1, 	%%mm0;\n"  
+	" punpckhdq   	%%mm1, 	%%mm0;\n"  
+	" paddd       	%%mm0, 	%%mm1;\n"  //yt(n)=yt(n)+yt(n+1)
+	" movd        	%%mm1, 	%0   ;\n"  //y=yt
+	" emms                       ;\n"
+	: "=&r" (y));
+  return y;
+}
 
-// This block should be MMX only compatible, but it isn't...
-#ifdef L4
-#define LOAD_QUE(x) \
-        __asm __volatile("movq %0, %%mm2\n\t" \
-                         :                    \
-                         :"m"((x)[0])         \
-                         :"memory");
-#define SAVE_QUE(x) \
-        __asm __volatile("movq %%mm2, %0\n\t" \
-                         "emms          \n\t" \
-                         :"=m"(x[0])          \
-                         :                    \
-                         :"memory");
-#define UPDATE_QUE(in) \
-        __asm __volatile("psllq   $16,   %%mm2\n\t"    \
-                         "pinsrw  $0,    %0,%%mm2\n\t" \
-                          :                            \
-                          :"m" ((in)[0])               \
-                          :"memory");                  
-#define FIR(x,w,y) \
-        __asm __volatile("movq	  %%mm2, %%mm0\n\t" \
-                         "pmaddwd %1,    %%mm0\n\t" \
-                         "movq    %%mm0, %%mm1\n\t" \
-                         "psrlq   $32, 	 %%mm1\n\t" \
-                         "paddd   %%mm0, %%mm1\n\t" \
-                         "movd    %%mm1, %%esi\n\t" \
-                         "shrl    $16,   %%esi\n\t" \
-                         "movw    %%si,  %0\n\t"    \
-			 : "=m" ((y)[0])            \
-			 : "m" ((w)[0])             \
-			 : "memory", "%esi"); 
-#endif /* L4 */
+#else /* HAVE_MMX */
 
-// It is possible to make the 8 bit filter a lot faster by using the
-// 128 bit registers, feel free to optimize.
-#ifdef L8
-#define LOAD_QUE(x) \
-        __asm __volatile("movq %0, %%mm5\n\t" \
-                         "movq %1, %%mm4\n\t" \
-                         :                    \
-                         :"m"((x)[0]),        \
-                          "m"((x)[4])         \
-                         :"memory");
-#define SAVE_QUE(x) \
-        __asm __volatile("movq %%mm5, %0\n\t" \
-                         "movq %%mm4, %1\n\t" \
-                         "emms          \n\t" \
-                         :"=m"((x)[0]),       \
-                          "=m"((x)[4])        \
-                         :                    \
-                         :"memory");
-
-// Below operation could replace line 2 to 5 in macro below but can
-// not cause of compiler bug ???
-// "pextrw $3, %%mm5,%%eax\n\t"
-#define UPDATE_QUE(in) \
-        __asm __volatile("psllq    $16,   %%mm4\n\t"        \
-                         "movq	   %%mm5, %%mm0\n\t" 	    \
-                         "psrlq    $48,   %%mm0\n\t"        \
-                         "movd     %%mm0, %%eax\n\t"        \
-			 "pinsrw   $0,    %%eax,%%mm4\n\t"  \
-                         "psllq    $16,   %%mm5\n\t"        \
-                         "pinsrw   $0,    %0,%%mm5\n\t"     \
-                          :                                 \
-                          :"m" ((in)[0])                    \
-                          :"memory", "%eax");                  
-#define FIR(x,w,y) \
-        __asm __volatile("movq	  %%mm5, %%mm0\n\t" \
-                         "pmaddwd %1,    %%mm0\n\t" \
-                         "movq	  %%mm4, %%mm1\n\t" \
-                         "pmaddwd %2,    %%mm1\n\t" \
-                         "paddd   %%mm1, %%mm0\n\t" \
-                         "movq    %%mm0, %%mm1\n\t" \
-                         "psrlq   $32, 	 %%mm1\n\t" \
-                         "paddd   %%mm0, %%mm1\n\t" \
-                         "movd    %%mm1, %%esi\n\t" \
-                         "shrl    $16,   %%esi\n\t" \
-                         "movw    %%si,  %0\n\t"    \
-			 : "=m" ((y)[0])            \
-			 : "m" ((w)[0]),            \
-			   "m" ((w)[4])             \
-			 : "memory", "%esi"); 
-#endif /* L8 */
+// Same thing as above but in C
+inline int32_t firn(int16_t* x, int16_t* w, int16_t n)
+{
+  register int32_t y=0;
+  while((n-=4) >=0)
+    y+=w[n]*x[n]+w[n+1]*x[n+1]+w[n+2]*x[n+2]+w[n+3]*x[n+3] >> 16;
+  return y;
+}
 
-#else /* HAVE_SSE */
+#endif /* HAVE_MMX */
 
-#define LOAD_QUE(x)
-#define SAVE_QUE(x)
-#define UPDATE_QUE(inm) \
-  xi=(--xi)&(L-1);     \
-  x[xi]=x[xi+L]=*(inm);
+// Macro to add new data to circular queue
+#define UPDATE_QUE(ind,xq,xid) \
+  xid=(--xid)&(L-1);      \
+  xq[xid]=xq[xid+L]=*(ind);
 
-#ifdef L4
-#define FIR(x,w,y) \
-        y[0]=(w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16;
-#else
+#ifdef L8
+#ifdef HAVE_MMX
+#define FIR(x,w,y) *y=(int16_t)firn(x,w,8);
+#else /* HAVE_MMX */
+// Unrolled loop to speed up execution 
 #define FIR(x,w,y){ \
   int16_t a = (w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; \
   int16_t b = (w[4]*x[4]+w[5]*x[5]+w[6]*x[6]+w[7]*x[7]) >> 16; \
   y[0]      = a+b; \
 }
-#endif /* L4 */
+#endif /* HAVE_MMX */
+#endif /* L8 */
 
-#endif /* HAVE_SSE */
+#ifdef L16
+#define FIR(x,w,y) *y=(int16_t)firn(x,w,16);
+#endif /* L16 */
 
 #endif /* __FIR_H__ */
-
-
author	anders <anders@b3059339-0415-0410-9bf9-f77b7e298cf2>	2002-02-16 13:08:14 +0000
committer	anders <anders@b3059339-0415-0410-9bf9-f77b7e298cf2>	2002-02-16 13:08:14 +0000
commit	1de3804595626b0d262e1bbd90089f995d17ec0b (patch)
tree	0e12ee60a8eda73e3f5c041f68142b4dbab29489 /libao2/fir.h
parent	6292bf2273f9c9d58b364f78a5ebf19e4036e0b1 (diff)
download	mpv-1de3804595626b0d262e1bbd90089f995d17ec0b.tar.bz2 mpv-1de3804595626b0d262e1bbd90089f995d17ec0b.tar.xz