summaryrefslogtreecommitdiffstats
path: root/liba52/srfftp_3dnow.h
diff options
context:
space:
mode:
Diffstat (limited to 'liba52/srfftp_3dnow.h')
-rw-r--r--liba52/srfftp_3dnow.h65
1 files changed, 59 insertions, 6 deletions
diff --git a/liba52/srfftp_3dnow.h b/liba52/srfftp_3dnow.h
index 8444fdee21..a89f13ae76 100644
--- a/liba52/srfftp_3dnow.h
+++ b/liba52/srfftp_3dnow.h
@@ -68,11 +68,18 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188,
"psrlq $32, "##mm_base"\n\t"\
"punpckldq "##mm_hlp","##mm_base"\n\t"
#endif
+#ifdef HAVE_3DNOWEX
+#define PFNACC_MM(mm_base,mm_hlp) "pfnacc "##mm_base","##mm_base"\n\t"
+#else
+#define PFNACC_MM(mm_base,mm_hlp)\
+ "movq "##mm_base","##mm_hlp"\n\t"\
+ "psrlq $32,"##mm_hlp"\n\t"\
+ "punpckldq "##mm_hlp","##mm_hlp"\n\t"\
+ "pfsub "##mm_hlp","##mm_base"\n\t"
+#endif
#define TRANSZERO_3DNOW(A0,A4,A8,A12) \
{ \
- __asm__ __volatile__("femms":::"memory");\
- TRANS_FILL_MM6_MM7_3DNOW()\
__asm__ __volatile__(\
"movq %4, %%mm0\n\t" /* mm0 = wTB[0]*/\
"movq %5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \
@@ -98,13 +105,10 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188,
:"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\
:"m"(wTB[0]), "m"(wTB[k*2]), "0"(A0), "2"(A4)\
:"memory");\
- __asm__ __volatile__("femms":::"memory");\
}
#define TRANSHALF_16_3DNOW(A2,A6,A10,A14)\
{\
- __asm__ __volatile__("femms":::"memory");\
- TRANS_FILL_MM6_MM7_3DNOW()\
__asm__ __volatile__(\
"movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
"movq %%mm0, %%mm1\n\t"\
@@ -142,7 +146,56 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188,
:"=m"(A2), "=m"(A10), "=m"(A6), "=m"(A14)\
:"m"(wTB[2]), "m"(wTB[6]), "0"(A2), "2"(A6), "m"(HSQRT2_3DNOW)\
:"memory");\
- __asm__ __volatile__("femms":::"memory");\
+}
+
+#define TRANS_3DNOW(A1,A5,A9,A13,WT,WB,D,D3)\
+{ \
+ __asm__ __volatile__(\
+ "movq %1, %%mm4\n\t"\
+ "movq %%mm4, %%mm5\n\t"\
+ "punpckldq %%mm4, %%mm4\n\t"/*mm4 = D.re | D.re */\
+ "punpckhdq %%mm5, %%mm5\n\t"/*mm5 = D.im | D.im */\
+ "movq %0, %%mm0\n\t"\
+ "pfmul %%mm0, %%mm4\n\t"/* mm4 =u.re | u.im */\
+ "pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\
+ PSWAP_MM("%%mm5","%%mm3")\
+ "pfmul %%mm7, %%mm5\n\t"\
+ "pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\
+ "movq %3, %%mm1\n\t"\
+ "movq %2, %%mm0\n\t"\
+ PSWAP_MM("%%mm1","%%mm3")\
+ "movq %%mm0, %%mm2\n\t"\
+ "pfmul %%mm1, %%mm0\n\t"/* mm0 = a*/\
+ "pfmul %3, %%mm2\n\t"/* mm2 = v*/\
+ PFNACC_MM("%%mm2","%%mm3")\
+ "pfacc %%mm0, %%mm0\n\t"\
+ "punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
+ "movq %%mm2, %%mm3\n\t"\
+ "pfmul %%mm7, %%mm3\n\t"\
+ "movq %%mm4, %%mm5\n\t"\
+ "pfmul %%mm6, %%mm5\n\t"\
+ "pfadd %%mm3, %%mm5\n\t"\
+ PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\
+ "pfadd %%mm2, %%mm4\n\t"\
+ :\
+ :"m"(WT), "m"(D), "m"(WB), "m"(D3)\
+ :"memory");\
+ __asm__ __volatile__(\
+ "movq %4, %%mm0\n\t"/* a1 = A1*/\
+ "movq %%mm0, %%mm1\n\t"\
+ "pfadd %%mm4, %%mm0\n\t"/*A1 = a1 + u*/\
+ "pfsub %%mm4, %%mm1\n\t"/*A9 = a1 - u*/\
+ "movq %%mm0, %0\n\t"\
+ "movq %%mm1, %1\n\t"\
+ "movq %5, %%mm2\n\t"/* a1 = A5*/\
+ "movq %%mm2, %%mm3\n\t"\
+ "pfsub %%mm5, %%mm2\n\t"/*A5 = a1 - v*/\
+ "pfadd %%mm5, %%mm3\n\t"/*A9 = a1 + v*/\
+ "movq %%mm2, %2\n\t"\
+ "movq %%mm3, %3"\
+ :"=m"(A1), "=m"(A9), "=m"(A5), "=m"(A13)\
+ :"0"(A1), "2"(A5), "m"(u), "m"(v)\
+ :"memory");\
}
#endif