From fabb1b271a3f55cb5d0a3b2dbefc0da910ecd70c Mon Sep 17 00:00:00 2001 From: diego Date: Sat, 12 Apr 2008 22:42:00 +0000 Subject: Backport SSE2-optimized IDCT routines from upstream libmpeg2. Thanks to Alexander Strange for finding and fixing some bugs. git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@26425 b3059339-0415-0410-9bf9-f77b7e298cf2 --- libmpcodecs/vd_libmpeg2.c | 2 + libmpeg2/idct.c | 6 +- libmpeg2/idct_mmx.c | 499 +++++++++++++++++++++++++++++++++++++++++++++- libmpeg2/mmx.h | 24 +++ libmpeg2/mpeg2_internal.h | 3 + 5 files changed, 530 insertions(+), 4 deletions(-) diff --git a/libmpcodecs/vd_libmpeg2.c b/libmpcodecs/vd_libmpeg2.c index 4dbb807750..c8ca0e92b6 100644 --- a/libmpcodecs/vd_libmpeg2.c +++ b/libmpcodecs/vd_libmpeg2.c @@ -74,6 +74,8 @@ static int init(sh_video_t *sh){ accel |= MPEG2_ACCEL_X86_MMXEXT; if(gCpuCaps.has3DNow) accel |= MPEG2_ACCEL_X86_3DNOW; + if(gCpuCaps.hasSSE2) + accel |= MPEG2_ACCEL_X86_SSE2; if(gCpuCaps.hasAltiVec) accel |= MPEG2_ACCEL_PPC_ALTIVEC; #ifdef HAVE_VIS diff --git a/libmpeg2/idct.c b/libmpeg2/idct.c index 5fb4b508af..530e10c9b0 100644 --- a/libmpeg2/idct.c +++ b/libmpeg2/idct.c @@ -240,7 +240,11 @@ static void mpeg2_idct_add_c (const int last, int16_t * block, void mpeg2_idct_init (uint32_t accel) { #ifdef ARCH_X86 - if (accel & MPEG2_ACCEL_X86_MMXEXT) { + if (accel & MPEG2_ACCEL_X86_SSE2) { + mpeg2_idct_copy = mpeg2_idct_copy_sse2; + mpeg2_idct_add = mpeg2_idct_add_sse2; + mpeg2_idct_mmx_init (); + } else if (accel & MPEG2_ACCEL_X86_MMXEXT) { mpeg2_idct_copy = mpeg2_idct_copy_mmxext; mpeg2_idct_add = mpeg2_idct_add_mmxext; mpeg2_idct_mmx_init (); diff --git a/libmpeg2/idct_mmx.c b/libmpeg2/idct_mmx.c index 2bef6704ed..3d1ae46f6e 100644 --- a/libmpeg2/idct_mmx.c +++ b/libmpeg2/idct_mmx.c @@ -41,6 +41,7 @@ #define round(bias) ((int)(((bias)+0.5) * (1< [3 2 1 0] */ \ + packssdw_r2r (xmm2, row1); /* 1: row1= result[] */ \ + paddd_r2r (xmm7, row2); /* 2: */ \ + psubd_r2r (row2, xmm6); /* 2: */ \ + paddd_r2r (xmm5, row2); /* 2: */ \ + psrad_i2r (ROW_SHIFT, xmm6); /* 2: */ \ + psrad_i2r (ROW_SHIFT, row2); /* 2: */ \ + pshufd_r2r (xmm6, xmm6, 0x1b); /* 2: */ \ + packssdw_r2r (xmm6, row2); /* 2: */ \ +} while (0) + + /* MMXEXT row IDCT */ #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ @@ -400,14 +456,264 @@ static inline void idct_col (int16_t * col, int offset) #endif -/* MMX column IDCT */ -static inline void idct_col (int16_t * const col, const int offset) -{ #define T1 13036 #define T2 27146 #define T3 43790 #define C4 23170 + +/* SSE2 column IDCT */ +static inline void sse2_idct_col (int16_t * const col) +{ + /* Almost identical to mmxext version: */ + /* just do both 4x8 columns in paraller */ + + static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1}; + static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2}; + static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3}; + static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4}; + +#if defined(__x86_64__) + + /* INPUT: block in xmm8 ... xmm15 */ + + movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ + movdqa_r2r (xmm9, xmm1); /* xmm1 = x1 */ + + movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ + pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ + + movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ + pmulhw_r2r (xmm15, xmm2); /* xmm2 = T1*x7 */ + + movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ + psubsw_r2r (xmm15, xmm0); /* xmm0 = v17 */ + + movdqa_m2r (*t2_vector, xmm9); /* xmm9 = T2 */ + pmulhw_r2r (xmm11, xmm5); /* xmm5 = (T3-1)*x3 */ + + paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ + pmulhw_r2r (xmm13, xmm7); /* xmm7 = (T3-1)*x5 */ + + movdqa_r2r (xmm9, xmm2); /* xmm2 = T2 */ + paddsw_r2r (xmm11, xmm5); /* xmm5 = T3*x3 */ + + pmulhw_r2r (xmm10, xmm9); /* xmm9 = T2*x2 */ + paddsw_r2r (xmm13, xmm7); /* xmm7 = T3*x5 */ + + psubsw_r2r (xmm13, xmm5); /* xmm5 = v35 */ + paddsw_r2r (xmm11, xmm7); /* xmm7 = u35 */ + + movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ + pmulhw_r2r (xmm14, xmm2); /* xmm2 = T2*x6 */ + + psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ + psubsw_r2r (xmm14, xmm9); /* xmm9 = v26 */ + + paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ + movdqa_r2r (xmm0, xmm11); /* xmm11 = b3 */ + + movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ + paddsw_r2r (xmm10, xmm2); /* xmm2 = u26 */ + + paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ + psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ + + movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ + paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ + + movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ + psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ + + movdqa_r2r (xmm6, xmm4); /* xmm4 = b0 */ + pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ + + movdqa_r2r (xmm9, xmm6); /* xmm6 = v26 */ + pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ + + movdqa_r2r (xmm8, xmm10); /* xmm10 = x0 */ + movdqa_r2r (xmm8, xmm0); /* xmm0 = x0 */ + + psubsw_r2r (xmm12, xmm10); /* xmm10 = v04 */ + paddsw_r2r (xmm12, xmm0); /* xmm0 = u04 */ + + paddsw_r2r (xmm10, xmm9); /* xmm9 = a1 */ + movdqa_r2r (xmm0, xmm8); /* xmm8 = u04 */ + + psubsw_r2r (xmm6, xmm10); /* xmm10 = a2 */ + paddsw_r2r (xmm2, xmm8); /* xmm5 = a0 */ + + paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ + psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ + + paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ + movdqa_r2r (xmm10, xmm13); /* xmm13 = a2 */ + + movdqa_r2r (xmm9, xmm14); /* xmm14 = a1 */ + paddsw_r2r (xmm7, xmm10); /* xmm10 = a2+b2 */ + + psraw_i2r (COL_SHIFT,xmm10); /* xmm10 = y2 */ + paddsw_r2r (xmm1, xmm9); /* xmm9 = a1+b1 */ + + psraw_i2r (COL_SHIFT, xmm9); /* xmm9 = y1 */ + psubsw_r2r (xmm1, xmm14); /* xmm14 = a1-b1 */ + + psubsw_r2r (xmm7, xmm13); /* xmm13 = a2-b2 */ + psraw_i2r (COL_SHIFT,xmm14); /* xmm14 = y6 */ + + movdqa_r2r (xmm8, xmm15); /* xmm15 = a0 */ + psraw_i2r (COL_SHIFT,xmm13); /* xmm13 = y5 */ + + paddsw_r2r (xmm4, xmm8); /* xmm8 = a0+b0 */ + psubsw_r2r (xmm4, xmm15); /* xmm15 = a0-b0 */ + + psraw_i2r (COL_SHIFT, xmm8); /* xmm8 = y0 */ + movdqa_r2r (xmm0, xmm12); /* xmm12 = a3 */ + + psubsw_r2r (xmm11, xmm12); /* xmm12 = a3-b3 */ + psraw_i2r (COL_SHIFT,xmm15); /* xmm15 = y7 */ + + paddsw_r2r (xmm0, xmm11); /* xmm11 = a3+b3 */ + psraw_i2r (COL_SHIFT,xmm12); /* xmm12 = y4 */ + + psraw_i2r (COL_SHIFT,xmm11); /* xmm11 = y3 */ + + /* OUTPUT: block in xmm8 ... xmm15 */ + +#else + movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ + + movdqa_m2r (*(col+1*8), xmm1); /* xmm1 = x1 */ + movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ + + movdqa_m2r (*(col+7*8), xmm4); /* xmm4 = x7 */ + pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ + + movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ + pmulhw_r2r (xmm4, xmm2); /* xmm2 = T1*x7 */ + + movdqa_m2r (*(col+5*8), xmm6); /* xmm6 = x5 */ + movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ + + movdqa_m2r (*(col+3*8), xmm3); /* xmm3 = x3 */ + psubsw_r2r (xmm4, xmm0); /* xmm0 = v17 */ + + movdqa_m2r (*t2_vector, xmm4); /* xmm4 = T2 */ + pmulhw_r2r (xmm3, xmm5); /* xmm5 = (T3-1)*x3 */ + + paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ + pmulhw_r2r (xmm6, xmm7); /* xmm7 = (T3-1)*x5 */ + + /* slot */ + + movdqa_r2r (xmm4, xmm2); /* xmm2 = T2 */ + paddsw_r2r (xmm3, xmm5); /* xmm5 = T3*x3 */ + + pmulhw_m2r (*(col+2*8), xmm4); /* xmm4 = T2*x2 */ + paddsw_r2r (xmm6, xmm7); /* xmm7 = T3*x5 */ + + psubsw_r2r (xmm6, xmm5); /* xmm5 = v35 */ + paddsw_r2r (xmm3, xmm7); /* xmm7 = u35 */ + + movdqa_m2r (*(col+6*8), xmm3); /* xmm3 = x6 */ + movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ + + pmulhw_r2r (xmm3, xmm2); /* xmm2 = T2*x6 */ + psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ + + psubsw_r2r (xmm3, xmm4); /* xmm4 = v26 */ + paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ + + movdqa_r2m (xmm0, *(col+3*8)); /* save b3 in scratch0 */ + movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ + + paddsw_m2r (*(col+2*8), xmm2); /* xmm2 = u26 */ + paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ + + psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ + movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ + + movdqa_m2r (*(col+0*8), xmm3); /* xmm3 = x0 */ + paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ + + movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ + psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ + + movdqa_r2m (xmm6, *(col+5*8)); /* save b0 in scratch1 */ + pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ + + movdqa_r2r (xmm4, xmm6); /* xmm6 = v26 */ + pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ + + movdqa_m2r (*(col+4*8), xmm5); /* xmm5 = x4 */ + movdqa_r2r (xmm3, xmm0); /* xmm0 = x0 */ + + psubsw_r2r (xmm5, xmm3); /* xmm3 = v04 */ + paddsw_r2r (xmm5, xmm0); /* xmm0 = u04 */ + + paddsw_r2r (xmm3, xmm4); /* xmm4 = a1 */ + movdqa_r2r (xmm0, xmm5); /* xmm5 = u04 */ + + psubsw_r2r (xmm6, xmm3); /* xmm3 = a2 */ + paddsw_r2r (xmm2, xmm5); /* xmm5 = a0 */ + + paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ + psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ + + paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ + movdqa_r2r (xmm3, xmm2); /* xmm2 = a2 */ + + movdqa_r2r (xmm4, xmm6); /* xmm6 = a1 */ + paddsw_r2r (xmm7, xmm3); /* xmm3 = a2+b2 */ + + psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y2 */ + paddsw_r2r (xmm1, xmm4); /* xmm4 = a1+b1 */ + + psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y1 */ + psubsw_r2r (xmm1, xmm6); /* xmm6 = a1-b1 */ + + movdqa_m2r (*(col+5*8), xmm1); /* xmm1 = b0 */ + psubsw_r2r (xmm7, xmm2); /* xmm2 = a2-b2 */ + + psraw_i2r (COL_SHIFT, xmm6); /* xmm6 = y6 */ + movdqa_r2r (xmm5, xmm7); /* xmm7 = a0 */ + + movdqa_r2m (xmm4, *(col+1*8)); /* save y1 */ + psraw_i2r (COL_SHIFT, xmm2); /* xmm2 = y5 */ + + movdqa_r2m (xmm3, *(col+2*8)); /* save y2 */ + paddsw_r2r (xmm1, xmm5); /* xmm5 = a0+b0 */ + + movdqa_m2r (*(col+3*8), xmm4); /* xmm4 = b3 */ + psubsw_r2r (xmm1, xmm7); /* xmm7 = a0-b0 */ + + psraw_i2r (COL_SHIFT, xmm5); /* xmm5 = y0 */ + movdqa_r2r (xmm0, xmm3); /* xmm3 = a3 */ + + movdqa_r2m (xmm2, *(col+5*8)); /* save y5 */ + psubsw_r2r (xmm4, xmm3); /* xmm3 = a3-b3 */ + + psraw_i2r (COL_SHIFT, xmm7); /* xmm7 = y7 */ + paddsw_r2r (xmm0, xmm4); /* xmm4 = a3+b3 */ + + movdqa_r2m (xmm5, *(col+0*8)); /* save y0 */ + psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y4 */ + + movdqa_r2m (xmm6, *(col+6*8)); /* save y6 */ + psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y3 */ + + movdqa_r2m (xmm7, *(col+7*8)); /* save y7 */ + + movdqa_r2m (xmm3, *(col+4*8)); /* save y4 */ + + movdqa_r2m (xmm4, *(col+3*8)); /* save y3 */ +#endif +} + + +/* MMX column IDCT */ +static inline void idct_col (int16_t * const col, const int offset) +{ static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; @@ -596,6 +902,129 @@ static inline void idct (int16_t * const block) \ idct_col (block, 4); \ } +static inline void sse2_idct (int16_t * const block) +{ + static const int16_t table04[] ATTR_ALIGN(16) = + sse2_table (22725, 21407, 19266, 16384, 12873, 8867, 4520); + static const int16_t table17[] ATTR_ALIGN(16) = + sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270); + static const int16_t table26[] ATTR_ALIGN(16) = + sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906); + static const int16_t table35[] ATTR_ALIGN(16) = + sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315); + + static const int32_t rounder0_128[] ATTR_ALIGN(16) = + rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5); + static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0); + static const int32_t rounder1_128[] ATTR_ALIGN(16) = + rounder_sse2 (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ + static const int32_t rounder7_128[] ATTR_ALIGN(16) = + rounder_sse2 (-0.25); /* C1*(C7/C4+C7-C1)/2 */ + static const int32_t rounder2_128[] ATTR_ALIGN(16) = + rounder_sse2 (0.60355339059); /* C2 * (C6+C2)/2 */ + static const int32_t rounder6_128[] ATTR_ALIGN(16) = + rounder_sse2 (-0.25); /* C2 * (C6-C2)/2 */ + static const int32_t rounder3_128[] ATTR_ALIGN(16) = + rounder_sse2 (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ + static const int32_t rounder5_128[] ATTR_ALIGN(16) = + rounder_sse2 (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ + +#if defined(__x86_64__) + movdqa_m2r (block[0*8], xmm8); + movdqa_m2r (block[4*8], xmm12); + SSE2_IDCT_2ROW (table04, xmm8, xmm12, *rounder0_128, *rounder4_128); + + movdqa_m2r (block[1*8], xmm9); + movdqa_m2r (block[7*8], xmm15); + SSE2_IDCT_2ROW (table17, xmm9, xmm15, *rounder1_128, *rounder7_128); + + movdqa_m2r (block[2*8], xmm10); + movdqa_m2r (block[6*8], xmm14); + SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128); + + movdqa_m2r (block[3*8], xmm11); + movdqa_m2r (block[5*8], xmm13); + SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128); + + /* OUTPUT: block in xmm8 ... xmm15 */ + +#else + movdqa_m2r (block[0*8], xmm0); + movdqa_m2r (block[4*8], xmm4); + SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128); + movdqa_r2m (xmm0, block[0*8]); + movdqa_r2m (xmm4, block[4*8]); + + movdqa_m2r (block[1*8], xmm0); + movdqa_m2r (block[7*8], xmm4); + SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128); + movdqa_r2m (xmm0, block[1*8]); + movdqa_r2m (xmm4, block[7*8]); + + movdqa_m2r (block[2*8], xmm0); + movdqa_m2r (block[6*8], xmm4); + SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128); + movdqa_r2m (xmm0, block[2*8]); + movdqa_r2m (xmm4, block[6*8]); + + movdqa_m2r (block[3*8], xmm0); + movdqa_m2r (block[5*8], xmm4); + SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128); + movdqa_r2m (xmm0, block[3*8]); + movdqa_r2m (xmm4, block[5*8]); +#endif + + sse2_idct_col (block); +} + +static void sse2_block_copy (int16_t * const block, uint8_t * dest, + const int stride) +{ +#if defined(__x86_64__) + /* INPUT: block in xmm8 ... xmm15 */ + packuswb_r2r (xmm8, xmm8); + packuswb_r2r (xmm9, xmm9); + movq_r2m (xmm8, *(dest+0*stride)); + packuswb_r2r (xmm10, xmm10); + movq_r2m (xmm9, *(dest+1*stride)); + packuswb_r2r (xmm11, xmm11); + movq_r2m (xmm10, *(dest+2*stride)); + packuswb_r2r (xmm12, xmm12); + movq_r2m (xmm11, *(dest+3*stride)); + packuswb_r2r (xmm13, xmm13); + movq_r2m (xmm12, *(dest+4*stride)); + packuswb_r2r (xmm14, xmm14); + movq_r2m (xmm13, *(dest+5*stride)); + packuswb_r2r (xmm15, xmm15); + movq_r2m (xmm14, *(dest+6*stride)); + movq_r2m (xmm15, *(dest+7*stride)); +#else + movdqa_m2r (*(block+0*8), xmm0); + movdqa_m2r (*(block+1*8), xmm1); + movdqa_m2r (*(block+2*8), xmm2); + packuswb_r2r (xmm0, xmm0); + movdqa_m2r (*(block+3*8), xmm3); + packuswb_r2r (xmm1, xmm1); + movdqa_m2r (*(block+4*8), xmm4); + packuswb_r2r (xmm2, xmm2); + movdqa_m2r (*(block+5*8), xmm5); + packuswb_r2r (xmm3, xmm3); + movdqa_m2r (*(block+6*8), xmm6); + packuswb_r2r (xmm4, xmm4); + movdqa_m2r (*(block+7*8), xmm7); + movq_r2m (xmm0, *(dest+0*stride)); + packuswb_r2r (xmm5, xmm5); + movq_r2m (xmm1, *(dest+1*stride)); + packuswb_r2r (xmm6, xmm6); + movq_r2m (xmm2, *(dest+2*stride)); + packuswb_r2r (xmm7, xmm7); + movq_r2m (xmm3, *(dest+3*stride)); + movq_r2m (xmm4, *(dest+4*stride)); + movq_r2m (xmm5, *(dest+5*stride)); + movq_r2m (xmm6, *(dest+6*stride)); + movq_r2m (xmm7, *(dest+7*stride)); +#endif +} #define COPY_MMX(offset,r0,r1,r2) \ do { \ @@ -625,6 +1054,38 @@ static inline void block_copy (int16_t * const block, uint8_t * dest, movq_r2m (mm2, *(dest+stride)); } +#define ADD_SSE2_2ROW(op, block0, block1)\ +do { \ + movq_m2r (*(dest), xmm1); \ + movq_m2r (*(dest+stride), xmm2); \ + punpcklbw_r2r (xmm0, xmm1); \ + punpcklbw_r2r (xmm0, xmm2); \ + paddsw_##op (block0, xmm1); \ + paddsw_##op (block1, xmm2); \ + packuswb_r2r (xmm1, xmm1); \ + packuswb_r2r (xmm2, xmm2); \ + movq_r2m (xmm1, *(dest)); \ + movq_r2m (xmm2, *(dest+stride)); \ + dest += 2*stride; \ +} while (0) + +static void sse2_block_add (int16_t * const block, uint8_t * dest, + const int stride) +{ + pxor_r2r(xmm0, xmm0); +#if defined(__x86_64__) + /* INPUT: block in xmm8 ... xmm15 */ + ADD_SSE2_2ROW(r2r, xmm8, xmm9); + ADD_SSE2_2ROW(r2r, xmm10, xmm11); + ADD_SSE2_2ROW(r2r, xmm12, xmm13); + ADD_SSE2_2ROW(r2r, xmm14, xmm15); +#else + ADD_SSE2_2ROW(m2r, *(block+0*8), *(block+1*8)); + ADD_SSE2_2ROW(m2r, *(block+2*8), *(block+3*8)); + ADD_SSE2_2ROW(m2r, *(block+4*8), *(block+5*8)); + ADD_SSE2_2ROW(m2r, *(block+6*8), *(block+7*8)); +#endif +} #define ADD_MMX(offset,r1,r2,r3,r4) \ do { \ @@ -668,6 +1129,19 @@ static inline void block_add (int16_t * const block, uint8_t * dest, } +static inline void sse2_block_zero (int16_t * const block) +{ + pxor_r2r (xmm0, xmm0); + movdqa_r2m (xmm0, *(block+0*8)); + movdqa_r2m (xmm0, *(block+1*8)); + movdqa_r2m (xmm0, *(block+2*8)); + movdqa_r2m (xmm0, *(block+3*8)); + movdqa_r2m (xmm0, *(block+4*8)); + movdqa_r2m (xmm0, *(block+5*8)); + movdqa_r2m (xmm0, *(block+6*8)); + movdqa_r2m (xmm0, *(block+7*8)); +} + static inline void block_zero (int16_t * const block) { pxor_r2r (mm0, mm0); @@ -752,6 +1226,25 @@ static inline void block_add_DC (int16_t * const block, uint8_t * dest, movq_r2m (mm3, *(dest + 2*stride)); } +void mpeg2_idct_copy_sse2 (int16_t * const block, uint8_t * const dest, + const int stride) +{ + sse2_idct (block); + sse2_block_copy (block, dest, stride); + sse2_block_zero (block); +} + +void mpeg2_idct_add_sse2 (const int last, int16_t * const block, + uint8_t * const dest, const int stride) +{ + if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { + sse2_idct (block); + sse2_block_add (block, dest, stride); + sse2_block_zero (block); + } else + block_add_DC (block, dest, stride, CPU_MMXEXT); +} + declare_idct (mmxext_idct, mmxext_table, mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) diff --git a/libmpeg2/mmx.h b/libmpeg2/mmx.h index 08b4d47760..fcb92870d2 100644 --- a/libmpeg2/mmx.h +++ b/libmpeg2/mmx.h @@ -257,6 +257,30 @@ typedef union { #define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) #define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) + +/* SSE2 */ + +typedef union { + long long q[2]; /* Quadword (64-bit) value */ + unsigned long long uq[2]; /* Unsigned Quadword */ + int d[4]; /* 2 Doubleword (32-bit) values */ + unsigned int ud[4]; /* 2 Unsigned Doubleword */ + short w[8]; /* 4 Word (16-bit) values */ + unsigned short uw[8]; /* 4 Unsigned Word */ + char b[16]; /* 8 Byte (8-bit) values */ + unsigned char ub[16]; /* 8 Unsigned Byte */ + float s[4]; /* Single-precision (32-bit) value */ +} ATTR_ALIGN(16) sse_t; /* On an 16-byte (128-bit) boundary */ + +#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) +#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) +#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) +#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) +#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) +#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) + +#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) + #define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) #define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) diff --git a/libmpeg2/mpeg2_internal.h b/libmpeg2/mpeg2_internal.h index b76e8e245a..37eb61f227 100644 --- a/libmpeg2/mpeg2_internal.h +++ b/libmpeg2/mpeg2_internal.h @@ -265,6 +265,9 @@ void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int b_type); void mpeg2_idct_init (uint32_t accel); /* idct_mmx.c */ +void mpeg2_idct_copy_sse2 (int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_sse2 (int last, int16_t * block, + uint8_t * dest, int stride); void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride); void mpeg2_idct_add_mmxext (int last, int16_t * block, uint8_t * dest, int stride); -- cgit v1.2.3