From 0c6d92bfda120d2793e03b588e0a96c5a6f9d769 Mon Sep 17 00:00:00 2001 From: michael Date: Tue, 9 Sep 2008 23:30:06 +0000 Subject: Rewrite bgr24->yuv mmx code, the new code is cleaner, more accurate, and does not throw half the chroma away. git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@27561 b3059339-0415-0410-9bf9-f77b7e298cf2 --- libswscale/swscale.c | 17 ++- libswscale/swscale_template.c | 326 +++++++++++++++++------------------------- 2 files changed, 147 insertions(+), 196 deletions(-) (limited to 'libswscale') diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 44069cd408..9dfe1f8e7c 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -237,6 +237,20 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; + +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL; +DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL; +DECLARE_ALIGNED(8, const uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL; + +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUV[2][4]) = { + {0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL}, + {0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL}, +}; + +DECLARE_ALIGNED(8, const uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL; + #endif /* defined(ARCH_X86) */ // clipping helper table for C implementations: @@ -2201,7 +2215,8 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP) && srcFormat!=PIX_FMT_RGB8 && srcFormat!=PIX_FMT_BGR8 && srcFormat!=PIX_FMT_RGB4 && srcFormat!=PIX_FMT_BGR4 - && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE) + && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE + && srcFormat!=PIX_FMT_BGR24 && srcFormat!=PIX_FMT_RGB24) c->chrSrcHSubSample=1; if (param){ diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c index ad050ad6ff..adaa59784b 100644 --- a/libswscale/swscale_template.c +++ b/libswscale/swscale_template.c @@ -1875,78 +1875,121 @@ static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1 } } -static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) -{ #ifdef HAVE_MMX +static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat) +{ + + if(srcFormat == PIX_FMT_BGR24){ + asm volatile( + "movq "MANGLE(ff_bgr24toY1Coeff)", %mm5 \n\t" + "movq "MANGLE(ff_bgr24toY2Coeff)", %mm6 \n\t" + ); + }else{ + asm volatile( + "movq "MANGLE(ff_rgb24toY1Coeff)", %mm5 \n\t" + "movq "MANGLE(ff_rgb24toY2Coeff)", %mm6 \n\t" + ); + } + asm volatile( - "mov %2, %%"REG_a" \n\t" - "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" - ASMALIGN(4) - "1: \n\t" - PREFETCH" 64(%0, %%"REG_d") \n\t" - "movd (%0, %%"REG_d"), %%mm0 \n\t" - "movd 3(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 6(%0, %%"REG_d"), %%mm2 \n\t" - "movd 9(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "packssdw %%mm2, %%mm0 \n\t" - "psraw $7, %%mm0 \n\t" - - "movd 12(%0, %%"REG_d"), %%mm4 \n\t" - "movd 15(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 18(%0, %%"REG_d"), %%mm2 \n\t" - "movd 21(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" -#endif - "packssdw %%mm1, %%mm4 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "add $24, %%"REG_d" \n\t" - "packssdw %%mm2, %%mm4 \n\t" - "psraw $7, %%mm4 \n\t" - - "packuswb %%mm4, %%mm0 \n\t" - "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" - - "movq %%mm0, (%1, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+width*3), "r" (dst+width), "g" (-width) - : "%"REG_a, "%"REG_d + "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t" + "mov %2, %%"REG_a" \n\t" + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + PREFETCH" 64(%0) \n\t" + "movd (%0), %%mm0 \n\t" + "movd 2(%0), %%mm1 \n\t" + "movd 6(%0), %%mm2 \n\t" + "movd 8(%0), %%mm3 \n\t" + "add $12, %0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "pmaddwd %%mm5, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm1 \n\t" + "pmaddwd %%mm5, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" + "paddd %%mm1, %%mm0 \n\t" + "paddd %%mm3, %%mm2 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "paddd %%mm4, %%mm2 \n\t" + "psrad $15, %%mm0 \n\t" + "psrad $15, %%mm2 \n\t" + "packssdw %%mm2, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movd %%mm0, (%1, %%"REG_a") \n\t" + "add $4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+r" (src) + : "r" (dst+width), "g" (-width) + : "%"REG_a ); +} + +static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat) +{ + asm volatile( + "movq 24+%4, %%mm6 \n\t" + "mov %3, %%"REG_a" \n\t" + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + PREFETCH" 64(%0) \n\t" + "movd (%0), %%mm0 \n\t" + "movd 2(%0), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pmaddwd %4, %%mm0 \n\t" + "pmaddwd 8+%4, %%mm1 \n\t" + "pmaddwd 16+%4, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" + "paddd %%mm1, %%mm0 \n\t" + "paddd %%mm3, %%mm2 \n\t" + + "movd 6(%0), %%mm1 \n\t" + "movd 8(%0), %%mm3 \n\t" + "add $12, %0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "pmaddwd %4, %%mm1 \n\t" + "pmaddwd 8+%4, %%mm3 \n\t" + "pmaddwd 16+%4, %%mm4 \n\t" + "pmaddwd %%mm6, %%mm5 \n\t" + "paddd %%mm3, %%mm1 \n\t" + "paddd %%mm5, %%mm4 \n\t" + + "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t" + "paddd %%mm3, %%mm0 \n\t" + "paddd %%mm3, %%mm2 \n\t" + "paddd %%mm3, %%mm1 \n\t" + "paddd %%mm3, %%mm4 \n\t" + "psrad $15, %%mm0 \n\t" + "psrad $15, %%mm2 \n\t" + "psrad $15, %%mm1 \n\t" + "psrad $15, %%mm4 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm4, %%mm2 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm2, %%mm2 \n\t" + "movd %%mm0, (%1, %%"REG_a") \n\t" + "movd %%mm2, (%2, %%"REG_a") \n\t" + "add $4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+r" (src) + : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0]) + : "%"REG_a + ); +} +#endif + +static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) +{ +#ifdef HAVE_MMX + bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24); #else int i; for (i=0; i>(RGB2YUV_SHIFT+1); - dstV[i]= (RV*r + GV*g + BV*b + (257<>(RGB2YUV_SHIFT+1); + dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; + dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; } #endif /* HAVE_MMX */ assert(src1 == src2); @@ -2201,6 +2129,9 @@ static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width) { +#ifdef HAVE_MMX + bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24); +#else int i; for (i=0; i>RGB2YUV_SHIFT); } +#endif } static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) { int i; assert(src1==src2); +#ifdef HAVE_MMX + bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24); +#else for (i=0; i>(RGB2YUV_SHIFT+1); - dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1); + dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; + dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; } +#endif } static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, long width) -- cgit v1.2.3