diff options
Diffstat (limited to 'libswscale/swscale_template.c')
-rw-r--r-- | libswscale/swscale_template.c | 557 |
1 files changed, 374 insertions, 183 deletions
diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c index 80704f6eb6..4562866b7b 100644 --- a/libswscale/swscale_template.c +++ b/libswscale/swscale_template.c @@ -26,15 +26,6 @@ #undef PAVGB #undef PREFETCH #undef PREFETCHW -#undef EMMS -#undef SFENCE - -#if HAVE_AMD3DNOW -/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ -#define EMMS "femms" -#else -#define EMMS "emms" -#endif #if HAVE_AMD3DNOW #define PREFETCH "prefetch" @@ -48,12 +39,6 @@ #endif #if HAVE_MMX2 -#define SFENCE "sfence" -#else -#define SFENCE " # nop" -#endif - -#if HAVE_MMX2 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" #elif HAVE_AMD3DNOW #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" @@ -67,7 +52,7 @@ #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) #if HAVE_ALTIVEC -#include "swscale_altivec_template.c" +#include "ppc/swscale_altivec_template.c" #endif #define YSCALEYUV2YV12X(x, offset, dest, width) \ @@ -644,6 +629,14 @@ #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) +#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ + "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ + "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ + "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ + "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ + "packuswb %%mm1, %%mm7 \n\t" +#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) + #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ "movq "#b", "#q2" \n\t" /* B */\ "movq "#r", "#t" \n\t" /* R */\ @@ -909,8 +902,8 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, - int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, - uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) + int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, int16_t **alpSrc, + uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) { #if HAVE_MMX if(!(c->flags & SWS_BITEXACT)){ @@ -919,6 +912,9 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t * YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) } + if (CONFIG_SWSCALE_ALPHA && aDest){ + YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) + } YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) }else{ @@ -926,6 +922,9 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t * YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) } + if (CONFIG_SWSCALE_ALPHA && aDest){ + YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) + } YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) } @@ -939,7 +938,7 @@ yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, #else //HAVE_ALTIVEC yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, chrFilter, chrSrc, chrFilterSize, - dest, uDest, vDest, dstW, chrDstW); + alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW); #endif //!HAVE_ALTIVEC } @@ -952,34 +951,38 @@ yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, dest, uDest, dstW, chrDstW, dstFormat); } -static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, - uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) +static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, int16_t *alpSrc, + uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) { int i; #if HAVE_MMX if(!(c->flags & SWS_BITEXACT)){ - long p= uDest ? 3 : 1; - uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; - uint8_t *dst[3]= {dest, uDest, vDest}; - long counter[3] = {dstW, chrDstW, chrDstW}; + long p= 4; + uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; + uint8_t *dst[4]= {aDest, dest, uDest, vDest}; + x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW}; if (c->flags & SWS_ACCURATE_RND){ while(p--){ - __asm__ volatile( - YSCALEYUV2YV121_ACCURATE - :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]) - : "%"REG_a - ); + if (dst[p]){ + __asm__ volatile( + YSCALEYUV2YV121_ACCURATE + :: "r" (src[p]), "r" (dst[p] + counter[p]), + "g" (-counter[p]) + : "%"REG_a + ); + } } }else{ while(p--){ - __asm__ volatile( - YSCALEYUV2YV121 - :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]) - : "%"REG_a - ); + if (dst[p]){ + __asm__ volatile( + YSCALEYUV2YV121 + :: "r" (src[p]), "r" (dst[p] + counter[p]), + "g" (-counter[p]) + : "%"REG_a + ); + } } } return; @@ -1013,6 +1016,12 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chr uDest[i]= u; vDest[i]= v; } + + if (CONFIG_SWSCALE_ALPHA && aDest) + for (i=0; i<dstW; i++){ + int val= (alpSrc[i]+64)>>7; + aDest[i]= av_clip_uint8(val); + } } @@ -1021,20 +1030,36 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chr */ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, - uint8_t *dest, long dstW, long dstY) + int16_t **alpSrc, uint8_t *dest, long dstW, long dstY) { #if HAVE_MMX - long dummy=0; + x86_reg dummy=0; if(!(c->flags & SWS_BITEXACT)){ if (c->flags & SWS_ACCURATE_RND){ switch(c->dstFormat){ case PIX_FMT_RGB32: - YSCALEYUV2PACKEDX_ACCURATE - YSCALEYUV2RGBX - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ + YSCALEYUV2PACKEDX_ACCURATE + YSCALEYUV2RGBX + "movq %%mm2, "U_TEMP"(%0) \n\t" + "movq %%mm4, "V_TEMP"(%0) \n\t" + "movq %%mm5, "Y_TEMP"(%0) \n\t" + YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) + "movq "Y_TEMP"(%0), %%mm5 \n\t" + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) + + YSCALEYUV2PACKEDX_END + }else{ + YSCALEYUV2PACKEDX_ACCURATE + YSCALEYUV2RGBX + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - YSCALEYUV2PACKEDX_END + YSCALEYUV2PACKEDX_END + } return; case PIX_FMT_BGR24: YSCALEYUV2PACKEDX_ACCURATE @@ -1095,11 +1120,22 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_ switch(c->dstFormat) { case PIX_FMT_RGB32: - YSCALEYUV2PACKEDX - YSCALEYUV2RGBX - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - YSCALEYUV2PACKEDX_END + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + YSCALEYUV2PACKEDX_END + }else{ + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + YSCALEYUV2PACKEDX_END + } return; case PIX_FMT_BGR24: YSCALEYUV2PACKEDX @@ -1160,26 +1196,26 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_ #endif /* HAVE_MMX */ #if HAVE_ALTIVEC /* The following list of supported dstFormat values should - match what's found in the body of altivec_yuv2packedX() */ - if (!(c->flags & SWS_BITEXACT) && + match what's found in the body of ff_yuv2packedX_altivec() */ + if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf && (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) - altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, - chrFilter, chrSrc, chrFilterSize, - dest, dstW, dstY); + ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, + chrFilter, chrSrc, chrFilterSize, + dest, dstW, dstY); else #endif yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, chrFilter, chrSrc, chrFilterSize, - dest, dstW, dstY); + alpSrc, dest, dstW, dstY); } /** * vertical bilinear scale YV12 to RGB */ static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, - uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) + uint16_t *abuf0, uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) { int yalpha1=4095- yalpha; int uvalpha1=4095-uvalpha; @@ -1191,19 +1227,62 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t * { //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( case PIX_FMT_RGB32: - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) - ); + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ +#if ARCH_X86_64 + __asm__ volatile( + YSCALEYUV2RGB(%%REGBP, %5) + YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7) + "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "packuswb %%mm7, %%mm1 \n\t" + WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest), + "a" (&c->redDither) + ,"r" (abuf0), "r" (abuf1) + : "%"REG_BP + ); +#else + *(uint16_t **)(&c->u_temp)=abuf0; + *(uint16_t **)(&c->v_temp)=abuf1; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB(%%REGBP, %5) + "push %0 \n\t" + "push %1 \n\t" + "mov "U_TEMP"(%5), %0 \n\t" + "mov "V_TEMP"(%5), %1 \n\t" + YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) + "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "packuswb %%mm7, %%mm1 \n\t" + "pop %1 \n\t" + "pop %0 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); +#endif + }else{ + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); + } return; case PIX_FMT_BGR24: __asm__ volatile( @@ -1279,14 +1358,14 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t * } } #endif //HAVE_MMX -YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) +YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) } /** * YV12 to RGB without scaling or interpolating */ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, - uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) + uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) { const int yalpha1=0; int i; @@ -1296,7 +1375,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t * if (flags&SWS_FULL_CHR_H_INT) { - RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); + RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y); return; } @@ -1307,19 +1386,35 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t * switch(dstFormat) { case PIX_FMT_RGB32: - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) - ); + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + YSCALEYUV2RGB1_ALPHA(%%REGBP) + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); + }else{ + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); + } return; case PIX_FMT_BGR24: __asm__ volatile( @@ -1400,19 +1495,35 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t * switch(dstFormat) { case PIX_FMT_RGB32: - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) - ); + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + YSCALEYUV2RGB1_ALPHA(%%REGBP) + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); + }else{ + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); + } return; case PIX_FMT_BGR24: __asm__ volatile( @@ -1492,9 +1603,9 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t * #endif /* HAVE_MMX */ if (uvalpha < 2048) { - YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) + YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) }else{ - YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) + YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) } } @@ -1515,7 +1626,7 @@ static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint3 "movq %%mm0, (%2, %%"REG_a") \n\t" "add $8, %%"REG_a" \n\t" " js 1b \n\t" - : : "g" (-width), "r" (src+width*2), "r" (dst+width) + : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) : "%"REG_a ); #else @@ -1546,7 +1657,7 @@ static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, "movd %%mm1, (%2, %%"REG_a") \n\t" "add $4, %%"REG_a" \n\t" " js 1b \n\t" - : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) + : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) : "%"REG_a ); #else @@ -1576,7 +1687,7 @@ static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint3 "movq %%mm0, (%2, %%"REG_a") \n\t" "add $8, %%"REG_a" \n\t" " js 1b \n\t" - : : "g" (-width), "r" (src+width*2), "r" (dst+width) + : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) : "%"REG_a ); #else @@ -1607,7 +1718,7 @@ static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, "movd %%mm1, (%2, %%"REG_a") \n\t" "add $4, %%"REG_a" \n\t" " js 1b \n\t" - : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) + : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) : "%"REG_a ); #else @@ -1642,6 +1753,13 @@ BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8) BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7) +static inline void RENAME(abgrToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){ + int i; + for (i=0; i<width; i++){ + dst[i]= src[4*i]; + } +} + #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\ static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\ {\ @@ -1731,7 +1849,7 @@ static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, "add $4, %%"REG_a" \n\t" " js 1b \n\t" : "+r" (src) - : "r" (dst+width), "g" (-width) + : "r" (dst+width), "g" ((x86_reg)-width) : "%"REG_a ); } @@ -1789,7 +1907,7 @@ static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t * "add $4, %%"REG_a" \n\t" " js 1b \n\t" : "+r" (src) - : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0]) + : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0]) : "%"REG_a ); } @@ -1951,7 +2069,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW assert(filterSize % 4 == 0 && filterSize>0); if (filterSize==4) // Always true for upscaling, sometimes for down, too. { - long counter= -2*dstW; + x86_reg counter= -2*dstW; filter-= counter*2; filterPos-= counter/2; dst-= counter/2; @@ -1997,7 +2115,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW } else if (filterSize==8) { - long counter= -2*dstW; + x86_reg counter= -2*dstW; filter-= counter*4; filterPos-= counter/2; dst-= counter/2; @@ -2055,7 +2173,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW else { uint8_t *offset = src+filterSize; - long counter= -2*dstW; + x86_reg counter= -2*dstW; //filter-= counter*filterSize/2; filterPos-= counter/2; dst-= counter/2; @@ -2098,7 +2216,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW : "+r" (counter), "+r" (filter) : "m" (filterPos), "m" (dst), "m"(offset), - "m" (src), "r" (filterSize*2) + "m" (src), "r" ((x86_reg)filterSize*2) : "%"REG_a, "%"REG_c, "%"REG_d ); } @@ -2125,13 +2243,34 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW #endif /* HAVE_ALTIVEC */ #endif /* HAVE_MMX */ } + +static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, + int dstWidth, uint8_t *src, int srcW, + int xInc) +{ + int i; + unsigned int xpos=0; + for (i=0;i<dstWidth;i++) + { + register unsigned int xx=xpos>>16; + register unsigned int xalpha=(xpos&0xFFFF)>>9; + dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; + xpos+=xInc; + } +} + // *** horizontal scale Y line to temp buffer static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, - int flags, int canMMX2BeUsed, int16_t *hLumFilter, - int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, - int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, - int32_t *mmx2FilterPos, uint32_t *pal) + int flags, int16_t *hLumFilter, + int16_t *hLumFilterPos, int hLumFilterSize, + int srcFormat, uint8_t *formatConvBuffer, + uint32_t *pal, int isAlpha) { + int32_t *mmx2FilterPos = c->lumMmx2FilterPos; + int16_t *mmx2Filter = c->lumMmx2Filter; + int canMMX2BeUsed = c->canMMX2BeUsed; + void *funnyYCode = c->funnyYCode; + if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE) { RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal); @@ -2144,12 +2283,18 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, } else if (srcFormat==PIX_FMT_RGB32) { - RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal); + if (isAlpha) + RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal); + else + RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal); src= formatConvBuffer; } else if (srcFormat==PIX_FMT_RGB32_1) { - RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); + if (isAlpha) + RENAME(abgrToA)(formatConvBuffer, src, srcW, pal); + else + RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); src= formatConvBuffer; } else if (srcFormat==PIX_FMT_BGR24) @@ -2169,12 +2314,18 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, } else if (srcFormat==PIX_FMT_BGR32) { - RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal); + if (isAlpha) + RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal); + else + RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal); src= formatConvBuffer; } else if (srcFormat==PIX_FMT_BGR32_1) { - RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); + if (isAlpha) + RENAME(abgrToA)(formatConvBuffer, src, srcW, pal); + else + RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); src= formatConvBuffer; } else if (srcFormat==PIX_FMT_RGB24) @@ -2289,7 +2440,7 @@ FUNNY_Y_CODE else { #endif /* HAVE_MMX2 */ - long xInc_shr16 = xInc >> 16; + x86_reg xInc_shr16 = xInc >> 16; uint16_t xInc_mask = xInc & 0xffff; //NO MMX just normal asm ... __asm__ volatile( @@ -2335,19 +2486,11 @@ FUNNY_Y_CODE } //if MMX2 can't be used #endif #else - int i; - unsigned int xpos=0; - for (i=0;i<dstWidth;i++) - { - register unsigned int xx=xpos>>16; - register unsigned int xalpha=(xpos&0xFFFF)>>9; - dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; - xpos+=xInc; - } + RENAME(hyscale_fast)(c, dst, dstWidth, src, srcW, xInc); #endif /* ARCH_X86 */ } - if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ + if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ int i; //FIXME all pal and rgb srcFormats could do this convertion as well //FIXME all scalers more complex than bilinear could do half of this transform @@ -2361,12 +2504,37 @@ FUNNY_Y_CODE } } +static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, + int dstWidth, uint8_t *src1, + uint8_t *src2, int srcW, int xInc) +{ + int i; + unsigned int xpos=0; + for (i=0;i<dstWidth;i++) + { + register unsigned int xx=xpos>>16; + register unsigned int xalpha=(xpos&0xFFFF)>>9; + dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); + dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); + /* slower + dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; + dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; + */ + xpos+=xInc; + } +} + inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, - int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, - int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, - int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, - int32_t *mmx2FilterPos, uint32_t *pal) + int srcW, int xInc, int flags, int16_t *hChrFilter, + int16_t *hChrFilterPos, int hChrFilterSize, + int srcFormat, uint8_t *formatConvBuffer, + uint32_t *pal) { + int32_t *mmx2FilterPos = c->chrMmx2FilterPos; + int16_t *mmx2Filter = c->chrMmx2Filter; + int canMMX2BeUsed = c->canMMX2BeUsed; + void *funnyUVCode = c->funnyUVCode; + if (srcFormat==PIX_FMT_YUYV422) { RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); @@ -2575,7 +2743,7 @@ FUNNY_UV_CODE else { #endif /* HAVE_MMX2 */ - long xInc_shr16 = (long) (xInc >> 16); + x86_reg xInc_shr16 = (x86_reg) (xInc >> 16); uint16_t xInc_mask = xInc & 0xffff; __asm__ volatile( "xor %%"REG_a", %%"REG_a" \n\t" // i @@ -2613,9 +2781,9 @@ FUNNY_UV_CODE /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, which is needed to support GCC 4.0. */ #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) - :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), + :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), #else - :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), + :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), #endif "r" (src2) : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" @@ -2624,20 +2792,7 @@ FUNNY_UV_CODE } //if MMX2 can't be used #endif #else - int i; - unsigned int xpos=0; - for (i=0;i<dstWidth;i++) - { - register unsigned int xx=xpos>>16; - register unsigned int xalpha=(xpos&0xFFFF)>>9; - dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); - dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); - /* slower - dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; - dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; - */ - xpos+=xInc; - } + RENAME(hcscale_fast)(c, dst, dstWidth, src1, src2, srcW, xInc); #endif /* ARCH_X86 */ } if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ @@ -2672,7 +2827,6 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s const int dstFormat= c->dstFormat; const int srcFormat= c->srcFormat; const int flags= c->flags; - const int canMMX2BeUsed= c->canMMX2BeUsed; int16_t *vLumFilterPos= c->vLumFilterPos; int16_t *vChrFilterPos= c->vChrFilterPos; int16_t *hLumFilterPos= c->hLumFilterPos; @@ -2683,16 +2837,16 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s int16_t *hChrFilter= c->hChrFilter; int32_t *lumMmxFilter= c->lumMmxFilter; int32_t *chrMmxFilter= c->chrMmxFilter; + int32_t *alpMmxFilter= c->alpMmxFilter; const int vLumFilterSize= c->vLumFilterSize; const int vChrFilterSize= c->vChrFilterSize; const int hLumFilterSize= c->hLumFilterSize; const int hChrFilterSize= c->hChrFilterSize; int16_t **lumPixBuf= c->lumPixBuf; int16_t **chrPixBuf= c->chrPixBuf; + int16_t **alpPixBuf= c->alpPixBuf; const int vLumBufSize= c->vLumBufSize; const int vChrBufSize= c->vChrBufSize; - uint8_t *funnyYCode= c->funnyYCode; - uint8_t *funnyUVCode= c->funnyUVCode; uint8_t *formatConvBuffer= c->formatConvBuffer; const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); @@ -2709,10 +2863,12 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s if (isPacked(c->srcFormat)){ src[0]= src[1]= - src[2]= src[0]; + src[2]= + src[3]= src[0]; srcStride[0]= srcStride[1]= - srcStride[2]= srcStride[0]; + srcStride[2]= + srcStride[3]= srcStride[0]; } srcStride[1]<<= c->vChrDrop; srcStride[2]<<= c->vChrDrop; @@ -2733,7 +2889,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], //dstStride[0],dstStride[1],dstStride[2]); - if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) + if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) { static int warnedAlready=0; //FIXME move this into the context perhaps if (flags & SWS_PRINT_INFO && !warnedAlready) |