diff options
-rw-r--r-- | postproc/swscale.c | 27 | ||||
-rw-r--r-- | postproc/swscale.h | 36 | ||||
-rw-r--r-- | postproc/swscale_template.c | 185 |
3 files changed, 153 insertions, 95 deletions
diff --git a/postproc/swscale.c b/postproc/swscale.c index 6936f71b0b..044085b9d1 100644 --- a/postproc/swscale.c +++ b/postproc/swscale.c @@ -2019,6 +2019,15 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, c->dstFormat= dstFormat; c->srcFormat= srcFormat; + c->yCoeff= 0x2568256825682568LL; + c->vrCoeff= 0x3343334333433343LL; + c->ubCoeff= 0x40cf40cf40cf40cfLL; + c->vgCoeff= 0xE5E2E5E2E5E2E5E2LL; + c->ugCoeff= 0xF36EF36EF36EF36ELL; + c->yOffset= 0x0080008000800080LL; + c->uOffset= 0x0400040004000400LL; + c->vOffset= 0x0400040004000400LL; + usesFilter=0; if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1; if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1; @@ -2261,19 +2270,6 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, ASSERT(c->chrDstH <= dstH) - // pack filter data for mmx code - if(cpuCaps.hasMMX) - { - c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize* dstH*4*sizeof(int16_t)); - c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t)); - for(i=0; i<c->vLumFilterSize*dstH; i++) - c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]= - c->vLumFilter[i]; - for(i=0; i<c->vChrFilterSize*c->chrDstH; i++) - c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]= - c->vChrFilter[i]; - } - if(flags&SWS_PRINT_INFO) { #ifdef DITHER1XBPP @@ -2668,11 +2664,6 @@ void freeSwsContext(SwsContext *c){ if(c->hChrFilterPos) free(c->hChrFilterPos); c->hChrFilterPos = NULL; - if(c->lumMmxFilter) free(c->lumMmxFilter); - c->lumMmxFilter = NULL; - if(c->chrMmxFilter) free(c->chrMmxFilter); - c->chrMmxFilter = NULL; - if(c->lumMmx2Filter) free(c->lumMmx2Filter); c->lumMmx2Filter=NULL; if(c->chrMmx2Filter) free(c->chrMmx2Filter); diff --git a/postproc/swscale.h b/postproc/swscale.h index 54f8c71ba0..a03927cd57 100644 --- a/postproc/swscale.h +++ b/postproc/swscale.h @@ -44,6 +44,7 @@ #define SWS_FULL_CHR_H_INP 0x4000 #define SWS_DIRECT_BGR 0x8000 +#define MAX_FILTER_SIZE 256 #define SWS_MAX_REDUCE_CUTOFF 0.002 @@ -70,9 +71,6 @@ typedef struct SwsContext{ int16_t *vChrFilter; int16_t *vChrFilterPos; -// Contain simply the values from v(Lum|Chr)Filter just nicely packed for mmx - int16_t *lumMmxFilter; - int16_t *chrMmxFilter; uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull int hLumFilterSize; @@ -105,8 +103,40 @@ typedef struct SwsContext{ void (*swScale)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[]); + +#define RED_DITHER "0*8" +#define GREEN_DITHER "1*8" +#define BLUE_DITHER "2*8" +#define Y_COEFF "3*8" +#define VR_COEFF "4*8" +#define UB_COEFF "5*8" +#define VG_COEFF "6*8" +#define UG_COEFF "7*8" +#define Y_OFFSET "8*8" +#define U_OFFSET "9*8" +#define V_OFFSET "10*8" +#define LUM_MMX_FILTER_OFFSET "11*8" +#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256" + + uint64_t redDither __attribute__((aligned(8))); + uint64_t greenDither __attribute__((aligned(8))); + uint64_t blueDither __attribute__((aligned(8))); + + uint64_t yCoeff __attribute__((aligned(8))); + uint64_t vrCoeff __attribute__((aligned(8))); + uint64_t ubCoeff __attribute__((aligned(8))); + uint64_t vgCoeff __attribute__((aligned(8))); + uint64_t ugCoeff __attribute__((aligned(8))); + uint64_t yOffset __attribute__((aligned(8))); + uint64_t uOffset __attribute__((aligned(8))); + uint64_t vOffset __attribute__((aligned(8))); + int32_t lumMmxFilter[4*MAX_FILTER_SIZE]; + int32_t chrMmxFilter[4*MAX_FILTER_SIZE]; + } SwsContext; //FIXME check init (where 0) +//FIXME split private & public + // when used for filters they must have an odd number of elements // coeffs cannot be shared between vectors diff --git a/postproc/swscale_template.c b/postproc/swscale_template.c index 092f11b3a4..dc8755860a 100644 --- a/postproc/swscale_template.c +++ b/postproc/swscale_template.c @@ -59,32 +59,35 @@ #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" #endif -#define YSCALEYUV2YV12X(x) \ +#define YSCALEYUV2YV12X(x, offset) \ "xorl %%eax, %%eax \n\t"\ "pxor %%mm3, %%mm3 \n\t"\ "pxor %%mm4, %%mm4 \n\t"\ - "movl %0, %%edx \n\t"\ + "leal " offset "(%0), %%edx \n\t"\ + "movl (%%edx), %%esi \n\t"\ ".balign 16 \n\t" /* FIXME Unroll? */\ "1: \n\t"\ - "movl (%1, %%edx, 4), %%esi \n\t"\ - "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ + "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\ "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\ "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\ + "addl $16, %%edx \n\t"\ + "movl (%%edx), %%esi \n\t"\ + "testl %%esi, %%esi \n\t"\ "pmulhw %%mm0, %%mm2 \n\t"\ "pmulhw %%mm0, %%mm5 \n\t"\ "paddw %%mm2, %%mm3 \n\t"\ "paddw %%mm5, %%mm4 \n\t"\ - "addl $1, %%edx \n\t"\ " jnz 1b \n\t"\ "psraw $3, %%mm3 \n\t"\ "psraw $3, %%mm4 \n\t"\ "packuswb %%mm4, %%mm3 \n\t"\ - MOVNTQ(%%mm3, (%3, %%eax))\ + MOVNTQ(%%mm3, (%1, %%eax))\ "addl $8, %%eax \n\t"\ - "cmpl %4, %%eax \n\t"\ + "cmpl %2, %%eax \n\t"\ "pxor %%mm3, %%mm3 \n\t"\ "pxor %%mm4, %%mm4 \n\t"\ - "movl %0, %%edx \n\t"\ + "leal " offset "(%0), %%edx \n\t"\ + "movl (%%edx), %%esi \n\t"\ "jb 1b \n\t" #define YSCALEYUV2YV121 \ @@ -110,57 +113,60 @@ #define YSCALEYUV2PACKEDX \ "xorl %%eax, %%eax \n\t"\ ".balign 16 \n\t"\ + "nop \n\t"\ "1: \n\t"\ - "movl %1, %%edx \n\t" /* -chrFilterSize */\ - "movl %3, %%ebx \n\t" /* chrMmxFilter+chrFilterSize */\ - "movl %7, %%ecx \n\t" /* chrSrc+chrFilterSize */\ + "leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\ + "movl (%%edx), %%esi \n\t"\ "pxor %%mm3, %%mm3 \n\t"\ "pxor %%mm4, %%mm4 \n\t"\ + ".balign 16 \n\t"\ "2: \n\t"\ - "movl (%%ecx, %%edx, 4), %%esi \n\t"\ - "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ + "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\ "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\ "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\ + "addl $16, %%edx \n\t"\ + "movl (%%edx), %%esi \n\t"\ "pmulhw %%mm0, %%mm2 \n\t"\ "pmulhw %%mm0, %%mm5 \n\t"\ "paddw %%mm2, %%mm3 \n\t"\ "paddw %%mm5, %%mm4 \n\t"\ - "addl $1, %%edx \n\t"\ + "testl %%esi, %%esi \n\t"\ " jnz 2b \n\t"\ \ - "movl %0, %%edx \n\t" /* -lumFilterSize */\ - "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\ - "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\ + "leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\ + "movl (%%edx), %%esi \n\t"\ "pxor %%mm1, %%mm1 \n\t"\ "pxor %%mm7, %%mm7 \n\t"\ + ".balign 16 \n\t"\ "2: \n\t"\ - "movl (%%ecx, %%edx, 4), %%esi \n\t"\ - "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ + "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\ "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\ "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\ + "addl $16, %%edx \n\t"\ + "movl (%%edx), %%esi \n\t"\ "pmulhw %%mm0, %%mm2 \n\t"\ "pmulhw %%mm0, %%mm5 \n\t"\ "paddw %%mm2, %%mm1 \n\t"\ "paddw %%mm5, %%mm7 \n\t"\ - "addl $1, %%edx \n\t"\ + "testl %%esi, %%esi \n\t"\ " jnz 2b \n\t"\ #define YSCALEYUV2RGBX \ YSCALEYUV2PACKEDX\ - "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\ - "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\ + "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ + "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ - "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\ - "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ + "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ + "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ - "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\ - "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\ - "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ - "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\ - "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ - "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\ + "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ + "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ + "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ + "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ + "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ + "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ "paddw %%mm3, %%mm4 \n\t"\ "movq %%mm2, %%mm0 \n\t"\ @@ -183,7 +189,7 @@ "packuswb %%mm6, %%mm5 \n\t"\ "packuswb %%mm3, %%mm4 \n\t"\ "pxor %%mm7, %%mm7 \n\t" - +#if 0 #define FULL_YSCALEYUV2RGB \ "pxor %%mm7, %%mm7 \n\t"\ "movd %6, %%mm6 \n\t" /*yalpha1*/\ @@ -236,6 +242,7 @@ "paddw %%mm2, %%mm1 \n\t" /* G*/\ \ "packuswb %%mm1, %%mm1 \n\t" +#endif #define YSCALEYUV2PACKED \ "movd %6, %%mm6 \n\t" /*yalpha1*/\ @@ -742,33 +749,34 @@ " jb 1b \n\t" -static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, +static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW, - int16_t * lumMmxFilter, int16_t * chrMmxFilter) + int32_t * lumMmxFilter, int32_t * chrMmxFilter) { + int dummy=0; #ifdef HAVE_MMX if(uDest != NULL) { asm volatile( - YSCALEYUV2YV12X(0) - :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), - "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW) + YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET) + :: "r" (&c->redDither), + "r" (uDest), "m" (chrDstW) : "%eax", "%edx", "%esi" ); asm volatile( - YSCALEYUV2YV12X(4096) - :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), - "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW) + YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET) + :: "r" (&c->redDither), + "r" (vDest), "m" (chrDstW) : "%eax", "%edx", "%esi" ); } asm volatile( - YSCALEYUV2YV12X(0) - :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize), - "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW) + YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET) + :: "r" (&c->redDither), + "r" (dest), "m" (dstW) : "%eax", "%edx", "%esi" ); #else @@ -844,8 +852,9 @@ static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, */ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, - uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY) + uint8_t *dest, int dstW, int dstY) { + int dummy=0; switch(c->dstFormat) { #ifdef HAVE_MMX @@ -855,11 +864,10 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_ YSCALEYUV2RGBX WRITEBGR32 - :: "m" (-lumFilterSize), "m" (-chrFilterSize), - "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), - "r" (dest), "m" (dstW), - "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) - : "%eax", "%ebx", "%ecx", "%edx", "%esi" + :: "r" (&c->redDither), + "m" (dummy), "m" (dummy), "m" (dummy), + "r" (dest), "m" (dstW) + : "%eax", "%edx", "%esi" ); } break; @@ -871,11 +879,10 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_ "addl %4, %%ebx \n\t" WRITEBGR24 - :: "m" (-lumFilterSize), "m" (-chrFilterSize), - "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), - "r" (dest), "m" (dstW), - "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) - : "%eax", "%ebx", "%ecx", "%edx", "%esi" + :: "r" (&c->redDither), + "m" (dummy), "m" (dummy), "m" (dummy), + "r" (dest), "m" (dstW) + : "%eax", "%ebx", "%edx", "%esi" //FIXME ebx ); } break; @@ -892,11 +899,10 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_ WRITEBGR15 - :: "m" (-lumFilterSize), "m" (-chrFilterSize), - "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), - "r" (dest), "m" (dstW), - "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) - : "%eax", "%ebx", "%ecx", "%edx", "%esi" + :: "r" (&c->redDither), + "m" (dummy), "m" (dummy), "m" (dummy), + "r" (dest), "m" (dstW) + : "%eax", "%edx", "%esi" ); } break; @@ -913,11 +919,10 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_ WRITEBGR16 - :: "m" (-lumFilterSize), "m" (-chrFilterSize), - "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), - "r" (dest), "m" (dstW), - "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) - : "%eax", "%ebx", "%ecx", "%edx", "%esi" + :: "r" (&c->redDither), + "m" (dummy), "m" (dummy), "m" (dummy), + "r" (dest), "m" (dstW) + : "%eax", "%edx", "%esi" ); } break; @@ -933,11 +938,10 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_ "psraw $3, %%mm7 \n\t" WRITEYUY2 - :: "m" (-lumFilterSize), "m" (-chrFilterSize), - "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), - "r" (dest), "m" (dstW), - "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) - : "%eax", "%ebx", "%ecx", "%edx", "%esi" + :: "r" (&c->redDither), + "m" (dummy), "m" (dummy), "m" (dummy), + "r" (dest), "m" (dstW) + : "%eax", "%edx", "%esi" ); } break; @@ -2528,8 +2532,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar int16_t *vChrFilter= c->vChrFilter; int16_t *hLumFilter= c->hLumFilter; int16_t *hChrFilter= c->hChrFilter; - int16_t *lumMmxFilter= c->lumMmxFilter; - int16_t *chrMmxFilter= c->chrMmxFilter; + int32_t *lumMmxFilter= c->lumMmxFilter; + int32_t *chrMmxFilter= c->chrMmxFilter; const int vLumFilterSize= c->vLumFilterSize; const int vChrFilterSize= c->vChrFilterSize; const int hLumFilterSize= c->hLumFilterSize; @@ -2729,11 +2733,28 @@ i--; { int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; - RENAME(yuv2yuvX)( + int i; +#ifdef HAVE_MMX + for(i=0; i<vLumFilterSize; i++) + { + lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; + lumMmxFilter[4*i+2]= + lumMmxFilter[4*i+3]= + ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; + } + for(i=0; i<vChrFilterSize; i++) + { + chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; + chrMmxFilter[4*i+2]= + chrMmxFilter[4*i+3]= + ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; + } +#endif + RENAME(yuv2yuvX)(c, vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, dest, uDest, vDest, dstW, chrDstW, - lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4); + lumMmxFilter, chrMmxFilter); } } else @@ -2760,11 +2781,27 @@ i--; } else //General RGB { + int i; +#ifdef HAVE_MMX + for(i=0; i<vLumFilterSize; i++) + { + lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; + lumMmxFilter[4*i+2]= + lumMmxFilter[4*i+3]= + ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; + } + for(i=0; i<vChrFilterSize; i++) + { + chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; + chrMmxFilter[4*i+2]= + chrMmxFilter[4*i+3]= + ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; + } +#endif RENAME(yuv2packedX)(c, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - dest, dstW, - lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4, dstY); + dest, dstW, dstY); } } } |