diff options
author | arpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2002-06-22 08:49:45 +0000 |
---|---|---|
committer | arpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2002-06-22 08:49:45 +0000 |
commit | d3c753359a7fa62f6b1d46d43d8eb0ad293df443 (patch) | |
tree | a08eca170c304ab9621cdc31f4056809db1ea873 | |
parent | 1bf774801032825fe68235eb766aaede5c14e046 (diff) | |
download | mpv-d3c753359a7fa62f6b1d46d43d8eb0ad293df443.tar.bz2 mpv-d3c753359a7fa62f6b1d46d43d8eb0ad293df443.tar.xz |
sync with mplayer xp
- partial yvu9 support (copy only)
- rgb 15/16 -> 24/32 converters
- int->unsigned changes
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@6493 b3059339-0415-0410-9bf9-f77b7e298cf2
-rw-r--r-- | postproc/rgb2rgb.c | 124 | ||||
-rw-r--r-- | postproc/rgb2rgb.h | 19 | ||||
-rw-r--r-- | postproc/rgb2rgb_template.c | 690 | ||||
-rw-r--r-- | postproc/swscale.c | 531 | ||||
-rw-r--r-- | postproc/swscale_template.c | 2 | ||||
-rw-r--r-- | postproc/yuv2rgb.c | 12 | ||||
-rw-r--r-- | postproc/yuv2rgb_mlib.c | 14 | ||||
-rw-r--r-- | postproc/yuv2rgb_template.c | 18 |
8 files changed, 1123 insertions, 287 deletions
diff --git a/postproc/rgb2rgb.c b/postproc/rgb2rgb.c index 91983bea0a..962a58945f 100644 --- a/postproc/rgb2rgb.c +++ b/postproc/rgb2rgb.c @@ -20,6 +20,8 @@ #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit #ifdef CAN_COMPILE_X86_ASM +static const uint64_t mmx_null __attribute__((aligned(8))) = 0x0000000000000000ULL; +static const uint64_t mmx_one __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL; static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL; static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL; static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL; @@ -35,6 +37,11 @@ static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff00 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; +static const uint64_t mask15g __attribute__((aligned(8))) = 0x03E003E003E003E0ULL; +static const uint64_t mask15r __attribute__((aligned(8))) = 0x7C007C007C007C00ULL; +#define mask16b mask15b +static const uint64_t mask16g __attribute__((aligned(8))) = 0x07E007E007E007E0ULL; +static const uint64_t mask16r __attribute__((aligned(8))) = 0xF800F800F800F800ULL; static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL; static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL; static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; @@ -137,10 +144,68 @@ void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size) else if(gCpuCaps.hasMMX) rgb24to32_MMX(src, dst, src_size); else +#endif rgb24to32_C(src, dst, src_size); -#else - rgb24to32_C(src, dst, src_size); +} + +void rgb15to24(const uint8_t *src,uint8_t *dst,unsigned src_size) +{ +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb15to24_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb15to24_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb15to24_MMX(src, dst, src_size); + else #endif + rgb15to24_C(src, dst, src_size); +} + +void rgb16to24(const uint8_t *src,uint8_t *dst,unsigned src_size) +{ +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb16to24_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb16to24_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb16to24_MMX(src, dst, src_size); + else +#endif + rgb16to24_C(src, dst, src_size); +} + +void rgb15to32(const uint8_t *src,uint8_t *dst,unsigned src_size) +{ +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb15to32_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb15to32_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb15to32_MMX(src, dst, src_size); + else +#endif + rgb15to32_C(src, dst, src_size); +} + +void rgb16to32(const uint8_t *src,uint8_t *dst,unsigned src_size) +{ +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb16to32_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb16to32_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb16to32_MMX(src, dst, src_size); + else +#endif + rgb16to32_C(src, dst, src_size); } void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size) @@ -154,10 +219,8 @@ void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size) else if(gCpuCaps.hasMMX) rgb32to24_MMX(src, dst, src_size); else - rgb32to24_C(src, dst, src_size); -#else - rgb32to24_C(src, dst, src_size); #endif + rgb32to24_C(src, dst, src_size); } /* @@ -177,10 +240,8 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size) else if(gCpuCaps.hasMMX) rgb15to16_MMX(src, dst, src_size); else - rgb15to16_C(src, dst, src_size); -#else - rgb15to16_C(src, dst, src_size); #endif + rgb15to16_C(src, dst, src_size); } /** @@ -242,10 +303,8 @@ void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size) else if(gCpuCaps.hasMMX) rgb32to16_MMX(src, dst, src_size); else - rgb32to16_C(src, dst, src_size); -#else - rgb32to16_C(src, dst, src_size); #endif + rgb32to16_C(src, dst, src_size); } void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size) @@ -259,10 +318,8 @@ void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size) else if(gCpuCaps.hasMMX) rgb32to15_MMX(src, dst, src_size); else - rgb32to15_C(src, dst, src_size); -#else - rgb32to15_C(src, dst, src_size); #endif + rgb32to15_C(src, dst, src_size); } void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size) @@ -276,10 +333,8 @@ void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size) else if(gCpuCaps.hasMMX) rgb24to16_MMX(src, dst, src_size); else - rgb24to16_C(src, dst, src_size); -#else - rgb24to16_C(src, dst, src_size); #endif + rgb24to16_C(src, dst, src_size); } void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size) @@ -293,10 +348,8 @@ void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size) else if(gCpuCaps.hasMMX) rgb24to15_MMX(src, dst, src_size); else - rgb24to15_C(src, dst, src_size); -#else - rgb24to15_C(src, dst, src_size); #endif + rgb24to15_C(src, dst, src_size); } /** @@ -330,10 +383,8 @@ void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size) else if(gCpuCaps.hasMMX) rgb32tobgr32_MMX(src, dst, src_size); else - rgb32tobgr32_C(src, dst, src_size); -#else - rgb32tobgr32_C(src, dst, src_size); #endif + rgb32tobgr32_C(src, dst, src_size); } void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size) @@ -347,10 +398,8 @@ void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size) else if(gCpuCaps.hasMMX) rgb24tobgr24_MMX(src, dst, src_size); else - rgb24tobgr24_C(src, dst, src_size); -#else - rgb24tobgr24_C(src, dst, src_size); #endif + rgb24tobgr24_C(src, dst, src_size); } /** @@ -371,10 +420,8 @@ void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, u else if(gCpuCaps.hasMMX) yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); else - yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); -#else - yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); #endif + yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); } /** @@ -394,10 +441,8 @@ void yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc else if(gCpuCaps.hasMMX) yuv422ptoyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); else - yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); -#else - yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); #endif + yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); } /** @@ -418,10 +463,8 @@ void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, else if(gCpuCaps.hasMMX) yuy2toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); else - yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); -#else - yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); #endif + yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); } /** @@ -488,14 +531,13 @@ void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst else if(gCpuCaps.hasMMX) rgb24toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); else - rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); -#else - rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); #endif + rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); } void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, - int width, int height, int src1Stride, int src2Stride, int dstStride) + unsigned width, unsigned height, unsigned src1Stride, + unsigned src2Stride, unsigned dstStride) { #ifdef CAN_COMPILE_X86_ASM // ordered per speed fasterst first @@ -506,8 +548,6 @@ void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, else if(gCpuCaps.hasMMX) interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); else - interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); -#else - interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); #endif + interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); } diff --git a/postproc/rgb2rgb.h b/postproc/rgb2rgb.h index fb4f04590d..9fb6da6ef1 100644 --- a/postproc/rgb2rgb.h +++ b/postproc/rgb2rgb.h @@ -10,12 +10,16 @@ #define RGB2RGB_INCLUDED extern void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size); +extern void rgb24to16(const uint8_t *src,uint8_t *dst,unsigned src_size); +extern void rgb24to15(const uint8_t *src,uint8_t *dst,unsigned src_size); extern void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size); -extern void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size); extern void rgb32to16(const uint8_t *src,uint8_t *dst,unsigned src_size); extern void rgb32to15(const uint8_t *src,uint8_t *dst,unsigned src_size); -extern void rgb24to16(const uint8_t *src,uint8_t *dst,unsigned src_size); -extern void rgb24to15(const uint8_t *src,uint8_t *dst,unsigned src_size); +extern void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size); +extern void rgb15to24(const uint8_t *src,uint8_t *dst,unsigned src_size); +extern void rgb15to32(const uint8_t *src,uint8_t *dst,unsigned src_size); +extern void rgb16to24(const uint8_t *src,uint8_t *dst,unsigned src_size); +extern void rgb16to32(const uint8_t *src,uint8_t *dst,unsigned src_size); extern void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned src_size); extern void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned src_size); @@ -39,7 +43,8 @@ extern void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_ unsigned int lumStride, unsigned int chromStride, unsigned int srcStride); extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, - int width, int height, int src1Stride, int src2Stride, int dstStride); + unsigned width, unsigned height, unsigned src1Stride, + unsigned src2Stride, unsigned dstStride); #define MODE_RGB 0x1 @@ -47,11 +52,11 @@ extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, typedef void (* yuv2rgb_fun) (uint8_t * image, uint8_t * py, uint8_t * pu, uint8_t * pv, - int h_size, int v_size, - int rgb_stride, int y_stride, int uv_stride); + unsigned h_size, unsigned v_size, + unsigned rgb_stride, unsigned y_stride, unsigned uv_stride); extern yuv2rgb_fun yuv2rgb; -void yuv2rgb_init (int bpp, int mode); +void yuv2rgb_init (unsigned bpp, int mode); #endif diff --git a/postproc/rgb2rgb_template.c b/postproc/rgb2rgb_template.c index 9d59eabc70..015e7f2d56 100644 --- a/postproc/rgb2rgb_template.c +++ b/postproc/rgb2rgb_template.c @@ -8,6 +8,13 @@ * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) */ +#include <stddef.h> +#include <inttypes.h> /* for __WORDSIZE */ + +#ifndef __WORDSIZE +#warning You have misconfigured system and probably will lose performance! +#endif + #undef PREFETCH #undef MOVNTQ #undef EMMS @@ -56,13 +63,13 @@ static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned sr const uint8_t *s = src; const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + uint8_t *mm_end; #endif end = s + src_size; #ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 23; __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); + mm_end = (uint8_t*)((((unsigned long)end)/24)*24); while(s < mm_end) { __asm __volatile( @@ -107,12 +114,12 @@ static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned sr const uint8_t *s = src; const uint8_t *end; #ifdef HAVE_MMX - const uint8_t *mm_end; + uint8_t *mm_end; #endif end = s + src_size; #ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 31; + mm_end = (uint8_t*)((((unsigned long)end)/32)*32); while(s < mm_end) { __asm __volatile( @@ -186,15 +193,16 @@ static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned sr */ static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size) { + register const uint8_t* s=src; + register uint8_t* d=dst; + register const uint8_t *end; + uint8_t *mm_end; + end = s + src_size; #ifdef HAVE_MMX - register int offs=15-src_size; - register const char* s=src-offs; - register char* d=dst-offs; - __asm __volatile(PREFETCH" %0"::"m"(*(s+offs))); - __asm __volatile( - "movq %0, %%mm4\n\t" - ::"m"(mask15s)); - while(offs<0) + __asm __volatile(PREFETCH" %0"::"m"(*s)); + __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); + mm_end = (uint8_t*)((((unsigned long)end)/16)*16); + while(s<mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" @@ -208,40 +216,28 @@ static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned sr "paddw %%mm3, %%mm2\n\t" MOVNTQ" %%mm0, %0\n\t" MOVNTQ" %%mm2, 8%0" - :"=m"(*(d+offs)) - :"m"(*(s+offs)) + :"=m"(*d) + :"m"(*s) ); - offs+=16; + d+=16; + s+=16; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); -#else -#if 0 - const uint16_t *s1=( uint16_t * )src; - uint16_t *d1=( uint16_t * )dst; - uint16_t *e=((uint8_t *)s1)+src_size; - while( s1<e ){ - register int x=*( s1++ ); - /* rrrrrggggggbbbbb - 0rrrrrgggggbbbbb - 0111 1111 1110 0000=0x7FE0 - 00000000000001 1111=0x001F */ - *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 ); - } -#else - const unsigned *s1=( unsigned * )src; - unsigned *d1=( unsigned * )dst; - int i; - int size= src_size>>2; - for(i=0; i<size; i++) - { - register int x= s1[i]; -// d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true - d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0); - - } -#endif #endif + mm_end = (uint8_t*)((((unsigned long)end)/4)*4); + while(s < mm_end) + { + register unsigned x= *((uint32_t *)s); + *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); + d+=4; + s+=4; + } + if(s < end) + { + register unsigned short x= *((uint16_t *)s); + *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); + } } static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size) @@ -257,17 +253,20 @@ static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsign static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) { -#ifdef HAVE_MMX const uint8_t *s = src; - const uint8_t *end,*mm_end; + const uint8_t *end; +#ifdef HAVE_MMX + const uint8_t *mm_end; +#endif uint16_t *d = (uint16_t *)dst; end = s + src_size; - mm_end = end - 15; +#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_16mask),"m"(green_16mask)); + mm_end = (uint8_t*)((((unsigned long)end)/16)*16); while(s < mm_end) { __asm __volatile( @@ -303,43 +302,35 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 16; } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif while(s < end) { const int b= *s++; const int g= *s++; const int r= *s++; - s++; *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); + s++; } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#else - unsigned j,i,num_pixels=src_size/4; - uint16_t *d = (uint16_t *)dst; - for(i=0,j=0; j<num_pixels; i+=4,j++) - { - const int b= src[i+0]; - const int g= src[i+1]; - const int r= src[i+2]; - - d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } -#endif } static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) { -#ifdef HAVE_MMX const uint8_t *s = src; - const uint8_t *end,*mm_end; + const uint8_t *end; +#ifdef HAVE_MMX + const uint8_t *mm_end; +#endif uint16_t *d = (uint16_t *)dst; end = s + src_size; - mm_end = end - 15; +#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_15mask),"m"(green_15mask)); + mm_end = (uint8_t*)((((unsigned long)end)/16)*16); while(s < mm_end) { __asm __volatile( @@ -375,43 +366,35 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 16; } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif while(s < end) { const int b= *s++; const int g= *s++; const int r= *s++; - s++; *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); + s++; } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#else - unsigned j,i,num_pixels=src_size/4; - uint16_t *d = (uint16_t *)dst; - for(i=0,j=0; j<num_pixels; i+=4,j++) - { - const int b= src[i+0]; - const int g= src[i+1]; - const int r= src[i+2]; - - d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); - } -#endif } static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) { -#ifdef HAVE_MMX const uint8_t *s = src; - const uint8_t *end,*mm_end; + const uint8_t *end; +#ifdef HAVE_MMX + const uint8_t *mm_end; +#endif uint16_t *d = (uint16_t *)dst; end = s + src_size; - mm_end = end - 11; +#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_16mask),"m"(green_16mask)); + mm_end = (uint8_t*)((((unsigned long)end)/16)*16); while(s < mm_end) { __asm __volatile( @@ -447,6 +430,9 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 12; } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif while(s < end) { const int b= *s++; @@ -454,35 +440,24 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned const int r= *s++; *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); } - __asm __volatile(SFENCE:::"memory"); - __asm __volatile(EMMS:::"memory"); -#else - unsigned j,i,num_pixels=src_size/3; - uint16_t *d = (uint16_t *)dst; - for(i=0,j=0; j<num_pixels; i+=3,j++) - { - const int b= src[i+0]; - const int g= src[i+1]; - const int r= src[i+2]; - - d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } -#endif } static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) { -#ifdef HAVE_MMX const uint8_t *s = src; - const uint8_t *end,*mm_end; + const uint8_t *end; +#ifdef HAVE_MMX + const uint8_t *mm_end; +#endif uint16_t *d = (uint16_t *)dst; end = s + src_size; - mm_end = end -11; +#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_15mask),"m"(green_15mask)); + mm_end = (uint8_t*)((((unsigned long)end)/16)*16); while(s < mm_end) { __asm __volatile( @@ -518,6 +493,9 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned d += 4; s += 12; } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif while(s < end) { const int b= *s++; @@ -525,25 +503,448 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned const int r= *s++; *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); } +} + +/* + I use here less accurate approximation by simply + left-shifting the input + value and filling the low order bits with + zeroes. This method improves png's + compression but this scheme cannot reproduce white exactly, since it does not + generate an all-ones maximum value; the net effect is to darken the + image slightly. + + The better method should be "left bit replication": + + 4 3 2 1 0 + --------- + 1 1 0 1 1 + + 7 6 5 4 3 2 1 0 + ---------------- + 1 1 0 1 1 1 1 0 + |=======| |===| + | Leftmost Bits Repeated to Fill Open Bits + | + Original Bits +*/ +static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) +{ + const uint16_t *end; +#ifdef HAVE_MMX + const uint16_t *mm_end; +#endif + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (uint16_t *)src; + end = s + src_size/2; +#ifdef HAVE_MMX + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = (uint16_t*)((((unsigned long)end)/8)*8); + while(s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1\n\t" + "movq %1, %%mm0\n\t" + "movq %1, %%mm1\n\t" + "movq %1, %%mm2\n\t" + "pand %2, %%mm0\n\t" + "pand %3, %%mm1\n\t" + "pand %4, %%mm2\n\t" + "psllq $3, %%mm0\n\t" + "psrlq $2, %%mm1\n\t" + "psrlq $7, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "movq %%mm1, %%mm4\n\t" + "movq %%mm2, %%mm5\n\t" + "punpcklwd %5, %%mm0\n\t" + "punpcklwd %5, %%mm1\n\t" + "punpcklwd %5, %%mm2\n\t" + "punpckhwd %5, %%mm3\n\t" + "punpckhwd %5, %%mm4\n\t" + "punpckhwd %5, %%mm5\n\t" + "psllq $8, %%mm1\n\t" + "psllq $16, %%mm2\n\t" + "por %%mm1, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" + "psllq $8, %%mm4\n\t" + "psllq $16, %%mm5\n\t" + "por %%mm4, %%mm3\n\t" + "por %%mm5, %%mm3\n\t" + + "movq %%mm0, %%mm6\n\t" + "movq %%mm3, %%mm7\n\t" + + "movq 8%1, %%mm0\n\t" + "movq 8%1, %%mm1\n\t" + "movq 8%1, %%mm2\n\t" + "pand %2, %%mm0\n\t" + "pand %3, %%mm1\n\t" + "pand %4, %%mm2\n\t" + "psllq $3, %%mm0\n\t" + "psrlq $2, %%mm1\n\t" + "psrlq $7, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "movq %%mm1, %%mm4\n\t" + "movq %%mm2, %%mm5\n\t" + "punpcklwd %5, %%mm0\n\t" + "punpcklwd %5, %%mm1\n\t" + "punpcklwd %5, %%mm2\n\t" + "punpckhwd %5, %%mm3\n\t" + "punpckhwd %5, %%mm4\n\t" + "punpckhwd %5, %%mm5\n\t" + "psllq $8, %%mm1\n\t" + "psllq $16, %%mm2\n\t" + "por %%mm1, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" + "psllq $8, %%mm4\n\t" + "psllq $16, %%mm5\n\t" + "por %%mm4, %%mm3\n\t" + "por %%mm5, %%mm3\n\t" + + :"=m"(*d) + :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) + :"memory"); + /* Borrowed 32 to 24 */ + __asm __volatile( + "movq %%mm0, %%mm4\n\t" + "movq %%mm3, %%mm5\n\t" + "movq %%mm6, %%mm0\n\t" + "movq %%mm7, %%mm1\n\t" + + "movq %%mm4, %%mm6\n\t" + "movq %%mm5, %%mm7\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + + "psrlq $8, %%mm2\n\t" + "psrlq $8, %%mm3\n\t" + "psrlq $8, %%mm6\n\t" + "psrlq $8, %%mm7\n\t" + "pand %2, %%mm0\n\t" + "pand %2, %%mm1\n\t" + "pand %2, %%mm4\n\t" + "pand %2, %%mm5\n\t" + "pand %3, %%mm2\n\t" + "pand %3, %%mm3\n\t" + "pand %3, %%mm6\n\t" + "pand %3, %%mm7\n\t" + "por %%mm2, %%mm0\n\t" + "por %%mm3, %%mm1\n\t" + "por %%mm6, %%mm4\n\t" + "por %%mm7, %%mm5\n\t" + + "movq %%mm1, %%mm2\n\t" + "movq %%mm4, %%mm3\n\t" + "psllq $48, %%mm2\n\t" + "psllq $32, %%mm3\n\t" + "pand %4, %%mm2\n\t" + "pand %5, %%mm3\n\t" + "por %%mm2, %%mm0\n\t" + "psrlq $16, %%mm1\n\t" + "psrlq $32, %%mm4\n\t" + "psllq $16, %%mm5\n\t" + "por %%mm3, %%mm1\n\t" + "pand %6, %%mm5\n\t" + "por %%mm5, %%mm4\n\t" + + MOVNTQ" %%mm0, %0\n\t" + MOVNTQ" %%mm1, 8%0\n\t" + MOVNTQ" %%mm4, 16%0" + + :"=m"(*d) + :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) + :"memory"); + d += 24; + s += 8; + } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory"); -#else - unsigned j,i,num_pixels=src_size/3; - uint16_t *d = (uint16_t *)dst; - for(i=0,j=0; j<num_pixels; i+=3,j++) +#endif + while(s < end) + { + register uint16_t bgr; + bgr = *s++; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x7C00)>>7; + } +} + +static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) +{ + const uint16_t *end; +#ifdef HAVE_MMX + const uint16_t *mm_end; +#endif + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; +#ifdef HAVE_MMX + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = (uint16_t*)((((unsigned long)end)/8)*8); + while(s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1\n\t" + "movq %1, %%mm0\n\t" + "movq %1, %%mm1\n\t" + "movq %1, %%mm2\n\t" + "pand %2, %%mm0\n\t" + "pand %3, %%mm1\n\t" + "pand %4, %%mm2\n\t" + "psllq $3, %%mm0\n\t" + "psrlq $3, %%mm1\n\t" + "psrlq $8, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "movq %%mm1, %%mm4\n\t" + "movq %%mm2, %%mm5\n\t" + "punpcklwd %5, %%mm0\n\t" + "punpcklwd %5, %%mm1\n\t" + "punpcklwd %5, %%mm2\n\t" + "punpckhwd %5, %%mm3\n\t" + "punpckhwd %5, %%mm4\n\t" + "punpckhwd %5, %%mm5\n\t" + "psllq $8, %%mm1\n\t" + "psllq $16, %%mm2\n\t" + "por %%mm1, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" + "psllq $8, %%mm4\n\t" + "psllq $16, %%mm5\n\t" + "por %%mm4, %%mm3\n\t" + "por %%mm5, %%mm3\n\t" + + "movq %%mm0, %%mm6\n\t" + "movq %%mm3, %%mm7\n\t" + + "movq 8%1, %%mm0\n\t" + "movq 8%1, %%mm1\n\t" + "movq 8%1, %%mm2\n\t" + "pand %2, %%mm0\n\t" + "pand %3, %%mm1\n\t" + "pand %4, %%mm2\n\t" + "psllq $3, %%mm0\n\t" + "psrlq $3, %%mm1\n\t" + "psrlq $8, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "movq %%mm1, %%mm4\n\t" + "movq %%mm2, %%mm5\n\t" + "punpcklwd %5, %%mm0\n\t" + "punpcklwd %5, %%mm1\n\t" + "punpcklwd %5, %%mm2\n\t" + "punpckhwd %5, %%mm3\n\t" + "punpckhwd %5, %%mm4\n\t" + "punpckhwd %5, %%mm5\n\t" + "psllq $8, %%mm1\n\t" + "psllq $16, %%mm2\n\t" + "por %%mm1, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" + "psllq $8, %%mm4\n\t" + "psllq $16, %%mm5\n\t" + "por %%mm4, %%mm3\n\t" + "por %%mm5, %%mm3\n\t" + :"=m"(*d) + :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) + :"memory"); + /* Borrowed 32 to 24 */ + __asm __volatile( + "movq %%mm0, %%mm4\n\t" + "movq %%mm3, %%mm5\n\t" + "movq %%mm6, %%mm0\n\t" + "movq %%mm7, %%mm1\n\t" + + "movq %%mm4, %%mm6\n\t" + "movq %%mm5, %%mm7\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + + "psrlq $8, %%mm2\n\t" + "psrlq $8, %%mm3\n\t" + "psrlq $8, %%mm6\n\t" + "psrlq $8, %%mm7\n\t" + "pand %2, %%mm0\n\t" + "pand %2, %%mm1\n\t" + "pand %2, %%mm4\n\t" + "pand %2, %%mm5\n\t" + "pand %3, %%mm2\n\t" + "pand %3, %%mm3\n\t" + "pand %3, %%mm6\n\t" + "pand %3, %%mm7\n\t" + "por %%mm2, %%mm0\n\t" + "por %%mm3, %%mm1\n\t" + "por %%mm6, %%mm4\n\t" + "por %%mm7, %%mm5\n\t" + + "movq %%mm1, %%mm2\n\t" + "movq %%mm4, %%mm3\n\t" + "psllq $48, %%mm2\n\t" + "psllq $32, %%mm3\n\t" + "pand %4, %%mm2\n\t" + "pand %5, %%mm3\n\t" + "por %%mm2, %%mm0\n\t" + "psrlq $16, %%mm1\n\t" + "psrlq $32, %%mm4\n\t" + "psllq $16, %%mm5\n\t" + "por %%mm3, %%mm1\n\t" + "pand %6, %%mm5\n\t" + "por %%mm5, %%mm4\n\t" + + MOVNTQ" %%mm0, %0\n\t" + MOVNTQ" %%mm1, 8%0\n\t" + MOVNTQ" %%mm4, 16%0" + + :"=m"(*d) + :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) + :"memory"); + d += 24; + s += 8; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while(s < end) + { + register uint16_t bgr; + bgr = *s++; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x7E0)>>3; + *d++ = (bgr&0xF800)>>8; + } +} + +static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) +{ + const uint16_t *end; +#ifdef HAVE_MMX + const uint16_t *mm_end; +#endif + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; +#ifdef HAVE_MMX + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); + mm_end = (uint16_t*)((((unsigned long)end)/4)*4); + while(s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1\n\t" + "movq %1, %%mm0\n\t" + "movq %1, %%mm1\n\t" + "movq %1, %%mm2\n\t" + "pand %2, %%mm0\n\t" + "pand %3, %%mm1\n\t" + "pand %4, %%mm2\n\t" + "psllq $3, %%mm0\n\t" + "psrlq $2, %%mm1\n\t" + "psrlq $7, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "movq %%mm1, %%mm4\n\t" + "movq %%mm2, %%mm5\n\t" + "punpcklwd %%mm7, %%mm0\n\t" + "punpcklwd %%mm7, %%mm1\n\t" + "punpcklwd %%mm7, %%mm2\n\t" + "punpckhwd %%mm7, %%mm3\n\t" + "punpckhwd %%mm7, %%mm4\n\t" + "punpckhwd %%mm7, %%mm5\n\t" + "psllq $8, %%mm1\n\t" + "psllq $16, %%mm2\n\t" + "por %%mm1, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" + "psllq $8, %%mm4\n\t" + "psllq $16, %%mm5\n\t" + "por %%mm4, %%mm3\n\t" + "por %%mm5, %%mm3\n\t" + MOVNTQ" %%mm0, %0\n\t" + MOVNTQ" %%mm3, 8%0\n\t" + :"=m"(*d) + :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) + :"memory"); + d += 16; + s += 4; + } + __asm __volatile(SFENCE:::"memory"); + __asm __volatile(EMMS:::"memory"); +#endif + while(s < end) { - const int b= src[i+0]; - const int g= src[i+1]; - const int r= src[i+2]; + register uint16_t bgr; + bgr = *s++; + *d++ = (bgr&0x1F)<<3; + *d++ = (bgr&0x3E0)>>2; + *d++ = (bgr&0x7C00)>>7; + *d++ = 0; + } +} - d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); +static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) +{ + const uint16_t *end; +#ifdef HAVE_MMX + const uint16_t *mm_end; +#endif + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (uint16_t *)src; + end = s + src_size/2; +#ifdef HAVE_MMX + __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); + __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); + mm_end = (uint16_t*)((((unsigned long)end)/4)*4); + while(s < mm_end) + { + __asm __volatile( + PREFETCH" 32%1\n\t" + "movq %1, %%mm0\n\t" + "movq %1, %%mm1\n\t" + "movq %1, %%mm2\n\t" + "pand %2, %%mm0\n\t" + "pand %3, %%mm1\n\t" + "pand %4, %%mm2\n\t" + "psllq $3, %%mm0\n\t" + "psrlq $3, %%mm1\n\t" + "psrlq |