summaryrefslogtreecommitdiffstats
path: root/postproc
diff options
context:
space:
mode:
authorarpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2>2002-06-22 08:49:45 +0000
committerarpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2>2002-06-22 08:49:45 +0000
commitd3c753359a7fa62f6b1d46d43d8eb0ad293df443 (patch)
treea08eca170c304ab9621cdc31f4056809db1ea873 /postproc
parent1bf774801032825fe68235eb766aaede5c14e046 (diff)
downloadmpv-d3c753359a7fa62f6b1d46d43d8eb0ad293df443.tar.bz2
mpv-d3c753359a7fa62f6b1d46d43d8eb0ad293df443.tar.xz
sync with mplayer xp
- partial yvu9 support (copy only) - rgb 15/16 -> 24/32 converters - int->unsigned changes git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@6493 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'postproc')
-rw-r--r--postproc/rgb2rgb.c124
-rw-r--r--postproc/rgb2rgb.h19
-rw-r--r--postproc/rgb2rgb_template.c690
-rw-r--r--postproc/swscale.c531
-rw-r--r--postproc/swscale_template.c2
-rw-r--r--postproc/yuv2rgb.c12
-rw-r--r--postproc/yuv2rgb_mlib.c14
-rw-r--r--postproc/yuv2rgb_template.c18
8 files changed, 1123 insertions, 287 deletions
diff --git a/postproc/rgb2rgb.c b/postproc/rgb2rgb.c
index 91983bea0a..962a58945f 100644
--- a/postproc/rgb2rgb.c
+++ b/postproc/rgb2rgb.c
@@ -20,6 +20,8 @@
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
#ifdef CAN_COMPILE_X86_ASM
+static const uint64_t mmx_null __attribute__((aligned(8))) = 0x0000000000000000ULL;
+static const uint64_t mmx_one __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL;
static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
@@ -35,6 +37,11 @@ static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff00
static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
+static const uint64_t mask15g __attribute__((aligned(8))) = 0x03E003E003E003E0ULL;
+static const uint64_t mask15r __attribute__((aligned(8))) = 0x7C007C007C007C00ULL;
+#define mask16b mask15b
+static const uint64_t mask16g __attribute__((aligned(8))) = 0x07E007E007E007E0ULL;
+static const uint64_t mask16r __attribute__((aligned(8))) = 0xF800F800F800F800ULL;
static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
@@ -137,10 +144,68 @@ void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
else if(gCpuCaps.hasMMX)
rgb24to32_MMX(src, dst, src_size);
else
+#endif
rgb24to32_C(src, dst, src_size);
-#else
- rgb24to32_C(src, dst, src_size);
+}
+
+void rgb15to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
+{
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ rgb15to24_MMX2(src, dst, src_size);
+ else if(gCpuCaps.has3DNow)
+ rgb15to24_3DNow(src, dst, src_size);
+ else if(gCpuCaps.hasMMX)
+ rgb15to24_MMX(src, dst, src_size);
+ else
#endif
+ rgb15to24_C(src, dst, src_size);
+}
+
+void rgb16to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
+{
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ rgb16to24_MMX2(src, dst, src_size);
+ else if(gCpuCaps.has3DNow)
+ rgb16to24_3DNow(src, dst, src_size);
+ else if(gCpuCaps.hasMMX)
+ rgb16to24_MMX(src, dst, src_size);
+ else
+#endif
+ rgb16to24_C(src, dst, src_size);
+}
+
+void rgb15to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
+{
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ rgb15to32_MMX2(src, dst, src_size);
+ else if(gCpuCaps.has3DNow)
+ rgb15to32_3DNow(src, dst, src_size);
+ else if(gCpuCaps.hasMMX)
+ rgb15to32_MMX(src, dst, src_size);
+ else
+#endif
+ rgb15to32_C(src, dst, src_size);
+}
+
+void rgb16to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
+{
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ rgb16to32_MMX2(src, dst, src_size);
+ else if(gCpuCaps.has3DNow)
+ rgb16to32_3DNow(src, dst, src_size);
+ else if(gCpuCaps.hasMMX)
+ rgb16to32_MMX(src, dst, src_size);
+ else
+#endif
+ rgb16to32_C(src, dst, src_size);
}
void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
@@ -154,10 +219,8 @@ void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
else if(gCpuCaps.hasMMX)
rgb32to24_MMX(src, dst, src_size);
else
- rgb32to24_C(src, dst, src_size);
-#else
- rgb32to24_C(src, dst, src_size);
#endif
+ rgb32to24_C(src, dst, src_size);
}
/*
@@ -177,10 +240,8 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
else if(gCpuCaps.hasMMX)
rgb15to16_MMX(src, dst, src_size);
else
- rgb15to16_C(src, dst, src_size);
-#else
- rgb15to16_C(src, dst, src_size);
#endif
+ rgb15to16_C(src, dst, src_size);
}
/**
@@ -242,10 +303,8 @@ void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
else if(gCpuCaps.hasMMX)
rgb32to16_MMX(src, dst, src_size);
else
- rgb32to16_C(src, dst, src_size);
-#else
- rgb32to16_C(src, dst, src_size);
#endif
+ rgb32to16_C(src, dst, src_size);
}
void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
@@ -259,10 +318,8 @@ void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
else if(gCpuCaps.hasMMX)
rgb32to15_MMX(src, dst, src_size);
else
- rgb32to15_C(src, dst, src_size);
-#else
- rgb32to15_C(src, dst, src_size);
#endif
+ rgb32to15_C(src, dst, src_size);
}
void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
@@ -276,10 +333,8 @@ void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
else if(gCpuCaps.hasMMX)
rgb24to16_MMX(src, dst, src_size);
else
- rgb24to16_C(src, dst, src_size);
-#else
- rgb24to16_C(src, dst, src_size);
#endif
+ rgb24to16_C(src, dst, src_size);
}
void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
@@ -293,10 +348,8 @@ void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
else if(gCpuCaps.hasMMX)
rgb24to15_MMX(src, dst, src_size);
else
- rgb24to15_C(src, dst, src_size);
-#else
- rgb24to15_C(src, dst, src_size);
#endif
+ rgb24to15_C(src, dst, src_size);
}
/**
@@ -330,10 +383,8 @@ void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
else if(gCpuCaps.hasMMX)
rgb32tobgr32_MMX(src, dst, src_size);
else
- rgb32tobgr32_C(src, dst, src_size);
-#else
- rgb32tobgr32_C(src, dst, src_size);
#endif
+ rgb32tobgr32_C(src, dst, src_size);
}
void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
@@ -347,10 +398,8 @@ void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
else if(gCpuCaps.hasMMX)
rgb24tobgr24_MMX(src, dst, src_size);
else
- rgb24tobgr24_C(src, dst, src_size);
-#else
- rgb24tobgr24_C(src, dst, src_size);
#endif
+ rgb24tobgr24_C(src, dst, src_size);
}
/**
@@ -371,10 +420,8 @@ void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, u
else if(gCpuCaps.hasMMX)
yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
else
- yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
-#else
- yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
#endif
+ yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
}
/**
@@ -394,10 +441,8 @@ void yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc
else if(gCpuCaps.hasMMX)
yuv422ptoyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
else
- yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
-#else
- yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
#endif
+ yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
}
/**
@@ -418,10 +463,8 @@ void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
else if(gCpuCaps.hasMMX)
yuy2toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
else
- yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
-#else
- yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
#endif
+ yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
}
/**
@@ -488,14 +531,13 @@ void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst
else if(gCpuCaps.hasMMX)
rgb24toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
else
- rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
-#else
- rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
#endif
+ rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
}
void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
- int width, int height, int src1Stride, int src2Stride, int dstStride)
+ unsigned width, unsigned height, unsigned src1Stride,
+ unsigned src2Stride, unsigned dstStride)
{
#ifdef CAN_COMPILE_X86_ASM
// ordered per speed fasterst first
@@ -506,8 +548,6 @@ void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
else if(gCpuCaps.hasMMX)
interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
else
- interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
-#else
- interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
#endif
+ interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
}
diff --git a/postproc/rgb2rgb.h b/postproc/rgb2rgb.h
index fb4f04590d..9fb6da6ef1 100644
--- a/postproc/rgb2rgb.h
+++ b/postproc/rgb2rgb.h
@@ -10,12 +10,16 @@
#define RGB2RGB_INCLUDED
extern void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size);
+extern void rgb24to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
+extern void rgb24to15(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size);
-extern void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void rgb32to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void rgb32to15(const uint8_t *src,uint8_t *dst,unsigned src_size);
-extern void rgb24to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
-extern void rgb24to15(const uint8_t *src,uint8_t *dst,unsigned src_size);
+extern void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size);
+extern void rgb15to24(const uint8_t *src,uint8_t *dst,unsigned src_size);
+extern void rgb15to32(const uint8_t *src,uint8_t *dst,unsigned src_size);
+extern void rgb16to24(const uint8_t *src,uint8_t *dst,unsigned src_size);
+extern void rgb16to32(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned src_size);
@@ -39,7 +43,8 @@ extern void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_
unsigned int lumStride, unsigned int chromStride, unsigned int srcStride);
extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
- int width, int height, int src1Stride, int src2Stride, int dstStride);
+ unsigned width, unsigned height, unsigned src1Stride,
+ unsigned src2Stride, unsigned dstStride);
#define MODE_RGB 0x1
@@ -47,11 +52,11 @@ extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
typedef void (* yuv2rgb_fun) (uint8_t * image, uint8_t * py,
uint8_t * pu, uint8_t * pv,
- int h_size, int v_size,
- int rgb_stride, int y_stride, int uv_stride);
+ unsigned h_size, unsigned v_size,
+ unsigned rgb_stride, unsigned y_stride, unsigned uv_stride);
extern yuv2rgb_fun yuv2rgb;
-void yuv2rgb_init (int bpp, int mode);
+void yuv2rgb_init (unsigned bpp, int mode);
#endif
diff --git a/postproc/rgb2rgb_template.c b/postproc/rgb2rgb_template.c
index 9d59eabc70..015e7f2d56 100644
--- a/postproc/rgb2rgb_template.c
+++ b/postproc/rgb2rgb_template.c
@@ -8,6 +8,13 @@
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
*/
+#include <stddef.h>
+#include <inttypes.h> /* for __WORDSIZE */
+
+#ifndef __WORDSIZE
+#warning You have misconfigured system and probably will lose performance!
+#endif
+
#undef PREFETCH
#undef MOVNTQ
#undef EMMS
@@ -56,13 +63,13 @@ static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned sr
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
- const uint8_t *mm_end;
+ uint8_t *mm_end;
#endif
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
- mm_end = end - 23;
__asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
+ mm_end = (uint8_t*)((((unsigned long)end)/24)*24);
while(s < mm_end)
{
__asm __volatile(
@@ -107,12 +114,12 @@ static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned sr
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
- const uint8_t *mm_end;
+ uint8_t *mm_end;
#endif
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
- mm_end = end - 31;
+ mm_end = (uint8_t*)((((unsigned long)end)/32)*32);
while(s < mm_end)
{
__asm __volatile(
@@ -186,15 +193,16 @@ static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned sr
*/
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
{
+ register const uint8_t* s=src;
+ register uint8_t* d=dst;
+ register const uint8_t *end;
+ uint8_t *mm_end;
+ end = s + src_size;
#ifdef HAVE_MMX
- register int offs=15-src_size;
- register const char* s=src-offs;
- register char* d=dst-offs;
- __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
- __asm __volatile(
- "movq %0, %%mm4\n\t"
- ::"m"(mask15s));
- while(offs<0)
+ __asm __volatile(PREFETCH" %0"::"m"(*s));
+ __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
+ mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
+ while(s<mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
@@ -208,40 +216,28 @@ static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned sr
"paddw %%mm3, %%mm2\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm2, 8%0"
- :"=m"(*(d+offs))
- :"m"(*(s+offs))
+ :"=m"(*d)
+ :"m"(*s)
);
- offs+=16;
+ d+=16;
+ s+=16;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
-#else
-#if 0
- const uint16_t *s1=( uint16_t * )src;
- uint16_t *d1=( uint16_t * )dst;
- uint16_t *e=((uint8_t *)s1)+src_size;
- while( s1<e ){
- register int x=*( s1++ );
- /* rrrrrggggggbbbbb
- 0rrrrrgggggbbbbb
- 0111 1111 1110 0000=0x7FE0
- 00000000000001 1111=0x001F */
- *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
- }
-#else
- const unsigned *s1=( unsigned * )src;
- unsigned *d1=( unsigned * )dst;
- int i;
- int size= src_size>>2;
- for(i=0; i<size; i++)
- {
- register int x= s1[i];
-// d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
- d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
-
- }
-#endif
#endif
+ mm_end = (uint8_t*)((((unsigned long)end)/4)*4);
+ while(s < mm_end)
+ {
+ register unsigned x= *((uint32_t *)s);
+ *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
+ d+=4;
+ s+=4;
+ }
+ if(s < end)
+ {
+ register unsigned short x= *((uint16_t *)s);
+ *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
+ }
}
static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
@@ -257,17 +253,20 @@ static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsign
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
-#ifdef HAVE_MMX
const uint8_t *s = src;
- const uint8_t *end,*mm_end;
+ const uint8_t *end;
+#ifdef HAVE_MMX
+ const uint8_t *mm_end;
+#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
- mm_end = end - 15;
+#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_16mask),"m"(green_16mask));
+ mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
while(s < mm_end)
{
__asm __volatile(
@@ -303,43 +302,35 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
d += 4;
s += 16;
}
+ __asm __volatile(SFENCE:::"memory");
+ __asm __volatile(EMMS:::"memory");
+#endif
while(s < end)
{
const int b= *s++;
const int g= *s++;
const int r= *s++;
- s++;
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
+ s++;
}
- __asm __volatile(SFENCE:::"memory");
- __asm __volatile(EMMS:::"memory");
-#else
- unsigned j,i,num_pixels=src_size/4;
- uint16_t *d = (uint16_t *)dst;
- for(i=0,j=0; j<num_pixels; i+=4,j++)
- {
- const int b= src[i+0];
- const int g= src[i+1];
- const int r= src[i+2];
-
- d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
- }
-#endif
}
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
-#ifdef HAVE_MMX
const uint8_t *s = src;
- const uint8_t *end,*mm_end;
+ const uint8_t *end;
+#ifdef HAVE_MMX
+ const uint8_t *mm_end;
+#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
- mm_end = end - 15;
+#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_15mask),"m"(green_15mask));
+ mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
while(s < mm_end)
{
__asm __volatile(
@@ -375,43 +366,35 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
d += 4;
s += 16;
}
+ __asm __volatile(SFENCE:::"memory");
+ __asm __volatile(EMMS:::"memory");
+#endif
while(s < end)
{
const int b= *s++;
const int g= *s++;
const int r= *s++;
- s++;
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
+ s++;
}
- __asm __volatile(SFENCE:::"memory");
- __asm __volatile(EMMS:::"memory");
-#else
- unsigned j,i,num_pixels=src_size/4;
- uint16_t *d = (uint16_t *)dst;
- for(i=0,j=0; j<num_pixels; i+=4,j++)
- {
- const int b= src[i+0];
- const int g= src[i+1];
- const int r= src[i+2];
-
- d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
- }
-#endif
}
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
-#ifdef HAVE_MMX
const uint8_t *s = src;
- const uint8_t *end,*mm_end;
+ const uint8_t *end;
+#ifdef HAVE_MMX
+ const uint8_t *mm_end;
+#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
- mm_end = end - 11;
+#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_16mask),"m"(green_16mask));
+ mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
while(s < mm_end)
{
__asm __volatile(
@@ -447,6 +430,9 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned
d += 4;
s += 12;
}
+ __asm __volatile(SFENCE:::"memory");
+ __asm __volatile(EMMS:::"memory");
+#endif
while(s < end)
{
const int b= *s++;
@@ -454,35 +440,24 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned
const int r= *s++;
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
}
- __asm __volatile(SFENCE:::"memory");
- __asm __volatile(EMMS:::"memory");
-#else
- unsigned j,i,num_pixels=src_size/3;
- uint16_t *d = (uint16_t *)dst;
- for(i=0,j=0; j<num_pixels; i+=3,j++)
- {
- const int b= src[i+0];
- const int g= src[i+1];
- const int r= src[i+2];
-
- d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
- }
-#endif
}
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
-#ifdef HAVE_MMX
const uint8_t *s = src;
- const uint8_t *end,*mm_end;
+ const uint8_t *end;
+#ifdef HAVE_MMX
+ const uint8_t *mm_end;
+#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
- mm_end = end -11;
+#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_15mask),"m"(green_15mask));
+ mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
while(s < mm_end)
{
__asm __volatile(
@@ -518,6 +493,9 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned
d += 4;
s += 12;
}
+ __asm __volatile(SFENCE:::"memory");
+ __asm __volatile(EMMS:::"memory");
+#endif
while(s < end)
{
const int b= *s++;
@@ -525,25 +503,448 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned
const int r= *s++;
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
}
+}
+
+/*
+ I use here less accurate approximation by simply
+ left-shifting the input
+ value and filling the low order bits with
+ zeroes. This method improves png's
+ compression but this scheme cannot reproduce white exactly, since it does not
+ generate an all-ones maximum value; the net effect is to darken the
+ image slightly.
+
+ The better method should be "left bit replication":
+
+ 4 3 2 1 0
+ ---------
+ 1 1 0 1 1
+
+ 7 6 5 4 3 2 1 0
+ ----------------
+ 1 1 0 1 1 1 1 0
+ |=======| |===|
+ | Leftmost Bits Repeated to Fill Open Bits
+ |
+ Original Bits
+*/
+static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
+{
+ const uint16_t *end;
+#ifdef HAVE_MMX
+ const uint16_t *mm_end;
+#endif
+ uint8_t *d = (uint8_t *)dst;
+ const uint16_t *s = (uint16_t *)src;
+ end = s + src_size/2;
+#ifdef HAVE_MMX
+ __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
+ mm_end = (uint16_t*)((((unsigned long)end)/8)*8);
+ while(s < mm_end)
+ {
+ __asm __volatile(
+ PREFETCH" 32%1\n\t"
+ "movq %1, %%mm0\n\t"
+ "movq %1, %%mm1\n\t"
+ "movq %1, %%mm2\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %3, %%mm1\n\t"
+ "pand %4, %%mm2\n\t"
+ "psllq $3, %%mm0\n\t"
+ "psrlq $2, %%mm1\n\t"
+ "psrlq $7, %%mm2\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "movq %%mm1, %%mm4\n\t"
+ "movq %%mm2, %%mm5\n\t"
+ "punpcklwd %5, %%mm0\n\t"
+ "punpcklwd %5, %%mm1\n\t"
+ "punpcklwd %5, %%mm2\n\t"
+ "punpckhwd %5, %%mm3\n\t"
+ "punpckhwd %5, %%mm4\n\t"
+ "punpckhwd %5, %%mm5\n\t"
+ "psllq $8, %%mm1\n\t"
+ "psllq $16, %%mm2\n\t"
+ "por %%mm1, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "psllq $8, %%mm4\n\t"
+ "psllq $16, %%mm5\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "por %%mm5, %%mm3\n\t"
+
+ "movq %%mm0, %%mm6\n\t"
+ "movq %%mm3, %%mm7\n\t"
+
+ "movq 8%1, %%mm0\n\t"
+ "movq 8%1, %%mm1\n\t"
+ "movq 8%1, %%mm2\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %3, %%mm1\n\t"
+ "pand %4, %%mm2\n\t"
+ "psllq $3, %%mm0\n\t"
+ "psrlq $2, %%mm1\n\t"
+ "psrlq $7, %%mm2\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "movq %%mm1, %%mm4\n\t"
+ "movq %%mm2, %%mm5\n\t"
+ "punpcklwd %5, %%mm0\n\t"
+ "punpcklwd %5, %%mm1\n\t"
+ "punpcklwd %5, %%mm2\n\t"
+ "punpckhwd %5, %%mm3\n\t"
+ "punpckhwd %5, %%mm4\n\t"
+ "punpckhwd %5, %%mm5\n\t"
+ "psllq $8, %%mm1\n\t"
+ "psllq $16, %%mm2\n\t"
+ "por %%mm1, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "psllq $8, %%mm4\n\t"
+ "psllq $16, %%mm5\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "por %%mm5, %%mm3\n\t"
+
+ :"=m"(*d)
+ :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
+ :"memory");
+ /* Borrowed 32 to 24 */
+ __asm __volatile(
+ "movq %%mm0, %%mm4\n\t"
+ "movq %%mm3, %%mm5\n\t"
+ "movq %%mm6, %%mm0\n\t"
+ "movq %%mm7, %%mm1\n\t"
+
+ "movq %%mm4, %%mm6\n\t"
+ "movq %%mm5, %%mm7\n\t"
+ "movq %%mm0, %%mm2\n\t"
+ "movq %%mm1, %%mm3\n\t"
+
+ "psrlq $8, %%mm2\n\t"
+ "psrlq $8, %%mm3\n\t"
+ "psrlq $8, %%mm6\n\t"
+ "psrlq $8, %%mm7\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %2, %%mm1\n\t"
+ "pand %2, %%mm4\n\t"
+ "pand %2, %%mm5\n\t"
+ "pand %3, %%mm2\n\t"
+ "pand %3, %%mm3\n\t"
+ "pand %3, %%mm6\n\t"
+ "pand %3, %%mm7\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "por %%mm3, %%mm1\n\t"
+ "por %%mm6, %%mm4\n\t"
+ "por %%mm7, %%mm5\n\t"
+
+ "movq %%mm1, %%mm2\n\t"
+ "movq %%mm4, %%mm3\n\t"
+ "psllq $48, %%mm2\n\t"
+ "psllq $32, %%mm3\n\t"
+ "pand %4, %%mm2\n\t"
+ "pand %5, %%mm3\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "psrlq $16, %%mm1\n\t"
+ "psrlq $32, %%mm4\n\t"
+ "psllq $16, %%mm5\n\t"
+ "por %%mm3, %%mm1\n\t"
+ "pand %6, %%mm5\n\t"
+ "por %%mm5, %%mm4\n\t"
+
+ MOVNTQ" %%mm0, %0\n\t"
+ MOVNTQ" %%mm1, 8%0\n\t"
+ MOVNTQ" %%mm4, 16%0"
+
+ :"=m"(*d)
+ :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
+ :"memory");
+ d += 24;
+ s += 8;
+ }
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
-#else
- unsigned j,i,num_pixels=src_size/3;
- uint16_t *d = (uint16_t *)dst;
- for(i=0,j=0; j<num_pixels; i+=3,j++)
+#endif
+ while(s < end)
+ {
+ register uint16_t bgr;
+ bgr = *s++;
+ *d++ = (bgr&0x1F)<<3;
+ *d++ = (bgr&0x3E0)>>2;
+ *d++ = (bgr&0x7C00)>>7;
+ }
+}
+
+static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
+{
+ const uint16_t *end;
+#ifdef HAVE_MMX
+ const uint16_t *mm_end;
+#endif
+ uint8_t *d = (uint8_t *)dst;
+ const uint16_t *s = (const uint16_t *)src;
+ end = s + src_size/2;
+#ifdef HAVE_MMX
+ __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
+ mm_end = (uint16_t*)((((unsigned long)end)/8)*8);
+ while(s < mm_end)
+ {
+ __asm __volatile(
+ PREFETCH" 32%1\n\t"
+ "movq %1, %%mm0\n\t"
+ "movq %1, %%mm1\n\t"
+ "movq %1, %%mm2\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %3, %%mm1\n\t"
+ "pand %4, %%mm2\n\t"
+ "psllq $3, %%mm0\n\t"
+ "psrlq $3, %%mm1\n\t"
+ "psrlq $8, %%mm2\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "movq %%mm1, %%mm4\n\t"
+ "movq %%mm2, %%mm5\n\t"
+ "punpcklwd %5, %%mm0\n\t"
+ "punpcklwd %5, %%mm1\n\t"
+ "punpcklwd %5, %%mm2\n\t"
+ "punpckhwd %5, %%mm3\n\t"
+ "punpckhwd %5, %%mm4\n\t"
+ "punpckhwd %5, %%mm5\n\t"
+ "psllq $8, %%mm1\n\t"
+ "psllq $16, %%mm2\n\t"
+ "por %%mm1, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "psllq $8, %%mm4\n\t"
+ "psllq $16, %%mm5\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "por %%mm5, %%mm3\n\t"
+
+ "movq %%mm0, %%mm6\n\t"
+ "movq %%mm3, %%mm7\n\t"
+
+ "movq 8%1, %%mm0\n\t"
+ "movq 8%1, %%mm1\n\t"
+ "movq 8%1, %%mm2\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %3, %%mm1\n\t"
+ "pand %4, %%mm2\n\t"
+ "psllq $3, %%mm0\n\t"
+ "psrlq $3, %%mm1\n\t"
+ "psrlq $8, %%mm2\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "movq %%mm1, %%mm4\n\t"
+ "movq %%mm2, %%mm5\n\t"
+ "punpcklwd %5, %%mm0\n\t"
+ "punpcklwd %5, %%mm1\n\t"
+ "punpcklwd %5, %%mm2\n\t"
+ "punpckhwd %5, %%mm3\n\t"
+ "punpckhwd %5, %%mm4\n\t"
+ "punpckhwd %5, %%mm5\n\t"
+ "psllq $8, %%mm1\n\t"
+ "psllq $16, %%mm2\n\t"
+ "por %%mm1, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "psllq $8, %%mm4\n\t"
+ "psllq $16, %%mm5\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "por %%mm5, %%mm3\n\t"
+ :"=m"(*d)
+ :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
+ :"memory");
+ /* Borrowed 32 to 24 */
+ __asm __volatile(
+ "movq %%mm0, %%mm4\n\t"
+ "movq %%mm3, %%mm5\n\t"
+ "movq %%mm6, %%mm0\n\t"
+ "movq %%mm7, %%mm1\n\t"
+
+ "movq %%mm4, %%mm6\n\t"
+ "movq %%mm5, %%mm7\n\t"
+ "movq %%mm0, %%mm2\n\t"
+ "movq %%mm1, %%mm3\n\t"
+
+ "psrlq $8, %%mm2\n\t"
+ "psrlq $8, %%mm3\n\t"
+ "psrlq $8, %%mm6\n\t"
+ "psrlq $8, %%mm7\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %2, %%mm1\n\t"
+ "pand %2, %%mm4\n\t"
+ "pand %2, %%mm5\n\t"
+ "pand %3, %%mm2\n\t"
+ "pand %3, %%mm3\n\t"
+ "pand %3, %%mm6\n\t"
+ "pand %3, %%mm7\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "por %%mm3, %%mm1\n\t"
+ "por %%mm6, %%mm4\n\t"
+ "por %%mm7, %%mm5\n\t"
+
+ "movq %%mm1, %%mm2\n\t"
+ "movq %%mm4, %%mm3\n\t"
+ "psllq $48, %%mm2\n\t"
+ "psllq $32, %%mm3\n\t"
+ "pand %4, %%mm2\n\t"
+ "pand %5, %%mm3\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "psrlq $16, %%mm1\n\t"
+ "psrlq $32, %%mm4\n\t"
+ "psllq $16, %%mm5\n\t"
+ "por %%mm3, %%mm1\n\t"
+ "pand %6, %%mm5\n\t"
+ "por %%mm5, %%mm4\n\t"
+
+ MOVNTQ" %%mm0, %0\n\t"
+ MOVNTQ" %%mm1, 8%0\n\t"
+ MOVNTQ" %%mm4, 16%0"
+
+ :"=m"(*d)
+ :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
+ :"memory");
+ d += 24;
+ s += 8;
+ }
+ __asm __volatile(SFENCE:::"memory");
+ __asm __volatile(EMMS:::"memory");
+#endif
+ while(s < end)
+ {
+ register uint16_t bgr;
+ bgr = *s++;
+ *d++ = (bgr&0x1F)<<3;
+ *d++ = (bgr&0x7E0)>>3;
+ *d++ = (bgr&0xF800)>>8;
+ }
+}
+
+static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
+{
+ const uint16_t *end;
+#ifdef HAVE_MMX
+ const uint16_t *mm_end;
+#endif
+ uint8_t *d = (uint8_t *)dst;
+ const uint16_t *s = (const uint16_t *)src;
+ end = s + src_size/2;
+#ifdef HAVE_MMX
+ __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
+ __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
+ mm_end = (uint16_t*)((((unsigned long)end)/4)*4);
+ while(s < mm_end)
+ {
+ __asm __volatile(
+ PREFETCH" 32%1\n\t"
+ "movq %1, %%mm0\n\t"
+ "movq %1, %%mm1\n\t"
+ "movq %1, %%mm2\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %3, %%mm1\n\t"
+ "pand %4, %%mm2\n\t"
+ "psllq $3, %%mm0\n\t"
+ "psrlq $2, %%mm1\n\t"
+ "psrlq $7, %%mm2\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "movq %%mm1, %%mm4\n\t"
+ "movq %%mm2, %%mm5\n\t"
+ "punpcklwd %%mm7, %%mm0\n\t"
+ "punpcklwd %%mm7, %%mm1\n\t"
+ "punpcklwd %%mm7, %%mm2\n\t"
+ "punpckhwd %%mm7, %%mm3\n\t"
+ "punpckhwd %%mm7, %%mm4\n\t"
+ "punpckhwd %%mm7, %%mm5\n\t"
+ "psllq $8, %%mm1\n\t"
+ "psllq $16, %%mm2\n\t"
+ "por %%mm1, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t"
+ "psllq $8, %%mm4\n\t"
+ "psllq $16, %%mm5\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "por %%mm5, %%mm3\n\t"
+ MOVNTQ" %%mm0, %0\n\t"
+ MOVNTQ" %%mm3, 8%0\n\t"
+ :"=m"(*d)
+ :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
+ :"memory");
+ d += 16;
+ s += 4;
+ }
+ __asm __volatile(SFENCE:::"memory");
+ __asm __volatile(EMMS:::"memory");
+#endif
+ while(s < end)
{
- const int b= src[i+0];
- const int g= src[i+1];
- const int r= src[i+2];
+ register uint16_t bgr;
+ bgr = *s++;
+ *d++ = (bgr&0x1F)<<3;
+ *d++ = (bgr&0x3E0)>>2;
+ *d++ = (bgr&0x7C00)>>7;
+ *d++ = 0;
+ }
+}
- d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
+static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
+{
+ const uint16_t *end;
+#ifdef HAVE_MMX
+ const uint16_t *mm_end;
+#endif
+ uint8_t *d = (uint8_t *)dst;
+ const uint16_t *s = (uint16_t *)src;
+ end = s + src_size/2;
+#ifdef HAVE_MMX
+ __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
+ __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
+ mm_end = (uint16_t*)((((unsigned long)end)/4)*4);
+ while(s < mm_end)
+ {
+ __asm __volatile(
+ PREFETCH" 32%1\n\t"
+ "movq %1, %%mm0\n\t"
+ "movq %1, %%mm1\n\t"
+ "movq %1, %%mm2\n\t"
+ "pand %2, %%mm0\n\t"
+ "pand %3, %%mm1\n\t"
+ "pand %4, %%mm2\n\t"
+ "psllq $3, %%mm0\n\t"
+ "psrlq $3, %%mm1\n\t"
+ "psrlq $8, %%mm2\n\t"