summaryrefslogtreecommitdiffstats
path: root/libswscale
diff options
context:
space:
mode:
authorivo <ivo@b3059339-0415-0410-9bf9-f77b7e298cf2>2007-04-16 21:41:03 +0000
committerivo <ivo@b3059339-0415-0410-9bf9-f77b7e298cf2>2007-04-16 21:41:03 +0000
commit589238fbe55e9f7195cbaba2ef5e3eb19fb19db1 (patch)
tree6a8955e368809c474e2fec065836fa032164b333 /libswscale
parent5e00b162bb7e99a0154ab63fff57bd944d0eefe4 (diff)
downloadmpv-589238fbe55e9f7195cbaba2ef5e3eb19fb19db1.tar.bz2
mpv-589238fbe55e9f7195cbaba2ef5e3eb19fb19db1.tar.xz
New implementation of rgb32tobgr32
The previous implementation segfaulted with MMX enabled when fed an image smaller than the size of the units the MMX code processed. The new code: - is faster for MMX, MMX2 and plain C - processes small images correctly - is LGPL git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@23009 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libswscale')
-rw-r--r--libswscale/rgb2rgb_template.c99
1 files changed, 58 insertions, 41 deletions
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 6489a4db91..7147855fed 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -1364,49 +1364,66 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
{
+ uint8_t *d = dst, *s = (uint8_t *) src;
+ const uint8_t *end = s + src_size;
#ifdef HAVE_MMX
-/* TODO: unroll this loop */
- asm volatile (
- "xor %%"REG_a", %%"REG_a" \n\t"
- ASMALIGN(4)
- "1: \n\t"
- PREFETCH" 32(%0, %%"REG_a") \n\t"
- "movq (%0, %%"REG_a"), %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "pslld $16, %%mm0 \n\t"
- "psrld $16, %%mm1 \n\t"
- "pand "MANGLE(mask32r)", %%mm0 \n\t"
- "pand "MANGLE(mask32g)", %%mm2 \n\t"
- "pand "MANGLE(mask32b)", %%mm1 \n\t"
- "por %%mm0, %%mm2 \n\t"
- "por %%mm1, %%mm2 \n\t"
- MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
- "add $8, %%"REG_a" \n\t"
- "cmp %2, %%"REG_a" \n\t"
- " jb 1b \n\t"
- :: "r" (src), "r"(dst), "r" (src_size-7)
- : "%"REG_a
- );
-
- __asm __volatile(SFENCE:::"memory");
- __asm __volatile(EMMS:::"memory");
-#else
- unsigned i;
- unsigned num_pixels = src_size >> 2;
- for(i=0; i<num_pixels; i++)
- {
-#ifdef WORDS_BIGENDIAN
- dst[4*i + 1] = src[4*i + 3];
- dst[4*i + 2] = src[4*i + 2];
- dst[4*i + 3] = src[4*i + 1];
-#else
- dst[4*i + 0] = src[4*i + 2];
- dst[4*i + 1] = src[4*i + 1];
- dst[4*i + 2] = src[4*i + 0];
-#endif
+ __asm __volatile(
+ " "PREFETCH" (%1) \n"
+ " movq %3, %%mm7 \n"
+ " pxor %4, %%mm7 \n"
+ " movq %%mm7, %%mm6 \n"
+ " pxor %5, %%mm7 \n"
+ " jmp 2f \n"
+ ASMALIGN(4)
+ "1: \n"
+ " "PREFETCH" 32(%1) \n"
+ " movq (%1), %%mm0 \n"
+ " movq 8(%1), %%mm1 \n"
+# ifdef HAVE_MMX2
+ " pshufw $177, %%mm0, %%mm3 \n"
+ " pshufw $177, %%mm1, %%mm5 \n"
+ " pand %%mm7, %%mm0 \n"
+ " pand %%mm6, %%mm3 \n"
+ " pand %%mm7, %%mm1 \n"
+ " pand %%mm6, %%mm5 \n"
+ " por %%mm3, %%mm0 \n"
+ " por %%mm5, %%mm1 \n"
+# else
+ " movq %%mm0, %%mm2 \n"
+ " movq %%mm1, %%mm4 \n"
+ " pand %%mm7, %%mm0 \n"
+ " pand %%mm6, %%mm2 \n"
+ " pand %%mm7, %%mm1 \n"
+ " pand %%mm6, %%mm4 \n"
+ " movq %%mm2, %%mm3 \n"
+ " movq %%mm4, %%mm5 \n"
+ " pslld $16, %%mm2 \n"
+ " psrld $16, %%mm3 \n"
+ " pslld $16, %%mm4 \n"
+ " psrld $16, %%mm5 \n"
+ " por %%mm2, %%mm0 \n"
+ " por %%mm4, %%mm1 \n"
+ " por %%mm3, %%mm0 \n"
+ " por %%mm5, %%mm1 \n"
+# endif
+ " "MOVNTQ" %%mm0, (%0) \n"
+ " "MOVNTQ" %%mm1, 8(%0) \n"
+ " add $16, %0 \n"
+ " add $16, %1 \n"
+ "2: \n"
+ " cmp %1, %2 \n"
+ " ja 1b \n"
+ " "SFENCE" \n"
+ " "EMMS" \n"
+ : "+r"(d), "+r"(s)
+ : "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
+ : "memory");
+#endif
+ for (; s<end; s+=4, d+=4) {
+ int v = *(uint32_t *)s, g = v & 0xff00;
+ v &= 0xff00ff;
+ *(uint32_t *)d = (v>>16) + g + (v<<16);
}
-#endif
}
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)