diff options
author | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-26 21:12:15 +0000 |
---|---|---|
committer | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-26 21:12:15 +0000 |
commit | 2e4b5d09085aec28e70bb7127f067c76228cd0f3 (patch) | |
tree | 5509484fc86a75560d913daeac43e25a4c0eacfe /libvo/osd.c | |
parent | 0dca7e991a3b5d059eb6bace42ef4d69e3ce7111 (diff) | |
download | mpv-2e4b5d09085aec28e70bb7127f067c76228cd0f3.tar.bz2 mpv-2e4b5d09085aec28e70bb7127f067c76228cd0f3.tar.xz |
runtime cpu detection
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3143 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libvo/osd.c')
-rw-r--r-- | libvo/osd.c | 499 |
1 files changed, 121 insertions, 378 deletions
diff --git a/libvo/osd.c b/libvo/osd.c index 7dfb82f692..770abe7b83 100644 --- a/libvo/osd.c +++ b/libvo/osd.c @@ -1,414 +1,140 @@ // Generic alpha renderers for all YUV modes and RGB depths. // These are "reference implementations", should be optimized later (MMX, etc) -// Optimized by Nick and Michael +// Templating Code from Michael Niedermayer (michaelni@gmx.at) is under GPL //#define FAST_OSD //#define FAST_OSD_TABLE #include "config.h" #include "osd.h" -#include "../mmx_defs.h" //#define ENABLE_PROFILE #include "../my_profile.h" #include <inttypes.h> +#include "../cpudetect.h" -#ifdef HAVE_MMX +extern int verbose; // defined in mplayer.c + +#ifdef ARCH_X86 +#define CAN_COMPILE_X86_ASM +#endif + +#ifdef CAN_COMPILE_X86_ASM static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL; +static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL; +static const unsigned long long mask24hl __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL; #endif +//Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one +//Plain C versions +#undef HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_3DNOW +#undef ARCH_X86 +#define RENAME(a) a ## _C +#include "osd_template.c" + +#ifdef CAN_COMPILE_X86_ASM + +//X86 noMMX versions +#undef RENAME +#undef HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_3DNOW +#define ARCH_X86 +#define RENAME(a) a ## _X86 +#include "osd_template.c" + +//MMX versions +#undef RENAME +#define HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_3DNOW +#define ARCH_X86 +#define RENAME(a) a ## _MMX +#include "osd_template.c" + +//MMX2 versions +#undef RENAME +#define HAVE_MMX +#define HAVE_MMX2 +#undef HAVE_3DNOW +#define ARCH_X86 +#define RENAME(a) a ## _MMX2 +#include "osd_template.c" + +//3DNOW versions +#undef RENAME +#define HAVE_MMX +#undef HAVE_MMX2 +#define HAVE_3DNOW +#define ARCH_X86 +#define RENAME(a) a ## _3DNow +#include "osd_template.c" + +#endif //CAN_COMPILE_X86_ASM + void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ - int y; -#if defined(FAST_OSD) && !defined(HAVE_MMX) - w=w>>1; -#endif -PROFILE_START(); - for(y=0;y<h;y++){ - register int x; -#ifdef HAVE_MMX - asm volatile( - PREFETCHW" %0\n\t" - PREFETCH" %1\n\t" - PREFETCH" %2\n\t" -// "pxor %%mm7, %%mm7\n\t" - "pcmpeqb %%mm5, %%mm5\n\t" // F..F - "movq %%mm5, %%mm4\n\t" - "psllw $8, %%mm5\n\t" //FF00FF00FF00 - "psrlw $8, %%mm4\n\t" //00FF00FF00FF - ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory"); - for(x=0;x<w;x+=8){ - asm volatile( - "movl %1, %%eax\n\t" - "orl 4%1, %%eax\n\t" - " jz 1f\n\t" - PREFETCHW" 32%0\n\t" - PREFETCH" 32%1\n\t" - PREFETCH" 32%2\n\t" - "movq %0, %%mm0\n\t" // dstbase - "movq %%mm0, %%mm1\n\t" - "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y - "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y - "movq %1, %%mm2\n\t" //srca HGFEDCBA - "paddb bFF, %%mm2\n\t" - "movq %%mm2, %%mm3\n\t" - "pand %%mm4, %%mm2\n\t" //0G0E0C0A - "psrlw $8, %%mm3\n\t" //0H0F0D0B - "pmullw %%mm2, %%mm0\n\t" - "pmullw %%mm3, %%mm1\n\t" - "psrlw $8, %%mm0\n\t" - "pand %%mm5, %%mm1\n\t" - "por %%mm1, %%mm0\n\t" - "paddb %2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - "1:\n\t" - :: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x]) - : "%eax"); - } +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.has3DNow) + vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX) + vo_draw_alpha_yv12_MMX(w, h, src, srca, srcstride, dstbase, dststride); + else + vo_draw_alpha_yv12_X86(w, h, src, srca, srcstride, dstbase, dststride); #else - for(x=0;x<w;x++){ -#ifdef FAST_OSD - if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0]; - if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1]; -#else - if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x]; + vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride); #endif - } -#endif - src+=srcstride; - srca+=srcstride; - dstbase+=dststride; - } -#ifdef HAVE_MMX - asm volatile(EMMS:::"memory"); -#endif -PROFILE_END("vo_draw_alpha_yv12"); - return; } void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ - int y; -#if defined(FAST_OSD) && !defined(HAVE_MMX) - w=w>>1; -#endif -PROFILE_START(); - for(y=0;y<h;y++){ - register int x; -#ifdef HAVE_MMX - asm volatile( - PREFETCHW" %0\n\t" - PREFETCH" %1\n\t" - PREFETCH" %2\n\t" - "pxor %%mm7, %%mm7\n\t" - "pcmpeqb %%mm5, %%mm5\n\t" // F..F - "movq %%mm5, %%mm4\n\t" - "psllw $8, %%mm5\n\t" //FF00FF00FF00 - "psrlw $8, %%mm4\n\t" //00FF00FF00FF - ::"m"(*dstbase),"m"(*srca),"m"(*src)); - for(x=0;x<w;x+=4){ - asm volatile( - "movl %1, %%eax\n\t" - "orl %%eax, %%eax\n\t" - " jz 1f\n\t" - PREFETCHW" 32%0\n\t" - PREFETCH" 32%1\n\t" - PREFETCH" 32%2\n\t" - "movq %0, %%mm0\n\t" // dstbase - "movq %%mm0, %%mm1\n\t" - "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y - "movd %%eax, %%mm2\n\t" //srca 0000DCBA - "paddb bFF, %%mm2\n\t" - "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A - "pmullw %%mm2, %%mm0\n\t" - "psrlw $8, %%mm0\n\t" - "pand %%mm5, %%mm1\n\t" //U0V0U0V0 - "movd %2, %%mm2\n\t" //src 0000DCBA - "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A - "por %%mm1, %%mm0\n\t" - "paddb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - "1:\n\t" - :: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x]) - : "%eax"); - } -#else - for(x=0;x<w;x++){ -#ifdef FAST_OSD - if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0]; - if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1]; +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.has3DNow) + vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX) + vo_draw_alpha_yuy2_MMX(w, h, src, srca, srcstride, dstbase, dststride); + else + vo_draw_alpha_yuy2_X86(w, h, src, srca, srcstride, dstbase, dststride); #else - if(srca[x]) dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x]; -#endif - } + vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride); #endif - src+=srcstride; - srca+=srcstride; - dstbase+=dststride; - } -#ifdef HAVE_MMX - asm volatile(EMMS:::"memory"); -#endif -PROFILE_END("vo_draw_alpha_yuy2"); - return; } -#ifdef HAVE_MMX -static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL; -static const unsigned long long mask24hl __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL; -#endif void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ - int y; - for(y=0;y<h;y++){ - register unsigned char *dst = dstbase; - register int x; -#ifdef ARCH_X86 -#ifdef HAVE_MMX - asm volatile( - PREFETCHW" %0\n\t" - PREFETCH" %1\n\t" - PREFETCH" %2\n\t" - "pxor %%mm7, %%mm7\n\t" - "pcmpeqb %%mm6, %%mm6\n\t" // F..F - ::"m"(*dst),"m"(*srca),"m"(*src):"memory"); - for(x=0;x<w;x+=2){ - if(srca[x] || srca[x+1]) - asm volatile( - PREFETCHW" 32%0\n\t" - PREFETCH" 32%1\n\t" - PREFETCH" 32%2\n\t" - "movq %0, %%mm0\n\t" // dstbase - "movq %%mm0, %%mm1\n\t" - "movq %%mm0, %%mm5\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "movd %1, %%mm2\n\t" // srca ABCD0000 - "paddb %%mm6, %%mm2\n\t" - "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD - "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A - "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B - "pmullw %%mm2, %%mm0\n\t" - "pmullw %%mm3, %%mm1\n\t" - "psrlw $8, %%mm0\n\t" - "psrlw $8, %%mm1\n\t" - "packuswb %%mm1, %%mm0\n\t" - "movd %2, %%mm2 \n\t" // src ABCD0000 - "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD - "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB - "paddb %%mm2, %%mm0\n\t" - "pand %4, %%mm5\n\t" - "pand %3, %%mm0\n\t" - "por %%mm0, %%mm5\n\t" - "movq %%mm5, %0\n\t" - :: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh)); - dst += 6; - } -#else /* HAVE_MMX */ - for(x=0;x<w;x++){ - if(srca[x]){ - asm volatile( - "movzbl (%0), %%ecx\n\t" - "movzbl 1(%0), %%eax\n\t" - "movzbl 2(%0), %%edx\n\t" - - "imull %1, %%ecx\n\t" - "imull %1, %%eax\n\t" - "imull %1, %%edx\n\t" - - "addl %2, %%ecx\n\t" - "addl %2, %%eax\n\t" - "addl %2, %%edx\n\t" - - "movb %%ch, (%0)\n\t" - "movb %%ah, 1(%0)\n\t" - "movb %%dh, 2(%0)\n\t" - - : - :"r" (dst), - "r" ((unsigned)srca[x]), - "r" (((unsigned)src[x])<<8) - :"%eax", "%ecx", "%edx" - ); - } - dst += 3; - } -#endif /* HAVE_MMX */ -#else /*non x86 arch*/ - for(x=0;x<w;x++){ - if(srca[x]){ -#ifdef FAST_OSD - dst[0]=dst[1]=dst[2]=src[x]; +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.has3DNow) + vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX) + vo_draw_alpha_rgb24_MMX(w, h, src, srca, srcstride, dstbase, dststride); + else + vo_draw_alpha_rgb24_X86(w, h, src, srca, srcstride, dstbase, dststride); #else - dst[0]=((dst[0]*srca[x])>>8)+src[x]; - dst[1]=((dst[1]*srca[x])>>8)+src[x]; - dst[2]=((dst[2]*srca[x])>>8)+src[x]; + vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride); #endif - } - dst+=3; // 24bpp - } -#endif /* arch_x86 */ - src+=srcstride; - srca+=srcstride; - dstbase+=dststride; - } -#ifdef HAVE_MMX - asm volatile(EMMS:::"memory"); -#endif - return; } void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ - int y; -PROFILE_START(); - for(y=0;y<h;y++){ - register int x; -#ifdef ARCH_X86 -#ifdef HAVE_MMX -#ifdef HAVE_3DNOW - asm volatile( - PREFETCHW" %0\n\t" - PREFETCH" %1\n\t" - PREFETCH" %2\n\t" - "pxor %%mm7, %%mm7\n\t" - "pcmpeqb %%mm6, %%mm6\n\t" // F..F - ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory"); - for(x=0;x<w;x+=2){ - if(srca[x] || srca[x+1]) - asm volatile( - PREFETCHW" 32%0\n\t" - PREFETCH" 32%1\n\t" - PREFETCH" 32%2\n\t" - "movq %0, %%mm0\n\t" // dstbase - "movq %%mm0, %%mm1\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "movd %1, %%mm2\n\t" // srca ABCD0000 - "paddb %%mm6, %%mm2\n\t" - "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD - "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A - "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B - "pmullw %%mm2, %%mm0\n\t" - "pmullw %%mm3, %%mm1\n\t" - "psrlw $8, %%mm0\n\t" - "psrlw $8, %%mm1\n\t" - "packuswb %%mm1, %%mm0\n\t" - "movd %2, %%mm2 \n\t" // src ABCD0000 - "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD - "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB - "paddb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])); - } -#else //this is faster for intels crap - asm volatile( - PREFETCHW" %0\n\t" - PREFETCH" %1\n\t" - PREFETCH" %2\n\t" - "pxor %%mm7, %%mm7\n\t" - "pcmpeqb %%mm5, %%mm5\n\t" // F..F - "movq %%mm5, %%mm4\n\t" - "psllw $8, %%mm5\n\t" //FF00FF00FF00 - "psrlw $8, %%mm4\n\t" //00FF00FF00FF - ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory"); - for(x=0;x<w;x+=4){ - asm volatile( - "movl %1, %%eax\n\t" - "orl %%eax, %%eax\n\t" - " jz 1f\n\t" - PREFETCHW" 32%0\n\t" - PREFETCH" 32%1\n\t" - PREFETCH" 32%2\n\t" - "movq %0, %%mm0\n\t" // dstbase - "movq %%mm0, %%mm1\n\t" - "pand %%mm4, %%mm0\n\t" //0R0B0R0B - "psrlw $8, %%mm1\n\t" //0?0G0?0G - "movd %%eax, %%mm2\n\t" //srca 0000DCBA - "paddb bFF, %%mm2\n\t" - "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A - "pmullw %%mm2, %%mm0\n\t" - "pmullw %%mm2, %%mm1\n\t" - "psrlw $8, %%mm0\n\t" - "pand %%mm5, %%mm1\n\t" - "por %%mm1, %%mm0\n\t" - "movd %2, %%mm2 \n\t" //src 0000DCBA - "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA - "movq %%mm2, %%mm6\n\t" - "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA - "paddb %%mm2, %%mm0\n\t" - "movq %%mm0, %0\n\t" - - "movq 8%0, %%mm0\n\t" // dstbase - "movq %%mm0, %%mm1\n\t" - "pand %%mm4, %%mm0\n\t" //0R0B0R0B - "psrlw $8, %%mm1\n\t" //0?0G0?0G - "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C - "pmullw %%mm3, %%mm0\n\t" - "pmullw %%mm3, %%mm1\n\t" - "psrlw $8, %%mm0\n\t" - "pand %%mm5, %%mm1\n\t" - "por %%mm1, %%mm0\n\t" - "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC - "paddb %%mm6, %%mm0\n\t" - "movq %%mm0, 8%0\n\t" - "1:\n\t" - :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]) - : "%eax"); - } -#endif -#else /* HAVE_MMX */ - for(x=0;x<w;x++){ - if(srca[x]){ - asm volatile( - "movzbl (%0), %%ecx\n\t" - "movzbl 1(%0), %%eax\n\t" - "movzbl 2(%0), %%edx\n\t" - - "imull %1, %%ecx\n\t" - "imull %1, %%eax\n\t" - "imull %1, %%edx\n\t" - - "addl %2, %%ecx\n\t" - "addl %2, %%eax\n\t" - "addl %2, %%edx\n\t" - - "movb %%ch, (%0)\n\t" - "movb %%ah, 1(%0)\n\t" - "movb %%dh, 2(%0)\n\t" - - : - :"r" (&dstbase[4*x]), - "r" ((unsigned)srca[x]), - "r" (((unsigned)src[x])<<8) - :"%eax", "%ecx", "%edx" - ); - } - } -#endif /* HAVE_MMX */ -#else /*non x86 arch*/ - for(x=0;x<w;x++){ - if(srca[x]){ -#ifdef FAST_OSD - dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x]; +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.has3DNow) + vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX) + vo_draw_alpha_rgb32_MMX(w, h, src, srca, srcstride, dstbase, dststride); + else + vo_draw_alpha_rgb32_X86(w, h, src, srca, srcstride, dstbase, dststride); #else - dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x]; - dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x]; - dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x]; + vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride); #endif - } - } -#endif /* arch_x86 */ - src+=srcstride; - srca+=srcstride; - dstbase+=dststride; - } -#ifdef HAVE_MMX - asm volatile(EMMS:::"memory"); -#endif -PROFILE_END("vo_draw_alpha_rgb32"); - return; } #ifdef FAST_OSD_TABLE @@ -424,6 +150,23 @@ void vo_draw_alpha_init(){ fast_osd_16bpp_table[i]=((i>>3)<<11)|((i>>2)<<5)|(i>>3); } #endif +//FIXME the optimized stuff is a lie for 15/16bpp as they arent optimized yet + if(verbose) + { +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + printf("Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n"); + else if(gCpuCaps.has3DNow) + printf("Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n"); + else if(gCpuCaps.hasMMX) + printf("Using MMX Optimized OnScreenDisplay\n"); + else + printf("Using X86 Optimized OnScreenDisplay\n"); +#else + printf("Using Unoptimized OnScreenDisplay\n"); +#endif + } } void vo_draw_alpha_rgb15(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ |