summaryrefslogtreecommitdiffstats
path: root/libvo
diff options
context:
space:
mode:
authormichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-11-26 21:12:15 +0000
committermichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-11-26 21:12:15 +0000
commit2e4b5d09085aec28e70bb7127f067c76228cd0f3 (patch)
tree5509484fc86a75560d913daeac43e25a4c0eacfe /libvo
parent0dca7e991a3b5d059eb6bace42ef4d69e3ce7111 (diff)
downloadmpv-2e4b5d09085aec28e70bb7127f067c76228cd0f3.tar.bz2
mpv-2e4b5d09085aec28e70bb7127f067c76228cd0f3.tar.xz
runtime cpu detection
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3143 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libvo')
-rw-r--r--libvo/osd.c499
-rw-r--r--libvo/osd_template.c124
2 files changed, 147 insertions, 476 deletions
diff --git a/libvo/osd.c b/libvo/osd.c
index 7dfb82f692..770abe7b83 100644
--- a/libvo/osd.c
+++ b/libvo/osd.c
@@ -1,414 +1,140 @@
// Generic alpha renderers for all YUV modes and RGB depths.
// These are "reference implementations", should be optimized later (MMX, etc)
-// Optimized by Nick and Michael
+// Templating Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
//#define FAST_OSD
//#define FAST_OSD_TABLE
#include "config.h"
#include "osd.h"
-#include "../mmx_defs.h"
//#define ENABLE_PROFILE
#include "../my_profile.h"
#include <inttypes.h>
+#include "../cpudetect.h"
-#ifdef HAVE_MMX
+extern int verbose; // defined in mplayer.c
+
+#ifdef ARCH_X86
+#define CAN_COMPILE_X86_ASM
+#endif
+
+#ifdef CAN_COMPILE_X86_ASM
static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
+static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
+static const unsigned long long mask24hl __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
#endif
+//Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
+//Plain C versions
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_3DNOW
+#undef ARCH_X86
+#define RENAME(a) a ## _C
+#include "osd_template.c"
+
+#ifdef CAN_COMPILE_X86_ASM
+
+//X86 noMMX versions
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _X86
+#include "osd_template.c"
+
+//MMX versions
+#undef RENAME
+#define HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _MMX
+#include "osd_template.c"
+
+//MMX2 versions
+#undef RENAME
+#define HAVE_MMX
+#define HAVE_MMX2
+#undef HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _MMX2
+#include "osd_template.c"
+
+//3DNOW versions
+#undef RENAME
+#define HAVE_MMX
+#undef HAVE_MMX2
+#define HAVE_3DNOW
+#define ARCH_X86
+#define RENAME(a) a ## _3DNow
+#include "osd_template.c"
+
+#endif //CAN_COMPILE_X86_ASM
+
void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
- int y;
-#if defined(FAST_OSD) && !defined(HAVE_MMX)
- w=w>>1;
-#endif
-PROFILE_START();
- for(y=0;y<h;y++){
- register int x;
-#ifdef HAVE_MMX
- asm volatile(
- PREFETCHW" %0\n\t"
- PREFETCH" %1\n\t"
- PREFETCH" %2\n\t"
-// "pxor %%mm7, %%mm7\n\t"
- "pcmpeqb %%mm5, %%mm5\n\t" // F..F
- "movq %%mm5, %%mm4\n\t"
- "psllw $8, %%mm5\n\t" //FF00FF00FF00
- "psrlw $8, %%mm4\n\t" //00FF00FF00FF
- ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
- for(x=0;x<w;x+=8){
- asm volatile(
- "movl %1, %%eax\n\t"
- "orl 4%1, %%eax\n\t"
- " jz 1f\n\t"
- PREFETCHW" 32%0\n\t"
- PREFETCH" 32%1\n\t"
- PREFETCH" 32%2\n\t"
- "movq %0, %%mm0\n\t" // dstbase
- "movq %%mm0, %%mm1\n\t"
- "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
- "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y
- "movq %1, %%mm2\n\t" //srca HGFEDCBA
- "paddb bFF, %%mm2\n\t"
- "movq %%mm2, %%mm3\n\t"
- "pand %%mm4, %%mm2\n\t" //0G0E0C0A
- "psrlw $8, %%mm3\n\t" //0H0F0D0B
- "pmullw %%mm2, %%mm0\n\t"
- "pmullw %%mm3, %%mm1\n\t"
- "psrlw $8, %%mm0\n\t"
- "pand %%mm5, %%mm1\n\t"
- "por %%mm1, %%mm0\n\t"
- "paddb %2, %%mm0\n\t"
- "movq %%mm0, %0\n\t"
- "1:\n\t"
- :: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
- : "%eax");
- }
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.has3DNow)
+ vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.hasMMX)
+ vo_draw_alpha_yv12_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+ else
+ vo_draw_alpha_yv12_X86(w, h, src, srca, srcstride, dstbase, dststride);
#else
- for(x=0;x<w;x++){
-#ifdef FAST_OSD
- if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
- if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
-#else
- if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
+ vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
#endif
- }
-#endif
- src+=srcstride;
- srca+=srcstride;
- dstbase+=dststride;
- }
-#ifdef HAVE_MMX
- asm volatile(EMMS:::"memory");
-#endif
-PROFILE_END("vo_draw_alpha_yv12");
- return;
}
void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
- int y;
-#if defined(FAST_OSD) && !defined(HAVE_MMX)
- w=w>>1;
-#endif
-PROFILE_START();
- for(y=0;y<h;y++){
- register int x;
-#ifdef HAVE_MMX
- asm volatile(
- PREFETCHW" %0\n\t"
- PREFETCH" %1\n\t"
- PREFETCH" %2\n\t"
- "pxor %%mm7, %%mm7\n\t"
- "pcmpeqb %%mm5, %%mm5\n\t" // F..F
- "movq %%mm5, %%mm4\n\t"
- "psllw $8, %%mm5\n\t" //FF00FF00FF00
- "psrlw $8, %%mm4\n\t" //00FF00FF00FF
- ::"m"(*dstbase),"m"(*srca),"m"(*src));
- for(x=0;x<w;x+=4){
- asm volatile(
- "movl %1, %%eax\n\t"
- "orl %%eax, %%eax\n\t"
- " jz 1f\n\t"
- PREFETCHW" 32%0\n\t"
- PREFETCH" 32%1\n\t"
- PREFETCH" 32%2\n\t"
- "movq %0, %%mm0\n\t" // dstbase
- "movq %%mm0, %%mm1\n\t"
- "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
- "movd %%eax, %%mm2\n\t" //srca 0000DCBA
- "paddb bFF, %%mm2\n\t"
- "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
- "pmullw %%mm2, %%mm0\n\t"
- "psrlw $8, %%mm0\n\t"
- "pand %%mm5, %%mm1\n\t" //U0V0U0V0
- "movd %2, %%mm2\n\t" //src 0000DCBA
- "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
- "por %%mm1, %%mm0\n\t"
- "paddb %%mm2, %%mm0\n\t"
- "movq %%mm0, %0\n\t"
- "1:\n\t"
- :: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
- : "%eax");
- }
-#else
- for(x=0;x<w;x++){
-#ifdef FAST_OSD
- if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
- if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.has3DNow)
+ vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.hasMMX)
+ vo_draw_alpha_yuy2_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+ else
+ vo_draw_alpha_yuy2_X86(w, h, src, srca, srcstride, dstbase, dststride);
#else
- if(srca[x]) dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
-#endif
- }
+ vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
#endif
- src+=srcstride;
- srca+=srcstride;
- dstbase+=dststride;
- }
-#ifdef HAVE_MMX
- asm volatile(EMMS:::"memory");
-#endif
-PROFILE_END("vo_draw_alpha_yuy2");
- return;
}
-#ifdef HAVE_MMX
-static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
-static const unsigned long long mask24hl __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
-#endif
void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
- int y;
- for(y=0;y<h;y++){
- register unsigned char *dst = dstbase;
- register int x;
-#ifdef ARCH_X86
-#ifdef HAVE_MMX
- asm volatile(
- PREFETCHW" %0\n\t"
- PREFETCH" %1\n\t"
- PREFETCH" %2\n\t"
- "pxor %%mm7, %%mm7\n\t"
- "pcmpeqb %%mm6, %%mm6\n\t" // F..F
- ::"m"(*dst),"m"(*srca),"m"(*src):"memory");
- for(x=0;x<w;x+=2){
- if(srca[x] || srca[x+1])
- asm volatile(
- PREFETCHW" 32%0\n\t"
- PREFETCH" 32%1\n\t"
- PREFETCH" 32%2\n\t"
- "movq %0, %%mm0\n\t" // dstbase
- "movq %%mm0, %%mm1\n\t"
- "movq %%mm0, %%mm5\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "punpckhbw %%mm7, %%mm1\n\t"
- "movd %1, %%mm2\n\t" // srca ABCD0000
- "paddb %%mm6, %%mm2\n\t"
- "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
- "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
- "movq %%mm2, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
- "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
- "pmullw %%mm2, %%mm0\n\t"
- "pmullw %%mm3, %%mm1\n\t"
- "psrlw $8, %%mm0\n\t"
- "psrlw $8, %%mm1\n\t"
- "packuswb %%mm1, %%mm0\n\t"
- "movd %2, %%mm2 \n\t" // src ABCD0000
- "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
- "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
- "paddb %%mm2, %%mm0\n\t"
- "pand %4, %%mm5\n\t"
- "pand %3, %%mm0\n\t"
- "por %%mm0, %%mm5\n\t"
- "movq %%mm5, %0\n\t"
- :: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
- dst += 6;
- }
-#else /* HAVE_MMX */
- for(x=0;x<w;x++){
- if(srca[x]){
- asm volatile(
- "movzbl (%0), %%ecx\n\t"
- "movzbl 1(%0), %%eax\n\t"
- "movzbl 2(%0), %%edx\n\t"
-
- "imull %1, %%ecx\n\t"
- "imull %1, %%eax\n\t"
- "imull %1, %%edx\n\t"
-
- "addl %2, %%ecx\n\t"
- "addl %2, %%eax\n\t"
- "addl %2, %%edx\n\t"
-
- "movb %%ch, (%0)\n\t"
- "movb %%ah, 1(%0)\n\t"
- "movb %%dh, 2(%0)\n\t"
-
- :
- :"r" (dst),
- "r" ((unsigned)srca[x]),
- "r" (((unsigned)src[x])<<8)
- :"%eax", "%ecx", "%edx"
- );
- }
- dst += 3;
- }
-#endif /* HAVE_MMX */
-#else /*non x86 arch*/
- for(x=0;x<w;x++){
- if(srca[x]){
-#ifdef FAST_OSD
- dst[0]=dst[1]=dst[2]=src[x];
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.has3DNow)
+ vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.hasMMX)
+ vo_draw_alpha_rgb24_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+ else
+ vo_draw_alpha_rgb24_X86(w, h, src, srca, srcstride, dstbase, dststride);
#else
- dst[0]=((dst[0]*srca[x])>>8)+src[x];
- dst[1]=((dst[1]*srca[x])>>8)+src[x];
- dst[2]=((dst[2]*srca[x])>>8)+src[x];
+ vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
#endif
- }
- dst+=3; // 24bpp
- }
-#endif /* arch_x86 */
- src+=srcstride;
- srca+=srcstride;
- dstbase+=dststride;
- }
-#ifdef HAVE_MMX
- asm volatile(EMMS:::"memory");
-#endif
- return;
}
void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
- int y;
-PROFILE_START();
- for(y=0;y<h;y++){
- register int x;
-#ifdef ARCH_X86
-#ifdef HAVE_MMX
-#ifdef HAVE_3DNOW
- asm volatile(
- PREFETCHW" %0\n\t"
- PREFETCH" %1\n\t"
- PREFETCH" %2\n\t"
- "pxor %%mm7, %%mm7\n\t"
- "pcmpeqb %%mm6, %%mm6\n\t" // F..F
- ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
- for(x=0;x<w;x+=2){
- if(srca[x] || srca[x+1])
- asm volatile(
- PREFETCHW" 32%0\n\t"
- PREFETCH" 32%1\n\t"
- PREFETCH" 32%2\n\t"
- "movq %0, %%mm0\n\t" // dstbase
- "movq %%mm0, %%mm1\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "punpckhbw %%mm7, %%mm1\n\t"
- "movd %1, %%mm2\n\t" // srca ABCD0000
- "paddb %%mm6, %%mm2\n\t"
- "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
- "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
- "movq %%mm2, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
- "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
- "pmullw %%mm2, %%mm0\n\t"
- "pmullw %%mm3, %%mm1\n\t"
- "psrlw $8, %%mm0\n\t"
- "psrlw $8, %%mm1\n\t"
- "packuswb %%mm1, %%mm0\n\t"
- "movd %2, %%mm2 \n\t" // src ABCD0000
- "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
- "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
- "paddb %%mm2, %%mm0\n\t"
- "movq %%mm0, %0\n\t"
- :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
- }
-#else //this is faster for intels crap
- asm volatile(
- PREFETCHW" %0\n\t"
- PREFETCH" %1\n\t"
- PREFETCH" %2\n\t"
- "pxor %%mm7, %%mm7\n\t"
- "pcmpeqb %%mm5, %%mm5\n\t" // F..F
- "movq %%mm5, %%mm4\n\t"
- "psllw $8, %%mm5\n\t" //FF00FF00FF00
- "psrlw $8, %%mm4\n\t" //00FF00FF00FF
- ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
- for(x=0;x<w;x+=4){
- asm volatile(
- "movl %1, %%eax\n\t"
- "orl %%eax, %%eax\n\t"
- " jz 1f\n\t"
- PREFETCHW" 32%0\n\t"
- PREFETCH" 32%1\n\t"
- PREFETCH" 32%2\n\t"
- "movq %0, %%mm0\n\t" // dstbase
- "movq %%mm0, %%mm1\n\t"
- "pand %%mm4, %%mm0\n\t" //0R0B0R0B
- "psrlw $8, %%mm1\n\t" //0?0G0?0G
- "movd %%eax, %%mm2\n\t" //srca 0000DCBA
- "paddb bFF, %%mm2\n\t"
- "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA
- "movq %%mm2, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A
- "pmullw %%mm2, %%mm0\n\t"
- "pmullw %%mm2, %%mm1\n\t"
- "psrlw $8, %%mm0\n\t"
- "pand %%mm5, %%mm1\n\t"
- "por %%mm1, %%mm0\n\t"
- "movd %2, %%mm2 \n\t" //src 0000DCBA
- "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA
- "movq %%mm2, %%mm6\n\t"
- "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA
- "paddb %%mm2, %%mm0\n\t"
- "movq %%mm0, %0\n\t"
-
- "movq 8%0, %%mm0\n\t" // dstbase
- "movq %%mm0, %%mm1\n\t"
- "pand %%mm4, %%mm0\n\t" //0R0B0R0B
- "psrlw $8, %%mm1\n\t" //0?0G0?0G
- "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C
- "pmullw %%mm3, %%mm0\n\t"
- "pmullw %%mm3, %%mm1\n\t"
- "psrlw $8, %%mm0\n\t"
- "pand %%mm5, %%mm1\n\t"
- "por %%mm1, %%mm0\n\t"
- "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC
- "paddb %%mm6, %%mm0\n\t"
- "movq %%mm0, 8%0\n\t"
- "1:\n\t"
- :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x])
- : "%eax");
- }
-#endif
-#else /* HAVE_MMX */
- for(x=0;x<w;x++){
- if(srca[x]){
- asm volatile(
- "movzbl (%0), %%ecx\n\t"
- "movzbl 1(%0), %%eax\n\t"
- "movzbl 2(%0), %%edx\n\t"
-
- "imull %1, %%ecx\n\t"
- "imull %1, %%eax\n\t"
- "imull %1, %%edx\n\t"
-
- "addl %2, %%ecx\n\t"
- "addl %2, %%eax\n\t"
- "addl %2, %%edx\n\t"
-
- "movb %%ch, (%0)\n\t"
- "movb %%ah, 1(%0)\n\t"
- "movb %%dh, 2(%0)\n\t"
-
- :
- :"r" (&dstbase[4*x]),
- "r" ((unsigned)srca[x]),
- "r" (((unsigned)src[x])<<8)
- :"%eax", "%ecx", "%edx"
- );
- }
- }
-#endif /* HAVE_MMX */
-#else /*non x86 arch*/
- for(x=0;x<w;x++){
- if(srca[x]){
-#ifdef FAST_OSD
- dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x];
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.has3DNow)
+ vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
+ else if(gCpuCaps.hasMMX)
+ vo_draw_alpha_rgb32_MMX(w, h, src, srca, srcstride, dstbase, dststride);
+ else
+ vo_draw_alpha_rgb32_X86(w, h, src, srca, srcstride, dstbase, dststride);
#else
- dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
- dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
- dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
+ vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
#endif
- }
- }
-#endif /* arch_x86 */
- src+=srcstride;
- srca+=srcstride;
- dstbase+=dststride;
- }
-#ifdef HAVE_MMX
- asm volatile(EMMS:::"memory");
-#endif
-PROFILE_END("vo_draw_alpha_rgb32");
- return;
}
#ifdef FAST_OSD_TABLE
@@ -424,6 +150,23 @@ void vo_draw_alpha_init(){
fast_osd_16bpp_table[i]=((i>>3)<<11)|((i>>2)<<5)|(i>>3);
}
#endif
+//FIXME the optimized stuff is a lie for 15/16bpp as they arent optimized yet
+ if(verbose)
+ {
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasMMX2)
+ printf("Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
+ else if(gCpuCaps.has3DNow)
+ printf("Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
+ else if(gCpuCaps.hasMMX)
+ printf("Using MMX Optimized OnScreenDisplay\n");
+ else
+ printf("Using X86 Optimized OnScreenDisplay\n");
+#else
+ printf("Using Unoptimized OnScreenDisplay\n");
+#endif
+ }
}
void vo_draw_alpha_rgb15(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
diff --git a/libvo/osd_template.c b/libvo/osd_template.c
index 7dfb82f692..1dcc056acb 100644
--- a/libvo/osd_template.c
+++ b/libvo/osd_template.c
@@ -1,22 +1,33 @@
// Generic alpha renderers for all YUV modes and RGB depths.
-// These are "reference implementations", should be optimized later (MMX, etc)
// Optimized by Nick and Michael
+// Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
-//#define FAST_OSD
-//#define FAST_OSD_TABLE
+#undef PREFETCH
+#undef EMMS
+#undef PREFETCHW
+#undef PAVGB
-#include "config.h"
-#include "osd.h"
-#include "../mmx_defs.h"
-//#define ENABLE_PROFILE
-#include "../my_profile.h"
-#include <inttypes.h>
+#ifdef HAVE_3DNOW
+#define PREFETCH "prefetch"
+#define PREFETCHW "prefetchw"
+#define PAVGB "pavgusb"
+#elif defined ( HAVE_MMX2 )
+#define PREFETCH "prefetchnta"
+#define PREFETCHW "prefetcht0"
+#define PAVGB "pavgb"
+#else
+#define PREFETCH "/nop"
+#define PREFETCHW "/nop"
+#endif
-#ifdef HAVE_MMX
-static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
+#ifdef HAVE_3DNOW
+/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
+#define EMMS "femms"
+#else
+#define EMMS "emms"
#endif
-void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
int y;
#if defined(FAST_OSD) && !defined(HAVE_MMX)
w=w>>1;
@@ -84,7 +95,7 @@ PROFILE_END("vo_draw_alpha_yv12");
return;
}
-void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
int y;
#if defined(FAST_OSD) && !defined(HAVE_MMX)
w=w>>1;
@@ -150,11 +161,7 @@ PROFILE_END("vo_draw_alpha_yuy2");
return;
}
-#ifdef HAVE_MMX
-static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
-static const unsigned long long mask24hl __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
-#endif
-void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
int y;
for(y=0;y<h;y++){
register unsigned char *dst = dstbase;
@@ -256,7 +263,7 @@ void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, i
return;
}
-void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
int y;
PROFILE_START();
for(y=0;y<h;y++){
@@ -410,82 +417,3 @@ PROFILE_START();
PROFILE_END("vo_draw_alpha_rgb32");
return;
}
-
-#ifdef FAST_OSD_TABLE
-static unsigned short fast_osd_15bpp_table[256];
-static unsigned short fast_osd_16bpp_table[256];
-#endif
-
-void vo_draw_alpha_init(){
-#ifdef FAST_OSD_TABLE
- int i;
- for(i=0;i<256;i++){
- fast_osd_15bpp_table[i]=((i>>3)<<10)|((i>>3)<<5)|(i>>3);
- fast_osd_16bpp_table[i]=((i>>3)<<11)|((i>>2)<<5)|(i>>3);
- }
-#endif
-}
-
-void vo_draw_alpha_rgb15(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
- int y;
- for(y=0;y<h;y++){
- register unsigned short *dst = (unsigned short*) dstbase;
- register int x;
- for(x=0;x<w;x++){
- if(srca[x]){
-#ifdef FAST_OSD
-#ifdef FAST_OSD_TABLE
- dst[x]=fast_osd_15bpp_table[src[x]];
-#else
- register unsigned int a=src[x]>>3;
- dst[x]=(a<<10)|(a<<5)|a;
-#endif
-#else
- unsigned char r=dst[x]&0x1F;
- unsigned char g=(dst[x]>>5)&0x1F;
- unsigned char b=(dst[x]>>10)&0x1F;
- r=(((r*srca[x])>>5)+src[x])>>3;
- g=(((g*srca[x])>>5)+src[x])>>3;
- b=(((b*srca[x])>>5)+src[x])>>3;
- dst[x]=(b<<10)|(g<<5)|r;
-#endif
- }
- }
- src+=srcstride;
- srca+=srcstride;
- dstbase+=dststride;
- }
- return;
-}
-
-void vo_draw_alpha_rgb16(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
- int y;
- for(y=0;y<h;y++){
- register unsigned short *dst = (unsigned short*) dstbase;
- register int x;
- for(x=0;x<w;x++){
- if(srca[x]){
-#ifdef FAST_OSD
-#ifdef FAST_OSD_TABLE
- dst[x]=fast_osd_16bpp_table[src[x]];
-#else
- dst[x]=((src[x]>>3)<<11)|((src[x]>>2)<<5)|(src[x]>>3);
-#endif
-#else
- unsigned char r=dst[x]&0x1F;
- unsigned char g=(dst[x]>>5)&0x3F;
- unsigned char b=(dst[x]>>11)&0x1F;
- r=(((r*srca[x])>>5)+src[x])>>3;
- g=(((g*srca[x])>>6)+src[x])>>2;
- b=(((b*srca[x])>>5)+src[x])>>3;
- dst[x]=(b<<11)|(g<<5)|r;
-#endif
- }
- }
- src+=srcstride;
- srca+=srcstride;
- dstbase+=dststride;
- }
- return;
-}
-