diff options
author | arpi_esp <arpi_esp@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-04-12 14:40:10 +0000 |
---|---|---|
committer | arpi_esp <arpi_esp@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-04-12 14:40:10 +0000 |
commit | 34a46800995e272e65021b23c0932a958a158c2c (patch) | |
tree | 9848826311293729bb22dcc5523eb122778b216b /libvo/fastmemcpy.h | |
parent | 3dde448fb2a3fa46c45bc6734a0a4c06f550d11b (diff) | |
download | mpv-34a46800995e272e65021b23c0932a958a158c2c.tar.bz2 mpv-34a46800995e272e65021b23c0932a958a158c2c.tar.xz |
P3 fixes...
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libvo/fastmemcpy.h')
-rw-r--r-- | libvo/fastmemcpy.h | 54 |
1 files changed, 24 insertions, 30 deletions
diff --git a/libvo/fastmemcpy.h b/libvo/fastmemcpy.h index 44ee7ef473..1f0a41853e 100644 --- a/libvo/fastmemcpy.h +++ b/libvo/fastmemcpy.h @@ -2,31 +2,19 @@ This part of code was taken by from Linux-2.4.3 and slightly modified for MMX2 instruction set. I have done it since linux uses page aligned blocks but mplayer uses weakly ordered data and original sources can not -speedup their. Only using prefetch and movntq together have effect! +speedup their. Only using prefetchnta and movntq together have effect! If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. */ - -#ifndef HAVE_MMX2 -//static inline void * __memcpy(void * to, const void * from, unsigned n) -inline static void * fast_memcpy(void * to, const void * from, unsigned n) -{ -int d0, d1, d2; -__asm__ __volatile__( - "rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) - : "memory"); -return (to); +#ifdef HAVE_MMX2 +/* for small memory blocks (<256 bytes) this version is faster */ +#define small_memcpy(to,from,n)\ +{\ +__asm__ __volatile__(\ + "rep ; movsb\n"\ + ::"D" (to), "S" (from),"c" (n)\ + : "memory");\ } -#else -//inline static void *__memcpy_mmx2(void *to, const void *from, unsigned len) + inline static void * fast_memcpy(void * to, const void * from, unsigned len) { void *p; @@ -36,12 +24,15 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len) { p = to; i = len >> 6; /* len/64 */ + len&=63; + __asm__ __volatile__ ( - "1: prefetch (%0)\n" /* This set is 28 bytes */ - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" + "1: prefetchnta (%0)\n" /* This set is 28 bytes */ + " prefetchnta 64(%0)\n" + " prefetchnta 128(%0)\n" + " prefetchnta 192(%0)\n" + " prefetchnta 256(%0)\n" +#if 0 "2: \n" ".section .fixup, \"ax\"\n" "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ @@ -51,13 +42,14 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len) " .align 4\n" " .long 1b, 3b\n" ".previous" +#endif : : "r" (from) ); for(; i>0; i--) { __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" + "1: prefetchnta 320(%0)\n" "2: movq (%0), %%mm0\n" " movq 8(%0), %%mm1\n" " movq 16(%0), %%mm2\n" @@ -74,6 +66,7 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len) " movntq %%mm1, 40(%1)\n" " movntq %%mm2, 48(%1)\n" " movntq %%mm3, 56(%1)\n" +#if 0 ".section .fixup, \"ax\"\n" "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" @@ -82,6 +75,7 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len) " .align 4\n" " .long 1b, 3b\n" ".previous" +#endif : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -91,10 +85,10 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len) /* * Now do the tail of the block */ - memcpy(to, from, len&63); + small_memcpy(to, from, len); return p; } +#define memcpy(a,b,c) fast_memcpy(a,b,c) #endif -#define memcpy(a,b,c) fast_memcpy(a,b,c) |