summaryrefslogtreecommitdiffstats
path: root/libvo/fastmemcpy.h
diff options
context:
space:
mode:
authorarpi_esp <arpi_esp@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-04-12 14:40:10 +0000
committerarpi_esp <arpi_esp@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-04-12 14:40:10 +0000
commit34a46800995e272e65021b23c0932a958a158c2c (patch)
tree9848826311293729bb22dcc5523eb122778b216b /libvo/fastmemcpy.h
parent3dde448fb2a3fa46c45bc6734a0a4c06f550d11b (diff)
downloadmpv-34a46800995e272e65021b23c0932a958a158c2c.tar.bz2
mpv-34a46800995e272e65021b23c0932a958a158c2c.tar.xz
P3 fixes...
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@377 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libvo/fastmemcpy.h')
-rw-r--r--libvo/fastmemcpy.h54
1 files changed, 24 insertions, 30 deletions
diff --git a/libvo/fastmemcpy.h b/libvo/fastmemcpy.h
index 44ee7ef473..1f0a41853e 100644
--- a/libvo/fastmemcpy.h
+++ b/libvo/fastmemcpy.h
@@ -2,31 +2,19 @@
This part of code was taken by from Linux-2.4.3 and slightly modified
for MMX2 instruction set. I have done it since linux uses page aligned
blocks but mplayer uses weakly ordered data and original sources can not
-speedup their. Only using prefetch and movntq together have effect!
+speedup their. Only using prefetchnta and movntq together have effect!
If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
*/
-
-#ifndef HAVE_MMX2
-//static inline void * __memcpy(void * to, const void * from, unsigned n)
-inline static void * fast_memcpy(void * to, const void * from, unsigned n)
-{
-int d0, d1, d2;
-__asm__ __volatile__(
- "rep ; movsl\n\t"
- "testb $2,%b4\n\t"
- "je 1f\n\t"
- "movsw\n"
- "1:\ttestb $1,%b4\n\t"
- "je 2f\n\t"
- "movsb\n"
- "2:"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
- : "memory");
-return (to);
+#ifdef HAVE_MMX2
+/* for small memory blocks (<256 bytes) this version is faster */
+#define small_memcpy(to,from,n)\
+{\
+__asm__ __volatile__(\
+ "rep ; movsb\n"\
+ ::"D" (to), "S" (from),"c" (n)\
+ : "memory");\
}
-#else
-//inline static void *__memcpy_mmx2(void *to, const void *from, unsigned len)
+
inline static void * fast_memcpy(void * to, const void * from, unsigned len)
{
void *p;
@@ -36,12 +24,15 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
{
p = to;
i = len >> 6; /* len/64 */
+ len&=63;
+
__asm__ __volatile__ (
- "1: prefetch (%0)\n" /* This set is 28 bytes */
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
+ "1: prefetchnta (%0)\n" /* This set is 28 bytes */
+ " prefetchnta 64(%0)\n"
+ " prefetchnta 128(%0)\n"
+ " prefetchnta 192(%0)\n"
+ " prefetchnta 256(%0)\n"
+#if 0
"2: \n"
".section .fixup, \"ax\"\n"
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
@@ -51,13 +42,14 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
" .align 4\n"
" .long 1b, 3b\n"
".previous"
+#endif
: : "r" (from) );
for(; i>0; i--)
{
__asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
+ "1: prefetchnta 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
@@ -74,6 +66,7 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
" movntq %%mm1, 40(%1)\n"
" movntq %%mm2, 48(%1)\n"
" movntq %%mm3, 56(%1)\n"
+#if 0
".section .fixup, \"ax\"\n"
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
@@ -82,6 +75,7 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
" .align 4\n"
" .long 1b, 3b\n"
".previous"
+#endif
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
@@ -91,10 +85,10 @@ inline static void * fast_memcpy(void * to, const void * from, unsigned len)
/*
* Now do the tail of the block
*/
- memcpy(to, from, len&63);
+ small_memcpy(to, from, len);
return p;
}
+#define memcpy(a,b,c) fast_memcpy(a,b,c)
#endif
-#define memcpy(a,b,c) fast_memcpy(a,b,c)