summaryrefslogtreecommitdiffstats
path: root/libvo
diff options
context:
space:
mode:
authormichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-11-22 19:40:38 +0000
committermichael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-11-22 19:40:38 +0000
commit6c6cc954f5eef6e800f4cb46db31d9932dc61f00 (patch)
treed005591526537e9be19929124940af0f5f8a2295 /libvo
parent2a6e9d9edae6aaf94e643f026cb63774e907cbe6 (diff)
downloadmpv-6c6cc954f5eef6e800f4cb46db31d9932dc61f00.tar.bz2
mpv-6c6cc954f5eef6e800f4cb46db31d9932dc61f00.tar.xz
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3078 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libvo')
-rw-r--r--libvo/aclib.c122
-rw-r--r--libvo/aclib_template.c122
2 files changed, 232 insertions, 12 deletions
diff --git a/libvo/aclib.c b/libvo/aclib.c
index 2366a28d51..d2c51c3157 100644
--- a/libvo/aclib.c
+++ b/libvo/aclib.c
@@ -1,13 +1,19 @@
#include "../config.h"
#ifdef USE_FASTMEMCPY
-/*
+/*
aclib - advanced C library ;)
This file contains functions which improve and expand standard C-library
*/
#include <stddef.h>
+#define BLOCK_SIZE 4096
+#define CONFUSION_FACTOR 0
+//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
+
+//#define STATISTICS
+
#ifndef HAVE_SSE2
/*
P3 processor has only one SSE decoder so can execute only 1 sse insn per
@@ -103,7 +109,7 @@ __asm__ __volatile__(\
#ifdef HAVE_SSE
#define MMREG_SIZE 16
#else
-#define MMREG_SIZE 8
+#define MMREG_SIZE 64 //8
#endif
/* Small defines (for readability only) ;) */
@@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len)
{
void *retval;
size_t i;
- retval = to;
+ retval = to;
+#ifdef STATISTICS
+ {
+ static int freq[33];
+ static int t=0;
+ int i;
+ for(i=0; len>(1<<i); i++);
+ freq[i]++;
+ t++;
+ if(1024*1024*1024 % t == 0)
+ for(i=0; i<32; i++)
+ printf("freq < %8d %4d\n", 1<<i, freq[i]);
+ }
+#endif
#ifndef HAVE_MMX1
/* PREFETCH has effect even for MOVSB instruction ;) */
__asm__ __volatile__ (
@@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
- else
+ else
/*
Only if SRC is aligned on 16-byte boundary.
It allows to use movaps instead of movups, which required data
@@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((unsigned char *)to)+=64;
}
#else
+ // Align destination at BLOCK_SIZE boundary
+ for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
+ {
+ __asm__ __volatile__ (
+#ifndef HAVE_MMX1
+ PREFETCH" 320(%0)\n"
+#endif
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ MOVNTQ" %%mm0, (%1)\n"
+ MOVNTQ" %%mm1, 8(%1)\n"
+ MOVNTQ" %%mm2, 16(%1)\n"
+ MOVNTQ" %%mm3, 24(%1)\n"
+ MOVNTQ" %%mm4, 32(%1)\n"
+ MOVNTQ" %%mm5, 40(%1)\n"
+ MOVNTQ" %%mm6, 48(%1)\n"
+ MOVNTQ" %%mm7, 56(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ ((const unsigned char *)from)+=64;
+ ((unsigned char *)to)+=64;
+ }
+
+// printf(" %d %d\n", (int)from&1023, (int)to&1023);
+ // Pure Assembly cuz gcc is a bit unpredictable ;)
+ if(i>=BLOCK_SIZE/64)
+ asm volatile(
+ "xorl %%eax, %%eax \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ "movl (%0, %%eax), %%ebx \n\t"
+ "movl 32(%0, %%eax), %%ebx \n\t"
+ "movl 64(%0, %%eax), %%ebx \n\t"
+ "movl 96(%0, %%eax), %%ebx \n\t"
+ "addl $128, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ " jb 1b \n\t"
+
+ "xorl %%eax, %%eax \n\t"
+
+ ".balign 16 \n\t"
+ "2: \n\t"
+ "movq (%0, %%eax), %%mm0\n"
+ "movq 8(%0, %%eax), %%mm1\n"
+ "movq 16(%0, %%eax), %%mm2\n"
+ "movq 24(%0, %%eax), %%mm3\n"
+ "movq 32(%0, %%eax), %%mm4\n"
+ "movq 40(%0, %%eax), %%mm5\n"
+ "movq 48(%0, %%eax), %%mm6\n"
+ "movq 56(%0, %%eax), %%mm7\n"
+ MOVNTQ" %%mm0, (%1, %%eax)\n"
+ MOVNTQ" %%mm1, 8(%1, %%eax)\n"
+ MOVNTQ" %%mm2, 16(%1, %%eax)\n"
+ MOVNTQ" %%mm3, 24(%1, %%eax)\n"
+ MOVNTQ" %%mm4, 32(%1, %%eax)\n"
+ MOVNTQ" %%mm5, 40(%1, %%eax)\n"
+ MOVNTQ" %%mm6, 48(%1, %%eax)\n"
+ MOVNTQ" %%mm7, 56(%1, %%eax)\n"
+ "addl $64, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ "jb 2b \n\t"
+
+#if CONFUSION_FACTOR > 0
+ // a few percent speedup on out of order executing CPUs
+ "movl %5, %%eax \n\t"
+ "2: \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "decl %%eax \n\t"
+ " jnz 2b \n\t"
+#endif
+
+ "xorl %%eax, %%eax \n\t"
+ "addl %3, %0 \n\t"
+ "addl %3, %1 \n\t"
+ "subl %4, %2 \n\t"
+ "cmpl %4, %2 \n\t"
+ " jae 1b \n\t"
+ : "+r" (from), "+r" (to), "+r" (i)
+ : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
+ : "%eax", "%ebx"
+ );
+
for(; i>0; i--)
{
__asm__ __volatile__ (
@@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
+
#endif /* Have SSE */
#ifdef HAVE_MMX2
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
-#ifndef HAVE_SSE
+#ifndef HAVE_SSE
/* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
-#endif
+#endif
}
/*
* Now do the tail of the block
diff --git a/libvo/aclib_template.c b/libvo/aclib_template.c
index 2366a28d51..d2c51c3157 100644
--- a/libvo/aclib_template.c
+++ b/libvo/aclib_template.c
@@ -1,13 +1,19 @@
#include "../config.h"
#ifdef USE_FASTMEMCPY
-/*
+/*
aclib - advanced C library ;)
This file contains functions which improve and expand standard C-library
*/
#include <stddef.h>
+#define BLOCK_SIZE 4096
+#define CONFUSION_FACTOR 0
+//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
+
+//#define STATISTICS
+
#ifndef HAVE_SSE2
/*
P3 processor has only one SSE decoder so can execute only 1 sse insn per
@@ -103,7 +109,7 @@ __asm__ __volatile__(\
#ifdef HAVE_SSE
#define MMREG_SIZE 16
#else
-#define MMREG_SIZE 8
+#define MMREG_SIZE 64 //8
#endif
/* Small defines (for readability only) ;) */
@@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len)
{
void *retval;
size_t i;
- retval = to;
+ retval = to;
+#ifdef STATISTICS
+ {
+ static int freq[33];
+ static int t=0;
+ int i;
+ for(i=0; len>(1<<i); i++);
+ freq[i]++;
+ t++;
+ if(1024*1024*1024 % t == 0)
+ for(i=0; i<32; i++)
+ printf("freq < %8d %4d\n", 1<<i, freq[i]);
+ }
+#endif
#ifndef HAVE_MMX1
/* PREFETCH has effect even for MOVSB instruction ;) */
__asm__ __volatile__ (
@@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
- else
+ else
/*
Only if SRC is aligned on 16-byte boundary.
It allows to use movaps instead of movups, which required data
@@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((unsigned char *)to)+=64;
}
#else
+ // Align destination at BLOCK_SIZE boundary
+ for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
+ {
+ __asm__ __volatile__ (
+#ifndef HAVE_MMX1
+ PREFETCH" 320(%0)\n"
+#endif
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ MOVNTQ" %%mm0, (%1)\n"
+ MOVNTQ" %%mm1, 8(%1)\n"
+ MOVNTQ" %%mm2, 16(%1)\n"
+ MOVNTQ" %%mm3, 24(%1)\n"
+ MOVNTQ" %%mm4, 32(%1)\n"
+ MOVNTQ" %%mm5, 40(%1)\n"
+ MOVNTQ" %%mm6, 48(%1)\n"
+ MOVNTQ" %%mm7, 56(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ ((const unsigned char *)from)+=64;
+ ((unsigned char *)to)+=64;
+ }
+
+// printf(" %d %d\n", (int)from&1023, (int)to&1023);
+ // Pure Assembly cuz gcc is a bit unpredictable ;)
+ if(i>=BLOCK_SIZE/64)
+ asm volatile(
+ "xorl %%eax, %%eax \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ "movl (%0, %%eax), %%ebx \n\t"
+ "movl 32(%0, %%eax), %%ebx \n\t"
+ "movl 64(%0, %%eax), %%ebx \n\t"
+ "movl 96(%0, %%eax), %%ebx \n\t"
+ "addl $128, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ " jb 1b \n\t"
+
+ "xorl %%eax, %%eax \n\t"
+
+ ".balign 16 \n\t"
+ "2: \n\t"
+ "movq (%0, %%eax), %%mm0\n"
+ "movq 8(%0, %%eax), %%mm1\n"
+ "movq 16(%0, %%eax), %%mm2\n"
+ "movq 24(%0, %%eax), %%mm3\n"
+ "movq 32(%0, %%eax), %%mm4\n"
+ "movq 40(%0, %%eax), %%mm5\n"
+ "movq 48(%0, %%eax), %%mm6\n"
+ "movq 56(%0, %%eax), %%mm7\n"
+ MOVNTQ" %%mm0, (%1, %%eax)\n"
+ MOVNTQ" %%mm1, 8(%1, %%eax)\n"
+ MOVNTQ" %%mm2, 16(%1, %%eax)\n"
+ MOVNTQ" %%mm3, 24(%1, %%eax)\n"
+ MOVNTQ" %%mm4, 32(%1, %%eax)\n"
+ MOVNTQ" %%mm5, 40(%1, %%eax)\n"
+ MOVNTQ" %%mm6, 48(%1, %%eax)\n"
+ MOVNTQ" %%mm7, 56(%1, %%eax)\n"
+ "addl $64, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ "jb 2b \n\t"
+
+#if CONFUSION_FACTOR > 0
+ // a few percent speedup on out of order executing CPUs
+ "movl %5, %%eax \n\t"
+ "2: \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "decl %%eax \n\t"
+ " jnz 2b \n\t"
+#endif
+
+ "xorl %%eax, %%eax \n\t"
+ "addl %3, %0 \n\t"
+ "addl %3, %1 \n\t"
+ "subl %4, %2 \n\t"
+ "cmpl %4, %2 \n\t"
+ " jae 1b \n\t"
+ : "+r" (from), "+r" (to), "+r" (i)
+ : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
+ : "%eax", "%ebx"
+ );
+
for(; i>0; i--)
{
__asm__ __volatile__ (
@@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len)
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
+
#endif /* Have SSE */
#ifdef HAVE_MMX2
/* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
-#ifndef HAVE_SSE
+#ifndef HAVE_SSE
/* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
-#endif
+#endif
}
/*
* Now do the tail of the block