From 7a4350e4c14add138b66f2c0c43b2796cf01094c Mon Sep 17 00:00:00 2001
From: michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2>
Date: Tue, 12 Feb 2002 23:17:14 +0000
Subject: mem2agpcpy()

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@4682 b3059339-0415-0410-9bf9-f77b7e298cf2
---
 libvo/aclib.c          | 32 ++++++++++++++++++-
 libvo/aclib_template.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++
 libvo/fastmemcpy.h     | 10 ++++--
 3 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/libvo/aclib.c b/libvo/aclib.c
index a2931739ea..f569f58460 100644
--- a/libvo/aclib.c
+++ b/libvo/aclib.c
@@ -118,4 +118,34 @@ inline void * fast_memcpy(void * to, const void * from, size_t len)
 #endif //!RUNTIME_CPUDETECT
 }
 
-#endif /* use fastmemcpy */
\ No newline at end of file
+inline void * mem2agpcpy(void * to, const void * from, size_t len)
+{
+#ifdef RUNTIME_CPUDETECT
+#ifdef CAN_COMPILE_X86_ASM
+	// ordered per speed fasterst first
+	if(gCpuCaps.hasMMX2)
+		mem2agpcpy_MMX2(to, from, len);
+	else if(gCpuCaps.has3DNow)
+		mem2agpcpy_3DNow(to, from, len);
+	else if(gCpuCaps.hasMMX)
+		mem2agpcpy_MMX(to, from, len);
+	else
+#endif //CAN_COMPILE_X86_ASM
+		memcpy(to, from, len); // prior to mmx we use the standart memcpy
+#else
+#ifdef HAVE_MMX2
+		mem2agpcpy_MMX2(to, from, len);
+#elif defined (HAVE_3DNOW)
+		mem2agpcpy_3DNow(to, from, len);
+#elif defined (HAVE_MMX)
+		mem2agpcpy_MMX(to, from, len);
+#else
+		memcpy(to, from, len); // prior to mmx we use the standart memcpy
+#endif
+
+#endif //!RUNTIME_CPUDETECT
+}
+
+
+#endif /* use fastmemcpy */
+
diff --git a/libvo/aclib_template.c b/libvo/aclib_template.c
index 9e444c4593..702b8aaa9a 100644
--- a/libvo/aclib_template.c
+++ b/libvo/aclib_template.c
@@ -353,3 +353,88 @@ static inline void * RENAME(fast_memcpy)(void * to, const void * from, size_t le
 	if(len) small_memcpy(to, from, len);
 	return retval;
 }
+
+/**
+ * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
+ */
+static inline void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
+{
+	void *retval;
+	size_t i;
+	retval = to;
+#ifdef STATISTICS
+	{
+		static int freq[33];
+		static int t=0;
+		int i;
+		for(i=0; len>(1<<i); i++);
+		freq[i]++;
+		t++;
+		if(1024*1024*1024 % t == 0)
+			for(i=0; i<32; i++)
+				printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
+	}
+#endif
+        if(len >= MIN_LEN)
+	{
+	  register unsigned long int delta;
+          /* Align destinition to MMREG_SIZE -boundary */
+          delta = ((unsigned long int)to)&7;
+          if(delta)
+	  {
+	    delta=8-delta;
+	    len -= delta;
+	    small_memcpy(to, from, delta);
+	  }
+	  i = len >> 6; /* len/64 */
+	  len &= 63;
+        /*
+           This algorithm is top effective when the code consequently
+           reads and writes blocks which have size of cache line.
+           Size of cache line is processor-dependent.
+           It will, however, be a minimum of 32 bytes on any processors.
+           It would be better to have a number of instructions which
+           perform reading and writing to be multiple to a number of
+           processor's decoders, but it's not always possible.
+        */
+	for(; i>0; i--)
+	{
+		__asm__ __volatile__ (
+        	PREFETCH" 320(%0)\n"
+		"movq (%0), %%mm0\n"
+		"movq 8(%0), %%mm1\n"
+		"movq 16(%0), %%mm2\n"
+		"movq 24(%0), %%mm3\n"
+		"movq 32(%0), %%mm4\n"
+		"movq 40(%0), %%mm5\n"
+		"movq 48(%0), %%mm6\n"
+		"movq 56(%0), %%mm7\n"
+		MOVNTQ" %%mm0, (%1)\n"
+		MOVNTQ" %%mm1, 8(%1)\n"
+		MOVNTQ" %%mm2, 16(%1)\n"
+		MOVNTQ" %%mm3, 24(%1)\n"
+		MOVNTQ" %%mm4, 32(%1)\n"
+		MOVNTQ" %%mm5, 40(%1)\n"
+		MOVNTQ" %%mm6, 48(%1)\n"
+		MOVNTQ" %%mm7, 56(%1)\n"
+		:: "r" (from), "r" (to) : "memory");
+		((const unsigned char *)from)+=64;
+		((unsigned char *)to)+=64;
+	}
+#ifdef HAVE_MMX2
+                /* since movntq is weakly-ordered, a "sfence"
+		 * is needed to become ordered again. */
+		__asm__ __volatile__ ("sfence":::"memory");
+#endif
+#ifndef HAVE_SSE
+		/* enables to use FPU */
+		__asm__ __volatile__ (EMMS:::"memory");
+#endif
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	if(len) small_memcpy(to, from, len);
+	return retval;
+}
+
diff --git a/libvo/fastmemcpy.h b/libvo/fastmemcpy.h
index cff2846bc8..aee1e786cd 100644
--- a/libvo/fastmemcpy.h
+++ b/libvo/fastmemcpy.h
@@ -9,8 +9,14 @@
 #include <stddef.h>
 
 extern void * fast_memcpy(void * to, const void * from, size_t len);
+extern void * mem2agpcpy(void * to, const void * from, size_t len);
 #define memcpy(a,b,c) fast_memcpy(a,b,c)
 
-#endif /* HAVE_MMX/MMX2/3DNOW/SSE/SSE2 */
-#endif /* USE_FASTMEMCPY */
+#else /* HAVE_MMX/MMX2/3DNOW/SSE/SSE2 */
+#define mem2agpcpy(a,b,c) memcpy(a,b,c)
+#endif
+
+#else /* USE_FASTMEMCPY */
+#define mem2agpcpy(a,b,c) memcpy(a,b,c)
+#endif
 #endif
-- 
cgit v1.2.3