1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
/*
* Copyright (C) 2011-2014 Hendrik Leppkes
* http://www.1f0.de
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Taken from the QuickSync decoder by Eric Gur
*/
#include <emmintrin.h>
// gpu_memcpy is a memcpy style function that copied data very fast from a
// GPU tiled memory (write back)
// Performance tip: page offset (12 lsb) of both addresses should be different
// optimally use a 2K offset between them.
__attribute__((target("sse4"))) static inline void
*gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
{
static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
if (d == NULL || s == NULL) return NULL;
// If memory is not aligned, use memcpy
bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
if (!isAligned)
{
return memcpy(d, s, size);
}
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
#ifdef __x86_64__
__m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
#endif
size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
size_t end = 0;
__m128i* pTrg = (__m128i*)d;
__m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
__m128i* pSrc = (__m128i*)s;
// Make sure source is synced - doesn't hurt if not needed.
_mm_sfence();
while (pTrg < pTrgEnd)
{
// _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
// Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
xmm0 = _mm_stream_load_si128(pSrc);
xmm1 = _mm_stream_load_si128(pSrc + 1);
xmm2 = _mm_stream_load_si128(pSrc + 2);
xmm3 = _mm_stream_load_si128(pSrc + 3);
xmm4 = _mm_stream_load_si128(pSrc + 4);
xmm5 = _mm_stream_load_si128(pSrc + 5);
xmm6 = _mm_stream_load_si128(pSrc + 6);
xmm7 = _mm_stream_load_si128(pSrc + 7);
#ifdef __x86_64__ // Use all 16 xmm registers
xmm8 = _mm_stream_load_si128(pSrc + 8);
xmm9 = _mm_stream_load_si128(pSrc + 9);
xmm10 = _mm_stream_load_si128(pSrc + 10);
xmm11 = _mm_stream_load_si128(pSrc + 11);
xmm12 = _mm_stream_load_si128(pSrc + 12);
xmm13 = _mm_stream_load_si128(pSrc + 13);
xmm14 = _mm_stream_load_si128(pSrc + 14);
xmm15 = _mm_stream_load_si128(pSrc + 15);
#endif
pSrc += regsInLoop;
// _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
_mm_store_si128(pTrg , xmm0);
_mm_store_si128(pTrg + 1, xmm1);
_mm_store_si128(pTrg + 2, xmm2);
_mm_store_si128(pTrg + 3, xmm3);
_mm_store_si128(pTrg + 4, xmm4);
_mm_store_si128(pTrg + 5, xmm5);
_mm_store_si128(pTrg + 6, xmm6);
_mm_store_si128(pTrg + 7, xmm7);
#ifdef __x86_64__ // Use all 16 xmm registers
_mm_store_si128(pTrg + 8, xmm8);
_mm_store_si128(pTrg + 9, xmm9);
_mm_store_si128(pTrg + 10, xmm10);
_mm_store_si128(pTrg + 11, xmm11);
_mm_store_si128(pTrg + 12, xmm12);
_mm_store_si128(pTrg + 13, xmm13);
_mm_store_si128(pTrg + 14, xmm14);
_mm_store_si128(pTrg + 15, xmm15);
#endif
pTrg += regsInLoop;
}
// Copy in 16 byte steps
if (reminder >= 16)
{
size = reminder;
reminder = size & 15;
end = size >> 4;
for (size_t i = 0; i < end; ++i)
{
pTrg[i] = _mm_stream_load_si128(pSrc + i);
}
}
// Copy last bytes - shouldn't happen as strides are modulu 16
if (reminder)
{
__m128i temp = _mm_stream_load_si128(pSrc + end);
char* ps = (char*)(&temp);
char* pt = (char*)(pTrg + end);
for (size_t i = 0; i < reminder; ++i)
{
pt[i] = ps[i];
}
}
return d;
}
|