diff options
Diffstat (limited to 'video/decode')
-rw-r--r-- | video/decode/dxva2.c | 62 | ||||
-rw-r--r-- | video/decode/gpu_memcpy_sse4.h | 136 |
2 files changed, 5 insertions, 193 deletions
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c index 5e06f505ac..0f7542817d 100644 --- a/video/decode/dxva2.c +++ b/video/decode/dxva2.c @@ -37,7 +37,6 @@ #include "video/mp_image_pool.h" #include "video/hwdec.h" #include "video/d3d.h" -#include "gpu_memcpy_sse4.h" // A minor evil. #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO @@ -98,9 +97,6 @@ typedef struct surface_info { typedef struct DXVA2Context { struct mp_log *log; - void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits, - unsigned src_pitch, unsigned surf_height); - HMODULE d3dlib; HMODULE dxva2lib; @@ -243,8 +239,8 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, return mp_image_new_custom_ref(&mpi, w, dxva2_release_img); } -static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits, - unsigned src_pitch, unsigned surf_height) +static void copy_nv12(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height) { struct mp_image buf = {0}; mp_image_setfmt(&buf, IMGFMT_NV12); @@ -254,49 +250,9 @@ static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits, buf.stride[0] = src_pitch; buf.planes[1] = src_bits + src_pitch * surf_height; buf.stride[1] = src_pitch; - mp_image_copy(dest, &buf); + mp_image_copy_gpu(dest, &buf); } -#pragma GCC push_options -#pragma GCC target("sse4.1") - -static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits, - unsigned src_pitch, unsigned surf_height) -{ - const int lines = dest->h; - const int stride_y = dest->stride[0]; - const int stride_uv = dest->stride[1]; - - // If the strides match, the image can be copied in one go - if (stride_y == src_pitch && stride_uv == src_pitch) { - const size_t size = lines * src_pitch; - gpu_memcpy(dest->planes[0], src_bits, size); - gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, size / 2); - - } else { - // Copy the Y plane line-by-line - uint8_t *dest_y = dest->planes[0]; - const uint8_t *src_y = src_bits; - const int bytes_per_line = dest->w; - for (int i = 0; i < lines; i++) { - gpu_memcpy(dest_y, src_y, bytes_per_line); - dest_y += stride_y; - src_y += src_pitch; - } - - // Copy the UV plane line-by-line - uint8_t *dest_uv = dest->planes[1]; - const uint8_t *src_uv = src_bits + src_pitch * surf_height; - for (int i = 0; i < lines / 2; i++) { - gpu_memcpy(dest_uv, src_uv, bytes_per_line); - dest_uv += stride_uv; - src_uv += src_pitch; - } - } -} - -#pragma GCC pop_options - static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, struct mp_image *img) { @@ -324,7 +280,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, return img; } - ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height); + copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height); mp_image_set_size(sw_img, img->w, img->h); mp_image_copy_attributes(sw_img, img); @@ -408,15 +364,7 @@ static int dxva2_init(struct lavc_ctx *s) ctx->log = mp_log_new(s, s->log, "dxva2"); ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17)); - if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) { - // Use a memcpy implementation optimised for copying from GPU memory - MP_DBG(ctx, "Using SSE4 memcpy\n"); - ctx->copy_nv12 = copy_nv12_gpu_sse4; - } else { - // Use the CRT memcpy. This can be slower than software decoding. - MP_WARN(ctx, "Using fallback memcpy (slow)\n"); - ctx->copy_nv12 = copy_nv12_fallback; - } + mp_check_gpu_memcpy(ctx->log, NULL); ctx->deviceHandle = INVALID_HANDLE_VALUE; diff --git a/video/decode/gpu_memcpy_sse4.h b/video/decode/gpu_memcpy_sse4.h deleted file mode 100644 index 160209bdc5..0000000000 --- a/video/decode/gpu_memcpy_sse4.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) 2011-2014 Hendrik Leppkes - * http://www.1f0.de - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Taken from the QuickSync decoder by Eric Gur - */ - -#ifndef GPU_MEMCPY_SSE4_H_ -#define GPU_MEMCPY_SSE4_H_ - -#pragma GCC push_options -#pragma GCC target("sse4.1") -#include <smmintrin.h> - -// gpu_memcpy is a memcpy style function that copied data very fast from a -// GPU tiled memory (write back) -// Performance tip: page offset (12 lsb) of both addresses should be different -// optimally use a 2K offset between them. -static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size) -{ - static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 - - if (d == NULL || s == NULL) return NULL; - - // If memory is not aligned, use memcpy - bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0; - if (!isAligned) - { - return memcpy(d, s, size); - } - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; -#ifdef __x86_64__ - __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; -#endif - - size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop - size_t end = 0; - - __m128i* pTrg = (__m128i*)d; - __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4); - __m128i* pSrc = (__m128i*)s; - - // Make sure source is synced - doesn't hurt if not needed. - _mm_sfence(); - - while (pTrg < pTrgEnd) - { - // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA - // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad) - xmm0 = _mm_stream_load_si128(pSrc); - xmm1 = _mm_stream_load_si128(pSrc + 1); - xmm2 = _mm_stream_load_si128(pSrc + 2); - xmm3 = _mm_stream_load_si128(pSrc + 3); - xmm4 = _mm_stream_load_si128(pSrc + 4); - xmm5 = _mm_stream_load_si128(pSrc + 5); - xmm6 = _mm_stream_load_si128(pSrc + 6); - xmm7 = _mm_stream_load_si128(pSrc + 7); -#ifdef __x86_64__ // Use all 16 xmm registers - xmm8 = _mm_stream_load_si128(pSrc + 8); - xmm9 = _mm_stream_load_si128(pSrc + 9); - xmm10 = _mm_stream_load_si128(pSrc + 10); - xmm11 = _mm_stream_load_si128(pSrc + 11); - xmm12 = _mm_stream_load_si128(pSrc + 12); - xmm13 = _mm_stream_load_si128(pSrc + 13); - xmm14 = _mm_stream_load_si128(pSrc + 14); - xmm15 = _mm_stream_load_si128(pSrc + 15); -#endif - pSrc += regsInLoop; - // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store) - _mm_store_si128(pTrg , xmm0); - _mm_store_si128(pTrg + 1, xmm1); - _mm_store_si128(pTrg + 2, xmm2); - _mm_store_si128(pTrg + 3, xmm3); - _mm_store_si128(pTrg + 4, xmm4); - _mm_store_si128(pTrg + 5, xmm5); - _mm_store_si128(pTrg + 6, xmm6); - _mm_store_si128(pTrg + 7, xmm7); -#ifdef __x86_64__ // Use all 16 xmm registers - _mm_store_si128(pTrg + 8, xmm8); - _mm_store_si128(pTrg + 9, xmm9); - _mm_store_si128(pTrg + 10, xmm10); - _mm_store_si128(pTrg + 11, xmm11); - _mm_store_si128(pTrg + 12, xmm12); - _mm_store_si128(pTrg + 13, xmm13); - _mm_store_si128(pTrg + 14, xmm14); - _mm_store_si128(pTrg + 15, xmm15); -#endif - pTrg += regsInLoop; - } - - // Copy in 16 byte steps - if (reminder >= 16) - { - size = reminder; - reminder = size & 15; - end = size >> 4; - for (size_t i = 0; i < end; ++i) - { - pTrg[i] = _mm_stream_load_si128(pSrc + i); - } - } - - // Copy last bytes - shouldn't happen as strides are modulu 16 - if (reminder) - { - __m128i temp = _mm_stream_load_si128(pSrc + end); - - char* ps = (char*)(&temp); - char* pt = (char*)(pTrg + end); - - for (size_t i = 0; i < reminder; ++i) - { - pt[i] = ps[i]; - } - } - - return d; -} - -#pragma GCC pop_options -#endif |