2 files changed, 5 insertions, 193 deletions
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c
index 5e06f505ac..0f7542817d 100644
--- a/video/decode/dxva2.c
+++ b/video/decode/dxva2.c
@@ -37,7 +37,6 @@
 #include "video/mp_image_pool.h"
 #include "video/hwdec.h"
 #include "video/d3d.h"
-#include "gpu_memcpy_sse4.h"
 
 // A minor evil.
 #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
@@ -98,9 +97,6 @@ typedef struct surface_info {
 typedef struct DXVA2Context {
     struct mp_log *log;
 
-    void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
-                      unsigned src_pitch, unsigned surf_height);
-
     HMODULE d3dlib;
     HMODULE dxva2lib;
 
@@ -243,8 +239,8 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s,
     return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
 }
 
-static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
+static void copy_nv12(struct mp_image *dest, uint8_t *src_bits,
+                      unsigned src_pitch, unsigned surf_height)
 {
     struct mp_image buf = {0};
     mp_image_setfmt(&buf, IMGFMT_NV12);
@@ -254,49 +250,9 @@ static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
     buf.stride[0] = src_pitch;
     buf.planes[1] = src_bits + src_pitch * surf_height;
     buf.stride[1] = src_pitch;
-    mp_image_copy(dest, &buf);
+    mp_image_copy_gpu(dest, &buf);
 }
 
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
-{
-    const int lines = dest->h;
-    const int stride_y = dest->stride[0];
-    const int stride_uv = dest->stride[1];
-
-    // If the strides match, the image can be copied in one go
-    if (stride_y == src_pitch && stride_uv == src_pitch) {
-        const size_t size = lines * src_pitch;
-        gpu_memcpy(dest->planes[0], src_bits, size);
-        gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, size / 2);
-
-    } else {
-        // Copy the Y plane line-by-line
-        uint8_t *dest_y = dest->planes[0];
-        const uint8_t *src_y = src_bits;
-        const int bytes_per_line = dest->w;
-        for (int i = 0; i < lines; i++) {
-            gpu_memcpy(dest_y, src_y, bytes_per_line);
-            dest_y += stride_y;
-            src_y += src_pitch;
-        }
-
-        // Copy the UV plane line-by-line
-        uint8_t *dest_uv = dest->planes[1];
-        const uint8_t *src_uv = src_bits + src_pitch * surf_height;
-        for (int i = 0; i < lines / 2; i++) {
-            gpu_memcpy(dest_uv, src_uv, bytes_per_line);
-            dest_uv += stride_uv;
-            src_uv += src_pitch;
-        }
-    }
-}
-
-#pragma GCC pop_options
-
 static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
                                              struct mp_image *img)
 {
@@ -324,7 +280,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
         return img;
     }
 
-    ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
+    copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
     mp_image_set_size(sw_img, img->w, img->h);
     mp_image_copy_attributes(sw_img, img);
 
@@ -408,15 +364,7 @@ static int dxva2_init(struct lavc_ctx *s)
     ctx->log = mp_log_new(s, s->log, "dxva2");
     ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
 
-    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
-        // Use a memcpy implementation optimised for copying from GPU memory
-        MP_DBG(ctx, "Using SSE4 memcpy\n");
-        ctx->copy_nv12 = copy_nv12_gpu_sse4;
-    } else {
-        // Use the CRT memcpy. This can be slower than software decoding.
-        MP_WARN(ctx, "Using fallback memcpy (slow)\n");
-        ctx->copy_nv12 = copy_nv12_fallback;
-    }
+    mp_check_gpu_memcpy(ctx->log, NULL);
 
     ctx->deviceHandle = INVALID_HANDLE_VALUE;
 
diff --git a/video/decode/gpu_memcpy_sse4.h b/video/decode/gpu_memcpy_sse4.h
deleted file mode 100644
index 160209bdc5..0000000000
--- a/video/decode/gpu_memcpy_sse4.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- *      Copyright (C) 2011-2014 Hendrik Leppkes
- *      http://www.1f0.de
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- *  Taken from the QuickSync decoder by Eric Gur
- */
-
-#ifndef GPU_MEMCPY_SSE4_H_
-#define GPU_MEMCPY_SSE4_H_
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-#include <smmintrin.h>
-
-// gpu_memcpy is a memcpy style function that copied data very fast from a
-// GPU tiled memory (write back)
-// Performance tip: page offset (12 lsb) of both addresses should be different
-//  optimally use a 2K offset between them.
-static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
-{
-    static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
-
-    if (d == NULL || s == NULL) return NULL;
-
-    // If memory is not aligned, use memcpy
-    bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
-    if (!isAligned)
-    {
-        return memcpy(d, s, size);
-    }
-
-    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-#ifdef __x86_64__
-    __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-#endif
-
-    size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
-    size_t end = 0;
-
-    __m128i* pTrg = (__m128i*)d;
-    __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
-    __m128i* pSrc = (__m128i*)s;
-
-    // Make sure source is synced - doesn't hurt if not needed.
-    _mm_sfence();
-
-    while (pTrg < pTrgEnd)
-    {
-        // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
-        // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
-        xmm0  = _mm_stream_load_si128(pSrc);
-        xmm1  = _mm_stream_load_si128(pSrc + 1);
-        xmm2  = _mm_stream_load_si128(pSrc + 2);
-        xmm3  = _mm_stream_load_si128(pSrc + 3);
-        xmm4  = _mm_stream_load_si128(pSrc + 4);
-        xmm5  = _mm_stream_load_si128(pSrc + 5);
-        xmm6  = _mm_stream_load_si128(pSrc + 6);
-        xmm7  = _mm_stream_load_si128(pSrc + 7);
-#ifdef __x86_64__ // Use all 16 xmm registers
-        xmm8  = _mm_stream_load_si128(pSrc + 8);
-        xmm9  = _mm_stream_load_si128(pSrc + 9);
-        xmm10 = _mm_stream_load_si128(pSrc + 10);
-        xmm11 = _mm_stream_load_si128(pSrc + 11);
-        xmm12 = _mm_stream_load_si128(pSrc + 12);
-        xmm13 = _mm_stream_load_si128(pSrc + 13);
-        xmm14 = _mm_stream_load_si128(pSrc + 14);
-        xmm15 = _mm_stream_load_si128(pSrc + 15);
-#endif
-        pSrc += regsInLoop;
-        // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
-        _mm_store_si128(pTrg     , xmm0);
-        _mm_store_si128(pTrg +  1, xmm1);
-        _mm_store_si128(pTrg +  2, xmm2);
-        _mm_store_si128(pTrg +  3, xmm3);
-        _mm_store_si128(pTrg +  4, xmm4);
-        _mm_store_si128(pTrg +  5, xmm5);
-        _mm_store_si128(pTrg +  6, xmm6);
-        _mm_store_si128(pTrg +  7, xmm7);
-#ifdef __x86_64__ // Use all 16 xmm registers
-        _mm_store_si128(pTrg +  8, xmm8);
-        _mm_store_si128(pTrg +  9, xmm9);
-        _mm_store_si128(pTrg + 10, xmm10);
-        _mm_store_si128(pTrg + 11, xmm11);
-        _mm_store_si128(pTrg + 12, xmm12);
-        _mm_store_si128(pTrg + 13, xmm13);
-        _mm_store_si128(pTrg + 14, xmm14);
-        _mm_store_si128(pTrg + 15, xmm15);
-#endif
-        pTrg += regsInLoop;
-    }
-
-    // Copy in 16 byte steps
-    if (reminder >= 16)
-    {
-        size = reminder;
-        reminder = size & 15;
-        end = size >> 4;
-        for (size_t i = 0; i < end; ++i)
-        {
-            pTrg[i] = _mm_stream_load_si128(pSrc + i);
-        }
-    }
-
-    // Copy last bytes - shouldn't happen as strides are modulu 16
-    if (reminder)
-    {
-        __m128i temp = _mm_stream_load_si128(pSrc + end);
-
-        char* ps = (char*)(&temp);
-        char* pt = (char*)(pTrg + end);
-
-        for (size_t i = 0; i < reminder; ++i)
-        {
-            pt[i] = ps[i];
-        }
-    }
-
-    return d;
-}
-
-#pragma GCC pop_options
-#endif