5 files changed, 86 insertions, 82 deletions
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c
index 5e06f505ac..0f7542817d 100644
--- a/video/decode/dxva2.c
+++ b/video/decode/dxva2.c
@@ -37,7 +37,6 @@
 #include "video/mp_image_pool.h"
 #include "video/hwdec.h"
 #include "video/d3d.h"
-#include "gpu_memcpy_sse4.h"
 
 // A minor evil.
 #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
@@ -98,9 +97,6 @@ typedef struct surface_info {
 typedef struct DXVA2Context {
     struct mp_log *log;
 
-    void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
-                      unsigned src_pitch, unsigned surf_height);
-
     HMODULE d3dlib;
     HMODULE dxva2lib;
 
@@ -243,8 +239,8 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s,
     return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
 }
 
-static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
+static void copy_nv12(struct mp_image *dest, uint8_t *src_bits,
+                      unsigned src_pitch, unsigned surf_height)
 {
     struct mp_image buf = {0};
     mp_image_setfmt(&buf, IMGFMT_NV12);
@@ -254,49 +250,9 @@ static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
     buf.stride[0] = src_pitch;
     buf.planes[1] = src_bits + src_pitch * surf_height;
     buf.stride[1] = src_pitch;
-    mp_image_copy(dest, &buf);
+    mp_image_copy_gpu(dest, &buf);
 }
 
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
-{
-    const int lines = dest->h;
-    const int stride_y = dest->stride[0];
-    const int stride_uv = dest->stride[1];
-
-    // If the strides match, the image can be copied in one go
-    if (stride_y == src_pitch && stride_uv == src_pitch) {
-        const size_t size = lines * src_pitch;
-        gpu_memcpy(dest->planes[0], src_bits, size);
-        gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, size / 2);
-
-    } else {
-        // Copy the Y plane line-by-line
-        uint8_t *dest_y = dest->planes[0];
-        const uint8_t *src_y = src_bits;
-        const int bytes_per_line = dest->w;
-        for (int i = 0; i < lines; i++) {
-            gpu_memcpy(dest_y, src_y, bytes_per_line);
-            dest_y += stride_y;
-            src_y += src_pitch;
-        }
-
-        // Copy the UV plane line-by-line
-        uint8_t *dest_uv = dest->planes[1];
-        const uint8_t *src_uv = src_bits + src_pitch * surf_height;
-        for (int i = 0; i < lines / 2; i++) {
-            gpu_memcpy(dest_uv, src_uv, bytes_per_line);
-            dest_uv += stride_uv;
-            src_uv += src_pitch;
-        }
-    }
-}
-
-#pragma GCC pop_options
-
 static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
                                              struct mp_image *img)
 {
@@ -324,7 +280,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
         return img;
     }
 
-    ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
+    copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
     mp_image_set_size(sw_img, img->w, img->h);
     mp_image_copy_attributes(sw_img, img);
 
@@ -408,15 +364,7 @@ static int dxva2_init(struct lavc_ctx *s)
     ctx->log = mp_log_new(s, s->log, "dxva2");
     ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
 
-    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
-        // Use a memcpy implementation optimised for copying from GPU memory
-        MP_DBG(ctx, "Using SSE4 memcpy\n");
-        ctx->copy_nv12 = copy_nv12_gpu_sse4;
-    } else {
-        // Use the CRT memcpy. This can be slower than software decoding.
-        MP_WARN(ctx, "Using fallback memcpy (slow)\n");
-        ctx->copy_nv12 = copy_nv12_fallback;
-    }
+    mp_check_gpu_memcpy(ctx->log, NULL);
 
     ctx->deviceHandle = INVALID_HANDLE_VALUE;
 
diff --git a/video/decode/gpu_memcpy_sse4.h b/video/gpu_memcpy.c
index 160209bdc5..355da0e2a2 100644
--- a/video/decode/gpu_memcpy_sse4.h
+++ b/video/gpu_memcpy.c
@@ -19,18 +19,20 @@
  *  Taken from the QuickSync decoder by Eric Gur
  */
 
-#ifndef GPU_MEMCPY_SSE4_H_
-#define GPU_MEMCPY_SSE4_H_
-
 #pragma GCC push_options
 #pragma GCC target("sse4.1")
 #include <smmintrin.h>
 
+#include <stdbool.h>
+#include <string.h>
+
+#include "gpu_memcpy.h"
+
 // gpu_memcpy is a memcpy style function that copied data very fast from a
 // GPU tiled memory (write back)
 // Performance tip: page offset (12 lsb) of both addresses should be different
 //  optimally use a 2K offset between them.
-static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
+void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
 {
     static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
 
@@ -131,6 +133,3 @@ static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t
 
     return d;
 }
-
-#pragma GCC pop_options
-#endif
diff --git a/video/gpu_memcpy.h b/video/gpu_memcpy.h
new file mode 100644
index 0000000000..c62f754aac
--- /dev/null
+++ b/video/gpu_memcpy.h
@@ -0,0 +1,8 @@
+#ifndef GPU_MEMCPY_SSE4_H_
+#define GPU_MEMCPY_SSE4_H_
+
+#include <stddef.h>
+
+void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size);
+
+#endif
diff --git a/video/mp_image.c b/video/mp_image.c
index debdbbb201..57650eea0d 100644
--- a/video/mp_image.c
+++ b/video/mp_image.c
@@ -35,6 +35,7 @@
 #include "mp_image.h"
 #include "sws_utils.h"
 #include "fmt-conversion.h"
+#include "gpu_memcpy.h"
 
 #include "video/filter/vf.h"
 
@@ -300,7 +301,30 @@ void mp_image_unrefp(struct mp_image **p_img)
     *p_img = NULL;
 }
 
-void mp_image_copy(struct mp_image *dst, struct mp_image *src)
+typedef void *(*memcpy_fn)(void *d, const void *s, size_t size);
+
+static void memcpy_pic_cb(void *dst, const void *src, int bytesPerLine, int height,
+                          int dstStride, int srcStride, memcpy_fn cpy)
+{
+    if (bytesPerLine == dstStride && dstStride == srcStride && height) {
+        if (srcStride < 0) {
+            src = (uint8_t*)src + (height - 1) * srcStride;
+            dst = (uint8_t*)dst + (height - 1) * dstStride;
+            srcStride = -srcStride;
+        }
+
+        cpy(dst, src, srcStride * (height - 1) + bytesPerLine);
+    } else {
+        for (int i = 0; i < height; i++) {
+            cpy(dst, src, bytesPerLine);
+            src = (uint8_t*)src + srcStride;
+            dst = (uint8_t*)dst + dstStride;
+        }
+    }
+}
+
+static void mp_image_copy_cb(struct mp_image *dst, struct mp_image *src,
+                             memcpy_fn cpy)
 {
     assert(dst->imgfmt == src->imgfmt);
     assert(dst->w == src->w && dst->h == src->h);
@@ -308,14 +332,50 @@ void mp_image_copy(struct mp_image *dst, struct mp_image *src)
     for (int n = 0; n < dst->num_planes; n++) {
         int line_bytes = (mp_image_plane_w(dst, n) * dst->fmt.bpp[n] + 7) / 8;
         int plane_h = mp_image_plane_h(dst, n);
-        memcpy_pic(dst->planes[n], src->planes[n], line_bytes, plane_h,
-                   dst->stride[n], src->stride[n]);
+        memcpy_pic_cb(dst->planes[n], src->planes[n], line_bytes, plane_h,
+                      dst->stride[n], src->stride[n], cpy);
     }
     // Watch out for AV_PIX_FMT_FLAG_PSEUDOPAL retardation
     if ((dst->fmt.flags & MP_IMGFLAG_PAL) && dst->planes[1] && src->planes[1])
         memcpy(dst->planes[1], src->planes[1], MP_PALETTE_SIZE);
 }
 
+void mp_image_copy(struct mp_image *dst, struct mp_image *src)
+{
+    mp_image_copy_cb(dst, src, memcpy);
+}
+
+void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src)
+{
+#if HAVE_SSE4_INTRINSICS
+    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
+        mp_image_copy_cb(dst, src, gpu_memcpy);
+        return;
+    }
+#endif
+    mp_image_copy(dst, src);
+}
+
+// Helper, only for outputting some log info.
+void mp_check_gpu_memcpy(struct mp_log *log, bool *once)
+{
+    if (once) {
+        if (*once)
+            return;
+        *once = true;
+    }
+
+    bool have_sse = false;
+#if HAVE_SSE4_INTRINSICS
+    have_sse = av_get_cpu_flags() & AV_CPU_FLAG_SSE4;
+#endif
+    if (have_sse) {
+        mp_verbose(log, "Using SSE4 memcpy\n");
+    } else {
+        mp_warn(log, "Using fallback memcpy (slow)\n");
+    }
+}
+
 void mp_image_copy_attributes(struct mp_image *dst, struct mp_image *src)
 {
     dst->pict_type = src->pict_type;
@@ -675,21 +735,7 @@ struct AVFrame *mp_image_to_av_frame_and_unref(struct mp_image *img)
 void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
                 int dstStride, int srcStride)
 {
-    if (bytesPerLine == dstStride && dstStride == srcStride && height) {
-        if (srcStride < 0) {
-            src = (uint8_t*)src + (height - 1) * srcStride;
-            dst = (uint8_t*)dst + (height - 1) * dstStride;
-            srcStride = -srcStride;
-        }
-
-        memcpy(dst, src, srcStride * (height - 1) + bytesPerLine);
-    } else {
-        for (int i = 0; i < height; i++) {
-            memcpy(dst, src, bytesPerLine);
-            src = (uint8_t*)src + srcStride;
-            dst = (uint8_t*)dst + dstStride;
-        }
-    }
+    memcpy_pic_cb(dst, src, bytesPerLine, height, dstStride, srcStride, memcpy);
 }
 
 void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride)
diff --git a/video/mp_image.h b/video/mp_image.h
index f5759205f4..25eb42c050 100644
--- a/video/mp_image.h
+++ b/video/mp_image.h
@@ -106,6 +106,7 @@ int mp_chroma_div_up(int size, int shift);
 
 struct mp_image *mp_image_alloc(int fmt, int w, int h);
 void mp_image_copy(struct mp_image *dmpi, struct mp_image *mpi);
+void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src);
 void mp_image_copy_attributes(struct mp_image *dmpi, struct mp_image *mpi);
 struct mp_image *mp_image_new_copy(struct mp_image *img);
 struct mp_image *mp_image_new_ref(struct mp_image *img);
@@ -159,4 +160,6 @@ void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
 void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride);
 void memset16_pic(void *dst, int fill, int unitsPerLine, int height, int stride);
 
+void mp_check_gpu_memcpy(struct mp_log *log, bool *once);
+
 #endif /* MPLAYER_MP_IMAGE_H */