video: refactor GPU memcpy usage

Make the GPU memcpy from the dxva2 code generally useful to other parts of the player. We need to check at configure time whether SSE intrinsics work at all. (At least in this form, they won't work on clang, for example. It also won't work on non-x86.) Introduce a mp_image_copy_gpu(), and make the dxva2 code use it. Do some awkward stuff to share the existing code used by mp_image_copy(). I'm hoping that FFmpeg will sooner or later provide a function like this, so we can remove most of this again. (There is a patch, bit it's stuck in limbo since forever.) All this is used by the following commit.
author: wm4 <wm4@nowhere> 2015-09-25 18:58:17 +0200
committer: wm4 <wm4@nowhere> 2015-09-25 19:18:16 +0200
commit: 0ae8aebb89b5d0b2226a5d3852a9c72cd52da2ff (patch)
tree: 4f9517ae03b50d21b534a8cfcdcbbdcdd8d326b5 /video/decode/dxva2.c
parent: 361040f9d912140832192af78808218d601c7465 (diff)
download: mpv-0ae8aebb89b5d0b2226a5d3852a9c72cd52da2ff.tar.bz2
mpv-0ae8aebb89b5d0b2226a5d3852a9c72cd52da2ff.tar.xz
1 files changed, 5 insertions, 57 deletions
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c
index 5e06f505ac..0f7542817d 100644
--- a/video/decode/dxva2.c
+++ b/video/decode/dxva2.c
@@ -37,7 +37,6 @@
 #include "video/mp_image_pool.h"
 #include "video/hwdec.h"
 #include "video/d3d.h"
-#include "gpu_memcpy_sse4.h"
 
 // A minor evil.
 #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
@@ -98,9 +97,6 @@ typedef struct surface_info {
 typedef struct DXVA2Context {
     struct mp_log *log;
 
-    void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
-                      unsigned src_pitch, unsigned surf_height);
-
     HMODULE d3dlib;
     HMODULE dxva2lib;
 
@@ -243,8 +239,8 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s,
     return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
 }
 
-static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
+static void copy_nv12(struct mp_image *dest, uint8_t *src_bits,
+                      unsigned src_pitch, unsigned surf_height)
 {
     struct mp_image buf = {0};
     mp_image_setfmt(&buf, IMGFMT_NV12);
@@ -254,49 +250,9 @@ static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
     buf.stride[0] = src_pitch;
     buf.planes[1] = src_bits + src_pitch * surf_height;
     buf.stride[1] = src_pitch;
-    mp_image_copy(dest, &buf);
+    mp_image_copy_gpu(dest, &buf);
 }
 
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
-{
-    const int lines = dest->h;
-    const int stride_y = dest->stride[0];
-    const int stride_uv = dest->stride[1];
-
-    // If the strides match, the image can be copied in one go
-    if (stride_y == src_pitch && stride_uv == src_pitch) {
-        const size_t size = lines * src_pitch;
-        gpu_memcpy(dest->planes[0], src_bits, size);
-        gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, size / 2);
-
-    } else {
-        // Copy the Y plane line-by-line
-        uint8_t *dest_y = dest->planes[0];
-        const uint8_t *src_y = src_bits;
-        const int bytes_per_line = dest->w;
-        for (int i = 0; i < lines; i++) {
-            gpu_memcpy(dest_y, src_y, bytes_per_line);
-            dest_y += stride_y;
-            src_y += src_pitch;
-        }
-
-        // Copy the UV plane line-by-line
-        uint8_t *dest_uv = dest->planes[1];
-        const uint8_t *src_uv = src_bits + src_pitch * surf_height;
-        for (int i = 0; i < lines / 2; i++) {
-            gpu_memcpy(dest_uv, src_uv, bytes_per_line);
-            dest_uv += stride_uv;
-            src_uv += src_pitch;
-        }
-    }
-}
-
-#pragma GCC pop_options
-
 static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
                                              struct mp_image *img)
 {
@@ -324,7 +280,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
         return img;
     }
 
-    ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
+    copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
     mp_image_set_size(sw_img, img->w, img->h);
     mp_image_copy_attributes(sw_img, img);
 
@@ -408,15 +364,7 @@ static int dxva2_init(struct lavc_ctx *s)
     ctx->log = mp_log_new(s, s->log, "dxva2");
     ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
 
-    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
-        // Use a memcpy implementation optimised for copying from GPU memory
-        MP_DBG(ctx, "Using SSE4 memcpy\n");
-        ctx->copy_nv12 = copy_nv12_gpu_sse4;
-    } else {
-        // Use the CRT memcpy. This can be slower than software decoding.
-        MP_WARN(ctx, "Using fallback memcpy (slow)\n");
-        ctx->copy_nv12 = copy_nv12_fallback;
-    }
+    mp_check_gpu_memcpy(ctx->log, NULL);
 
     ctx->deviceHandle = INVALID_HANDLE_VALUE;
author	wm4 <wm4@nowhere>	2015-09-25 18:58:17 +0200
committer	wm4 <wm4@nowhere>	2015-09-25 19:18:16 +0200
commit	0ae8aebb89b5d0b2226a5d3852a9c72cd52da2ff (patch)
tree	4f9517ae03b50d21b534a8cfcdcbbdcdd8d326b5 /video/decode/dxva2.c
parent	361040f9d912140832192af78808218d601c7465 (diff)
download	mpv-0ae8aebb89b5d0b2226a5d3852a9c72cd52da2ff.tar.bz2 mpv-0ae8aebb89b5d0b2226a5d3852a9c72cd52da2ff.tar.xz