From 0ae8aebb89b5d0b2226a5d3852a9c72cd52da2ff Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Fri, 25 Sep 2015 18:58:17 +0200
Subject: video: refactor GPU memcpy usage

Make the GPU memcpy from the dxva2 code generally useful to other parts
of the player.

We need to check at configure time whether SSE intrinsics work at all.
(At least in this form, they won't work on clang, for example. It also
won't work on non-x86.)

Introduce a mp_image_copy_gpu(), and make the dxva2 code use it. Do some
awkward stuff to share the existing code used by mp_image_copy(). I'm
hoping that FFmpeg will sooner or later provide a function like this, so
we can remove most of this again. (There is a patch, bit it's stuck in
limbo since forever.)

All this is used by the following commit.
---
 TOOLS/old-configure            |   1 +
 TOOLS/old-makefile             |   1 +
 video/decode/dxva2.c           |  62 ++-----------------
 video/decode/gpu_memcpy_sse4.h | 136 -----------------------------------------
 video/gpu_memcpy.c             | 135 ++++++++++++++++++++++++++++++++++++++++
 video/gpu_memcpy.h             |   8 +++
 video/mp_image.c               |  82 +++++++++++++++++++------
 video/mp_image.h               |   3 +
 waftools/fragments/sse.c       |  18 ++++++
 wscript                        |   5 ++
 wscript_build.py               |   1 +
 11 files changed, 241 insertions(+), 211 deletions(-)
 delete mode 100644 video/decode/gpu_memcpy_sse4.h
 create mode 100644 video/gpu_memcpy.c
 create mode 100644 video/gpu_memcpy.h
 create mode 100644 waftools/fragments/sse.c

diff --git a/TOOLS/old-configure b/TOOLS/old-configure
index f68f95f3aa..41c9b1c9bd 100755
--- a/TOOLS/old-configure
+++ b/TOOLS/old-configure
@@ -964,6 +964,7 @@ cat > $TMPC << EOF
 #define HAVE_VIDEOTOOLBOX_HWACCEL 0
 #define HAVE_VIDEOTOOLBOX_GL 0
 #define HAVE_VIDEOTOOLBOX_VDA_GL 0
+#define HAVE_SSE4_INTRINSICS 1
 
 #ifdef __OpenBSD__
 #define DEFAULT_CDROM_DEVICE "/dev/rcd0c"
diff --git a/TOOLS/old-makefile b/TOOLS/old-makefile
index 1ed6bc4b3f..bb15f96630 100644
--- a/TOOLS/old-makefile
+++ b/TOOLS/old-makefile
@@ -234,6 +234,7 @@ SOURCES = audio/audio.c \
           ta/ta_talloc.c \
           video/csputils.c \
           video/fmt-conversion.c \
+          video/gpu_memcpy.c \
           video/image_writer.c \
           video/img_format.c \
           video/mp_image.c \
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c
index 5e06f505ac..0f7542817d 100644
--- a/video/decode/dxva2.c
+++ b/video/decode/dxva2.c
@@ -37,7 +37,6 @@
 #include "video/mp_image_pool.h"
 #include "video/hwdec.h"
 #include "video/d3d.h"
-#include "gpu_memcpy_sse4.h"
 
 // A minor evil.
 #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
@@ -98,9 +97,6 @@ typedef struct surface_info {
 typedef struct DXVA2Context {
     struct mp_log *log;
 
-    void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
-                      unsigned src_pitch, unsigned surf_height);
-
     HMODULE d3dlib;
     HMODULE dxva2lib;
 
@@ -243,8 +239,8 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s,
     return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
 }
 
-static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
+static void copy_nv12(struct mp_image *dest, uint8_t *src_bits,
+                      unsigned src_pitch, unsigned surf_height)
 {
     struct mp_image buf = {0};
     mp_image_setfmt(&buf, IMGFMT_NV12);
@@ -254,49 +250,9 @@ static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
     buf.stride[0] = src_pitch;
     buf.planes[1] = src_bits + src_pitch * surf_height;
     buf.stride[1] = src_pitch;
-    mp_image_copy(dest, &buf);
+    mp_image_copy_gpu(dest, &buf);
 }
 
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
-                               unsigned src_pitch, unsigned surf_height)
-{
-    const int lines = dest->h;
-    const int stride_y = dest->stride[0];
-    const int stride_uv = dest->stride[1];
-
-    // If the strides match, the image can be copied in one go
-    if (stride_y == src_pitch && stride_uv == src_pitch) {
-        const size_t size = lines * src_pitch;
-        gpu_memcpy(dest->planes[0], src_bits, size);
-        gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, size / 2);
-
-    } else {
-        // Copy the Y plane line-by-line
-        uint8_t *dest_y = dest->planes[0];
-        const uint8_t *src_y = src_bits;
-        const int bytes_per_line = dest->w;
-        for (int i = 0; i < lines; i++) {
-            gpu_memcpy(dest_y, src_y, bytes_per_line);
-            dest_y += stride_y;
-            src_y += src_pitch;
-        }
-
-        // Copy the UV plane line-by-line
-        uint8_t *dest_uv = dest->planes[1];
-        const uint8_t *src_uv = src_bits + src_pitch * surf_height;
-        for (int i = 0; i < lines / 2; i++) {
-            gpu_memcpy(dest_uv, src_uv, bytes_per_line);
-            dest_uv += stride_uv;
-            src_uv += src_pitch;
-        }
-    }
-}
-
-#pragma GCC pop_options
-
 static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
                                              struct mp_image *img)
 {
@@ -324,7 +280,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
         return img;
     }
 
-    ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
+    copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
     mp_image_set_size(sw_img, img->w, img->h);
     mp_image_copy_attributes(sw_img, img);
 
@@ -408,15 +364,7 @@ static int dxva2_init(struct lavc_ctx *s)
     ctx->log = mp_log_new(s, s->log, "dxva2");
     ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
 
-    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
-        // Use a memcpy implementation optimised for copying from GPU memory
-        MP_DBG(ctx, "Using SSE4 memcpy\n");
-        ctx->copy_nv12 = copy_nv12_gpu_sse4;
-    } else {
-        // Use the CRT memcpy. This can be slower than software decoding.
-        MP_WARN(ctx, "Using fallback memcpy (slow)\n");
-        ctx->copy_nv12 = copy_nv12_fallback;
-    }
+    mp_check_gpu_memcpy(ctx->log, NULL);
 
     ctx->deviceHandle = INVALID_HANDLE_VALUE;
 
diff --git a/video/decode/gpu_memcpy_sse4.h b/video/decode/gpu_memcpy_sse4.h
deleted file mode 100644
index 160209bdc5..0000000000
--- a/video/decode/gpu_memcpy_sse4.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- *      Copyright (C) 2011-2014 Hendrik Leppkes
- *      http://www.1f0.de
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- *  Taken from the QuickSync decoder by Eric Gur
- */
-
-#ifndef GPU_MEMCPY_SSE4_H_
-#define GPU_MEMCPY_SSE4_H_
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-#include <smmintrin.h>
-
-// gpu_memcpy is a memcpy style function that copied data very fast from a
-// GPU tiled memory (write back)
-// Performance tip: page offset (12 lsb) of both addresses should be different
-//  optimally use a 2K offset between them.
-static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
-{
-    static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
-
-    if (d == NULL || s == NULL) return NULL;
-
-    // If memory is not aligned, use memcpy
-    bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
-    if (!isAligned)
-    {
-        return memcpy(d, s, size);
-    }
-
-    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-#ifdef __x86_64__
-    __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-#endif
-
-    size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
-    size_t end = 0;
-
-    __m128i* pTrg = (__m128i*)d;
-    __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
-    __m128i* pSrc = (__m128i*)s;
-
-    // Make sure source is synced - doesn't hurt if not needed.
-    _mm_sfence();
-
-    while (pTrg < pTrgEnd)
-    {
-        // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
-        // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
-        xmm0  = _mm_stream_load_si128(pSrc);
-        xmm1  = _mm_stream_load_si128(pSrc + 1);
-        xmm2  = _mm_stream_load_si128(pSrc + 2);
-        xmm3  = _mm_stream_load_si128(pSrc + 3);
-        xmm4  = _mm_stream_load_si128(pSrc + 4);
-        xmm5  = _mm_stream_load_si128(pSrc + 5);
-        xmm6  = _mm_stream_load_si128(pSrc + 6);
-        xmm7  = _mm_stream_load_si128(pSrc + 7);
-#ifdef __x86_64__ // Use all 16 xmm registers
-        xmm8  = _mm_stream_load_si128(pSrc + 8);
-        xmm9  = _mm_stream_load_si128(pSrc + 9);
-        xmm10 = _mm_stream_load_si128(pSrc + 10);
-        xmm11 = _mm_stream_load_si128(pSrc + 11);
-        xmm12 = _mm_stream_load_si128(pSrc + 12);
-        xmm13 = _mm_stream_load_si128(pSrc + 13);
-        xmm14 = _mm_stream_load_si128(pSrc + 14);
-        xmm15 = _mm_stream_load_si128(pSrc + 15);
-#endif
-        pSrc += regsInLoop;
-        // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
-        _mm_store_si128(pTrg     , xmm0);
-        _mm_store_si128(pTrg +  1, xmm1);
-        _mm_store_si128(pTrg +  2, xmm2);
-        _mm_store_si128(pTrg +  3, xmm3);
-        _mm_store_si128(pTrg +  4, xmm4);
-        _mm_store_si128(pTrg +  5, xmm5);
-        _mm_store_si128(pTrg +  6, xmm6);
-        _mm_store_si128(pTrg +  7, xmm7);
-#ifdef __x86_64__ // Use all 16 xmm registers
-        _mm_store_si128(pTrg +  8, xmm8);
-        _mm_store_si128(pTrg +  9, xmm9);
-        _mm_store_si128(pTrg + 10, xmm10);
-        _mm_store_si128(pTrg + 11, xmm11);
-        _mm_store_si128(pTrg + 12, xmm12);
-        _mm_store_si128(pTrg + 13, xmm13);
-        _mm_store_si128(pTrg + 14, xmm14);
-        _mm_store_si128(pTrg + 15, xmm15);
-#endif
-        pTrg += regsInLoop;
-    }
-
-    // Copy in 16 byte steps
-    if (reminder >= 16)
-    {
-        size = reminder;
-        reminder = size & 15;
-        end = size >> 4;
-        for (size_t i = 0; i < end; ++i)
-        {
-            pTrg[i] = _mm_stream_load_si128(pSrc + i);
-        }
-    }
-
-    // Copy last bytes - shouldn't happen as strides are modulu 16
-    if (reminder)
-    {
-        __m128i temp = _mm_stream_load_si128(pSrc + end);
-
-        char* ps = (char*)(&temp);
-        char* pt = (char*)(pTrg + end);
-
-        for (size_t i = 0; i < reminder; ++i)
-        {
-            pt[i] = ps[i];
-        }
-    }
-
-    return d;
-}
-
-#pragma GCC pop_options
-#endif
diff --git a/video/gpu_memcpy.c b/video/gpu_memcpy.c
new file mode 100644
index 0000000000..355da0e2a2
--- /dev/null
+++ b/video/gpu_memcpy.c
@@ -0,0 +1,135 @@
+/*
+ *      Copyright (C) 2011-2014 Hendrik Leppkes
+ *      http://www.1f0.de
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *  Taken from the QuickSync decoder by Eric Gur
+ */
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#include <smmintrin.h>
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "gpu_memcpy.h"
+
+// gpu_memcpy is a memcpy style function that copied data very fast from a
+// GPU tiled memory (write back)
+// Performance tip: page offset (12 lsb) of both addresses should be different
+//  optimally use a 2K offset between them.
+void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
+{
+    static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
+
+    if (d == NULL || s == NULL) return NULL;
+
+    // If memory is not aligned, use memcpy
+    bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
+    if (!isAligned)
+    {
+        return memcpy(d, s, size);
+    }
+
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+#ifdef __x86_64__
+    __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+#endif
+
+    size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
+    size_t end = 0;
+
+    __m128i* pTrg = (__m128i*)d;
+    __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
+    __m128i* pSrc = (__m128i*)s;
+
+    // Make sure source is synced - doesn't hurt if not needed.
+    _mm_sfence();
+
+    while (pTrg < pTrgEnd)
+    {
+        // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
+        // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
+        xmm0  = _mm_stream_load_si128(pSrc);
+        xmm1  = _mm_stream_load_si128(pSrc + 1);
+        xmm2  = _mm_stream_load_si128(pSrc + 2);
+        xmm3  = _mm_stream_load_si128(pSrc + 3);
+        xmm4  = _mm_stream_load_si128(pSrc + 4);
+        xmm5  = _mm_stream_load_si128(pSrc + 5);
+        xmm6  = _mm_stream_load_si128(pSrc + 6);
+        xmm7  = _mm_stream_load_si128(pSrc + 7);
+#ifdef __x86_64__ // Use all 16 xmm registers
+        xmm8  = _mm_stream_load_si128(pSrc + 8);
+        xmm9  = _mm_stream_load_si128(pSrc + 9);
+        xmm10 = _mm_stream_load_si128(pSrc + 10);
+        xmm11 = _mm_stream_load_si128(pSrc + 11);
+        xmm12 = _mm_stream_load_si128(pSrc + 12);
+        xmm13 = _mm_stream_load_si128(pSrc + 13);
+        xmm14 = _mm_stream_load_si128(pSrc + 14);
+        xmm15 = _mm_stream_load_si128(pSrc + 15);
+#endif
+        pSrc += regsInLoop;
+        // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
+        _mm_store_si128(pTrg     , xmm0);
+        _mm_store_si128(pTrg +  1, xmm1);
+        _mm_store_si128(pTrg +  2, xmm2);
+        _mm_store_si128(pTrg +  3, xmm3);
+        _mm_store_si128(pTrg +  4, xmm4);
+        _mm_store_si128(pTrg +  5, xmm5);
+        _mm_store_si128(pTrg +  6, xmm6);
+        _mm_store_si128(pTrg +  7, xmm7);
+#ifdef __x86_64__ // Use all 16 xmm registers
+        _mm_store_si128(pTrg +  8, xmm8);
+        _mm_store_si128(pTrg +  9, xmm9);
+        _mm_store_si128(pTrg + 10, xmm10);
+        _mm_store_si128(pTrg + 11, xmm11);
+        _mm_store_si128(pTrg + 12, xmm12);
+        _mm_store_si128(pTrg + 13, xmm13);
+        _mm_store_si128(pTrg + 14, xmm14);
+        _mm_store_si128(pTrg + 15, xmm15);
+#endif
+        pTrg += regsInLoop;
+    }
+
+    // Copy in 16 byte steps
+    if (reminder >= 16)
+    {
+        size = reminder;
+        reminder = size & 15;
+        end = size >> 4;
+        for (size_t i = 0; i < end; ++i)
+        {
+            pTrg[i] = _mm_stream_load_si128(pSrc + i);
+        }
+    }
+
+    // Copy last bytes - shouldn't happen as strides are modulu 16
+    if (reminder)
+    {
+        __m128i temp = _mm_stream_load_si128(pSrc + end);
+
+        char* ps = (char*)(&temp);
+        char* pt = (char*)(pTrg + end);
+
+        for (size_t i = 0; i < reminder; ++i)
+        {
+            pt[i] = ps[i];
+        }
+    }
+
+    return d;
+}
diff --git a/video/gpu_memcpy.h b/video/gpu_memcpy.h
new file mode 100644
index 0000000000..c62f754aac
--- /dev/null
+++ b/video/gpu_memcpy.h
@@ -0,0 +1,8 @@
+#ifndef GPU_MEMCPY_SSE4_H_
+#define GPU_MEMCPY_SSE4_H_
+
+#include <stddef.h>
+
+void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size);
+
+#endif
diff --git a/video/mp_image.c b/video/mp_image.c
index debdbbb201..57650eea0d 100644
--- a/video/mp_image.c
+++ b/video/mp_image.c
@@ -35,6 +35,7 @@
 #include "mp_image.h"
 #include "sws_utils.h"
 #include "fmt-conversion.h"
+#include "gpu_memcpy.h"
 
 #include "video/filter/vf.h"
 
@@ -300,7 +301,30 @@ void mp_image_unrefp(struct mp_image **p_img)
     *p_img = NULL;
 }
 
-void mp_image_copy(struct mp_image *dst, struct mp_image *src)
+typedef void *(*memcpy_fn)(void *d, const void *s, size_t size);
+
+static void memcpy_pic_cb(void *dst, const void *src, int bytesPerLine, int height,
+                          int dstStride, int srcStride, memcpy_fn cpy)
+{
+    if (bytesPerLine == dstStride && dstStride == srcStride && height) {
+        if (srcStride < 0) {
+            src = (uint8_t*)src + (height - 1) * srcStride;
+            dst = (uint8_t*)dst + (height - 1) * dstStride;
+            srcStride = -srcStride;
+        }
+
+        cpy(dst, src, srcStride * (height - 1) + bytesPerLine);
+    } else {
+        for (int i = 0; i < height; i++) {
+            cpy(dst, src, bytesPerLine);
+            src = (uint8_t*)src + srcStride;
+            dst = (uint8_t*)dst + dstStride;
+        }
+    }
+}
+
+static void mp_image_copy_cb(struct mp_image *dst, struct mp_image *src,
+                             memcpy_fn cpy)
 {
     assert(dst->imgfmt == src->imgfmt);
     assert(dst->w == src->w && dst->h == src->h);
@@ -308,14 +332,50 @@ void mp_image_copy(struct mp_image *dst, struct mp_image *src)
     for (int n = 0; n < dst->num_planes; n++) {
         int line_bytes = (mp_image_plane_w(dst, n) * dst->fmt.bpp[n] + 7) / 8;
         int plane_h = mp_image_plane_h(dst, n);
-        memcpy_pic(dst->planes[n], src->planes[n], line_bytes, plane_h,
-                   dst->stride[n], src->stride[n]);
+        memcpy_pic_cb(dst->planes[n], src->planes[n], line_bytes, plane_h,
+                      dst->stride[n], src->stride[n], cpy);
     }
     // Watch out for AV_PIX_FMT_FLAG_PSEUDOPAL retardation
     if ((dst->fmt.flags & MP_IMGFLAG_PAL) && dst->planes[1] && src->planes[1])
         memcpy(dst->planes[1], src->planes[1], MP_PALETTE_SIZE);
 }
 
+void mp_image_copy(struct mp_image *dst, struct mp_image *src)
+{
+    mp_image_copy_cb(dst, src, memcpy);
+}
+
+void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src)
+{
+#if HAVE_SSE4_INTRINSICS
+    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
+        mp_image_copy_cb(dst, src, gpu_memcpy);
+        return;
+    }
+#endif
+    mp_image_copy(dst, src);
+}
+
+// Helper, only for outputting some log info.
+void mp_check_gpu_memcpy(struct mp_log *log, bool *once)
+{
+    if (once) {
+        if (*once)
+            return;
+        *once = true;
+    }
+
+    bool have_sse = false;
+#if HAVE_SSE4_INTRINSICS
+    have_sse = av_get_cpu_flags() & AV_CPU_FLAG_SSE4;
+#endif
+    if (have_sse) {
+        mp_verbose(log, "Using SSE4 memcpy\n");
+    } else {
+        mp_warn(log, "Using fallback memcpy (slow)\n");
+    }
+}
+
 void mp_image_copy_attributes(struct mp_image *dst, struct mp_image *src)
 {
     dst->pict_type = src->pict_type;
@@ -675,21 +735,7 @@ struct AVFrame *mp_image_to_av_frame_and_unref(struct mp_image *img)
 void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
                 int dstStride, int srcStride)
 {
-    if (bytesPerLine == dstStride && dstStride == srcStride && height) {
-        if (srcStride < 0) {
-            src = (uint8_t*)src + (height - 1) * srcStride;
-            dst = (uint8_t*)dst + (height - 1) * dstStride;
-            srcStride = -srcStride;
-        }
-
-        memcpy(dst, src, srcStride * (height - 1) + bytesPerLine);
-    } else {
-        for (int i = 0; i < height; i++) {
-            memcpy(dst, src, bytesPerLine);
-            src = (uint8_t*)src + srcStride;
-            dst = (uint8_t*)dst + dstStride;
-        }
-    }
+    memcpy_pic_cb(dst, src, bytesPerLine, height, dstStride, srcStride, memcpy);
 }
 
 void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride)
diff --git a/video/mp_image.h b/video/mp_image.h
index f5759205f4..25eb42c050 100644
--- a/video/mp_image.h
+++ b/video/mp_image.h
@@ -106,6 +106,7 @@ int mp_chroma_div_up(int size, int shift);
 
 struct mp_image *mp_image_alloc(int fmt, int w, int h);
 void mp_image_copy(struct mp_image *dmpi, struct mp_image *mpi);
+void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src);
 void mp_image_copy_attributes(struct mp_image *dmpi, struct mp_image *mpi);
 struct mp_image *mp_image_new_copy(struct mp_image *img);
 struct mp_image *mp_image_new_ref(struct mp_image *img);
@@ -159,4 +160,6 @@ void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
 void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride);
 void memset16_pic(void *dst, int fill, int unitsPerLine, int height, int stride);
 
+void mp_check_gpu_memcpy(struct mp_log *log, bool *once);
+
 #endif /* MPLAYER_MP_IMAGE_H */
diff --git a/waftools/fragments/sse.c b/waftools/fragments/sse.c
new file mode 100644
index 0000000000..e9689cda17
--- /dev/null
+++ b/waftools/fragments/sse.c
@@ -0,0 +1,18 @@
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#include <smmintrin.h>
+
+void *a_ptr;
+
+int main(void)
+{
+    __m128i xmm0;
+    __m128i* p = (__m128i*)a_ptr;
+
+    _mm_sfence();
+
+    xmm0  = _mm_stream_load_si128(p + 1);
+    _mm_store_si128(p + 2, xmm0);
+
+    return 0;
+}
diff --git a/wscript b/wscript
index 8093ea396c..63c74686fa 100644
--- a/wscript
+++ b/wscript
@@ -769,6 +769,11 @@ hwaccel_features = [
         'desc': 'libavcodec DXVA2 hwaccel',
         'deps': [ 'win32' ],
         'func': check_headers('libavcodec/dxva2.h', use='libav'),
+    }, {
+        'name': 'sse4-intrinsics',
+        'desc': 'GCC SSE4 intrinsics for GPU memcpy',
+        'deps_any': [ 'dxva2-hwaccel' ],
+        'func': check_cc(fragment=load_fragment('sse.c')),
     }
 ]
 
diff --git a/wscript_build.py b/wscript_build.py
index 531da901a7..1d32b1a922 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -267,6 +267,7 @@ def build(ctx):
         ## Video
         ( "video/csputils.c" ),
         ( "video/fmt-conversion.c" ),
+        ( "video/gpu_memcpy.c",                  "sse4-intrinsics" ),
         ( "video/image_writer.c" ),
         ( "video/img_format.c" ),
         ( "video/mp_image.c" ),
-- 
cgit v1.2.3