summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xTOOLS/old-configure1
-rw-r--r--TOOLS/old-makefile1
-rw-r--r--video/decode/dxva2.c62
-rw-r--r--video/gpu_memcpy.c (renamed from video/decode/gpu_memcpy_sse4.h)13
-rw-r--r--video/gpu_memcpy.h8
-rw-r--r--video/mp_image.c82
-rw-r--r--video/mp_image.h3
-rw-r--r--waftools/fragments/sse.c18
-rw-r--r--wscript5
-rw-r--r--wscript_build.py1
10 files changed, 112 insertions, 82 deletions
diff --git a/TOOLS/old-configure b/TOOLS/old-configure
index f68f95f3aa..41c9b1c9bd 100755
--- a/TOOLS/old-configure
+++ b/TOOLS/old-configure
@@ -964,6 +964,7 @@ cat > $TMPC << EOF
#define HAVE_VIDEOTOOLBOX_HWACCEL 0
#define HAVE_VIDEOTOOLBOX_GL 0
#define HAVE_VIDEOTOOLBOX_VDA_GL 0
+#define HAVE_SSE4_INTRINSICS 1
#ifdef __OpenBSD__
#define DEFAULT_CDROM_DEVICE "/dev/rcd0c"
diff --git a/TOOLS/old-makefile b/TOOLS/old-makefile
index 1ed6bc4b3f..bb15f96630 100644
--- a/TOOLS/old-makefile
+++ b/TOOLS/old-makefile
@@ -234,6 +234,7 @@ SOURCES = audio/audio.c \
ta/ta_talloc.c \
video/csputils.c \
video/fmt-conversion.c \
+ video/gpu_memcpy.c \
video/image_writer.c \
video/img_format.c \
video/mp_image.c \
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c
index 5e06f505ac..0f7542817d 100644
--- a/video/decode/dxva2.c
+++ b/video/decode/dxva2.c
@@ -37,7 +37,6 @@
#include "video/mp_image_pool.h"
#include "video/hwdec.h"
#include "video/d3d.h"
-#include "gpu_memcpy_sse4.h"
// A minor evil.
#ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO
@@ -98,9 +97,6 @@ typedef struct surface_info {
typedef struct DXVA2Context {
struct mp_log *log;
- void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits,
- unsigned src_pitch, unsigned surf_height);
-
HMODULE d3dlib;
HMODULE dxva2lib;
@@ -243,8 +239,8 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s,
return mp_image_new_custom_ref(&mpi, w, dxva2_release_img);
}
-static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
- unsigned src_pitch, unsigned surf_height)
+static void copy_nv12(struct mp_image *dest, uint8_t *src_bits,
+ unsigned src_pitch, unsigned surf_height)
{
struct mp_image buf = {0};
mp_image_setfmt(&buf, IMGFMT_NV12);
@@ -254,49 +250,9 @@ static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits,
buf.stride[0] = src_pitch;
buf.planes[1] = src_bits + src_pitch * surf_height;
buf.stride[1] = src_pitch;
- mp_image_copy(dest, &buf);
+ mp_image_copy_gpu(dest, &buf);
}
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits,
- unsigned src_pitch, unsigned surf_height)
-{
- const int lines = dest->h;
- const int stride_y = dest->stride[0];
- const int stride_uv = dest->stride[1];
-
- // If the strides match, the image can be copied in one go
- if (stride_y == src_pitch && stride_uv == src_pitch) {
- const size_t size = lines * src_pitch;
- gpu_memcpy(dest->planes[0], src_bits, size);
- gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, size / 2);
-
- } else {
- // Copy the Y plane line-by-line
- uint8_t *dest_y = dest->planes[0];
- const uint8_t *src_y = src_bits;
- const int bytes_per_line = dest->w;
- for (int i = 0; i < lines; i++) {
- gpu_memcpy(dest_y, src_y, bytes_per_line);
- dest_y += stride_y;
- src_y += src_pitch;
- }
-
- // Copy the UV plane line-by-line
- uint8_t *dest_uv = dest->planes[1];
- const uint8_t *src_uv = src_bits + src_pitch * surf_height;
- for (int i = 0; i < lines / 2; i++) {
- gpu_memcpy(dest_uv, src_uv, bytes_per_line);
- dest_uv += stride_uv;
- src_uv += src_pitch;
- }
- }
-}
-
-#pragma GCC pop_options
-
static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
struct mp_image *img)
{
@@ -324,7 +280,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
return img;
}
- ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
+ copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height);
mp_image_set_size(sw_img, img->w, img->h);
mp_image_copy_attributes(sw_img, img);
@@ -408,15 +364,7 @@ static int dxva2_init(struct lavc_ctx *s)
ctx->log = mp_log_new(s, s->log, "dxva2");
ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17));
- if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
- // Use a memcpy implementation optimised for copying from GPU memory
- MP_DBG(ctx, "Using SSE4 memcpy\n");
- ctx->copy_nv12 = copy_nv12_gpu_sse4;
- } else {
- // Use the CRT memcpy. This can be slower than software decoding.
- MP_WARN(ctx, "Using fallback memcpy (slow)\n");
- ctx->copy_nv12 = copy_nv12_fallback;
- }
+ mp_check_gpu_memcpy(ctx->log, NULL);
ctx->deviceHandle = INVALID_HANDLE_VALUE;
diff --git a/video/decode/gpu_memcpy_sse4.h b/video/gpu_memcpy.c
index 160209bdc5..355da0e2a2 100644
--- a/video/decode/gpu_memcpy_sse4.h
+++ b/video/gpu_memcpy.c
@@ -19,18 +19,20 @@
* Taken from the QuickSync decoder by Eric Gur
*/
-#ifndef GPU_MEMCPY_SSE4_H_
-#define GPU_MEMCPY_SSE4_H_
-
#pragma GCC push_options
#pragma GCC target("sse4.1")
#include <smmintrin.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "gpu_memcpy.h"
+
// gpu_memcpy is a memcpy style function that copied data very fast from a
// GPU tiled memory (write back)
// Performance tip: page offset (12 lsb) of both addresses should be different
// optimally use a 2K offset between them.
-static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
+void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size)
{
static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
@@ -131,6 +133,3 @@ static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t
return d;
}
-
-#pragma GCC pop_options
-#endif
diff --git a/video/gpu_memcpy.h b/video/gpu_memcpy.h
new file mode 100644
index 0000000000..c62f754aac
--- /dev/null
+++ b/video/gpu_memcpy.h
@@ -0,0 +1,8 @@
+#ifndef GPU_MEMCPY_SSE4_H_
+#define GPU_MEMCPY_SSE4_H_
+
+#include <stddef.h>
+
+void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size);
+
+#endif
diff --git a/video/mp_image.c b/video/mp_image.c
index debdbbb201..57650eea0d 100644
--- a/video/mp_image.c
+++ b/video/mp_image.c
@@ -35,6 +35,7 @@
#include "mp_image.h"
#include "sws_utils.h"
#include "fmt-conversion.h"
+#include "gpu_memcpy.h"
#include "video/filter/vf.h"
@@ -300,7 +301,30 @@ void mp_image_unrefp(struct mp_image **p_img)
*p_img = NULL;
}
-void mp_image_copy(struct mp_image *dst, struct mp_image *src)
+typedef void *(*memcpy_fn)(void *d, const void *s, size_t size);
+
+static void memcpy_pic_cb(void *dst, const void *src, int bytesPerLine, int height,
+ int dstStride, int srcStride, memcpy_fn cpy)
+{
+ if (bytesPerLine == dstStride && dstStride == srcStride && height) {
+ if (srcStride < 0) {
+ src = (uint8_t*)src + (height - 1) * srcStride;
+ dst = (uint8_t*)dst + (height - 1) * dstStride;
+ srcStride = -srcStride;
+ }
+
+ cpy(dst, src, srcStride * (height - 1) + bytesPerLine);
+ } else {
+ for (int i = 0; i < height; i++) {
+ cpy(dst, src, bytesPerLine);
+ src = (uint8_t*)src + srcStride;
+ dst = (uint8_t*)dst + dstStride;
+ }
+ }
+}
+
+static void mp_image_copy_cb(struct mp_image *dst, struct mp_image *src,
+ memcpy_fn cpy)
{
assert(dst->imgfmt == src->imgfmt);
assert(dst->w == src->w && dst->h == src->h);
@@ -308,14 +332,50 @@ void mp_image_copy(struct mp_image *dst, struct mp_image *src)
for (int n = 0; n < dst->num_planes; n++) {
int line_bytes = (mp_image_plane_w(dst, n) * dst->fmt.bpp[n] + 7) / 8;
int plane_h = mp_image_plane_h(dst, n);
- memcpy_pic(dst->planes[n], src->planes[n], line_bytes, plane_h,
- dst->stride[n], src->stride[n]);
+ memcpy_pic_cb(dst->planes[n], src->planes[n], line_bytes, plane_h,
+ dst->stride[n], src->stride[n], cpy);
}
// Watch out for AV_PIX_FMT_FLAG_PSEUDOPAL retardation
if ((dst->fmt.flags & MP_IMGFLAG_PAL) && dst->planes[1] && src->planes[1])
memcpy(dst->planes[1], src->planes[1], MP_PALETTE_SIZE);
}
+void mp_image_copy(struct mp_image *dst, struct mp_image *src)
+{
+ mp_image_copy_cb(dst, src, memcpy);
+}
+
+void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src)
+{
+#if HAVE_SSE4_INTRINSICS
+ if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) {
+ mp_image_copy_cb(dst, src, gpu_memcpy);
+ return;
+ }
+#endif
+ mp_image_copy(dst, src);
+}
+
+// Helper, only for outputting some log info.
+void mp_check_gpu_memcpy(struct mp_log *log, bool *once)
+{
+ if (once) {
+ if (*once)
+ return;
+ *once = true;
+ }
+
+ bool have_sse = false;
+#if HAVE_SSE4_INTRINSICS
+ have_sse = av_get_cpu_flags() & AV_CPU_FLAG_SSE4;
+#endif
+ if (have_sse) {
+ mp_verbose(log, "Using SSE4 memcpy\n");
+ } else {
+ mp_warn(log, "Using fallback memcpy (slow)\n");
+ }
+}
+
void mp_image_copy_attributes(struct mp_image *dst, struct mp_image *src)
{
dst->pict_type = src->pict_type;
@@ -675,21 +735,7 @@ struct AVFrame *mp_image_to_av_frame_and_unref(struct mp_image *img)
void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
int dstStride, int srcStride)
{
- if (bytesPerLine == dstStride && dstStride == srcStride && height) {
- if (srcStride < 0) {
- src = (uint8_t*)src + (height - 1) * srcStride;
- dst = (uint8_t*)dst + (height - 1) * dstStride;
- srcStride = -srcStride;
- }
-
- memcpy(dst, src, srcStride * (height - 1) + bytesPerLine);
- } else {
- for (int i = 0; i < height; i++) {
- memcpy(dst, src, bytesPerLine);
- src = (uint8_t*)src + srcStride;
- dst = (uint8_t*)dst + dstStride;
- }
- }
+ memcpy_pic_cb(dst, src, bytesPerLine, height, dstStride, srcStride, memcpy);
}
void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride)
diff --git a/video/mp_image.h b/video/mp_image.h
index f5759205f4..25eb42c050 100644
--- a/video/mp_image.h
+++ b/video/mp_image.h
@@ -106,6 +106,7 @@ int mp_chroma_div_up(int size, int shift);
struct mp_image *mp_image_alloc(int fmt, int w, int h);
void mp_image_copy(struct mp_image *dmpi, struct mp_image *mpi);
+void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src);
void mp_image_copy_attributes(struct mp_image *dmpi, struct mp_image *mpi);
struct mp_image *mp_image_new_copy(struct mp_image *img);
struct mp_image *mp_image_new_ref(struct mp_image *img);
@@ -159,4 +160,6 @@ void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride);
void memset16_pic(void *dst, int fill, int unitsPerLine, int height, int stride);
+void mp_check_gpu_memcpy(struct mp_log *log, bool *once);
+
#endif /* MPLAYER_MP_IMAGE_H */
diff --git a/waftools/fragments/sse.c b/waftools/fragments/sse.c
new file mode 100644
index 0000000000..e9689cda17
--- /dev/null
+++ b/waftools/fragments/sse.c
@@ -0,0 +1,18 @@
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#include <smmintrin.h>
+
+void *a_ptr;
+
+int main(void)
+{
+ __m128i xmm0;
+ __m128i* p = (__m128i*)a_ptr;
+
+ _mm_sfence();
+
+ xmm0 = _mm_stream_load_si128(p + 1);
+ _mm_store_si128(p + 2, xmm0);
+
+ return 0;
+}
diff --git a/wscript b/wscript
index 8093ea396c..63c74686fa 100644
--- a/wscript
+++ b/wscript
@@ -769,6 +769,11 @@ hwaccel_features = [
'desc': 'libavcodec DXVA2 hwaccel',
'deps': [ 'win32' ],
'func': check_headers('libavcodec/dxva2.h', use='libav'),
+ }, {
+ 'name': 'sse4-intrinsics',
+ 'desc': 'GCC SSE4 intrinsics for GPU memcpy',
+ 'deps_any': [ 'dxva2-hwaccel' ],
+ 'func': check_cc(fragment=load_fragment('sse.c')),
}
]
diff --git a/wscript_build.py b/wscript_build.py
index 531da901a7..1d32b1a922 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -267,6 +267,7 @@ def build(ctx):
## Video
( "video/csputils.c" ),
( "video/fmt-conversion.c" ),
+ ( "video/gpu_memcpy.c", "sse4-intrinsics" ),
( "video/image_writer.c" ),
( "video/img_format.c" ),
( "video/mp_image.c" ),