From 52b52800ce083364a9c0baa1d6e0564eaae3b4d7 Mon Sep 17 00:00:00 2001 From: James Ross-Gowan Date: Sun, 26 Oct 2014 10:43:15 +1100 Subject: dxva2: use optimized memcpy At least on my machine, reading back the frame with system memcpy is slower than just using software rendering. Use the optimized gpu_memcpy from LAV to speed things up. --- video/decode/dxva2.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'video/decode/dxva2.c') diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c index 21f9fcd633..e03014438a 100644 --- a/video/decode/dxva2.c +++ b/video/decode/dxva2.c @@ -41,6 +41,7 @@ #include "video/fmt-conversion.h" #include "video/mp_image_pool.h" #include "video/hwdec.h" +#include "gpu_memcpy_sse4.h" // A minor evil. #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO @@ -98,6 +99,9 @@ typedef struct surface_info { typedef struct DXVA2Context { struct mp_log *log; + void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height); + HMODULE d3dlib; HMODULE dxva2lib; @@ -241,6 +245,26 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, int fmt, return mp_image_new_custom_ref(&mpi, w, dxva2_release_img); } +static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height) +{ + unsigned height = dest->h * src_pitch; + memcpy(dest->planes[0], src_bits, height); + dest->stride[0] = src_pitch; + memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2); + dest->stride[1] = src_pitch; +} + +static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height) +{ + unsigned height = dest->h * src_pitch; + gpu_memcpy(dest->planes[0], src_bits, height); + dest->stride[0] = src_pitch; + gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, height / 2); + dest->stride[1] = src_pitch; +} + static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, struct mp_image *img) { @@ -265,17 +289,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, return img; } - struct mp_image buf = {0}; - mp_image_setfmt(&buf, IMGFMT_NV12); - mp_image_set_size(&buf, img->w, img->h); - - buf.planes[0] = LockedRect.pBits; - buf.stride[0] = LockedRect.Pitch; - buf.planes[1] = (char *)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height; - buf.stride[1] = LockedRect.Pitch; - - // This should probably use some sort of "special" memcpy-like function. - mp_image_copy(sw_img, &buf); + ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height); IDirect3DSurface9_UnlockRect(surface); @@ -302,6 +316,16 @@ static int dxva2_init(struct lavc_ctx *s) ctx->log = mp_log_new(s, s->log, "dxva2"); ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17)); + if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) { + // Use a memcpy implementation optimised for copying from GPU memory + MP_DBG(ctx, "Using SSE4 memcpy\n"); + ctx->copy_nv12 = copy_nv12_gpu_sse4; + } else { + // Use the CRT memcpy. This can be slower than software decoding. + MP_WARN(ctx, "Using fallback memcpy (slow)\n"); + ctx->copy_nv12 = copy_nv12_fallback; + } + ctx->deviceHandle = INVALID_HANDLE_VALUE; ctx->d3dlib = LoadLibrary("d3d9.dll"); -- cgit v1.2.3