diff options
-rwxr-xr-x | TOOLS/old-configure | 1 | ||||
-rw-r--r-- | TOOLS/old-makefile | 1 | ||||
-rw-r--r-- | video/decode/dxva2.c | 62 | ||||
-rw-r--r-- | video/gpu_memcpy.c (renamed from video/decode/gpu_memcpy_sse4.h) | 13 | ||||
-rw-r--r-- | video/gpu_memcpy.h | 8 | ||||
-rw-r--r-- | video/mp_image.c | 82 | ||||
-rw-r--r-- | video/mp_image.h | 3 | ||||
-rw-r--r-- | waftools/fragments/sse.c | 18 | ||||
-rw-r--r-- | wscript | 5 | ||||
-rw-r--r-- | wscript_build.py | 1 |
10 files changed, 112 insertions, 82 deletions
diff --git a/TOOLS/old-configure b/TOOLS/old-configure index f68f95f3aa..41c9b1c9bd 100755 --- a/TOOLS/old-configure +++ b/TOOLS/old-configure @@ -964,6 +964,7 @@ cat > $TMPC << EOF #define HAVE_VIDEOTOOLBOX_HWACCEL 0 #define HAVE_VIDEOTOOLBOX_GL 0 #define HAVE_VIDEOTOOLBOX_VDA_GL 0 +#define HAVE_SSE4_INTRINSICS 1 #ifdef __OpenBSD__ #define DEFAULT_CDROM_DEVICE "/dev/rcd0c" diff --git a/TOOLS/old-makefile b/TOOLS/old-makefile index 1ed6bc4b3f..bb15f96630 100644 --- a/TOOLS/old-makefile +++ b/TOOLS/old-makefile @@ -234,6 +234,7 @@ SOURCES = audio/audio.c \ ta/ta_talloc.c \ video/csputils.c \ video/fmt-conversion.c \ + video/gpu_memcpy.c \ video/image_writer.c \ video/img_format.c \ video/mp_image.c \ diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c index 5e06f505ac..0f7542817d 100644 --- a/video/decode/dxva2.c +++ b/video/decode/dxva2.c @@ -37,7 +37,6 @@ #include "video/mp_image_pool.h" #include "video/hwdec.h" #include "video/d3d.h" -#include "gpu_memcpy_sse4.h" // A minor evil. #ifndef FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO @@ -98,9 +97,6 @@ typedef struct surface_info { typedef struct DXVA2Context { struct mp_log *log; - void (*copy_nv12)(struct mp_image *dest, uint8_t *src_bits, - unsigned src_pitch, unsigned surf_height); - HMODULE d3dlib; HMODULE dxva2lib; @@ -243,8 +239,8 @@ static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, return mp_image_new_custom_ref(&mpi, w, dxva2_release_img); } -static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits, - unsigned src_pitch, unsigned surf_height) +static void copy_nv12(struct mp_image *dest, uint8_t *src_bits, + unsigned src_pitch, unsigned surf_height) { struct mp_image buf = {0}; mp_image_setfmt(&buf, IMGFMT_NV12); @@ -254,49 +250,9 @@ static void copy_nv12_fallback(struct mp_image *dest, uint8_t *src_bits, buf.stride[0] = src_pitch; buf.planes[1] = src_bits + src_pitch * surf_height; buf.stride[1] = src_pitch; - mp_image_copy(dest, &buf); + mp_image_copy_gpu(dest, &buf); } -#pragma GCC push_options -#pragma GCC target("sse4.1") - -static void copy_nv12_gpu_sse4(struct mp_image *dest, uint8_t *src_bits, - unsigned src_pitch, unsigned surf_height) -{ - const int lines = dest->h; - const int stride_y = dest->stride[0]; - const int stride_uv = dest->stride[1]; - - // If the strides match, the image can be copied in one go - if (stride_y == src_pitch && stride_uv == src_pitch) { - const size_t size = lines * src_pitch; - gpu_memcpy(dest->planes[0], src_bits, size); - gpu_memcpy(dest->planes[1], src_bits + src_pitch * surf_height, size / 2); - - } else { - // Copy the Y plane line-by-line - uint8_t *dest_y = dest->planes[0]; - const uint8_t *src_y = src_bits; - const int bytes_per_line = dest->w; - for (int i = 0; i < lines; i++) { - gpu_memcpy(dest_y, src_y, bytes_per_line); - dest_y += stride_y; - src_y += src_pitch; - } - - // Copy the UV plane line-by-line - uint8_t *dest_uv = dest->planes[1]; - const uint8_t *src_uv = src_bits + src_pitch * surf_height; - for (int i = 0; i < lines / 2; i++) { - gpu_memcpy(dest_uv, src_uv, bytes_per_line); - dest_uv += stride_uv; - src_uv += src_pitch; - } - } -} - -#pragma GCC pop_options - static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, struct mp_image *img) { @@ -324,7 +280,7 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s, return img; } - ctx->copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height); + copy_nv12(sw_img, LockedRect.pBits, LockedRect.Pitch, surfaceDesc.Height); mp_image_set_size(sw_img, img->w, img->h); mp_image_copy_attributes(sw_img, img); @@ -408,15 +364,7 @@ static int dxva2_init(struct lavc_ctx *s) ctx->log = mp_log_new(s, s->log, "dxva2"); ctx->sw_pool = talloc_steal(ctx, mp_image_pool_new(17)); - if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) { - // Use a memcpy implementation optimised for copying from GPU memory - MP_DBG(ctx, "Using SSE4 memcpy\n"); - ctx->copy_nv12 = copy_nv12_gpu_sse4; - } else { - // Use the CRT memcpy. This can be slower than software decoding. - MP_WARN(ctx, "Using fallback memcpy (slow)\n"); - ctx->copy_nv12 = copy_nv12_fallback; - } + mp_check_gpu_memcpy(ctx->log, NULL); ctx->deviceHandle = INVALID_HANDLE_VALUE; diff --git a/video/decode/gpu_memcpy_sse4.h b/video/gpu_memcpy.c index 160209bdc5..355da0e2a2 100644 --- a/video/decode/gpu_memcpy_sse4.h +++ b/video/gpu_memcpy.c @@ -19,18 +19,20 @@ * Taken from the QuickSync decoder by Eric Gur */ -#ifndef GPU_MEMCPY_SSE4_H_ -#define GPU_MEMCPY_SSE4_H_ - #pragma GCC push_options #pragma GCC target("sse4.1") #include <smmintrin.h> +#include <stdbool.h> +#include <string.h> + +#include "gpu_memcpy.h" + // gpu_memcpy is a memcpy style function that copied data very fast from a // GPU tiled memory (write back) // Performance tip: page offset (12 lsb) of both addresses should be different // optimally use a 2K offset between them. -static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size) +void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size) { static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 @@ -131,6 +133,3 @@ static inline void *gpu_memcpy(void *restrict d, const void *restrict s, size_t return d; } - -#pragma GCC pop_options -#endif diff --git a/video/gpu_memcpy.h b/video/gpu_memcpy.h new file mode 100644 index 0000000000..c62f754aac --- /dev/null +++ b/video/gpu_memcpy.h @@ -0,0 +1,8 @@ +#ifndef GPU_MEMCPY_SSE4_H_ +#define GPU_MEMCPY_SSE4_H_ + +#include <stddef.h> + +void *gpu_memcpy(void *restrict d, const void *restrict s, size_t size); + +#endif diff --git a/video/mp_image.c b/video/mp_image.c index debdbbb201..57650eea0d 100644 --- a/video/mp_image.c +++ b/video/mp_image.c @@ -35,6 +35,7 @@ #include "mp_image.h" #include "sws_utils.h" #include "fmt-conversion.h" +#include "gpu_memcpy.h" #include "video/filter/vf.h" @@ -300,7 +301,30 @@ void mp_image_unrefp(struct mp_image **p_img) *p_img = NULL; } -void mp_image_copy(struct mp_image *dst, struct mp_image *src) +typedef void *(*memcpy_fn)(void *d, const void *s, size_t size); + +static void memcpy_pic_cb(void *dst, const void *src, int bytesPerLine, int height, + int dstStride, int srcStride, memcpy_fn cpy) +{ + if (bytesPerLine == dstStride && dstStride == srcStride && height) { + if (srcStride < 0) { + src = (uint8_t*)src + (height - 1) * srcStride; + dst = (uint8_t*)dst + (height - 1) * dstStride; + srcStride = -srcStride; + } + + cpy(dst, src, srcStride * (height - 1) + bytesPerLine); + } else { + for (int i = 0; i < height; i++) { + cpy(dst, src, bytesPerLine); + src = (uint8_t*)src + srcStride; + dst = (uint8_t*)dst + dstStride; + } + } +} + +static void mp_image_copy_cb(struct mp_image *dst, struct mp_image *src, + memcpy_fn cpy) { assert(dst->imgfmt == src->imgfmt); assert(dst->w == src->w && dst->h == src->h); @@ -308,14 +332,50 @@ void mp_image_copy(struct mp_image *dst, struct mp_image *src) for (int n = 0; n < dst->num_planes; n++) { int line_bytes = (mp_image_plane_w(dst, n) * dst->fmt.bpp[n] + 7) / 8; int plane_h = mp_image_plane_h(dst, n); - memcpy_pic(dst->planes[n], src->planes[n], line_bytes, plane_h, - dst->stride[n], src->stride[n]); + memcpy_pic_cb(dst->planes[n], src->planes[n], line_bytes, plane_h, + dst->stride[n], src->stride[n], cpy); } // Watch out for AV_PIX_FMT_FLAG_PSEUDOPAL retardation if ((dst->fmt.flags & MP_IMGFLAG_PAL) && dst->planes[1] && src->planes[1]) memcpy(dst->planes[1], src->planes[1], MP_PALETTE_SIZE); } +void mp_image_copy(struct mp_image *dst, struct mp_image *src) +{ + mp_image_copy_cb(dst, src, memcpy); +} + +void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src) +{ +#if HAVE_SSE4_INTRINSICS + if (av_get_cpu_flags() & AV_CPU_FLAG_SSE4) { + mp_image_copy_cb(dst, src, gpu_memcpy); + return; + } +#endif + mp_image_copy(dst, src); +} + +// Helper, only for outputting some log info. +void mp_check_gpu_memcpy(struct mp_log *log, bool *once) +{ + if (once) { + if (*once) + return; + *once = true; + } + + bool have_sse = false; +#if HAVE_SSE4_INTRINSICS + have_sse = av_get_cpu_flags() & AV_CPU_FLAG_SSE4; +#endif + if (have_sse) { + mp_verbose(log, "Using SSE4 memcpy\n"); + } else { + mp_warn(log, "Using fallback memcpy (slow)\n"); + } +} + void mp_image_copy_attributes(struct mp_image *dst, struct mp_image *src) { dst->pict_type = src->pict_type; @@ -675,21 +735,7 @@ struct AVFrame *mp_image_to_av_frame_and_unref(struct mp_image *img) void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height, int dstStride, int srcStride) { - if (bytesPerLine == dstStride && dstStride == srcStride && height) { - if (srcStride < 0) { - src = (uint8_t*)src + (height - 1) * srcStride; - dst = (uint8_t*)dst + (height - 1) * dstStride; - srcStride = -srcStride; - } - - memcpy(dst, src, srcStride * (height - 1) + bytesPerLine); - } else { - for (int i = 0; i < height; i++) { - memcpy(dst, src, bytesPerLine); - src = (uint8_t*)src + srcStride; - dst = (uint8_t*)dst + dstStride; - } - } + memcpy_pic_cb(dst, src, bytesPerLine, height, dstStride, srcStride, memcpy); } void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride) diff --git a/video/mp_image.h b/video/mp_image.h index f5759205f4..25eb42c050 100644 --- a/video/mp_image.h +++ b/video/mp_image.h @@ -106,6 +106,7 @@ int mp_chroma_div_up(int size, int shift); struct mp_image *mp_image_alloc(int fmt, int w, int h); void mp_image_copy(struct mp_image *dmpi, struct mp_image *mpi); +void mp_image_copy_gpu(struct mp_image *dst, struct mp_image *src); void mp_image_copy_attributes(struct mp_image *dmpi, struct mp_image *mpi); struct mp_image *mp_image_new_copy(struct mp_image *img); struct mp_image *mp_image_new_ref(struct mp_image *img); @@ -159,4 +160,6 @@ void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height, void memset_pic(void *dst, int fill, int bytesPerLine, int height, int stride); void memset16_pic(void *dst, int fill, int unitsPerLine, int height, int stride); +void mp_check_gpu_memcpy(struct mp_log *log, bool *once); + #endif /* MPLAYER_MP_IMAGE_H */ diff --git a/waftools/fragments/sse.c b/waftools/fragments/sse.c new file mode 100644 index 0000000000..e9689cda17 --- /dev/null +++ b/waftools/fragments/sse.c @@ -0,0 +1,18 @@ +#pragma GCC push_options +#pragma GCC target("sse4.1") +#include <smmintrin.h> + +void *a_ptr; + +int main(void) +{ + __m128i xmm0; + __m128i* p = (__m128i*)a_ptr; + + _mm_sfence(); + + xmm0 = _mm_stream_load_si128(p + 1); + _mm_store_si128(p + 2, xmm0); + + return 0; +} @@ -769,6 +769,11 @@ hwaccel_features = [ 'desc': 'libavcodec DXVA2 hwaccel', 'deps': [ 'win32' ], 'func': check_headers('libavcodec/dxva2.h', use='libav'), + }, { + 'name': 'sse4-intrinsics', + 'desc': 'GCC SSE4 intrinsics for GPU memcpy', + 'deps_any': [ 'dxva2-hwaccel' ], + 'func': check_cc(fragment=load_fragment('sse.c')), } ] diff --git a/wscript_build.py b/wscript_build.py index 531da901a7..1d32b1a922 100644 --- a/wscript_build.py +++ b/wscript_build.py @@ -267,6 +267,7 @@ def build(ctx): ## Video ( "video/csputils.c" ), ( "video/fmt-conversion.c" ), + ( "video/gpu_memcpy.c", "sse4-intrinsics" ), ( "video/image_writer.c" ), ( "video/img_format.c" ), ( "video/mp_image.c" ), |