1 files changed, 60 insertions, 894 deletions
diff --git a/video/zimg.c b/video/zimg.c
index ae3602d297..4e7711f61a 100644
--- a/video/zimg.c
+++ b/video/zimg.c
@@ -25,6 +25,7 @@
 #include "csputils.h"
 #include "options/m_config.h"
 #include "options/m_option.h"
+#include "repack.h"
 #include "video/fmt-conversion.h"
 #include "video/img_format.h"
 #include "zimg.h"
@@ -83,50 +84,14 @@ struct mp_zimg_repack {
     int num_planes;             // number of planes involved
     unsigned zmask[4];          // zmask[mp_index] = zimg mask (using mp index!)
     int z_planes[4];            // z_planes[zimg_index] = mp_index (or -1)
-    bool pass_through_y;        // luma plane optimization for e.g. nv12
 
-    // If set, the pack/unpack callback to pass to zimg.
-    // Called with user==mp_zimg_repack.
-    zimg_filter_graph_callback repack;
-
-    // Endian-swap (done before/after actual repacker).
-    int endian_size;            // 0=no swapping, 2/4=word byte size to swap
-    int endian_items[4];        // number of words per pixel/plane
-
-    // For packed_repack.
-    int components[4];          // p2[n] = mp_image.planes[components[n]]
-    //  pack:   p1 is dst, p2 is src
-    //  unpack: p1 is src, p2 is dst
-    void (*packed_repack_scanline)(void *p1, void *p2[], int x0, int x1);
-
-    // Fringe RGB/YUV.
-    uint8_t comp_size;
-    uint8_t *comp_map;
-    uint8_t comp_shifts[3];
-    uint8_t *comp_lut; // 256 * 3
+    struct mp_repack *repack;   // converting to/from planar
 
     // Temporary memory for slice-wise repacking. This may be set even if repack
     // is not set (then it may be used to avoid alignment issues). This has
     // about one slice worth of data.
     struct mp_image *tmp;
 
-    // Temporary memory for endian swapping. This has about one slice worth
-    // of data; set and used only if endian swapping is used (endian_size>0).
-    // It's also used only for pack==false; packers do this in-place.
-    struct mp_image *tmp_endian;
-
-    // Temporary, per-call source/target frame.
-    struct mp_image *mpi;
-    // Y coordinate of first line in mpi; usually 0 if mpi==user_mpi, or the
-    // start of the current slice (in the current repack cb).
-    // repackers should use: mpi->data[p] + mpi->stride[p] * (i - mpi_y0)
-    int mpi_y0;
-
-    struct mp_image *user_mpi;
-
-    // Also temporary, per-call. use_buf[n] == plane n uses tmp (and not mpi).
-    bool use_buf[4];
-
     int real_w, real_h;         // aligned size
 };
 
@@ -243,532 +208,44 @@ void mp_zimg_enable_cmdline_opts(struct mp_zimg_context *ctx,
     mp_zimg_update_from_cmdline(ctx); // first update
 }
 
-static int repack_align(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    for (int p = 0; p < r->mpi->fmt.num_planes; p++) {
-        if (!r->use_buf[p])
-            continue;
-
-        int bpp = r->mpi->fmt.bytes[p];
-        int xs = r->mpi->fmt.xs[p];
-        int ys = r->mpi->fmt.ys[p];
-        // Number of lines on this plane.
-        int h = (1 << r->mpi->fmt.chroma_ys) - (1 << ys) + 1;
-
-        for (int y = i; y < i + h; y++) {
-            void *a = r->mpi->planes[p] +
-                      r->mpi->stride[p] * (ptrdiff_t)((y - r->mpi_y0) >> ys) +
-                      bpp * (x0 >> xs);
-            void *b = r->tmp->planes[p] +
-                      r->tmp->stride[p] * (ptrdiff_t)((y >> ys) & r->zmask[p]) +
-                      bpp * (x0 >> xs);
-            size_t size = ((x1 - x0) >> xs) * bpp;
-            if (r->pack) {
-                memcpy(a, b, size);
-            } else {
-                memcpy(b, a, size);
-            }
-        }
-    }
-
-    return 0;
-}
-
-// Swap endian for one line.
-static void swap_endian(struct mp_zimg_repack *r, struct mp_image *dst, int dst_y,
-                        struct mp_image *src, int src_y, int x0, int x1)
-{
-    for (int p = 0; p < dst->fmt.num_planes; p++) {
-        int xs = dst->fmt.xs[p];
-        int ys = dst->fmt.ys[p];
-        int words_per_pixel = r->endian_items[p];
-        int bpp = words_per_pixel * r->endian_size;
-        // Number of lines on this plane.
-        int h = (1 << dst->fmt.chroma_ys) - (1 << ys) + 1;
-        int num_words = ((x1 - x0) >> xs) * words_per_pixel;
-
-        for (int y = 0; y < h; y++) {
-            void *s = src->planes[p] +
-                      src->stride[p] * (ptrdiff_t)((y + src_y) >> ys) +
-                      bpp * (x0 >> xs);
-            void *d = dst->planes[p] +
-                      dst->stride[p] * (ptrdiff_t)((y + dst_y) >> ys) +
-                      bpp * (x0 >> xs);
-            switch (r->endian_size) {
-            case 2:
-                for (int w = 0; w < num_words; w++)
-                    ((uint16_t *)d)[w] = av_bswap16(((uint16_t *)s)[w]);
-                break;
-            case 4:
-                for (int w = 0; w < num_words; w++)
-                    ((uint32_t *)d)[w] = av_bswap32(((uint32_t *)s)[w]);
-                break;
-            default:
-                assert(0);
-            }
-        }
-    }
-}
-
-// PA = PAck, copy planar input to single packed array
-// UN = UNpack, copy packed input to planar output
-// Naming convention:
-//  pa_/un_ prefix to identify conversion direction.
-//  Left (LSB, lowest byte address) -> Right (MSB, highest byte address).
-//      (This is unusual; MSB to LSB is more commonly used to describe formats,
-//       but our convention makes more sense for byte access in little endian.)
-//  "c" identifies a color component.
-//  "z" identifies known zero padding.
-//  "x" identifies uninitialized padding.
-//  A component is followed by its size in bits.
-//  Size can be omitted for multiple uniform components (c8c8c8 == ccc8).
-// Unpackers will often use "x" for padding, because they ignore it, while
-// packers will use "z" because they write zero.
-
-#define PA_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3)      \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] =                                          \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
-                ((packed_t)((plane_t *)src[2])[x] << (sh_c2)) |             \
-                ((packed_t)((plane_t *)src[3])[x] << (sh_c3));              \
-        }                                                                   \
-    }
-
-#define UN_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3, mask)\
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
-            ((plane_t *)dst[3])[x] = (c >> (sh_c3)) & (mask);               \
-        }                                                                   \
-    }
-
-
-#define PA_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, pad)        \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] = (pad) |                                  \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
-                ((packed_t)((plane_t *)src[2])[x] << (sh_c2));              \
-        }                                                                   \
-    }
-
-UN_WORD_4(un_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24, 0xFFu)
-PA_WORD_4(pa_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24)
-// Not sure if this is a good idea; there may be no alignment guarantee.
-UN_WORD_4(un_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48, 0xFFFFu)
-PA_WORD_4(pa_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48)
-
-#define UN_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, mask)       \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
-        }                                                                   \
-    }
-
-UN_WORD_3(un_ccc8x8,  uint32_t, uint8_t,  0, 8,  16, 0xFFu)
-PA_WORD_3(pa_ccc8z8,  uint32_t, uint8_t,  0, 8,  16, 0)
-UN_WORD_3(un_x8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0xFFu)
-PA_WORD_3(pa_z8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0)
-UN_WORD_3(un_ccc10x2, uint32_t, uint16_t, 0, 10, 20, 0x3FFu)
-PA_WORD_3(pa_ccc10z2, uint32_t, uint16_t, 20, 10, 0, 0)
-
-#define PA_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, pad)               \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] = (pad) |                                  \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1));              \
-        }                                                                   \
-    }
-
-#define UN_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, mask)              \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-        }                                                                   \
-    }
-
-UN_WORD_2(un_cc8,  uint16_t, uint8_t,  0, 8,  0xFFu)
-PA_WORD_2(pa_cc8,  uint16_t, uint8_t,  0, 8,  0)
-UN_WORD_2(un_cc16, uint32_t, uint16_t, 0, 16, 0xFFFFu)
-PA_WORD_2(pa_cc16, uint32_t, uint16_t, 0, 16, 0)
-
-#define PA_SEQ_3(name, comp_t)                                              \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        comp_t *r = dst;                                                    \
-        for (int x = x0; x < x1; x++) {                                     \
-            *r++ = ((comp_t *)src[0])[x];                                   \
-            *r++ = ((comp_t *)src[1])[x];                                   \
-            *r++ = ((comp_t *)src[2])[x];                                   \
-        }                                                                   \
-    }
-
-#define UN_SEQ_3(name, comp_t)                                              \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        comp_t *r = src;                                                    \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((comp_t *)dst[0])[x] = *r++;                                   \
-            ((comp_t *)dst[1])[x] = *r++;                                   \
-            ((comp_t *)dst[2])[x] = *r++;                                   \
-        }                                                                   \
-    }
-
-UN_SEQ_3(un_ccc8,  uint8_t)
-PA_SEQ_3(pa_ccc8,  uint8_t)
-UN_SEQ_3(un_ccc16, uint16_t)
-PA_SEQ_3(pa_ccc16, uint16_t)
-
-// "regular": single packed plane, all components have same width (except padding)
-struct regular_repacker {
-    int packed_width;       // number of bits of the packed pixel
-    int component_width;    // number of bits for a single component
-    int prepadding;         // number of bits of LSB padding
-    int num_components;     // number of components that can be accessed
-    void (*pa_scanline)(void *p1, void *p2[], int x0, int x1);
-    void (*un_scanline)(void *p1, void *p2[], int x0, int x1);
-};
-
-static const struct regular_repacker regular_repackers[] = {
-    {32, 8,  0, 3, pa_ccc8z8,  un_ccc8x8},
-    {32, 8,  8, 3, pa_z8ccc8,  un_x8ccc8},
-    {32, 8,  0, 4, pa_cccc8,   un_cccc8},
-    {64, 16, 0, 4, pa_cccc16,  un_cccc16},
-    {24, 8,  0, 3, pa_ccc8,    un_ccc8},
-    {48, 16, 0, 3, pa_ccc16,   un_ccc16},
-    {16, 8,  0, 2, pa_cc8,     un_cc8},
-    {32, 16, 0, 2, pa_cc16,    un_cc16},
-    {32, 10, 0, 3, pa_ccc10z2, un_ccc10x2},
-};
-
-static int packed_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint32_t *p1 = (void *)(r->mpi->planes[0] +
-                            r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0));
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)(i & r->zmask[s]);
-    }
-
-    r->packed_repack_scanline(p1, p2, x0, x1);
-
-    return 0;
-}
-
-struct fringe_rgb_repacker {
-    // To avoid making a mess of IMGFMT_*, we use av formats directly.
-    enum AVPixelFormat avfmt;
-    // If true, use BGR instead of RGB.
-    //  False:  LSB - R - G - B - pad - MSB
-    //  True:   LSB - B - G - R - pad - MSB
-    bool rev_order;
-    // Size in bit for each component, strictly from LSB to MSB.
-    int bits[3];
-    bool be;
-};
-
-static const struct fringe_rgb_repacker fringe_rgb_repackers[] = {
-    {AV_PIX_FMT_BGR4_BYTE,  false,  {1, 2, 1}},
-    {AV_PIX_FMT_RGB4_BYTE,  true,   {1, 2, 1}},
-    {AV_PIX_FMT_BGR8,       false,  {3, 3, 2}},
-    {AV_PIX_FMT_RGB8,       true,   {2, 3, 3}}, // pixdesc desc. and doc. bug?
-    {AV_PIX_FMT_RGB444LE,   true,   {4, 4, 4}},
-    {AV_PIX_FMT_RGB444BE,   true,   {4, 4, 4}, .be = true},
-    {AV_PIX_FMT_BGR444LE,   false,  {4, 4, 4}},
-    {AV_PIX_FMT_BGR444BE,   false,  {4, 4, 4}, .be = true},
-    {AV_PIX_FMT_BGR565LE,   false,  {5, 6, 5}},
-    {AV_PIX_FMT_BGR565BE,   false,  {5, 6, 5}, .be = true},
-    {AV_PIX_FMT_RGB565LE,   true,   {5, 6, 5}},
-    {AV_PIX_FMT_RGB565BE,   true,   {5, 6, 5}, .be = true},
-    {AV_PIX_FMT_BGR555LE,   false,  {5, 5, 5}},
-    {AV_PIX_FMT_BGR555BE,   false,  {5, 5, 5}, .be = true},
-    {AV_PIX_FMT_RGB555LE,   true,   {5, 5, 5}},
-    {AV_PIX_FMT_RGB555BE,   true,   {5, 5, 5}, .be = true},
-};
-
-#define PA_SHIFT_LUT8(name, packed_t)                                       \
-    static void name(void *dst, void *src[], int x0, int x1, uint8_t *lut,  \
-                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] =                                          \
-                (lut[((uint8_t *)src[0])[x] + 256 * 0] << s0) |             \
-                (lut[((uint8_t *)src[1])[x] + 256 * 1] << s1) |             \
-                (lut[((uint8_t *)src[2])[x] + 256 * 2] << s2);              \
-        }                                                                   \
-    }
-
-
-#define UN_SHIFT_LUT8(name, packed_t)                                       \
-    static void name(void *src, void *dst[], int x0, int x1, uint8_t *lut,  \
-                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((uint8_t *)dst[0])[x] = lut[((c >> s0) & 0xFF) + 256 * 0];     \
-            ((uint8_t *)dst[1])[x] = lut[((c >> s1) & 0xFF) + 256 * 1];     \
-            ((uint8_t *)dst[2])[x] = lut[((c >> s2) & 0xFF) + 256 * 2];     \
-        }                                                                   \
-    }
-
-PA_SHIFT_LUT8(pa_shift_lut8_8,  uint8_t)
-PA_SHIFT_LUT8(pa_shift_lut8_16, uint16_t)
-UN_SHIFT_LUT8(un_shift_lut8_8,  uint8_t)
-UN_SHIFT_LUT8(un_shift_lut8_16, uint16_t)
-
-static int fringe_rgb_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    void *p1 = r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)(i & r->zmask[s]);
-    }
-
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    void (*repack)(void *p1, void *p2[], int x0, int x1, uint8_t *lut,
-                   uint8_t s0, uint8_t s1, uint8_t s2) = NULL;
-    if (r->pack) {
-        repack = r->comp_size == 1 ? pa_shift_lut8_8 : pa_shift_lut8_16;
-    } else {
-        repack = r->comp_size == 1 ? un_shift_lut8_8 : un_shift_lut8_16;
-    }
-    repack(p1, p2, x0, x1, r->comp_lut,
-           r->comp_shifts[0], r->comp_shifts[1], r->comp_shifts[2]);
-
-    return 0;
-}
-
-static int bitmap_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint8_t *p1 =
-        r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-    uint8_t *p2 =
-        r->tmp->planes[0] + r->tmp->stride[0] * (ptrdiff_t)(i & r->zmask[0]);
-
-    uint8_t swap = r->comp_size ? 0xFF : 0;
-    if (r->pack) {
-        // Supposedly zimg aligns this at least on 64 byte boundaries. Simplifies a
-        // lot for us.
-        assert(!(x0 & 7));
-
-        for (int x = x0; x < x1; x += 8) {
-            uint8_t d = 0;
-            int max_b = MPMIN(8, x1 - x);
-            for (int b = 0; b < max_b; b++)
-                d |= (!!p2[x + b]) << (7 - b);
-            p1[x / 8] = d ^ swap;
-        }
-    } else {
-        x0 &= ~0x7;
-
-        for (int x = x0; x < x1; x += 8) {
-            uint8_t d = p1[x / 8] ^ swap;
-            int max_b = MPMIN(8, x1 - x);
-            for (int b = 0; b < max_b; b++)
-                p2[x + b] = !!(d & (1 << (7 - b)));
-        }
-    }
-
-    return 0;
-}
-
-static int unpack_pal(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint8_t *src = (void *)(r->mpi->planes[0] +
-                            r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0));
-    uint32_t *pal = (void *)r->mpi->planes[1];
-
-    uint8_t *dst[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        dst[p] = r->tmp->planes[p] +
-                 r->tmp->stride[p] * (ptrdiff_t)(i & r->zmask[p]);
-    }
-
-    for (int x = x0; x < x1; x++) {
-        uint32_t c = pal[src[x]];
-        dst[0][x] = (c >>  8) & 0xFF; // G
-        dst[1][x] = (c >>  0) & 0xFF; // B
-        dst[2][x] = (c >> 16) & 0xFF; // R
-        dst[3][x] = (c >> 24) & 0xFF; // A
-    }
-
-    return 0;
-}
-
-struct fringe_yuv422_repacker {
-    // To avoid making a mess of IMGFMT_*, we use av formats directly.
-    enum AVPixelFormat avfmt;
-    // In bits (depth/8 rounded up gives byte size)
-    int8_t depth;
-    // Word index of each sample: {y0, y1, cb, cr}
-    uint8_t comp[4];
-    bool be;
-};
-
-static const struct fringe_yuv422_repacker fringe_yuv422_repackers[] = {
-    {AV_PIX_FMT_YUYV422,  8, {0, 2, 1, 3}},
-    {AV_PIX_FMT_UYVY422,  8, {1, 3, 0, 2}},
-    {AV_PIX_FMT_YVYU422,  8, {0, 2, 3, 1}},
-#ifdef AV_PIX_FMT_Y210
-    {AV_PIX_FMT_Y210LE,  10, {0, 2, 1, 3}},
-    {AV_PIX_FMT_Y210BE,  10, {0, 2, 1, 3}, .be = true},
-#endif
-};
-
-#define PA_P422(name, comp_t)                                               \
-    static void name(void *dst, void *src[], int x0, int x1, uint8_t *c) {  \
-        for (int x = x0; x < x1; x += 2) {                                  \
-            ((comp_t *)dst)[x * 2 + c[0]] = ((comp_t *)src[0])[x + 0];      \
-            ((comp_t *)dst)[x * 2 + c[1]] = ((comp_t *)src[0])[x + 1];      \
-            ((comp_t *)dst)[x * 2 + c[2]] = ((comp_t *)src[1])[x >> 1];     \
-            ((comp_t *)dst)[x * 2 + c[3]] = ((comp_t *)src[2])[x >> 1];     \
-        }                                                                   \
-    }
-
-
-#define UN_P422(name, comp_t)                                               \
-    static void name(void *src, void *dst[], int x0, int x1, uint8_t *c) {  \
-        for (int x = x0; x < x1; x += 2) {                                  \
-            ((comp_t *)dst[0])[x + 0]  = ((comp_t *)src)[x * 2 + c[0]];     \
-            ((comp_t *)dst[0])[x + 1]  = ((comp_t *)src)[x * 2 + c[1]];     \
-            ((comp_t *)dst[1])[x >> 1] = ((comp_t *)src)[x * 2 + c[2]];     \
-            ((comp_t *)dst[2])[x >> 1] = ((comp_t *)src)[x * 2 + c[3]];     \
-        }                                                                   \
-    }
-
-PA_P422(pa_p422_8,  uint8_t)
-PA_P422(pa_p422_16, uint16_t)
-UN_P422(un_p422_8,  uint8_t)
-UN_P422(un_p422_16, uint16_t)
-
-static int fringe_yuv422_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    void *p1 = r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        p2[p] = r->tmp->planes[p] +
-                r->tmp->stride[p] * (ptrdiff_t)(i & r->zmask[p]);
-    }
-
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    void (*repack)(void *p1, void *p2[], int x0, int x1, uint8_t *c) = NULL;
-    if (r->pack) {
-        repack = r->comp_size == 1 ? pa_p422_8 : pa_p422_16;
-    } else {
-        repack = r->comp_size == 1 ? un_p422_8 : un_p422_16;
-    }
-    repack(p1, p2, x0, x1, r->comp_map);
-
-    return 0;
-}
-
-static int repack_nv(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    int xs = r->mpi->fmt.chroma_xs;
-    int ys = r->mpi->fmt.chroma_ys;
-
-    if (r->use_buf[0]) {
-        // Copy Y.
-        int l_h = 1 << ys;
-        for (int y = i; y < i + l_h; y++) {
-            ptrdiff_t bpp = r->mpi->fmt.bytes[0];
-            void *a = r->mpi->planes[0] +
-                    r->mpi->stride[0] * (ptrdiff_t)(y - r->mpi_y0) + bpp * x0;
-            void *b = r->tmp->planes[0] +
-                    r->tmp->stride[0] * (ptrdiff_t)(y & r->zmask[0]) + bpp * x0;
-            size_t size = (x1 - x0) * bpp;
-            if (r->pack) {
-                memcpy(a, b, size);
-            } else {
-                memcpy(b, a, size);
-            }
-        }
-    }
-
-    uint32_t *p1 = (void *)(r->mpi->planes[1] +
-                            r->mpi->stride[1] * (ptrdiff_t)((i - r->mpi_y0) >> ys));
-
-    void *p2[2];
-    for (int p = 0; p < 2; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)((i >> ys) & r->zmask[s]);
-    }
-
-    r->packed_repack_scanline(p1, p2, x0 >> xs, x1 >> xs);
-
-    return 0;
-}
-
 static int repack_entrypoint(void *user, unsigned i, unsigned x0, unsigned x1)
 {
     struct mp_zimg_repack *r = user;
 
-    if (r->endian_size && !r->pack) {
-        r->mpi = r->tmp_endian;
-        r->mpi_y0 = i;
-        swap_endian(r, r->mpi, 0, r->user_mpi, i, x0, x1);
-    } else {
-        r->mpi = r->user_mpi;
-        r->mpi_y0 = 0;
-    }
+    // If reading is not aligned, just read slightly more data.
+    if (!r->pack)
+        x0 &= ~(unsigned)(mp_repack_get_align_x(r->repack) - 1);
 
-    if (r->repack) {
-        r->repack(r, i, x0, x1);
-    } else {
-        repack_align(r, i, x0, x1);
-    }
+    // mp_repack requirements and zimg guarantees.
+    assert(!(i & (mp_repack_get_align_y(r->repack) - 1)));
+    assert(!(x0 & (mp_repack_get_align_x(r->repack) - 1)));
+
+    unsigned i_src = i & (r->pack ? r->zmask[0] : ZIMG_BUFFER_MAX);
+    unsigned i_dst = i & (r->pack ? ZIMG_BUFFER_MAX : r->zmask[0]);
 
-    if (r->endian_size && r->pack)
-        swap_endian(r, r->user_mpi, i, r->mpi, i - r->mpi_y0, x0, x1);
+    repack_line(r->repack, x0, i_dst, x0, i_src, x1 - x0);
 
-    r->mpi = NULL;
     return 0;
 }
 
-static void wrap_buffer(struct mp_zimg_repack *r,
+static bool wrap_buffer(struct mp_zimg_repack *r,
                         zimg_image_buffer *buf,
                         struct mp_image *mpi)
 {
     *buf = (zimg_image_buffer){ZIMG_API_VERSION};
 
-    bool plane_aligned[4] = {0};
-    for (int n = 0; n < r->num_planes; n++) {
-        plane_aligned[n] = !((uintptr_t)mpi->planes[n] % ZIMG_ALIGN) &&
-                           !(mpi->stride[n] % ZIMG_ALIGN);
+    bool direct[MP_MAX_PLANES] = {0};
+
+    for (int p = 0; p < mpi->num_planes; p++) {
+        // If alignment is good, try to avoid copy.
+        direct[p] = !((uintptr_t)mpi->planes[p] % ZIMG_ALIGN) &&
+                    !(mpi->stride[p] % ZIMG_ALIGN);
     }
 
+    if (!repack_config_buffers(r->repack, 0, r->pack ? mpi : r->tmp,
+                                          0, r->pack ? r->tmp : mpi, direct))
+        return false;
+
     for (int n = 0; n < MP_ARRAY_SIZE(buf->plane); n++) {
         // Note: this is really the only place we have to care about plane
         // permutation (zimg_image_buffer may have a different plane order
@@ -778,355 +255,67 @@ static void wrap_buffer(struct mp_zimg_repack *r,
         if (mplane < 0)
             continue;
 
-        r->use_buf[mplane] = !plane_aligned[mplane] || r->endian_size;
-        if (!(r->pass_through_y && mplane == 0))
-            r->use_buf[mplane] |= !!r->repack;
-
-        struct mp_image *tmpi = r->use_buf[mplane] ? r->tmp : mpi;
+        struct mp_image *tmpi = direct[mplane] ? mpi : r->tmp;
         buf->plane[n].data = tmpi->planes[mplane];
         buf->plane[n].stride = tmpi->stride[mplane];
-        buf->plane[n].mask = r->use_buf[mplane] ? r->zmask[mplane]
-                                                : ZIMG_BUFFER_MAX;
+        buf->plane[n].mask = direct[mplane] ? ZIMG_BUFFER_MAX : r->zmask[mplane];
     }
 
-    r->user_mpi = mpi;
-}
-
-// depth = number of LSB in use
-static int find_gbrp_format(int depth, int num_planes)
-{
-    if (num_planes != 3 && num_planes != 4)
-        return 0;
-    struct mp_regular_imgfmt desc = {
-        .component_type = MP_COMPONENT_TYPE_UINT,
-        .forced_csp = MP_CSP_RGB,
-        .component_size = depth > 8 ? 2 : 1,
-        .component_pad = depth - (depth > 8 ? 16 : 8),
-        .num_planes = num_planes,
-        .planes = { {1, {2}}, {1, {3}}, {1, {1}}, {1, {4}} },
-    };
-    return mp_find_regular_imgfmt(&desc);
-}
-
-// depth = number of LSB in use
-static int find_gray_format(int depth, int num_planes)
-{
-    if (num_planes != 1 && num_planes != 2)
-        return 0;
-    struct mp_regular_imgfmt desc = {
-        .component_type = MP_COMPONENT_TYPE_UINT,
-        .component_size = depth > 8 ? 2 : 1,
-        .component_pad = depth - (depth > 8 ? 16 : 8),
-        .num_planes = num_planes,
-        .planes = { {1, {1}}, {1, {4}} },
-    };
-    return mp_find_regular_imgfmt(&desc);
+    return true;
 }
 
-static void setup_fringe_rgb_packer(struct mp_zimg_repack *r,
-                                    struct mp_zimg_context *ctx)
+// (ctx can be NULL for probing.)
+static bool setup_format(zimg_image_format *zfmt, struct mp_zimg_repack *r,
+                         bool pack, struct mp_image_params *user_fmt,
+                         struct mp_zimg_context *ctx)
 {
-    enum AVPixelFormat avfmt = imgfmt2pixfmt(r->zimgfmt);
+    r->fmt = *user_fmt;
+    r->pack = pack;
 
-    const struct fringe_rgb_repacker *fmt = NULL;
-    for (int n = 0; n < MP_ARRAY_SIZE(fringe_rgb_repackers); n++) {
-        if (fringe_rgb_repackers[n].avfmt == avfmt) {
-            fmt = &fringe_rgb_repackers[n];
-            break;
-        }
-    }
+    zimg_image_format_default(zfmt, ZIMG_API_VERSION);
 
-    if (!fmt)
-        return;
+    int rp_flags = 0;
 
-    int depth = 8;
+    // For e.g. RGB565, go to lowest depth on pack for less weird dithering.
     if (r->pack) {
-        // Dither to lowest depth - loses some precision, but result is saner.
-        depth = fmt->bits[0];
-        for (int n = 0; n < 3; n++)
-            depth = MPMIN(depth, fmt->bits[n]);
-    }
-
-    r->zimgfmt = find_gbrp_format(depth, 3);
-    if (!r->zimgfmt)
-        return;
-    if (ctx)
-        r->comp_lut = talloc_array(ctx, uint8_t, 256 * 3);
-    r->repack = fringe_rgb_repack;
-    static const int c_order_rgb[] = {3, 1, 2};
-    static const int c_order_bgr[] = {2, 1, 3};
-    for (int n = 0; n < 3; n++)
-        r->components[n] = (fmt->rev_order ? c_order_bgr : c_order_rgb)[n] - 1;
-
-    int bitpos = 0;
-    for (int n = 0; n < 3; n++) {
-        int bits = fmt->bits[n];
-        r->comp_shifts[n] = bitpos;
-        if (r->comp_lut) {
-            uint8_t *lut = r->comp_lut + 256 * n;
-            uint8_t zmax = (1 << depth) - 1;
-            uint8_t cmax = (1 << bits) - 1;
-            for (int v = 0; v < 256; v++) {
-                if (r->pack) {
-                    lut[v] = (v * cmax + zmax / 2) / zmax;
-                } else {
-                    lut[v] = (v & cmax) * zmax / cmax;
-                }
-            }
-        }
-        bitpos += bits;
-    }
-
-    r->comp_size = (bitpos + 7) / 8;
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    if (fmt->be) {
-        assert(r->comp_size == 2);
-        r->endian_size = 2;
-        r->endian_items[0] = 1;
-    }
-}
-
-static void setup_fringe_yuv422_packer(struct mp_zimg_repack *r)
-{
-    enum AVPixelFormat avfmt = imgfmt2pixfmt(r->zimgfmt);
-
-    const struct fringe_yuv422_repacker *fmt = NULL;
-    for (int n = 0; n < MP_ARRAY_SIZE(fringe_yuv422_repackers); n++) {
-        if (fringe_yuv422_repackers[n].avfmt == avfmt) {
-            fmt = &fringe_yuv422_repackers[n];
-            break;
-        }
-    }
-
-    if (!fmt)
-        return;
-
-    r->comp_size = (fmt->depth + 7) / 8;
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    struct mp_regular_imgfmt yuvfmt = {
-        .component_type = MP_COMPONENT_TYPE_UINT,
-        // NB: same problem with P010 and not clearing padding.
-        .component_size = r->comp_size,
-        .num_planes = 3,
-        .planes = { {1, {1}}, {1, {2}}, {1, {3}} },
-        .chroma_xs = 1,
-        .chroma_ys = 0,
-    };
-    r->zimgfmt = mp_find_regular_imgfmt(&yuvfmt);
-    r->repack = fringe_yuv422_repack;
-    r->comp_map = (uint8_t *)fmt->comp;
-
-    if (fmt->be) {
-        assert(r->comp_size == 2);
-        r->endian_size = 2;
-        r->endian_items[0] = 4;
-    }
-}
-
-static void setup_nv_packer(struct mp_zimg_repack *r)
-{
-    struct mp_regular_imgfmt desc;
-    if (!mp_get_regular_imgfmt(&desc, r->zimgfmt))
-        return;
-
-    // Check for NV.
-    if (desc.num_planes != 2)
-        return;
-    if (desc.planes[0].num_components != 1 || desc.planes[0].components[0] != 1)
-        return;
-    if (desc.planes[1].num_components != 2)
-        return;
-    int cr0 = desc.planes[1].components[0];
-    int cr1 = desc.planes[1].components[1];
-    if (cr0 > cr1)
-        MPSWAP(int, cr0, cr1);
-    if (cr0 != 2 || cr1 != 3)
-        return;
-
-    // Construct equivalent planar format.
-    struct mp_regular_imgfmt desc2 = desc;
-    desc2.num_planes = 3;
-    desc2.planes[1].num_components = 1;
-    desc2.planes[1].components[0] = 2;
-    desc2.planes[2].num_components = 1;
-    desc2.planes[2].components[0] = 3;
-    // For P010. Strangely this concept exists only for the NV format.
-    if (desc2.component_pad > 0)
-        desc2.component_pad = 0;
-
-    int planar_fmt = mp_find_regular_imgfmt(&desc2);
-    if (!planar_fmt)
-        return;
-
-    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
-        const struct regular_repacker *pa = &regular_repackers[i];
-
-        void (*repack_cb)(void *p1, void *p2[], int x0, int x1) =
-            r->pack ? pa->pa_scanline : pa->un_scanline;
-
-        if (pa->packed_width != desc.component_size * 2 * 8 ||
-            pa->component_width != desc.component_size * 8 ||
-            pa->num_components != 2 ||
-            pa->prepadding != 0 ||
-            !repack_cb)
-            continue;
-
-        r->repack = repack_nv;
-        r->pass_through_y = true;
-        r->packed_repack_scanline = repack_cb;
-        r->zimgfmt = planar_fmt;
-        r->components[0] = desc.planes[1].components[0] - 1;
-        r->components[1] = desc.planes[1].components[1] - 1;
-        return;
-    }
-}
-
-static void setup_misc_packer(struct mp_zimg_repack *r)
-{
-    // Although it's in regular_repackers[], the generic mpv imgfmt metadata
-    // can't handle it yet.
-    if (r->zimgfmt == IMGFMT_RGB30) {
-        int planar_fmt = find_gbrp_format(10, 3);
-        if (!planar_fmt)
-            return;
-        r->zimgfmt = planar_fmt;
-        r->repack = packed_repack;
-        r->packed_repack_scanline = r->pack ? pa_ccc10z2 : un_ccc10x2;
-        static int c_order[] = {3, 2, 1};
-        for (int n = 0; n < 3; n++)
-            r->components[n] = c_order[n] - 1;
-    } else if (r->zimgfmt == IMGFMT_PAL8 && !r->pack) {
-        int grap_fmt = find_gbrp_format(8, 4);
-        if (!grap_fmt)
-            return;
-        r->zimgfmt = grap_fmt;
-        r->repack = unpack_pal;
+        rp_flags |= REPACK_CREATE_ROUND_DOWN;
     } else {
-        enum AVPixelFormat avfmt = imgfmt2pixfmt(r->zimgfmt);
-        if (avfmt == AV_PIX_FMT_MONOWHITE || avfmt == AV_PIX_FMT_MONOBLACK) {
-            r->zimgfmt = IMGFMT_Y1;
-            r->repack = bitmap_repack;
-            r->comp_size = avfmt == AV_PIX_FMT_MONOWHITE; // abuse to pass a flag
-            return;
-        }
+        rp_flags |= REPACK_CREATE_EXPAND_8BIT;
     }
-}
-
-// Tries to set a packer/unpacker for component-wise byte aligned RGB formats.
-static void setup_regular_rgb_packer(struct mp_zimg_repack *r)
-{
-    struct mp_regular_imgfmt desc;
-    if (!mp_get_regular_imgfmt(&desc, r->zimgfmt))
-        return;
 
-    if (desc.num_planes != 1 || desc.planes[0].num_components < 2)
-        return;
-    struct mp_regular_imgfmt_plane *p = &desc.planes[0];
-
-    int num_real_components = 0;
-    bool has_alpha = false;
-    for (int n = 0; n < p->num_components; n++) {
-        if (p->components[n]) {
-            has_alpha |= p->components[n] == 4;
-            num_real_components += 1;
-        } else {
-            // padding must be in MSB or LSB
-            if (n != 0 && n != p->num_components - 1)
-                return;
-        }
-    }
+    r->repack = mp_repack_create_planar(r->fmt.imgfmt, r->pack, rp_flags);
+    if (!r->repack)
+        return false;
 
-    int depth = desc.component_size * 8 + MPMIN(0, desc.component_pad);
+    int align_x = mp_repack_get_align_x(r->repack);
 
-    int planar_fmt = num_real_components > 2
-        ? find_gbrp_format(depth, num_real_components)
-        : find_gray_format(depth, num_real_components);
-    if (!planar_fmt)
-        return;
-    static const int reorder_gbrp[] = {0, 3, 1, 2, 4};
-    static const int reorder_gray[] = {0, 1, 0, 0, 4};
-    const int *reorder = num_real_components > 2 ? reorder_gbrp : reorder_gray;
-
-    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
-        const struct regular_repacker *pa = &regular_repackers[i];
-
-        // The following may assume little endian (because some repack backends
-        // use word access, while the metadata here uses byte access).
-
-        int prepad = p->components[0] ? 0 : 8;
-        int first_comp = p->components[0] ? 0 : 1;
-        void (*repack_cb)(void *p1, void *p2[], int x0, int x1) =
-            r->pack ? pa->pa_scanline : pa->un_scanline;
-
-        if (pa->packed_width != desc.component_size * p->num_components * 8 ||
-            pa->component_width != depth ||
-            pa->num_components != num_real_components ||
-