From d8002f1dde94771952b301f4ebe331c52bc71871 Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Sat, 9 May 2020 17:56:44 +0200
Subject: video: separate repacking code from zimg and make it independent

For whatever purpose. If anything, this makes the zimg wrapper cleaner.

The added tests are not particular exhaustive, but nice to have. This
also makes the scale_zimg.c test pretty useless, because it only tests
repacking (going through the zimg wrapper). In theory, the repack_tests
things could also be used on scalers, but I guess it doesn't matter.

Some things are added over the previous zimg wrapper code. For example,
some fringe formats can now be expanded to 8 bit per component for
convenience.
---
 video/img_format.h |    3 +-
 video/repack.c     | 1110 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 video/repack.h     |   72 ++++
 video/zimg.c       |  954 +++-----------------------------------------
 4 files changed, 1244 insertions(+), 895 deletions(-)
 create mode 100644 video/repack.c
 create mode 100644 video/repack.h

(limited to 'video')

diff --git a/video/img_format.h b/video/img_format.h
index 8e55cc9493..b0fdef8a50 100644
--- a/video/img_format.h
+++ b/video/img_format.h
@@ -69,8 +69,9 @@ struct mp_imgfmt_desc {
     int flags;              // MP_IMGFLAG_* bitfield
     int8_t num_planes;
     int8_t chroma_xs, chroma_ys; // chroma shift (i.e. log2 of chroma pixel size)
-    int8_t align_x, align_y;     // pixel size to get byte alignment and to get
+    int8_t align_x, align_y;     // pixel count to get byte alignment and to get
                                  // to a pixel pos where luma & chroma aligns
+                                 // always power of 2
     int8_t bytes[MP_MAX_PLANES]; // bytes per pixel (MP_IMGFLAG_BYTE_ALIGNED)
     int8_t bpp[MP_MAX_PLANES];   // bits per pixel
     int8_t plane_bits;           // number of bits in use for plane 0
diff --git a/video/repack.c b/video/repack.c
new file mode 100644
index 0000000000..359e32996d
--- /dev/null
+++ b/video/repack.c
@@ -0,0 +1,1110 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include <libavutil/bswap.h>
+#include <libavutil/pixfmt.h>
+
+#include "common/common.h"
+#include "repack.h"
+#include "video/fmt-conversion.h"
+#include "video/img_format.h"
+#include "video/mp_image.h"
+
+enum repack_step_type {
+    REPACK_STEP_REPACK,
+    REPACK_STEP_ENDIAN,
+};
+
+struct repack_step {
+    enum repack_step_type type;
+    // 0=input, 1=output
+    struct mp_image *buf[2];
+    bool user_buf[2]; // user_buf[n]==true if buf[n] = user src/dst buffer
+    struct mp_imgfmt_desc fmt[2];
+    struct mp_image *tmp; // output buffer, if needed
+};
+
+struct mp_repack {
+    bool pack;                  // if false, this is for unpacking
+    int flags;
+    int imgfmt_user;            // original mp format (unchanged endian)
+    int imgfmt_a;               // original mp format (possibly packed format,
+                                // swapped endian)
+    int imgfmt_b;               // equivalent unpacked/planar format
+    struct mp_imgfmt_desc fmt_a;// ==imgfmt_a
+    struct mp_imgfmt_desc fmt_b;// ==imgfmt_b
+
+    void (*repack)(struct mp_repack *rp,
+                   struct mp_image *a, int a_x, int a_y,
+                   struct mp_image *b, int b_x, int b_y, int w);
+
+    bool passthrough_y;         // possible luma plane optimization for e.g. nv12
+    int endian_size;            // endian swap; 0=none, 2/4=swap word size
+
+    // For packed_repack.
+    int components[4];          // b[n] = mp_image.planes[components[n]]
+    //  pack:   a is dst, b is src
+    //  unpack: a is src, b is dst
+    void (*packed_repack_scanline)(void *a, void *b[], int w);
+
+    // Fringe RGB/YUV.
+    uint8_t comp_size;
+    uint8_t *comp_map;
+    uint8_t comp_shifts[3];
+    uint8_t *comp_lut;
+
+    // REPACK_STEP_REPACK: if true, need to copy this plane
+    bool copy_buf[4];
+
+    struct repack_step steps[4];
+    int num_steps;
+
+    bool configured;
+};
+
+// depth = number of LSB in use
+static int find_gbrp_format(int depth, int num_planes)
+{
+    if (num_planes != 3 && num_planes != 4)
+        return 0;
+    struct mp_regular_imgfmt desc = {
+        .component_type = MP_COMPONENT_TYPE_UINT,
+        .forced_csp = MP_CSP_RGB,
+        .component_size = depth > 8 ? 2 : 1,
+        .component_pad = depth - (depth > 8 ? 16 : 8),
+        .num_planes = num_planes,
+        .planes = { {1, {2}}, {1, {3}}, {1, {1}}, {1, {4}} },
+    };
+    return mp_find_regular_imgfmt(&desc);
+}
+
+// depth = number of LSB in use
+static int find_yuv_format(int depth, int num_planes)
+{
+    if (num_planes < 1 || num_planes > 4)
+        return 0;
+    struct mp_regular_imgfmt desc = {
+        .component_type = MP_COMPONENT_TYPE_UINT,
+        .component_size = depth > 8 ? 2 : 1,
+        .component_pad = depth - (depth > 8 ? 16 : 8),
+        .num_planes = num_planes,
+        .planes = { {1, {1}}, {1, {2}}, {1, {3}}, {1, {4}} },
+    };
+    if (num_planes == 2)
+        desc.planes[1].components[0] = 4;
+    return mp_find_regular_imgfmt(&desc);
+}
+
+// Copy one line on the plane p.
+static void copy_plane(struct mp_image *dst, int dst_x, int dst_y,
+                       struct mp_image *src, int src_x, int src_y,
+                       int w, int p)
+{
+    // Number of lines on this plane.
+    int h = (1 << dst->fmt.chroma_ys) - (1 << dst->fmt.ys[p]) + 1;
+    size_t size = mp_image_plane_bytes(dst, p, dst_x, w);
+
+    assert(dst->fmt.bpp[p] == src->fmt.bpp[p]);
+
+    for (int y = 0; y < h; y++) {
+        void *pd = mp_image_pixel_ptr(dst, p, dst_x, dst_y + y);
+        void *ps = mp_image_pixel_ptr(src, p, src_x, src_y + y);
+        memcpy(pd, ps, size);
+    }
+}
+
+// Swap endian for one line.
+static void swap_endian(struct mp_image *dst, int dst_x, int dst_y,
+                        struct mp_image *src, int src_x, int src_y,
+                        int w, int endian_size)
+{
+    assert(src->fmt.num_planes == dst->fmt.num_planes);
+
+    for (int p = 0; p < dst->fmt.num_planes; p++) {
+        int xs = dst->fmt.xs[p];
+        int bpp = dst->fmt.bytes[p];
+        int words_per_pixel = bpp / endian_size;
+        int num_words = ((w + (1 << xs) - 1) >> xs) * words_per_pixel;
+        // Number of lines on this plane.
+        int h = (1 << dst->fmt.chroma_ys) - (1 << dst->fmt.ys[p]) + 1;
+
+        assert(src->fmt.bytes[p] == bpp);
+
+        for (int y = 0; y < h; y++) {
+            void *s = mp_image_pixel_ptr(src, p, src_x, src_y + y);
+            void *d = mp_image_pixel_ptr(dst, p, dst_x, dst_y + y);
+            switch (endian_size) {
+            case 2:
+                for (int x = 0; x < num_words; x++)
+                    ((uint16_t *)d)[x] = av_bswap16(((uint16_t *)s)[x]);
+                break;
+            case 4:
+                for (int x = 0; x < num_words; x++)
+                    ((uint32_t *)d)[x] = av_bswap32(((uint32_t *)s)[x]);
+                break;
+            default:
+                assert(0);
+            }
+        }
+    }
+}
+
+// PA = PAck, copy planar input to single packed array
+// UN = UNpack, copy packed input to planar output
+// Naming convention:
+//  pa_/un_ prefix to identify conversion direction.
+//  Left (LSB, lowest byte address) -> Right (MSB, highest byte address).
+//      (This is unusual; MSB to LSB is more commonly used to describe formats,
+//       but our convention makes more sense for byte access in little endian.)
+//  "c" identifies a color component.
+//  "z" identifies known zero padding.
+//  "x" identifies uninitialized padding.
+//  A component is followed by its size in bits.
+//  Size can be omitted for multiple uniform components (c8c8c8 == ccc8).
+// Unpackers will often use "x" for padding, because they ignore it, while
+// packers will use "z" because they write zero.
+
+#define PA_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3)      \
+    static void name(void *dst, void *src[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] =                                          \
+                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
+                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
+                ((packed_t)((plane_t *)src[2])[x] << (sh_c2)) |             \
+                ((packed_t)((plane_t *)src[3])[x] << (sh_c3));              \
+        }                                                                   \
+    }
+
+#define UN_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3, mask)\
+    static void name(void *src, void *dst[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
+            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
+            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
+            ((plane_t *)dst[3])[x] = (c >> (sh_c3)) & (mask);               \
+        }                                                                   \
+    }
+
+
+#define PA_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, pad)        \
+    static void name(void *dst, void *src[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] = (pad) |                                  \
+                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
+                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
+                ((packed_t)((plane_t *)src[2])[x] << (sh_c2));              \
+        }                                                                   \
+    }
+
+UN_WORD_4(un_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24, 0xFFu)
+PA_WORD_4(pa_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24)
+// Not sure if this is a good idea; there may be no alignment guarantee.
+UN_WORD_4(un_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48, 0xFFFFu)
+PA_WORD_4(pa_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48)
+
+#define UN_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, mask)       \
+    static void name(void *src, void *dst[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
+            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
+            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
+        }                                                                   \
+    }
+
+UN_WORD_3(un_ccc8x8,  uint32_t, uint8_t,  0, 8,  16, 0xFFu)
+PA_WORD_3(pa_ccc8z8,  uint32_t, uint8_t,  0, 8,  16, 0)
+UN_WORD_3(un_x8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0xFFu)
+PA_WORD_3(pa_z8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0)
+UN_WORD_3(un_ccc10x2, uint32_t, uint16_t, 0, 10, 20, 0x3FFu)
+PA_WORD_3(pa_ccc10z2, uint32_t, uint16_t, 20, 10, 0, 0)
+
+#define PA_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, pad)               \
+    static void name(void *dst, void *src[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] = (pad) |                                  \
+                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
+                ((packed_t)((plane_t *)src[1])[x] << (sh_c1));              \
+        }                                                                   \
+    }
+
+#define UN_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, mask)              \
+    static void name(void *src, void *dst[], int w) {                       \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
+            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
+        }                                                                   \
+    }
+
+UN_WORD_2(un_cc8,  uint16_t, uint8_t,  0, 8,  0xFFu)
+PA_WORD_2(pa_cc8,  uint16_t, uint8_t,  0, 8,  0)
+UN_WORD_2(un_cc16, uint32_t, uint16_t, 0, 16, 0xFFFFu)
+PA_WORD_2(pa_cc16, uint32_t, uint16_t, 0, 16, 0)
+
+#define PA_SEQ_3(name, comp_t)                                              \
+    static void name(void *dst, void *src[], int w) {                       \
+        comp_t *r = dst;                                                    \
+        for (int x = 0; x < w; x++) {                                       \
+            *r++ = ((comp_t *)src[0])[x];                                   \
+            *r++ = ((comp_t *)src[1])[x];                                   \
+            *r++ = ((comp_t *)src[2])[x];                                   \
+        }                                                                   \
+    }
+
+#define UN_SEQ_3(name, comp_t)                                              \
+    static void name(void *src, void *dst[], int w) {                       \
+        comp_t *r = src;                                                    \
+        for (int x = 0; x < w; x++) {                                       \
+            ((comp_t *)dst[0])[x] = *r++;                                   \
+            ((comp_t *)dst[1])[x] = *r++;                                   \
+            ((comp_t *)dst[2])[x] = *r++;                                   \
+        }                                                                   \
+    }
+
+UN_SEQ_3(un_ccc8,  uint8_t)
+PA_SEQ_3(pa_ccc8,  uint8_t)
+UN_SEQ_3(un_ccc16, uint16_t)
+PA_SEQ_3(pa_ccc16, uint16_t)
+
+// "regular": single packed plane, all components have same width (except padding)
+struct regular_repacker {
+    int packed_width;       // number of bits of the packed pixel
+    int component_width;    // number of bits for a single component
+    int prepadding;         // number of bits of LSB padding
+    int num_components;     // number of components that can be accessed
+    void (*pa_scanline)(void *a, void *b[], int w);
+    void (*un_scanline)(void *a, void *b[], int w);
+};
+
+static const struct regular_repacker regular_repackers[] = {
+    {32, 8,  0, 3, pa_ccc8z8,  un_ccc8x8},
+    {32, 8,  8, 3, pa_z8ccc8,  un_x8ccc8},
+    {32, 8,  0, 4, pa_cccc8,   un_cccc8},
+    {64, 16, 0, 4, pa_cccc16,  un_cccc16},
+    {24, 8,  0, 3, pa_ccc8,    un_ccc8},
+    {48, 16, 0, 3, pa_ccc16,   un_ccc16},
+    {16, 8,  0, 2, pa_cc8,     un_cc8},
+    {32, 16, 0, 2, pa_cc16,    un_cc16},
+    {32, 10, 0, 3, pa_ccc10z2, un_ccc10x2},
+};
+
+static void packed_repack(struct mp_repack *rp,
+                          struct mp_image *a, int a_x, int a_y,
+                          struct mp_image *b, int b_x, int b_y, int w)
+{
+    uint32_t *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+
+    void *pb[4] = {0};
+    for (int p = 0; p < b->num_planes; p++) {
+        int s = rp->components[p];
+        pb[p] = mp_image_pixel_ptr(b, s, b_x, b_y);
+    }
+
+    rp->packed_repack_scanline(pa, pb, w);
+}
+
+// Tries to set a packer/unpacker for component-wise byte aligned formats.
+static void setup_packed_packer(struct mp_repack *rp)
+{
+    struct mp_regular_imgfmt desc;
+    if (!mp_get_regular_imgfmt(&desc, rp->imgfmt_a))
+        return;
+
+    if (desc.num_planes != 1 || desc.planes[0].num_components < 2)
+        return;
+    struct mp_regular_imgfmt_plane *p = &desc.planes[0];
+
+    int num_real_components = 0;
+    bool has_alpha = false;
+    for (int n = 0; n < p->num_components; n++) {
+        if (p->components[n]) {
+            has_alpha |= p->components[n] == 4;
+            num_real_components += 1;
+        } else {
+            // padding must be in MSB or LSB
+            if (n != 0 && n != p->num_components - 1)
+                return;
+        }
+    }
+
+    int depth = desc.component_size * 8 + MPMIN(0, desc.component_pad);
+
+    static const int reorder_gbrp[] = {0, 3, 1, 2, 4};
+    static const int reorder_yuv[] = {0, 1, 2, 3, 4};
+    int planar_fmt = 0;
+    const int *reorder = NULL;
+    if (desc.forced_csp) {
+        if (desc.forced_csp != MP_CSP_RGB && desc.forced_csp != MP_CSP_XYZ)
+            return;
+        planar_fmt = find_gbrp_format(depth, num_real_components);
+        reorder = reorder_gbrp;
+    } else {
+        planar_fmt = find_yuv_format(depth, num_real_components);
+        reorder = reorder_yuv;
+    }
+    if (!planar_fmt)
+        return;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
+        const struct regular_repacker *pa = &regular_repackers[i];
+
+        // The following may assume little endian (because some repack backends
+        // use word access, while the metadata here uses byte access).
+
+        int prepad = p->components[0] ? 0 : 8;
+        int first_comp = p->components[0] ? 0 : 1;
+        void (*repack_cb)(void *pa, void *pb[], int w) =
+            rp->pack ? pa->pa_scanline : pa->un_scanline;
+
+        if (pa->packed_width != desc.component_size * p->num_components * 8 ||
+            pa->component_width != depth ||
+            pa->num_components != num_real_components ||
+            pa->prepadding != prepad ||
+            !repack_cb)
+            continue;
+
+        rp->repack = packed_repack;
+        rp->packed_repack_scanline = repack_cb;
+        rp->imgfmt_b = planar_fmt;
+        for (int n = 0; n < num_real_components; n++) {
+            // Determine permutation that maps component order between the two
+            // formats, with has_alpha special case (see above).
+            int c = reorder[p->components[first_comp + n]];
+            rp->components[n] = c == 4 ? num_real_components - 1 : c - 1;
+        }
+        return;
+    }
+}
+
+struct fringe_rgb_repacker {
+    // To avoid making a mess of IMGFMT_*, we use av formats directly.
+    enum AVPixelFormat avfmt;
+    // If true, use BGR instead of RGB.
+    //  False:  LSB - R - G - B - pad - MSB
+    //  True:   LSB - B - G - R - pad - MSB
+    bool rev_order;
+    // Size in bit for each component, strictly from LSB to MSB.
+    int bits[3];
+    bool be;
+};
+
+static const struct fringe_rgb_repacker fringe_rgb_repackers[] = {
+    {AV_PIX_FMT_BGR4_BYTE,  false,  {1, 2, 1}},
+    {AV_PIX_FMT_RGB4_BYTE,  true,   {1, 2, 1}},
+    {AV_PIX_FMT_BGR8,       false,  {3, 3, 2}},
+    {AV_PIX_FMT_RGB8,       true,   {2, 3, 3}}, // pixdesc desc. and doc. bug?
+    {AV_PIX_FMT_RGB444LE,   true,   {4, 4, 4}},
+    {AV_PIX_FMT_RGB444BE,   true,   {4, 4, 4}, .be = true},
+    {AV_PIX_FMT_BGR444LE,   false,  {4, 4, 4}},
+    {AV_PIX_FMT_BGR444BE,   false,  {4, 4, 4}, .be = true},
+    {AV_PIX_FMT_BGR565LE,   false,  {5, 6, 5}},
+    {AV_PIX_FMT_BGR565BE,   false,  {5, 6, 5}, .be = true},
+    {AV_PIX_FMT_RGB565LE,   true,   {5, 6, 5}},
+    {AV_PIX_FMT_RGB565BE,   true,   {5, 6, 5}, .be = true},
+    {AV_PIX_FMT_BGR555LE,   false,  {5, 5, 5}},
+    {AV_PIX_FMT_BGR555BE,   false,  {5, 5, 5}, .be = true},
+    {AV_PIX_FMT_RGB555LE,   true,   {5, 5, 5}},
+    {AV_PIX_FMT_RGB555BE,   true,   {5, 5, 5}, .be = true},
+};
+
+#define PA_SHIFT_LUT8(name, packed_t)                                       \
+    static void name(void *dst, void *src[], int w, uint8_t *lut,           \
+                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
+        for (int x = 0; x < w; x++) {                                       \
+            ((packed_t *)dst)[x] =                                          \
+                (lut[((uint8_t *)src[0])[x] + 256 * 0] << s0) |             \
+                (lut[((uint8_t *)src[1])[x] + 256 * 1] << s1) |             \
+                (lut[((uint8_t *)src[2])[x] + 256 * 2] << s2);              \
+        }                                                                   \
+    }
+
+
+#define UN_SHIFT_LUT8(name, packed_t)                                       \
+    static void name(void *src, void *dst[], int w, uint8_t *lut,           \
+                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
+        for (int x = 0; x < w; x++) {                                       \
+            packed_t c = ((packed_t *)src)[x];                              \
+            ((uint8_t *)dst[0])[x] = lut[((c >> s0) & 0xFF) + 256 * 0];     \
+            ((uint8_t *)dst[1])[x] = lut[((c >> s1) & 0xFF) + 256 * 1];     \
+            ((uint8_t *)dst[2])[x] = lut[((c >> s2) & 0xFF) + 256 * 2];     \
+        }                                                                   \
+    }
+
+PA_SHIFT_LUT8(pa_shift_lut8_8,  uint8_t)
+PA_SHIFT_LUT8(pa_shift_lut8_16, uint16_t)
+UN_SHIFT_LUT8(un_shift_lut8_8,  uint8_t)
+UN_SHIFT_LUT8(un_shift_lut8_16, uint16_t)
+
+static void fringe_rgb_repack(struct mp_repack *rp,
+                              struct mp_image *a, int a_x, int a_y,
+                              struct mp_image *b, int b_x, int b_y, int w)
+{
+    void *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+
+    void *pb[4] = {0};
+    for (int p = 0; p < b->num_planes; p++) {
+        int s = rp->components[p];
+        pb[p] = mp_image_pixel_ptr(b, s, b_x, b_y);
+    }
+
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    void (*repack)(void *pa, void *pb[], int w, uint8_t *lut,
+                   uint8_t s0, uint8_t s1, uint8_t s2) = NULL;
+    if (rp->pack) {
+        repack = rp->comp_size == 1 ? pa_shift_lut8_8 : pa_shift_lut8_16;
+    } else {
+        repack = rp->comp_size == 1 ? un_shift_lut8_8 : un_shift_lut8_16;
+    }
+    repack(pa, pb, w, rp->comp_lut,
+           rp->comp_shifts[0], rp->comp_shifts[1], rp->comp_shifts[2]);
+}
+
+static void setup_fringe_rgb_packer(struct mp_repack *rp)
+{
+    enum AVPixelFormat avfmt = imgfmt2pixfmt(rp->imgfmt_a);
+
+    const struct fringe_rgb_repacker *fmt = NULL;
+    for (int n = 0; n < MP_ARRAY_SIZE(fringe_rgb_repackers); n++) {
+        if (fringe_rgb_repackers[n].avfmt == avfmt) {
+            fmt = &fringe_rgb_repackers[n];
+            break;
+        }
+    }
+
+    if (!fmt)
+        return;
+
+    int depth = fmt->bits[0];
+    for (int n = 0; n < 3; n++) {
+        if (rp->flags & REPACK_CREATE_ROUND_DOWN) {
+            depth = MPMIN(depth, fmt->bits[n]);
+        } else {
+            depth = MPMAX(depth, fmt->bits[n]);
+        }
+    }
+    if (rp->flags & REPACK_CREATE_EXPAND_8BIT)
+        depth = 8;
+
+    rp->imgfmt_b = find_gbrp_format(depth, 3);
+    if (!rp->imgfmt_b)
+        return;
+    rp->comp_lut = talloc_array(rp, uint8_t, 256 * 3);
+    rp->repack = fringe_rgb_repack;
+    static const int c_order_rgb[] = {3, 1, 2};
+    static const int c_order_bgr[] = {2, 1, 3};
+    for (int n = 0; n < 3; n++)
+        rp->components[n] = (fmt->rev_order ? c_order_bgr : c_order_rgb)[n] - 1;
+
+    int bitpos = 0;
+    for (int n = 0; n < 3; n++) {
+        int bits = fmt->bits[n];
+        rp->comp_shifts[n] = bitpos;
+        if (rp->comp_lut) {
+            uint8_t *lut = rp->comp_lut + 256 * n;
+            uint8_t zmax = (1 << depth) - 1;
+            uint8_t cmax = (1 << bits) - 1;
+            for (int v = 0; v < 256; v++) {
+                if (rp->pack) {
+                    lut[v] = (v * cmax + zmax / 2) / zmax;
+                } else {
+                    lut[v] = (v & cmax) * zmax / cmax;
+                }
+            }
+        }
+        bitpos += bits;
+    }
+
+    rp->comp_size = (bitpos + 7) / 8;
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    if (fmt->be) {
+        assert(rp->comp_size == 2);
+        rp->endian_size = 2;
+    }
+}
+
+static void unpack_pal(struct mp_repack *rp,
+                       struct mp_image *a, int a_x, int a_y,
+                       struct mp_image *b, int b_x, int b_y, int w)
+{
+    uint8_t *src = mp_image_pixel_ptr(a, 0, a_x, a_y);
+    uint32_t *pal = (void *)a->planes[1];
+
+    uint8_t *dst[4] = {0};
+    for (int p = 0; p < b->num_planes; p++)
+        dst[p] = mp_image_pixel_ptr(b, p, b_x, b_y);
+
+    for (int x = 0; x < w; x++) {
+        uint32_t c = pal[src[x]];
+        dst[0][x] = (c >>  8) & 0xFF; // G
+        dst[1][x] = (c >>  0) & 0xFF; // B
+        dst[2][x] = (c >> 16) & 0xFF; // R
+        dst[3][x] = (c >> 24) & 0xFF; // A
+    }
+}
+
+static void bitmap_repack(struct mp_repack *rp,
+                          struct mp_image *a, int a_x, int a_y,
+                          struct mp_image *b, int b_x, int b_y, int w)
+{
+    uint8_t *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+    uint8_t *pb = mp_image_pixel_ptr(b, 0, b_x, b_y);
+
+    if (rp->pack) {
+        for (unsigned x = 0; x < w; x += 8) {
+            uint8_t d = 0;
+            int max_b = MPMIN(8, w - x);
+            for (int bp = 0; bp < max_b; bp++)
+                d |= (rp->comp_lut[pb[x + bp]]) << (7 - bp);
+            pa[x / 8] = d;
+        }
+    } else {
+        for (unsigned x = 0; x < w; x += 8) {
+            uint8_t d = pa[x / 8];
+            int max_b = MPMIN(8, w - x);
+            for (int bp = 0; bp < max_b; bp++)
+                pb[x + bp] = rp->comp_lut[d & (1 << (7 - bp))];
+        }
+    }
+}
+
+static void setup_misc_packer(struct mp_repack *rp)
+{
+    // Although it's in regular_repackers[], the generic mpv imgfmt metadata
+    // can't handle it yet.
+    if (rp->imgfmt_a == IMGFMT_RGB30) {
+        int planar_fmt = find_gbrp_format(10, 3);
+        if (!planar_fmt)
+            return;
+        rp->imgfmt_b = planar_fmt;
+        rp->repack = packed_repack;
+        rp->packed_repack_scanline = rp->pack ? pa_ccc10z2 : un_ccc10x2;
+        static int c_order[] = {3, 2, 1};
+        for (int n = 0; n < 3; n++)
+            rp->components[n] = c_order[n] - 1;
+    } else if (rp->imgfmt_a == IMGFMT_PAL8 && !rp->pack) {
+        int grap_fmt = find_gbrp_format(8, 4);
+        if (!grap_fmt)
+            return;
+        rp->imgfmt_b = grap_fmt;
+        rp->repack = unpack_pal;
+    } else {
+        enum AVPixelFormat avfmt = imgfmt2pixfmt(rp->imgfmt_a);
+        if (avfmt == AV_PIX_FMT_MONOWHITE || avfmt == AV_PIX_FMT_MONOBLACK) {
+            rp->comp_lut = talloc_array(rp, uint8_t, 256);
+            rp->imgfmt_b = IMGFMT_Y1;
+            int max = 1;
+            if (rp->flags & REPACK_CREATE_EXPAND_8BIT) {
+                rp->imgfmt_b = IMGFMT_Y8;
+                max = 255;
+            }
+            bool inv = avfmt == AV_PIX_FMT_MONOWHITE;
+            for (int n = 0; n < 256; n++) {
+                rp->comp_lut[n] = rp->pack ? (inv ^ (n >= (max + 1) / 2))
+                                           : ((inv ^ !!n) ? max : 0);
+            }
+            rp->repack = bitmap_repack;
+            return;
+        }
+    }
+}
+
+struct fringe_yuv422_repacker {
+    // To avoid making a mess of IMGFMT_*, we use av formats directly.
+    enum AVPixelFormat avfmt;
+    // In bits (depth/8 rounded up gives byte size)
+    int8_t depth;
+    // Word index of each sample: {y0, y1, cb, cr}
+    uint8_t comp[4];
+    bool be;
+};
+
+static const struct fringe_yuv422_repacker fringe_yuv422_repackers[] = {
+    {AV_PIX_FMT_YUYV422,  8, {0, 2, 1, 3}},
+    {AV_PIX_FMT_UYVY422,  8, {1, 3, 0, 2}},
+    {AV_PIX_FMT_YVYU422,  8, {0, 2, 3, 1}},
+#ifdef AV_PIX_FMT_Y210
+    {AV_PIX_FMT_Y210LE,  10, {0, 2, 1, 3}},
+    {AV_PIX_FMT_Y210BE,  10, {0, 2, 1, 3}, .be = true},
+#endif
+};
+
+#define PA_P422(name, comp_t)                                               \
+    static void name(void *dst, void *src[], int w, uint8_t *c) {           \
+        for (int x = 0; x < w; x += 2) {                                    \
+            ((comp_t *)dst)[x * 2 + c[0]] = ((comp_t *)src[0])[x + 0];      \
+            ((comp_t *)dst)[x * 2 + c[1]] = ((comp_t *)src[0])[x + 1];      \
+            ((comp_t *)dst)[x * 2 + c[2]] = ((comp_t *)src[1])[x >> 1];     \
+            ((comp_t *)dst)[x * 2 + c[3]] = ((comp_t *)src[2])[x >> 1];     \
+        }                                                                   \
+    }
+
+
+#define UN_P422(name, comp_t)                                               \
+    static void name(void *src, void *dst[], int w, uint8_t *c) {           \
+        for (int x = 0; x < w; x += 2) {                                    \
+            ((comp_t *)dst[0])[x + 0]  = ((comp_t *)src)[x * 2 + c[0]];     \
+            ((comp_t *)dst[0])[x + 1]  = ((comp_t *)src)[x * 2 + c[1]];     \
+            ((comp_t *)dst[1])[x >> 1] = ((comp_t *)src)[x * 2 + c[2]];     \
+            ((comp_t *)dst[2])[x >> 1] = ((comp_t *)src)[x * 2 + c[3]];     \
+        }                                                                   \
+    }
+
+PA_P422(pa_p422_8,  uint8_t)
+PA_P422(pa_p422_16, uint16_t)
+UN_P422(un_p422_8,  uint8_t)
+UN_P422(un_p422_16, uint16_t)
+
+static void fringe_yuv422_repack(struct mp_repack *rp,
+                                 struct mp_image *a, int a_x, int a_y,
+                                 struct mp_image *b, int b_x, int b_y, int w)
+{
+    void *pa = mp_image_pixel_ptr(a, 0, a_x, a_y);
+
+    void *pb[4] = {0};
+    for (int p = 0; p < b->num_planes; p++)
+        pb[p] = mp_image_pixel_ptr(b, p, b_x, b_y);
+
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    void (*repack)(void *a, void *b[], int w, uint8_t *c) = NULL;
+    if (rp->pack) {
+        repack = rp->comp_size == 1 ? pa_p422_8 : pa_p422_16;
+    } else {
+        repack = rp->comp_size == 1 ? un_p422_8 : un_p422_16;
+    }
+    repack(pa, pb, w, rp->comp_map);
+}
+
+static void setup_fringe_yuv422_packer(struct mp_repack *rp)
+{
+    enum AVPixelFormat avfmt = imgfmt2pixfmt(rp->imgfmt_a);
+
+    const struct fringe_yuv422_repacker *fmt = NULL;
+    for (int n = 0; n < MP_ARRAY_SIZE(fringe_yuv422_repackers); n++) {
+        if (fringe_yuv422_repackers[n].avfmt == avfmt) {
+            fmt = &fringe_yuv422_repackers[n];
+            break;
+        }
+    }
+
+    if (!fmt)
+        return;
+
+    rp->comp_size = (fmt->depth + 7) / 8;
+    assert(rp->comp_size == 1 || rp->comp_size == 2);
+
+    struct mp_regular_imgfmt yuvfmt = {
+        .component_type = MP_COMPONENT_TYPE_UINT,
+        // NB: same problem with P010 and not clearing padding.
+        .component_size = rp->comp_size,
+        .num_planes = 3,
+        .planes = { {1, {1}}, {1, {2}}, {1, {3}} },
+        .chroma_xs = 1,
+        .chroma_ys = 0,
+    };
+    rp->imgfmt_b = mp_find_regular_imgfmt(&yuvfmt);
+    rp->repack = fringe_yuv422_repack;
+    rp->comp_map = (uint8_t *)fmt->comp;
+
+    if (fmt->be) {
+        assert(rp->comp_size == 2);
+        rp->endian_size = 2;
+    }
+}
+
+static void repack_nv(struct mp_repack *rp,
+                      struct mp_image *a, int a_x, int a_y,
+                      struct mp_image *b, int b_x, int b_y, int w)
+{
+    int xs = a->fmt.chroma_xs;
+
+    uint32_t *pa = mp_image_pixel_ptr(a, 1, a_x, a_y);
+
+    void *pb[2];
+    for (int p = 0; p < 2; p++) {
+        int s = rp->components[p];
+        pb[p] = mp_image_pixel_ptr(b, s, b_x, b_y);
+    }
+
+    rp->packed_repack_scanline(pa, pb, (w + (1 << xs) - 1) >> xs);
+}
+
+static void setup_nv_packer(struct mp_repack *rp)
+{
+    struct mp_regular_imgfmt desc;
+    if (!mp_get_regular_imgfmt(&desc, rp->imgfmt_a))
+        return;
+
+    // Check for NV.
+    if (desc.num_planes != 2)
+        return;
+    if (desc.planes[0].num_components != 1 || desc.planes[0].components[0] != 1)
+        return;
+    if (desc.planes[1].num_components != 2)
+        return;
+    int cr0 = desc.planes[1].components[0];
+    int cr1 = desc.planes[1].components[1];
+    if (cr0 > cr1)
+        MPSWAP(int, cr0, cr1);
+    if (cr0 != 2 || cr1 != 3)
+        return;
+
+    // Construct equivalent planar format.
+    struct mp_regular_imgfmt desc2 = desc;
+    desc2.num_planes = 3;
+    desc2.planes[1].num_components = 1;
+    desc2.planes[1].components[0] = 2;
+    desc2.planes[2].num_components = 1;
+    desc2.planes[2].components[0] = 3;
+    // For P010. Strangely this concept exists only for the NV format.
+    if (desc2.component_pad > 0)
+        desc2.component_pad = 0;
+
+    int planar_fmt = mp_find_regular_imgfmt(&desc2);
+    if (!planar_fmt)
+        return;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(regular_repackers); i++) {
+        const struct regular_repacker *pa = &regular_repackers[i];
+
+        void (*repack_cb)(void *pa, void *pb[], int w) =
+            rp->pack ? pa->pa_scanline : pa->un_scanline;
+
+        if (pa->packed_width != desc.component_size * 2 * 8 ||
+            pa->component_width != desc.component_size * 8 ||
+            pa->num_components != 2 ||
+            pa->prepadding != 0 ||
+            !repack_cb)
+            continue;
+
+        rp->repack = repack_nv;
+        rp->passthrough_y = true;
+        rp->packed_repack_scanline = repack_cb;
+        rp->imgfmt_b = planar_fmt;
+        rp->components[0] = desc.planes[1].components[0] - 1;
+        rp->components[1] = desc.planes[1].components[1] - 1;
+        return;
+    }
+}
+
+void repack_line(struct mp_repack *rp, int dst_x, int dst_y,
+                 int src_x, int src_y, int w)
+{
+    assert(rp->configured);
+
+    struct repack_step *first = &rp->steps[0];
+    struct repack_step *last = &rp->steps[rp->num_steps - 1];
+
+    assert(dst_x >= 0 && dst_y >= 0 && src_x >= 0 && src_y >= 0 && w >= 0);
+    assert(dst_x + w <= MP_ALIGN_UP(last->buf[1]->w, last->fmt[1].align_x));
+    assert(src_x + w <= MP_ALIGN_UP(first->buf[1]->w, first->fmt[0].align_x));
+    assert(dst_y < last->buf[1]->h);
+    assert(src_y < first->buf[0]->h);
+    assert(!(dst_x & (last->fmt[1].align_x - 1)));
+    assert(!(src_x & (first->fmt[0].align_x - 1)));
+    assert(!(w & ((1 << first->fmt[0].chroma_xs) - 1)));
+    assert(!(dst_y & (last->fmt[1].align_y - 1)));
+    assert(!(src_y & (first->fmt[0].align_y - 1)));
+
+    for (int n = 0; n < rp->num_steps; n++) {
+        struct repack_step *rs = &rp->steps[n];
+
+        // When writing to temporary buffers, always write to the start (maybe
+        // helps with locality).
+        int sx = rs->user_buf[0] ? src_x : 0;
+        int sy = rs->user_buf[0] ? src_y : 0;
+        int dx = rs->user_buf[1] ? dst_x : 0;
+        int dy = rs->user_buf[1] ? dst_y : 0;
+
+        struct mp_image *buf_a = rs->buf[rp->pack];
+        struct mp_image *buf_b = rs->buf[!rp->pack];
+        int a_x = rp->pack ? dx : sx;
+        int a_y = rp->pack ? dy : sy;
+        int b_x = rp->pack ? sx : dx;
+        int b_y = rp->pack ? sy : dy;
+
+        switch (rs->type) {
+        case REPACK_STEP_REPACK: {
+            if (rp->repack)
+                rp->repack(rp, buf_a, a_x, a_y, buf_b, b_x, b_y, w);
+
+            for (int p = 0; p < rs->fmt[0].num_planes; p++) {
+                if (rp->copy_buf[p])
+                    copy_plane(rs->buf[1], dx, dy, rs->buf[0], sx, sy, w, p);
+            }
+            break;
+        }
+        case REPACK_STEP_ENDIAN:
+            swap_endian(rs->buf[1], dx, dy, rs->buf[0], sx, sy, w,
+                        rp->endian_size);
+            break;
+        }
+    }
+}
+
+static bool setup_format_ne(struct mp_repack *rp)
+{
+    if (!rp->imgfmt_b)
+        setup_nv_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_misc_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_packed_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_fringe_rgb_packer(rp);
+    if (!rp->imgfmt_b)
+        setup_fringe_yuv422_packer(rp);
+    if (!rp->imgfmt_b)
+        rp->imgfmt_b = rp->imgfmt_a; // maybe it was planar after all
+
+    struct mp_regular_imgfmt desc;
+    if (!mp_get_regular_imgfmt(&desc, rp->imgfmt_b))
+        return false;
+
+    // no weird stuff
+    if (desc.num_planes > 4)
+        return false;
+
+    // Endian swapping.
+    if (rp->imgfmt_a != rp->imgfmt_user) {
+        struct mp_regular_imgfmt ndesc;
+        if (!mp_get_regular_imgfmt(&ndesc, rp->imgfmt_a) || ndesc.num_planes > 4)
+            return false;
+        rp->endian_size = ndesc.component_size;
+        if (rp->endian_size != 2 && rp->endian_size != 4)
+            return false;
+    }
+
+    // Accept only true planar formats (with known components and no padding).
+    for (int n = 0; n < desc.num_planes; n++) {
+        if (desc.planes[n].num_components != 1)
+            return false;
+        int c = desc.planes[n].components[0];
+        if (c < 1 || c > 4)
+            return false;
+    }
+
+    rp->fmt_a = mp_imgfmt_get_desc(rp->imgfmt_a);
+    rp->fmt_b = mp_imgfmt_get_desc(rp->imgfmt_b);
+
+    // This is if we did a pack step.
+
+    rp->steps[rp->num_steps++] = (struct repack_step) {
+        .type = REPACK_STEP_REPACK,
+        .fmt = { rp->fmt_b, rp->fmt_a },
+    };
+
+    if (rp->endian_size) {
+        rp->steps[rp->num_steps++] = (struct repack_step) {
+            .type = REPACK_STEP_ENDIAN,
+            .fmt = {
+                rp->fmt_a,
+                mp_imgfmt_get_desc(rp->imgfmt_user),
+            },
+        };
+    }
+
+    // Reverse if unpack (to reflect actual data flow)
+    if (!rp->pack) {
+        for (int n = 0; n < rp->num_steps / 2; n++) {
+            MPSWAP(struct repack_step, rp->steps[n],
+                   rp->steps[rp->num_steps - 1 - n]);
+        }
+        for (int n = 0; n < rp->num_steps; n++) {
+            struct repack_step *rs = &rp->steps[n];
+            MPSWAP(struct mp_imgfmt_desc, rs->fmt[0], rs->fmt[1]);
+        }
+    }
+
+    for (int n = 0; n < rp->num_steps - 1; n++)
+        assert(rp->steps[n].fmt[1].id == rp->steps[n + 1].fmt[0].id);
+
+    return true;
+}
+
+static void reset_params(struct mp_repack *rp)
+{
+    rp->num_steps = 0;
+    rp->imgfmt_b = 0;
+    rp->repack = NULL;
+    rp->passthrough_y = false;
+    rp->endian_size = 0;
+    rp->packed_repack_scanline = NULL;
+    rp->comp_size = 0;
+    rp->comp_map = NULL;
+    talloc_free(rp->comp_lut);
+    rp->comp_lut = NULL;
+}
+
+static bool setup_format(struct mp_repack *rp)
+{
+    reset_params(rp);
+    rp->imgfmt_a = rp->imgfmt_user;
+    if (setup_format_ne(rp))
+        return true;
+    // Try reverse endian.
+    reset_params(rp);
+    rp->imgfmt_a = mp_find_other_endian(rp->imgfmt_user);
+    return rp->imgfmt_a && setup_format_ne(rp);
+}
+
+struct mp_repack *mp_repack_create_planar(int imgfmt, bool pack, int flags)
+{
+    struct mp_repack *rp = talloc_zero(NULL, struct mp_repack);
+    rp->imgfmt_user = imgfmt;
+    rp->pack = pack;
+    rp->flags = flags;
+
+    if (!setup_format(rp)) {
+        talloc_free(rp);
+        return NULL;
+    }
+
+    return rp;
+}
+
+int mp_repack_get_format_src(struct mp_repack *rp)
+{
+    return rp->steps[0].fmt[0].id;
+}
+
+int mp_repack_get_format_dst(struct mp_repack *rp)
+{
+    return rp->steps[rp->num_steps - 1].fmt[1].id;
+}
+
+int mp_repack_get_align_x(struct mp_repack *rp)
+{
+    // We really want the LCM between those, but since only one of them is
+    // packed (or they're the same format), and the chroma subsampling is the
+    // same for both, only the packed one matters.
+    return rp->fmt_a.align_x;
+}
+
+int mp_repack_get_align_y(struct mp_repack *rp)
+{
+    return rp->fmt_a.align_y; // should be the same for packed/planar formats
+}
+
+static void image_realloc(struct mp_image **img, int fmt, int w, int h)
+{
+    if (*img && (*img)->imgfmt == fmt && (*img)->w == w && (*img)->h == h)
+        return;
+    talloc_free(*img);
+    *img = mp_image_alloc(fmt, w, h);
+}
+
+bool repack_config_buffers(struct mp_repack *rp,
+                           int dst_flags, struct mp_image *dst,
+                           int src_flags, struct mp_image *src,
+                           bool *enable_passthrough)
+{
+    struct repack_step *rs_first = &rp->steps[0];
+    struct repack_step *rs_last = &rp->steps[rp->num_steps - 1];
+
+    rp->configured = false;
+
+    assert(dst && src);
+
+    int buf_w = MPMAX(dst->w, src->w);
+
+    assert(dst->imgfmt == rs_last->fmt[1].id);
+    assert(src->imgfmt == rs_first->fmt[0].id);
+
+    // Chain/allocate buffers.
+
+    for (int n = 0; n < rp->num_steps; n++)
+        rp->steps[n].buf[0] = rp->steps[n].buf[1] = NULL;
+
+    rs_first->buf[0] = src;
+    rs_last->buf[1] = dst;
+
+    for (int n = 0; n < rp->num_steps; n++) {
+        struct repack_step *rs = &rp->steps[n];
+
+        if (!rs->buf[0]) {
+            assert(n > 0);
+            rs->buf[0] = rp->steps[n - 1].buf[1];
+        }
+
+        if (rs->buf[1])
+            continue;
+
+        // Note: since repack_line() can have different src/dst offsets, we
+        //       can't do true in-place in general.
+        bool can_inplace = rs->type == REPACK_STEP_ENDIAN &&
+                           rs->buf[0] != src && rs->buf[0] != dst;
+        if (can_inplace) {
+            rs->buf[1] = rs->buf[0];
+            continue;
+        }
+
+        if (rs != rs_last) {
+            struct repack_step *next = &rp->steps[n + 1];
+            if (next->buf[0]) {
+                rs->buf[1] = next->buf[0];
+                continue;
+            }
+        }
+
+        image_realloc(&rs->tmp, rs->fmt[1].id, buf_w, rs->fmt[1].align_y);
+        if (!rs->tmp)
+            return false;
+        talloc_steal(rp, rs->tmp);
+        rs->buf[1] = rs->tmp;
+    }
+
+    for (int n = 0; n < rp->num_steps; n++) {
+        struct repack_step *rs = &rp->steps[n];
+        rs->user_buf[0] = rs->buf[0] == src || rs->buf[0] == dst;
+        rs->user_buf[1] = rs->buf[1] == src || rs->buf[1] == dst;
+    }
+
+    // If repacking is the only operation. It's also responsible for simply
+    // copying src to dst if absolutely no filtering is done.
+    bool may_passthrough =
+        rp->num_steps == 1 && rp->steps[0].type == REPACK_STEP_REPACK;
+
+    for (int p = 0; p < rp->fmt_b.num_planes; p++) {
+        // (All repack callbacks copy, except nv12 does not copy luma.)
+        bool repack_copies_plane = rp->repack && !(rp->passthrough_y && p == 0);
+
+        bool can_pt = may_passthrough && !repack_copies_plane &&
+                      enable_passthrough && enable_passthrough[p];
+
+        // Copy if needed, unless the repack callback does it anyway.
+        rp->copy_buf[p] = !repack_copies_plane && !can_pt;
+
+        if (enable_passthrough)
+            enable_passthrough[p] = can_pt && !rp->copy_buf[p];
+    }
+
+    if (enable_passthrough) {
+        for (int n = rp->fmt_b.num_planes; n < MP_MAX_PLANES; n++)
+            enable_passthrough[n] = false;
+    }
+
+    rp->configured = true;
+
+    return true;
+}
diff --git a/video/repack.h b/video/repack.h
new file mode 100644
index 0000000000..fa81ca9df2
--- /dev/null
+++ b/video/repack.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <stdbool.h>
+
+enum {
+    // This controls bheavior with different bit widths per component (like
+    // RGB565). If ROUND_DOWN is specified, the planar format will use the min.
+    // bit width of all components, otherwise the transformation is lossless.
+    REPACK_CREATE_ROUND_DOWN    = (1 << 0),
+
+    // Expand some (not all) low bit depth fringe formats to 8 bit on unpack.
+    REPACK_CREATE_EXPAND_8BIT   = (1 << 1),
+};
+
+struct mp_repack;
+struct mp_image;
+
+// Create a repacker between any format (imgfmt parameter) and an equivalent
+// planar format (that is native endian). If pack==true, imgfmt is the output,
+// otherwise it is the input. The respective other input/output is the planar
+// format. The planar format can be queried with mp_repack_get_format_*().
+// Note that some formats may change the "implied" colorspace (for example,
+// packed xyz unpacks as rgb).
+// If imgfmt is already planar, a passthrough repacker may be created.
+//  imgfmt: src or dst format (usually packed, non-planar, etc.)
+//  pack: true if imgfmt is dst, false if imgfmt is src
+//  flags: any of REPACK_CREATE_* flags
+//  returns: NULL on failure, otherwise free with talloc_free().
+struct mp_repack *mp_repack_create_planar(int imgfmt, bool pack, int flags);
+
+// Return input and output formats for which rp was created.
+int mp_repack_get_format_src(struct mp_repack *rp);
+int mp_repack_get_format_dst(struct mp_repack *rp);
+
+// Return pixel alignment. For x, this is a lowest pixel count at which there is
+// a byte boundary and a full chroma pixel (horizontal subsampling) on src/dst.
+// For y, this is the pixel height of the vertical subsampling.
+// Always returns a power of 2.
+int mp_repack_get_align_x(struct mp_repack *rp);
+int mp_repack_get_align_y(struct mp_repack *rp);
+
+// Repack a single line from dst to src, as set in repack_config_buffers().
+// For subsampled chroma formats, this copies as many luma/alpha rows as needed
+// for a complete line (e.g. 2 luma lines, 1 chroma line for 4:2:0).
+// dst_x, src_x, y must be aligned to the pixel alignment. w may be unaligned
+// if at the right crop-border of the image, but must be always aligned to
+// horiz. sub-sampling. y is subject to hslice.
+void repack_line(struct mp_repack *rp, int dst_x, int dst_y,
+                 int src_x, int src_y, int w);
+
+// Configure with a source and target buffer. The rp instance will keep the
+// mp_image pointers and access them on repack_line() calls. Refcounting is
+// not respected - the caller needs to make sure dst is always writable.
+// The images can have different sizes (as repack_line() lets you use different
+// target coordinates for dst/src).
+// This also allocaters potentially required temporary buffers.
+//  dst_flags: REPACK_BUF_* flags for dst
+//  dst: where repack_line() writes to
+//  src_flags: REPACK_BUF_* flags for src
+//  src: where repack_line() reads from
+//  enable_passthrough: if non-NULL, an bool array of size MP_MAX_PLANES indexed
+//                      by plane; a true entry requests disabling copying the
+//                      plane data to the dst plane. The function will write to
+//                      this array whether the plane can really be passed through
+//                      (i.e. will set array entries from true to false if pass-
+//                      through is not possible). It writes to all MP_MAX_PLANES
+//                      entries. If NULL, all entries are implicitly false.
+//  returns: success (fails on OOM)
+bool repack_config_buffers(struct mp_repack *rp,
+                           int dst_flags, struct mp_image *dst,
+                           int src_flags, struct mp_image *src,
+                           bool *enable_passthrough);
diff --git a/video/zimg.c b/video/zimg.c
index ae3602d297..4e7711f61a 100644
--- a/video/zimg.c
+++ b/video/zimg.c
@@ -25,6 +25,7 @@
 #include "csputils.h"
 #include "options/m_config.h"
 #include "options/m_option.h"
+#include "repack.h"
 #include "video/fmt-conversion.h"
 #include "video/img_format.h"
 #include "zimg.h"
@@ -83,50 +84,14 @@ struct mp_zimg_repack {
     int num_planes;             // number of planes involved
     unsigned zmask[4];          // zmask[mp_index] = zimg mask (using mp index!)
     int z_planes[4];            // z_planes[zimg_index] = mp_index (or -1)
-    bool pass_through_y;        // luma plane optimization for e.g. nv12
 
-    // If set, the pack/unpack callback to pass to zimg.
-    // Called with user==mp_zimg_repack.
-    zimg_filter_graph_callback repack;
-
-    // Endian-swap (done before/after actual repacker).
-    int endian_size;            // 0=no swapping, 2/4=word byte size to swap
-    int endian_items[4];        // number of words per pixel/plane
-
-    // For packed_repack.
-    int components[4];          // p2[n] = mp_image.planes[components[n]]
-    //  pack:   p1 is dst, p2 is src
-    //  unpack: p1 is src, p2 is dst
-    void (*packed_repack_scanline)(void *p1, void *p2[], int x0, int x1);
-
-    // Fringe RGB/YUV.
-    uint8_t comp_size;
-    uint8_t *comp_map;
-    uint8_t comp_shifts[3];
-    uint8_t *comp_lut; // 256 * 3
+    struct mp_repack *repack;   // converting to/from planar
 
     // Temporary memory for slice-wise repacking. This may be set even if repack
     // is not set (then it may be used to avoid alignment issues). This has
     // about one slice worth of data.
     struct mp_image *tmp;
 
-    // Temporary memory for endian swapping. This has about one slice worth
-    // of data; set and used only if endian swapping is used (endian_size>0).
-    // It's also used only for pack==false; packers do this in-place.
-    struct mp_image *tmp_endian;
-
-    // Temporary, per-call source/target frame.
-    struct mp_image *mpi;
-    // Y coordinate of first line in mpi; usually 0 if mpi==user_mpi, or the
-    // start of the current slice (in the current repack cb).
-    // repackers should use: mpi->data[p] + mpi->stride[p] * (i - mpi_y0)
-    int mpi_y0;
-
-    struct mp_image *user_mpi;
-
-    // Also temporary, per-call. use_buf[n] == plane n uses tmp (and not mpi).
-    bool use_buf[4];
-
     int real_w, real_h;         // aligned size
 };
 
@@ -243,532 +208,44 @@ void mp_zimg_enable_cmdline_opts(struct mp_zimg_context *ctx,
     mp_zimg_update_from_cmdline(ctx); // first update
 }
 
-static int repack_align(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    for (int p = 0; p < r->mpi->fmt.num_planes; p++) {
-        if (!r->use_buf[p])
-            continue;
-
-        int bpp = r->mpi->fmt.bytes[p];
-        int xs = r->mpi->fmt.xs[p];
-        int ys = r->mpi->fmt.ys[p];
-        // Number of lines on this plane.
-        int h = (1 << r->mpi->fmt.chroma_ys) - (1 << ys) + 1;
-
-        for (int y = i; y < i + h; y++) {
-            void *a = r->mpi->planes[p] +
-                      r->mpi->stride[p] * (ptrdiff_t)((y - r->mpi_y0) >> ys) +
-                      bpp * (x0 >> xs);
-            void *b = r->tmp->planes[p] +
-                      r->tmp->stride[p] * (ptrdiff_t)((y >> ys) & r->zmask[p]) +
-                      bpp * (x0 >> xs);
-            size_t size = ((x1 - x0) >> xs) * bpp;
-            if (r->pack) {
-                memcpy(a, b, size);
-            } else {
-                memcpy(b, a, size);
-            }
-        }
-    }
-
-    return 0;
-}
-
-// Swap endian for one line.
-static void swap_endian(struct mp_zimg_repack *r, struct mp_image *dst, int dst_y,
-                        struct mp_image *src, int src_y, int x0, int x1)
-{
-    for (int p = 0; p < dst->fmt.num_planes; p++) {
-        int xs = dst->fmt.xs[p];
-        int ys = dst->fmt.ys[p];
-        int words_per_pixel = r->endian_items[p];
-        int bpp = words_per_pixel * r->endian_size;
-        // Number of lines on this plane.
-        int h = (1 << dst->fmt.chroma_ys) - (1 << ys) + 1;
-        int num_words = ((x1 - x0) >> xs) * words_per_pixel;
-
-        for (int y = 0; y < h; y++) {
-            void *s = src->planes[p] +
-                      src->stride[p] * (ptrdiff_t)((y + src_y) >> ys) +
-                      bpp * (x0 >> xs);
-            void *d = dst->planes[p] +
-                      dst->stride[p] * (ptrdiff_t)((y + dst_y) >> ys) +
-                      bpp * (x0 >> xs);
-            switch (r->endian_size) {
-            case 2:
-                for (int w = 0; w < num_words; w++)
-                    ((uint16_t *)d)[w] = av_bswap16(((uint16_t *)s)[w]);
-                break;
-            case 4:
-                for (int w = 0; w < num_words; w++)
-                    ((uint32_t *)d)[w] = av_bswap32(((uint32_t *)s)[w]);
-                break;
-            default:
-                assert(0);
-            }
-        }
-    }
-}
-
-// PA = PAck, copy planar input to single packed array
-// UN = UNpack, copy packed input to planar output
-// Naming convention:
-//  pa_/un_ prefix to identify conversion direction.
-//  Left (LSB, lowest byte address) -> Right (MSB, highest byte address).
-//      (This is unusual; MSB to LSB is more commonly used to describe formats,
-//       but our convention makes more sense for byte access in little endian.)
-//  "c" identifies a color component.
-//  "z" identifies known zero padding.
-//  "x" identifies uninitialized padding.
-//  A component is followed by its size in bits.
-//  Size can be omitted for multiple uniform components (c8c8c8 == ccc8).
-// Unpackers will often use "x" for padding, because they ignore it, while
-// packers will use "z" because they write zero.
-
-#define PA_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3)      \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] =                                          \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
-                ((packed_t)((plane_t *)src[2])[x] << (sh_c2)) |             \
-                ((packed_t)((plane_t *)src[3])[x] << (sh_c3));              \
-        }                                                                   \
-    }
-
-#define UN_WORD_4(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, sh_c3, mask)\
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
-            ((plane_t *)dst[3])[x] = (c >> (sh_c3)) & (mask);               \
-        }                                                                   \
-    }
-
-
-#define PA_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, pad)        \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] = (pad) |                                  \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1)) |             \
-                ((packed_t)((plane_t *)src[2])[x] << (sh_c2));              \
-        }                                                                   \
-    }
-
-UN_WORD_4(un_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24, 0xFFu)
-PA_WORD_4(pa_cccc8,  uint32_t, uint8_t,  0, 8,  16, 24)
-// Not sure if this is a good idea; there may be no alignment guarantee.
-UN_WORD_4(un_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48, 0xFFFFu)
-PA_WORD_4(pa_cccc16,  uint64_t, uint16_t,  0, 16,  32, 48)
-
-#define UN_WORD_3(name, packed_t, plane_t, sh_c0, sh_c1, sh_c2, mask)       \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-            ((plane_t *)dst[2])[x] = (c >> (sh_c2)) & (mask);               \
-        }                                                                   \
-    }
-
-UN_WORD_3(un_ccc8x8,  uint32_t, uint8_t,  0, 8,  16, 0xFFu)
-PA_WORD_3(pa_ccc8z8,  uint32_t, uint8_t,  0, 8,  16, 0)
-UN_WORD_3(un_x8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0xFFu)
-PA_WORD_3(pa_z8ccc8,  uint32_t, uint8_t,  8, 16, 24, 0)
-UN_WORD_3(un_ccc10x2, uint32_t, uint16_t, 0, 10, 20, 0x3FFu)
-PA_WORD_3(pa_ccc10z2, uint32_t, uint16_t, 20, 10, 0, 0)
-
-#define PA_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, pad)               \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] = (pad) |                                  \
-                ((packed_t)((plane_t *)src[0])[x] << (sh_c0)) |             \
-                ((packed_t)((plane_t *)src[1])[x] << (sh_c1));              \
-        }                                                                   \
-    }
-
-#define UN_WORD_2(name, packed_t, plane_t, sh_c0, sh_c1, mask)              \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((plane_t *)dst[0])[x] = (c >> (sh_c0)) & (mask);               \
-            ((plane_t *)dst[1])[x] = (c >> (sh_c1)) & (mask);               \
-        }                                                                   \
-    }
-
-UN_WORD_2(un_cc8,  uint16_t, uint8_t,  0, 8,  0xFFu)
-PA_WORD_2(pa_cc8,  uint16_t, uint8_t,  0, 8,  0)
-UN_WORD_2(un_cc16, uint32_t, uint16_t, 0, 16, 0xFFFFu)
-PA_WORD_2(pa_cc16, uint32_t, uint16_t, 0, 16, 0)
-
-#define PA_SEQ_3(name, comp_t)                                              \
-    static void name(void *dst, void *src[], int x0, int x1) {              \
-        comp_t *r = dst;                                                    \
-        for (int x = x0; x < x1; x++) {                                     \
-            *r++ = ((comp_t *)src[0])[x];                                   \
-            *r++ = ((comp_t *)src[1])[x];                                   \
-            *r++ = ((comp_t *)src[2])[x];                                   \
-        }                                                                   \
-    }
-
-#define UN_SEQ_3(name, comp_t)                                              \
-    static void name(void *src, void *dst[], int x0, int x1) {              \
-        comp_t *r = src;                                                    \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((comp_t *)dst[0])[x] = *r++;                                   \
-            ((comp_t *)dst[1])[x] = *r++;                                   \
-            ((comp_t *)dst[2])[x] = *r++;                                   \
-        }                                                                   \
-    }
-
-UN_SEQ_3(un_ccc8,  uint8_t)
-PA_SEQ_3(pa_ccc8,  uint8_t)
-UN_SEQ_3(un_ccc16, uint16_t)
-PA_SEQ_3(pa_ccc16, uint16_t)
-
-// "regular": single packed plane, all components have same width (except padding)
-struct regular_repacker {
-    int packed_width;       // number of bits of the packed pixel
-    int component_width;    // number of bits for a single component
-    int prepadding;         // number of bits of LSB padding
-    int num_components;     // number of components that can be accessed
-    void (*pa_scanline)(void *p1, void *p2[], int x0, int x1);
-    void (*un_scanline)(void *p1, void *p2[], int x0, int x1);
-};
-
-static const struct regular_repacker regular_repackers[] = {
-    {32, 8,  0, 3, pa_ccc8z8,  un_ccc8x8},
-    {32, 8,  8, 3, pa_z8ccc8,  un_x8ccc8},
-    {32, 8,  0, 4, pa_cccc8,   un_cccc8},
-    {64, 16, 0, 4, pa_cccc16,  un_cccc16},
-    {24, 8,  0, 3, pa_ccc8,    un_ccc8},
-    {48, 16, 0, 3, pa_ccc16,   un_ccc16},
-    {16, 8,  0, 2, pa_cc8,     un_cc8},
-    {32, 16, 0, 2, pa_cc16,    un_cc16},
-    {32, 10, 0, 3, pa_ccc10z2, un_ccc10x2},
-};
-
-static int packed_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint32_t *p1 = (void *)(r->mpi->planes[0] +
-                            r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0));
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)(i & r->zmask[s]);
-    }
-
-    r->packed_repack_scanline(p1, p2, x0, x1);
-
-    return 0;
-}
-
-struct fringe_rgb_repacker {
-    // To avoid making a mess of IMGFMT_*, we use av formats directly.
-    enum AVPixelFormat avfmt;
-    // If true, use BGR instead of RGB.
-    //  False:  LSB - R - G - B - pad - MSB
-    //  True:   LSB - B - G - R - pad - MSB
-    bool rev_order;
-    // Size in bit for each component, strictly from LSB to MSB.
-    int bits[3];
-    bool be;
-};
-
-static const struct fringe_rgb_repacker fringe_rgb_repackers[] = {
-    {AV_PIX_FMT_BGR4_BYTE,  false,  {1, 2, 1}},
-    {AV_PIX_FMT_RGB4_BYTE,  true,   {1, 2, 1}},
-    {AV_PIX_FMT_BGR8,       false,  {3, 3, 2}},
-    {AV_PIX_FMT_RGB8,       true,   {2, 3, 3}}, // pixdesc desc. and doc. bug?
-    {AV_PIX_FMT_RGB444LE,   true,   {4, 4, 4}},
-    {AV_PIX_FMT_RGB444BE,   true,   {4, 4, 4}, .be = true},
-    {AV_PIX_FMT_BGR444LE,   false,  {4, 4, 4}},
-    {AV_PIX_FMT_BGR444BE,   false,  {4, 4, 4}, .be = true},
-    {AV_PIX_FMT_BGR565LE,   false,  {5, 6, 5}},
-    {AV_PIX_FMT_BGR565BE,   false,  {5, 6, 5}, .be = true},
-    {AV_PIX_FMT_RGB565LE,   true,   {5, 6, 5}},
-    {AV_PIX_FMT_RGB565BE,   true,   {5, 6, 5}, .be = true},
-    {AV_PIX_FMT_BGR555LE,   false,  {5, 5, 5}},
-    {AV_PIX_FMT_BGR555BE,   false,  {5, 5, 5}, .be = true},
-    {AV_PIX_FMT_RGB555LE,   true,   {5, 5, 5}},
-    {AV_PIX_FMT_RGB555BE,   true,   {5, 5, 5}, .be = true},
-};
-
-#define PA_SHIFT_LUT8(name, packed_t)                                       \
-    static void name(void *dst, void *src[], int x0, int x1, uint8_t *lut,  \
-                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
-        for (int x = x0; x < x1; x++) {                                     \
-            ((packed_t *)dst)[x] =                                          \
-                (lut[((uint8_t *)src[0])[x] + 256 * 0] << s0) |             \
-                (lut[((uint8_t *)src[1])[x] + 256 * 1] << s1) |             \
-                (lut[((uint8_t *)src[2])[x] + 256 * 2] << s2);              \
-        }                                                                   \
-    }
-
-
-#define UN_SHIFT_LUT8(name, packed_t)                                       \
-    static void name(void *src, void *dst[], int x0, int x1, uint8_t *lut,  \
-                     uint8_t s0, uint8_t s1, uint8_t s2) {                  \
-        for (int x = x0; x < x1; x++) {                                     \
-            packed_t c = ((packed_t *)src)[x];                              \
-            ((uint8_t *)dst[0])[x] = lut[((c >> s0) & 0xFF) + 256 * 0];     \
-            ((uint8_t *)dst[1])[x] = lut[((c >> s1) & 0xFF) + 256 * 1];     \
-            ((uint8_t *)dst[2])[x] = lut[((c >> s2) & 0xFF) + 256 * 2];     \
-        }                                                                   \
-    }
-
-PA_SHIFT_LUT8(pa_shift_lut8_8,  uint8_t)
-PA_SHIFT_LUT8(pa_shift_lut8_16, uint16_t)
-UN_SHIFT_LUT8(un_shift_lut8_8,  uint8_t)
-UN_SHIFT_LUT8(un_shift_lut8_16, uint16_t)
-
-static int fringe_rgb_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    void *p1 = r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-
-    void *p2[4] = {0};
-    for (int p = 0; p < r->num_planes; p++) {
-        int s = r->components[p];
-        p2[p] = r->tmp->planes[s] +
-                r->tmp->stride[s] * (ptrdiff_t)(i & r->zmask[s]);
-    }
-
-    assert(r->comp_size == 1 || r->comp_size == 2);
-
-    void (*repack)(void *p1, void *p2[], int x0, int x1, uint8_t *lut,
-                   uint8_t s0, uint8_t s1, uint8_t s2) = NULL;
-    if (r->pack) {
-        repack = r->comp_size == 1 ? pa_shift_lut8_8 : pa_shift_lut8_16;
-    } else {
-        repack = r->comp_size == 1 ? un_shift_lut8_8 : un_shift_lut8_16;
-    }
-    repack(p1, p2, x0, x1, r->comp_lut,
-           r->comp_shifts[0], r->comp_shifts[1], r->comp_shifts[2]);
-
-    return 0;
-}
-
-static int bitmap_repack(void *user, unsigned i, unsigned x0, unsigned x1)
-{
-    struct mp_zimg_repack *r = user;
-
-    uint8_t *p1 =
-        r->mpi->planes[0] + r->mpi->stride[0] * (ptrdiff_t)(i - r->mpi_y0);
-    uint8_t *p2 =
-        r->tmp->planes[0] + r->tmp->stride[0] * (ptrdiff_t)(i &