1 files changed, 649 insertions, 406 deletions
diff --git a/sub/draw_bmp.c b/sub/draw_bmp.c
index ba027838ec..e1caea1e0a 100644
--- a/sub/draw_bmp.c
+++ b/sub/draw_bmp.c
@@ -21,12 +21,11 @@
 #include <math.h>
 #include <inttypes.h>
 
-#include <libswscale/swscale.h>
-
 #include "common/common.h"
 #include "draw_bmp.h"
 #include "img_convert.h"
 #include "video/mp_image.h"
+#include "video/repack.h"
 #include "video/sws_utils.h"
 #include "video/img_format.h"
 #include "video/csputils.h"
@@ -36,517 +35,761 @@ const bool mp_draw_sub_formats[SUBBITMAP_COUNT] = {
     [SUBBITMAP_RGBA] = true,
 };
 
-struct sub_cache {
-    struct mp_image *i, *a;
-};
-
 struct part {
     int change_id;
-    int imgfmt;
-    enum mp_csp colorspace;
-    enum mp_csp_levels levels;
+    // Sub-bitmaps scaled to final sizes.
     int num_imgs;
-    struct sub_cache *imgs;
+    struct mp_image **imgs;
+};
+
+// Must be a power of 2. Height is 1, but mark_rect() effectively operates on
+// multiples of chroma sized macro-pixels. (E.g. 4:2:0 -> every second line is
+// the same as the previous one, and x0%2==x1%2==0.)
+#define SLICE_W 256u
+
+// Whether to scale in tiles. Faster, but can't use correct chroma position.
+// Should be a runtime option. SLICE_W is used as tile width. The tile size
+// should probably be small; too small or too big will cause overhead when
+// scaling.
+#define SCALE_IN_TILES 1
+#define TILE_H 4u
+
+struct slice {
+    uint16_t x0, x1;
 };
 
 struct mp_draw_sub_cache
 {
-    struct part *parts[MAX_OSD_PARTS];
-    struct mp_image *upsample_img;
-    struct mp_image upsample_temp;
-};
+    // Possibly cached parts. Also implies what's in the video_overlay.
+    struct part parts[MAX_OSD_PARTS];
+    int64_t change_id;
 
+    struct mp_image_params params;  // target image params
 
-static struct part *get_cache(struct mp_draw_sub_cache *cache,
-                              struct sub_bitmaps *sbs, struct mp_image *format);
-static bool get_sub_area(struct mp_rect bb, struct mp_image *temp,
-                         struct sub_bitmap *sb, struct mp_image *out_area,
-                         int *out_src_x, int *out_src_y);
+    int w, h;                       // like params.w/h, but rounded up to chroma
+    unsigned align_x, align_y;      // alignment for all video pixels
 
-#define CONDITIONAL 1
+    struct mp_image *rgba_overlay;  // all OSD in RGBA
+    struct mp_image *video_overlay; // rgba_overlay converted to video colorspace
+    struct mp_image *alpha_overlay; // alpha plane ref. to video_overlay
+    struct mp_image *calpha_overlay; // alpha_overlay scaled to chroma plane size
 
-#define BLEND_CONST_ALPHA(TYPE)                                                 \
-    TYPE *dst_r = dst_rp;                                                       \
-    for (int x = 0; x < w; x++) {                                               \
-        uint32_t srcap = srca_r[x];                                             \
-        if (CONDITIONAL && !srcap) continue;                                    \
-        srcap *= srcamul; /* now 0..65025 */                                    \
-        dst_r[x] = (srcp * srcap + dst_r[x] * (65025 - srcap) + 32512) / 65025; \
-    }
+    unsigned s_w;                   // number of slices per line
+    struct slice *slices;           // slices[y * s_w + x / SLICE_W]
+    bool any_osd;
+
+    struct mp_sws_context *rgba_to_overlay; // scaler for rgba -> video csp.
+    struct mp_sws_context *alpha_to_calpha; // scaler for overlay -> calpha
+    bool scale_in_tiles;
 
-// dst = srcp * (srca * srcamul) + dst * (1 - (srca * srcamul))
-static void blend_const_alpha(void *dst, int dst_stride, int srcp,
-                              uint8_t *srca, int srca_stride, uint8_t srcamul,
-                              int w, int h, int bytes)
+    struct mp_sws_context *sub_scale; // scaler for SUBBITMAP_RGBA
+
+    struct mp_repack *overlay_to_f32; // convert video_overlay to float
+    struct mp_image *overlay_tmp;   // slice in float32
+
+    struct mp_repack *calpha_to_f32; // convert video_overlay to float
+    struct mp_image *calpha_tmp;    // slice in float32
+
+    struct mp_repack *video_to_f32; // convert video to float
+    struct mp_repack *video_from_f32; // convert float back to video
+    struct mp_image *video_tmp;     // slice in float32
+
+    struct mp_sws_context *premul;  // video -> premultiplied video
+    struct mp_sws_context *unpremul; // reverse
+    struct mp_image *premul_tmp;
+
+    // Function that works on the _f32 data.
+    void (*blend_line)(void *dst, void *src, void *src_a, int w);
+};
+
+static void blend_line_f32(void *dst, void *src, void *src_a, int w)
 {
-    if (!srcamul)
-        return;
-    for (int y = 0; y < h; y++) {
-        void *dst_rp = (uint8_t *)dst + dst_stride * y;
-        uint8_t *srca_r = srca + srca_stride * y;
-        if (bytes == 2) {
-            BLEND_CONST_ALPHA(uint16_t)
-        } else if (bytes == 1) {
-            BLEND_CONST_ALPHA(uint8_t)
-        }
-    }
-}
+    float *dst_f = dst;
+    float *src_f = src;
+    float *src_a_f = src_a;
 
-#define BLEND_SRC_ALPHA(TYPE)                                                   \
-    TYPE *dst_r = dst_rp, *src_r = src_rp;                                      \
-    for (int x = 0; x < w; x++) {                                               \
-        uint32_t srcap = srca_r[x];                                             \
-        if (CONDITIONAL && !srcap) continue;                                    \
-        dst_r[x] = (src_r[x] * srcap + dst_r[x] * (255 - srcap) + 127) / 255;   \
-    }
+    for (int x = 0; x < w; x++)
+        dst_f[x] = src_f[x] + dst_f[x] * (1.0f - src_a_f[x]);
+}
 
-// dst = src * srca + dst * (1 - srca)
-static void blend_src_alpha(void *dst, int dst_stride, void *src,
-                            int src_stride, uint8_t *srca, int srca_stride,
-                            int w, int h, int bytes)
+static void blend_slice(struct mp_draw_sub_cache *p, int rgb_y)
 {
-    for (int y = 0; y < h; y++) {
-        void *dst_rp = (uint8_t *)dst + dst_stride * y;
-        void *src_rp = (uint8_t *)src + src_stride * y;
-        uint8_t *srca_r = srca + srca_stride * y;
-        if (bytes == 2) {
-            BLEND_SRC_ALPHA(uint16_t)
-        } else if (bytes == 1) {
-            BLEND_SRC_ALPHA(uint8_t)
+    struct mp_image *ov = p->overlay_tmp;
+    struct mp_image *ca = p->calpha_tmp;
+    struct mp_image *vid = p->video_tmp;
+
+    for (int plane = 0; plane < vid->num_planes; plane++) {
+        int xs = vid->fmt.xs[plane];
+        int ys = vid->fmt.ys[plane];
+        int h = (1 << vid->fmt.chroma_ys) - (1 << ys) + 1;
+        int cw = mp_chroma_div_up(vid->w, xs);
+        for (int y = 0; y < h; y++) {
+            p->blend_line(mp_image_pixel_ptr(vid, plane, 0, y),
+                          mp_image_pixel_ptr(ov, plane, 0, y),
+                          xs || ys ? mp_image_pixel_ptr(ca, 0, 0, y)
+                            : mp_image_pixel_ptr(ov, ov->num_planes - 1, 0, y),
+                          cw);
         }
     }
 }
 
-#define BLEND_SRC_DST_MUL(TYPE, MAX)                                            \
-    TYPE *dst_r = dst_rp;                                                       \
-    for (int x = 0; x < w; x++) {                                               \
-        uint16_t srcp = src_r[x] * srcmul; /* now 0..65025 */                   \
-        dst_r[x] = (srcp * (MAX) + dst_r[x] * (65025 - srcp) + 32512) / 65025;  \
+static bool blend_overlay_with_video(struct mp_draw_sub_cache *p,
+                                     struct mp_image *dst)
+{
+    if (!repack_config_buffers(p->video_to_f32, 0, p->video_tmp, 0, dst, NULL))
+        return false;
+    if (!repack_config_buffers(p->video_from_f32, 0, dst, 0, p->video_tmp, NULL))
+        return false;
+
+    int xs = dst->fmt.chroma_xs;
+    int ys = dst->fmt.chroma_ys;
+
+    for (int y = 0; y < dst->h; y += p->align_y) {
+        struct slice *line = &p->slices[y * p->s_w];
+
+        for (int sx = 0; sx < p->s_w; sx++) {
+            struct slice *s = &line[sx];
+
+            int w = s->x1 - s->x0;
+            if (w <= 0)
+                continue;
+            int x = sx * SLICE_W + s->x0;
+
+            assert(MP_IS_ALIGNED(x, p->align_x));
+            assert(MP_IS_ALIGNED(w, p->align_x));
+            assert(x + w <= p->w);
+
+            repack_line(p->overlay_to_f32, 0, 0, x, y, w);
+            repack_line(p->video_to_f32, 0, 0, x, y, w);
+            if (p->calpha_to_f32)
+                repack_line(p->calpha_to_f32, 0, 0, x >> xs, y >> ys, w >> xs);
+
+            blend_slice(p, y);
+
+            repack_line(p->video_from_f32, x, y, 0, 0, w);
+        }
     }
 
-// dst = src * srcmul + dst * (1 - src * srcmul)
-static void blend_src_dst_mul(void *dst, int dst_stride,
-                              uint8_t *src, int src_stride, uint8_t srcmul,
-                              int w, int h, int dst_bytes)
+    return true;
+}
+
+static bool convert_overlay_part(struct mp_draw_sub_cache *p,
+                                 int x0, int y0, int w, int h)
 {
-    for (int y = 0; y < h; y++) {
-        void *dst_rp = (uint8_t *)dst + dst_stride * y;
-        uint8_t *src_r = (uint8_t *)src + src_stride * y;
-        if (dst_bytes == 2) {
-            BLEND_SRC_DST_MUL(uint16_t, 65025)
-        } else if (dst_bytes == 1) {
-            BLEND_SRC_DST_MUL(uint8_t, 255)
-        }
+    struct mp_image src = *p->rgba_overlay;
+    struct mp_image dst = *p->video_overlay;
+
+    mp_image_crop(&src, x0, y0, x0 + w, y0 + h);
+    mp_image_crop(&dst, x0, y0, x0 + w, y0 + h);
+
+    if (mp_sws_scale(p->rgba_to_overlay, &dst, &src) < 0)
+        return false;
+
+    if (p->calpha_overlay) {
+        src = *p->alpha_overlay;
+        dst = *p->calpha_overlay;
+
+        int xs = p->video_overlay->fmt.chroma_xs;
+        int ys = p->video_overlay->fmt.chroma_ys;
+        mp_image_crop(&src, x0, y0, x0 + w, y0 + h);
+        mp_image_crop(&dst, x0 >> xs, y0 >> ys, (x0 + w) >> xs, (y0 + h) >> ys);
+
+        if (mp_sws_scale(p->alpha_to_calpha, &dst, &src) < 0)
+            return false;
     }
+
+    return true;
 }
 
-static void unpremultiply_and_split_BGR32(struct mp_image *img,
-                                          struct mp_image *alpha)
+static bool convert_to_video_overlay(struct mp_draw_sub_cache *p)
 {
-    for (int y = 0; y < img->h; ++y) {
-        uint32_t *irow = (uint32_t *) &img->planes[0][img->stride[0] * y];
-        uint8_t *arow = &alpha->planes[0][alpha->stride[0] * y];
-        for (int x = 0; x < img->w; ++x) {
-            uint32_t pval = irow[x];
-            uint32_t aval = (pval >> 24);
-            uint32_t rval = (pval >> 16) & 0xFF;
-            uint32_t gval = (pval >> 8) & 0xFF;
-            uint32_t bval = pval & 0xFF;
-            // multiplied = separate * alpha / 255
-            // separate = rint(multiplied * 255 / alpha)
-            //          = floor(multiplied * 255 / alpha + 0.5)
-            //          = floor((multiplied * 255 + 0.5 * alpha) / alpha)
-            //          = floor((multiplied * 255 + floor(0.5 * alpha)) / alpha)
-            int div = (int) aval;
-            int add = div / 2;
-            if (aval) {
-                rval = MPMIN(255, (rval * 255 + add) / div);
-                gval = MPMIN(255, (gval * 255 + add) / div);
-                bval = MPMIN(255, (bval * 255 + add) / div);
-                irow[x] = bval + (gval << 8) + (rval << 16) + (aval << 24);
+    if (!p->video_overlay)
+        return true;
+
+    if (p->scale_in_tiles) {
+        int t_h = p->rgba_overlay->h / TILE_H;
+        for (int ty = 0; ty < t_h; ty++) {
+            for (int sx = 0; sx < p->s_w; sx++) {
+                struct slice *s = &p->slices[ty * TILE_H * p->s_w + sx];
+                bool pixels_set = false;
+                for (int y = 0; y < TILE_H; y++) {
+                    if (s[0].x0 < s[0].x1) {
+                        pixels_set = true;
+                        break;
+                    }
+                    s += p->s_w;
+                }
+                if (!pixels_set)
+                    continue;
+                if (!convert_overlay_part(p, sx * SLICE_W, ty * TILE_H,
+                                          SLICE_W, TILE_H))
+                    return false;
             }
-            arow[x] = aval;
         }
+    } else {
+        if (!convert_overlay_part(p, 0, 0, p->rgba_overlay->w, p->rgba_overlay->h))
+            return false;
     }
+
+    return true;
 }
 
-// dst_format merely contains the target colorspace/format information
-static void scale_sb_rgba(struct sub_bitmap *sb, const struct mp_image *dst_format,
-                          struct mp_image **out_sbi, struct mp_image **out_sba)
+// Mark the given rectangle of pixels as possibly non-transparent.
+// The rectangle must have been pre-clipped.
+static void mark_rect(struct mp_draw_sub_cache *p, int x0, int y0, int x1, int y1)
 {
-    struct mp_image sbisrc = {0};
-    mp_image_setfmt(&sbisrc, IMGFMT_BGR32);
-    mp_image_set_size(&sbisrc, sb->w, sb->h);
-    sbisrc.planes[0] = sb->bitmap;
-    sbisrc.stride[0] = sb->stride;
-    struct mp_image *sbisrc2 = mp_image_alloc(IMGFMT_BGR32, sb->dw, sb->dh);
-    struct mp_image *sba = mp_image_alloc(IMGFMT_Y8, sb->dw, sb->dh);
-    struct mp_image *sbi = mp_image_alloc(dst_format->imgfmt, sb->dw, sb->dh);
-    if (!sbisrc2 || !sba || !sbi) {
-        talloc_free(sbisrc2);
-        talloc_free(sba);
-        talloc_free(sbi);
-        return;
-    }
+    x0 = MP_ALIGN_DOWN(x0, p->align_x);
+    y0 = MP_ALIGN_DOWN(y0, p->align_y);
+    x1 = MP_ALIGN_UP(x1, p->align_x);
+    y1 = MP_ALIGN_UP(y1, p->align_y);
+
+    assert(x0 >= 0 && x0 <= x1 && x1 <= p->w);
+    assert(y0 >= 0 && y0 <= y1 && y1 <= p->h);
+
+    int sx0 = x0 / SLICE_W;
+    int sx1 = x1 / SLICE_W;
 
-    mp_image_swscale(sbisrc2, &sbisrc, SWS_BILINEAR);
-    unpremultiply_and_split_BGR32(sbisrc2, sba);
+    for (int y = y0; y < y1; y++) {
+        struct slice *line = &p->slices[y * p->s_w];
 
-    sbi->params.color = dst_format->params.color;
-    mp_image_swscale(sbi, sbisrc2, SWS_BILINEAR);
+        struct slice *s0 = &line[sx0];
+        struct slice *s1 = &line[sx1];
 
-    talloc_free(sbisrc2);
+        s0->x0 = MPMIN(s0->x0, x0 % SLICE_W);
+        s1->x1 = MPMAX(s1->x1, x1 % SLICE_W);
 
-    *out_sbi = sbi;
-    *out_sba = sba;
+        if (s0 != s1) {
+            s0->x1 = SLICE_W;
+            s1->x0 = 0;
+
+            for (int x = sx0 + 1; x < sx1; x++) {
+                struct slice *s = &line[x];
+                s->x0 = 0;
+                s->x1 = SLICE_W;
+            }
+        }
+
+        p->any_osd = true;
+    }
 }
 
-static void draw_rgba(struct mp_draw_sub_cache *cache, struct mp_rect bb,
-                      struct mp_image *temp, int bits,
-                      struct sub_bitmaps *sbs)
+static void draw_ass_rgba(uint8_t *dst, ptrdiff_t dst_stride,
+                          uint8_t *src, ptrdiff_t src_stride,
+                          int w, int h, uint32_t color)
 {
-    struct part *part = get_cache(cache, sbs, temp);
-    assert(part);
+    const unsigned int r = (color >> 24) & 0xff;
+    const unsigned int g = (color >> 16) & 0xff;
+    const unsigned int b = (color >>  8) & 0xff;
+    const unsigned int a = 0xff - (color & 0xff);
 
-    for (int i = 0; i < sbs->num_parts; ++i) {
-        struct sub_bitmap *sb = &sbs->parts[i];
+    for (int y = 0; y < h; y++) {
+        uint32_t *dstrow = (uint32_t *) dst;
+        for (int x = 0; x < w; x++) {
+            const unsigned int v = src[x];
+            unsigned int aa = a * v;
+            uint32_t dstpix = dstrow[x];
+            unsigned int dstb =  dstpix        & 0xFF;
+            unsigned int dstg = (dstpix >>  8) & 0xFF;
+            unsigned int dstr = (dstpix >> 16) & 0xFF;
+            unsigned int dsta = (dstpix >> 24) & 0xFF;
+            dstb = (v * b * a   + dstb * (255 * 255 - aa)) / (255 * 255);
+            dstg = (v * g * a   + dstg * (255 * 255 - aa)) / (255 * 255);
+            dstr = (v * r * a   + dstr * (255 * 255 - aa)) / (255 * 255);
+            dsta = (aa * 255    + dsta * (255 * 255 - aa)) / (255 * 255);
+            dstrow[x] = dstb | (dstg << 8) | (dstr << 16) | (dsta << 24);
+        }
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
 
-        if (sb->w < 1 || sb->h < 1)
-            continue;
+static void render_ass(struct mp_draw_sub_cache *p, struct sub_bitmaps *sb)
+{
+    assert(sb->format == SUBBITMAP_LIBASS);
 
-        struct mp_image dst;
-        int src_x, src_y;
-        if (!get_sub_area(bb, temp, sb, &dst, &src_x, &src_y))
-            continue;
+    for (int i = 0; i < sb->num_parts; i++) {
+        struct sub_bitmap *s = &sb->parts[i];
 
-        struct mp_image *sbi = part->imgs[i].i;
-        struct mp_image *sba = part->imgs[i].a;
+        draw_ass_rgba(mp_image_pixel_ptr(p->rgba_overlay, 0, s->x, s->y),
+                      p->rgba_overlay->stride[0], s->bitmap, s->stride,
+                      s->w, s->h, s->libass.color);
 
-        if (!(sbi && sba))
-            scale_sb_rgba(sb, temp, &sbi, &sba);
-        // on OOM, skip drawing
-        if (!(sbi && sba))
-            continue;
+        mark_rect(p, s->x, s->y, s->x + s->w, s->y + s->h);
+    }
+}
 
-        int bytes = (bits + 7) / 8;
-        uint8_t *alpha_p = sba->planes[0] + src_y * sba->stride[0] + src_x;
-        for (int p = 0; p < (temp->num_planes > 2 ? 3 : 1); p++) {
-            void *src = sbi->planes[p] + src_y * sbi->stride[p] + src_x * bytes;
-            blend_src_alpha(dst.planes[p], dst.stride[p], src, sbi->stride[p],
-                            alpha_p, sba->stride[0], dst.w, dst.h, bytes);
-        }
-        if (temp->num_planes >= 4) {
-            blend_src_dst_mul(dst.planes[3], dst.stride[3], alpha_p,
-                              sba->stride[0], 255, dst.w, dst.h, bytes);
+static void draw_rgba(uint8_t *dst, ptrdiff_t dst_stride,
+                      uint8_t *src, ptrdiff_t src_stride, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        uint32_t *srcrow = (uint32_t *)src;
+        uint32_t *dstrow = (uint32_t *)dst;
+        for (int x = 0; x < w; x++) {
+            uint32_t srcpix = srcrow[x];
+            uint32_t dstpix = dstrow[x];
+            unsigned int srcb =  srcpix        & 0xFF;
+            unsigned int srcg = (srcpix >>  8) & 0xFF;
+            unsigned int srcr = (srcpix >> 16) & 0xFF;
+            unsigned int srca = (srcpix >> 24) & 0xFF;
+            unsigned int dstb =  dstpix        & 0xFF;
+            unsigned int dstg = (dstpix >>  8) & 0xFF;
+            unsigned int dstr = (dstpix >> 16) & 0xFF;
+            unsigned int dsta = (dstpix >> 24) & 0xFF;
+            dstb = srcb + dstb * (255 * 255 - srca) / (255 * 255);
+            dstg = srcg + dstg * (255 * 255 - srca) / (255 * 255);
+            dstr = srcr + dstr * (255 * 255 - srca) / (255 * 255);
+            dsta = srca + dsta * (255 * 255 - srca) / (255 * 255);
+            dstrow[x] = dstb | (dstg << 8) | (dstr << 16) | (dsta << 24);
         }
-
-        part->imgs[i].i = talloc_steal(part, sbi);
-        part->imgs[i].a = talloc_steal(part, sba);
+        dst += dst_stride;
+        src += src_stride;
     }
 }
 
-static void draw_ass(struct mp_draw_sub_cache *cache, struct mp_rect bb,
-                     struct mp_image *temp, int bits, struct sub_bitmaps *sbs)
+static bool render_rgba(struct mp_draw_sub_cache *p, struct part *part,
+                        struct sub_bitmaps *sb)
 {
-    struct mp_csp_params cspar = MP_CSP_PARAMS_DEFAULTS;
-    mp_csp_set_image_params(&cspar, &temp->params);
-    cspar.levels_out = MP_CSP_LEVELS_PC; // RGB (libass.color)
-    cspar.input_bits = bits;
-    cspar.texture_bits = (bits + 7) / 8 * 8;
-
-    struct mp_cmat yuv2rgb, rgb2yuv;
-    bool need_conv = temp->fmt.flags & MP_IMGFLAG_YUV;
-    if (need_conv) {
-        mp_get_csp_matrix(&cspar, &yuv2rgb);
-        mp_invert_cmat(&rgb2yuv, &yuv2rgb);
+    assert(sb->format == SUBBITMAP_RGBA);
+
+    if (part->change_id != sb->change_id) {
+        for (int n = 0; n < part->num_imgs; n++)
+            talloc_free(part->imgs[n]);
+        part->num_imgs = sb->num_parts;
+        MP_TARRAY_GROW(p, part->imgs, part->num_imgs);
+        for (int n = 0; n < part->num_imgs; n++)
+            part->imgs[n] = NULL;
+
+        part->change_id = sb->change_id;
     }
 
-    for (int i = 0; i < sbs->num_parts; ++i) {
-        struct sub_bitmap *sb = &sbs->parts[i];
+    for (int i = 0; i < sb->num_parts; i++) {
+        struct sub_bitmap *s = &sb->parts[i];
+
+        // Clipping is rare but necessary.
+        int sx0 = s->x;
+        int sy0 = s->y;
+        int sx1 = s->x + s->dw;
+        int sy1 = s->y + s->dh;
+
+        int x0 = MPCLAMP(sx0, 0, p->w);
+        int y0 = MPCLAMP(sy0, 0, p->h);
+        int x1 = MPCLAMP(sx1, 0, p->w);
+        int y1 = MPCLAMP(sy1, 0, p->h);
 
-        struct mp_image dst;
-        int src_x, src_y;
-        if (!get_sub_area(bb, temp, sb, &dst, &src_x, &src_y))
+        int dw = x1 - x0;
+        int dh = y1 - y0;
+        if (dw <= 0 || dh <= 0)
             continue;
 
-        int r = (sb->libass.color >> 24) & 0xFF;
-        int g = (sb->libass.color >> 16) & 0xFF;
-        int b = (sb->libass.color >> 8) & 0xFF;
-        int a = 255 - (sb->libass.color & 0xFF);
-        int color_yuv[3];
-        if (need_conv) {
-            int rgb[3] = {r, g, b};
-            mp_map_fixp_color(&rgb2yuv, 8, rgb, cspar.texture_bits, color_yuv);
-        } else {
-            const int shift = (bits > 8) ? bits - 8 : 0;
-            color_yuv[0] = g << shift;
-            color_yuv[1] = b << shift;
-            color_yuv[2] = r << shift;
+        // We clip the source instead of the scaled image, because that might
+        // avoid excessive memory usage when applying a ridiculous scale factor,
+        // even if that stretches it to up to 1 pixel due to integer rounding.
+        int sx = 0;
+        int sy = 0;
+        int sw = s->w;
+        int sh = s->h;
+        if (x0 != sx0 || y0 != sy0 || x1 != sx1 || y1 != sy1) {
+            double fx = s->dw / (double)s->w;
+            double fy = s->dh / (double)s->h;
+            sx = MPCLAMP((x0 - sx0) / fx, 0, s->w);
+            sy = MPCLAMP((y0 - sy0) / fy, 0, s->h);
+            sw = MPCLAMP(dw / fx, 1, s->w);
+            sh = MPCLAMP(dh / fy, 1, s->h);
         }
 
-        int bytes = (bits + 7) / 8;
-        uint8_t *alpha_p = (uint8_t *)sb->bitmap + src_y * sb->stride + src_x;
-        for (int p = 0; p < (temp->num_planes > 2 ? 3 : 1); p++) {
-            blend_const_alpha(dst.planes[p], dst.stride[p], color_yuv[p],
-                              alpha_p, sb->stride, a, dst.w, dst.h, bytes);
-        }
-        if (temp->num_planes >= 4) {
-            blend_src_dst_mul(dst.planes[3], dst.stride[3], alpha_p,
-                              sb->stride, a, dst.w, dst.h, bytes);
+        assert(sx >= 0 && sw > 0 && sx + sw <= s->w);
+        assert(sy >= 0 && sh > 0 && sy + sh <= s->h);
+
+        ptrdiff_t s_stride = s->stride;
+        void *s_ptr = (char *)s->bitmap + s_stride * sy + sx * 4;
+
+        if (dw != sw || dh != sh) {
+            struct mp_image *scaled = part->imgs[i];
+
+            if (!scaled) {
+                struct mp_image src_img = {0};
+                mp_image_setfmt(&src_img, IMGFMT_BGR32);
+                mp_image_set_size(&src_img, sw, sh);
+                src_img.planes[0] = s_ptr;
+                src_img.stride[0] = s_stride;
+                src_img.params.alpha = MP_ALPHA_PREMUL;
+
+                scaled = mp_image_alloc(IMGFMT_BGR32, dw, dh);
+                if (!scaled)
+                    return false;
+                part->imgs[i] = talloc_steal(p, scaled);
+                mp_image_copy_attributes(scaled, &src_img);
+
+                if (mp_sws_scale(p->sub_scale, scaled, &src_img) < 0)
+                    return false;
+            }
+
+            assert(scaled->w == dw);
+            assert(scaled->h == dh);
+
+            s_stride = scaled->stride[0];
+            s_ptr = scaled->planes[0];
         }
+
+        draw_rgba(mp_image_pixel_ptr(p->rgba_overlay, 0, x0, y0),
+                  p->rgba_overlay->stride[0], s_ptr, s_stride, dw, dh);
+
+        mark_rect(p, x0, y0, x1, y1);
     }
+
+    return true;
 }
 
-static void get_swscale_alignment(const struct mp_image *img, int *out_xstep,
-                                  int *out_ystep)
+static bool render_sb(struct mp_draw_sub_cache *p, struct sub_bitmaps *sb)
 {
-    int sx = (1 << img->fmt.chroma_xs);
-    int sy = (1 << img->fmt.chroma_ys);
-
-    for (int p = 0; p < img->num_planes; ++p) {
-        int bits = img->fmt.bpp[p];
-        // the * 2 fixes problems with writing past the destination width
-        while (((sx >> img->fmt.chroma_xs) * bits) % (SWS_MIN_BYTE_ALIGN * 8 * 2))
-            sx *= 2;
+    struct part *part = &p->parts[sb->render_index];
+
+    switch (sb->format) {
+    case SUBBITMAP_LIBASS:
+        render_ass(p, sb);
+        return true;
+    case SUBBITMAP_RGBA:
+        return render_rgba(p, part, sb);
     }
 
-    *out_xstep = sx;
-    *out_ystep = sy;
+    return false;
 }
 
-static void align_bbox(int xstep, int ystep, struct mp_rect *rc)
+static void clear_rgba_overlay(struct mp_draw_sub_cache *p)
 {
-    rc->x0 = rc->x0 & ~(xstep - 1);
-    rc->y0 = rc->y0 & ~(ystep - 1);
-    rc->x1 = FFALIGN(rc->x1, xstep);
-    rc->y1 = FFALIGN(rc->y1, ystep);
-}
+    assert(p->rgba_overlay->imgfmt == IMGFMT_BGR32);
 
-// Post condition, if true returned: rc is inside img
-static bool align_bbox_for_swscale(struct mp_image *img, struct mp_rect *rc)
-{
-    struct mp_rect img_rect = {0, 0, img->w, img->h};
-    // Get rid of negative coordinates
-    if (!mp_rect_intersection(rc, &img_rect))
-        return false;
-    int xstep, ystep;
-    get_swscale_alignment(img, &xstep, &ystep);
-    align_bbox(xstep, ystep, rc);
-    return mp_rect_intersection(rc, &img_rect);
-}
+    for (int y = 0; y < p->rgba_overlay->h; y++) {
+        uint32_t *px = mp_image_pixel_ptr(p->rgba_overlay, 0, 0, y);
+        struct slice *line = &p->slices[y * p->s_w];
 
-// Try to find best/closest YUV 444 format (or similar) for imgfmt
-static void get_closest_y444_format(int imgfmt, int *out_format, int *out_bits)
-{
-    struct mp_imgfmt_desc desc = mp_imgfmt_get_desc(imgfmt);
-    int planes = desc.flags & MP_IMGFLAG_ALPHA ? 4 : 3;
-    if (desc.flags & MP_IMGFLAG_RGB) {
-        // For RGB try to match the amount of bits exactly (but no less than 8, or larger than 16)
-        int bits = (desc.component_bits > 8) ? desc.component_bits : 8;
-        if (bits > 16)
-            bits = 16;
-        *out_format = mp_imgfmt_find(0, 0, planes, bits, MP_IMGFLAG_RGB_P);
-        if (!mp_sws_supported_format(*out_format))
-            *out_format = mp_imgfmt_find(0, 0, planes, 8, MP_IMGFLAG_RGB_P);
-    } else if (desc.flags & MP_IMGFLAG_YUV_P) {
-        const int bits = (desc.component_bits > 8) ? 16 : 8;
-        *out_format = mp_imgfmt_find(0, 0, planes, bits, MP_IMGFLAG_YUV_P);
-    } else {
-        *out_format = 0;
-    }
-    if (!mp_sws_supported_format(*out_format))
-        *out_format = IMGFMT_444P; // generic fallback
-    *out_bits = mp_imgfmt_get_desc(*out_format).component_bits;
-}
+        for (int sx = 0; sx < p->s_w; sx++) {
+            struct slice *s = &line[sx];
 
-static struct part *get_cache(struct mp_draw_sub_cache *cache,
-                              struct sub_bitmaps *sbs, struct mp_image *format)
-{
-    struct part *part = NULL;
-
-    bool use_cache = sbs->format == SUBBITMAP_RGBA;
-    if (use_cache) {
-        part = cache->parts[sbs->render_index];
-        if (part) {
-            if (part->change_id != sbs->change_id
-                || part->imgfmt != format->imgfmt
-                || part->colorspace != format->params.color.space
-                || part->levels != format->params.color.levels)
-            {
-                talloc_free(part);
-                part = NULL;
+            if (s->x0 <= s->x1) {
+                memset(px + s->x0, 0, (s->x1 - s->x0) * 4);
+                *s = (struct slice){SLICE_W, 0};
             }
+
+            px += SLICE_W;
         }
-        if (!part) {
-            part = talloc(cache, struct part);
-            *part = (struct part) {
-                .change_id = sbs->change_id,
-                .num_imgs = sbs->num_parts,
-                .imgfmt = format->imgfmt,
-                .levels = format->params.color.levels,
-                .colorspace = format->params.color.space,
-            };
-            part->imgs = talloc_zero_array(part, struct sub_cache,
-                                           part->num_imgs);
-        }
-        assert(part->num_imgs == sbs->num_parts);
-        cache->parts[sbs->render_index] = part;
     }
 
-    return part;
+    p->any_osd = false;
 }
 
-// Return area of intersection between target and sub-bitmap as cropped image
-static bool get_sub_area(struct mp_rect bb, struct mp_image *temp,
-                         struct sub_bitmap *sb, struct mp_image *out_area,
-                         int *out_src_x, int *out_src_y)
+static bool reinit(struct mp_draw_sub_cache *p, struct mp_image_params *params)
 {
-    // coordinates are relative to the bbox
-    struct mp_rect dst = {sb->x - bb.x0, sb->y - bb.y0};
-    dst.x1 = dst.x0 + sb->dw;
-    dst.y1 = dst.y0 + sb->dh;
-    if (!mp_rect_intersection(&dst, &(struct mp_rect){0, 0, temp->w, temp->h}))
+    talloc_free_children(p);
+    *p = (struct mp_draw_sub_cache){.params = *params};
+
+    bool need_premul = params->alpha != MP_ALPHA_PREMUL &&
+        (mp_imgfmt_get_desc(params->imgfmt).flags & MP_IMGFLAG_ALPHA);
+
+    int rflags = REPACK_CREATE_EXPAND_8BIT | REPACK_CREATE_PLANAR_F32;
+    p->blend_line = blend_line_f32;
+
+    p->video_to_f32 = mp_repack_create_planar(params->imgfmt, false, rflags);
+    talloc_steal(p, p->video_to_f32);
+    if (!p->video_to_f32)
         return false;
 
-    *out_src_x = (dst.x0 - sb->x) + bb.x0;
-    *out_src_y = (dst.y0 - sb->y) + bb.y0;
-    *out_area = *temp;
-    mp_image_crop_rc(out_area, dst);
+    p->scale_in_tiles = SCALE_IN_TILES;
 
-    return true;
-}
+    int vid_f32_fmt = mp_repack_get_format_dst(p->video_to_f32);
 
-// Convert the src image to imgfmt (which should be a 444 format)
-static struct mp_image *chroma_up(struct mp_draw_sub_cache *cache, int imgfmt,
-                                  struct mp_image *src)
-{
-    if (src->imgfmt == imgfmt)
-        return src;
+    p->video_from_f32 = mp_repack_create_planar(params->imgfmt, true, rflags);
+    talloc_steal(p, p->video_from_f32);
+    if (!p->video_from_f32)
+        return false;
 
-    if (!cache->upsample_img || cache->upsample_img->imgfmt != imgfmt ||
-        cache->upsample_img->w < src->w || cache->upsample_img->h < src->h)
-    {
-        talloc_free(cache->upsample_img);
-        cache->upsample_img = mp_image_alloc(imgfmt, src->w, src->h);
-        talloc_steal(cache, cache->upsample_img);
-        if (!cache->upsample_img)
-            return NULL;
+    assert(mp_repack_get_format_dst(p->video_to_f32) ==
+           mp_repack_get_format_src(p->video_from_f32));
+
+    // Find a reasonable intermediate format for video_overlay. Requirements:
+    //  - same subsampling
+    //  - has alpha
+    //  - uses video colorspace
+    //  - REPACK_CREATE_PLANAR_F32 support
+    //  - probably not using float (vaguely wastes memory)
+    struct mp_regular_imgfmt vfdesc = {0};
+    mp_get_regular_imgfmt(&vfdesc, mp_repack_get_format_dst(p->video_to_f32));
+    assert(vfdesc.component_type == MP_COMPONENT_TYPE_FLOAT);
+
+    int overlay_fmt = 0;
+    if (params->color.space == MP_CSP_RGB && vfdesc.num_planes >= 3) {
+        // No point in doing anything fancy.
+        overlay_fmt = IMGFMT_BGR32;
+        p->scale_in_tiles = false;
+    } else {
+        struct mp_regular_imgfmt odesc = vfdesc;
+        // Just use 8 bit as well (should be fine, may use less memory).
+        odesc.component_type = MP_COMPONENT_TYPE_UINT;
+        odesc.component_size = 1;
+        odesc.component_pad = 0;
+
+        // Ensure there's alpha.
+        if (odesc.planes[odesc.num_planes - 1].components[0] != 4) {
+            if (odesc.num_planes >= 4)
+                return false; // wat
+            odesc.planes[odesc.num_planes++] =
+                (struct mp_regular_imgfmt_plane){1, {4}};
+        }
+
+        overlay_fmt = mp_find_regular_imgfmt(&odesc);
+        p->scale_in_tiles = odesc.chroma_xs || odesc.chroma_ys;
     }
+    if (!overlay_fmt)
+        return false;
 
-    cache->upsample_temp = *cache->upsample_img;
-    struct mp_image *temp = &cache->upsample_temp;
-    mp_image_set_size(temp, src->w, src->h);
-
-    // The temp image is always YUV, but src not necessarily.
-    // Reduce amount of conversions in YUV case (upsampling/shifting only)
-    if (src->fmt.flags & MP_IMGFLAG_YUV)
-        temp->params.color = src->params.color;
-
-    if (src->imgfmt == IMGFMT_420P) {
-        assert(imgfmt == IMGFMT_444P);
-        // Faster upsampling: keep Y plane, upsample chroma planes only
-        // The whole point is not having swscale copy the Y plane
-        struct mp_image t_dst = *temp;
-        mp_image_setfmt(&t_dst, IMGFMT_Y8);
-        mp_image_set_size(&t_dst, temp->w, temp->h);
-        struct mp_image t_src = t_dst;
-        mp_image_set_size(&t_src, src->w >> 1, src->h >> 1);
-        for (int c = 0; c < 2; c++) {
-            t_dst.planes[0] = temp->planes[1 + c];
-            t_dst.stride[0] = temp->stride[1 + c];
-            t_src.planes[0] = src->planes[1 + c];
-            t_src.stride[0] = src->stride[1 + c];
-            mp_image_swscale(&t_dst, &t_src, SWS_POINT);
-        }
-        temp->planes[0] = src->planes[0];
-        temp->stride[0] = src->stride[0];
-    } else {
-        mp_image_swscale(temp, src, SWS_POINT);
+    p->overlay_to_f32 = mp_repack_create_planar(overlay_fmt, false, rflags);
+    talloc_steal(p, p->overlay_to_f32);
+    if (!p->overlay_to_f32)
+        return false;
+
+    int render_fmt = mp_repack_get_format_dst(p->overlay_to_f32);
+
+    struct mp_regular_imgfmt ofdesc = {0};
+    mp_get_regular_imgfmt(&ofdesc, render_fmt);
+
+    if (ofdesc.planes[ofdesc.num_planes - 1].components[0] != 4)
+        return false;
+
+    // The formats must be the same, minus possible lack of alpha in vfdesc.
+    if (ofdesc.num_planes != vfdesc.num_planes &&
+        ofdesc.num_planes - 1 != vfdesc.num_planes)
+        return false;
+    for (int n = 0; n < vfdesc.num_planes; n++) {
+        if (vfdesc.planes[n].components[0] != ofdesc.planes[n].components[0])
+            return false;
     }
 
-    return temp;
-}
+    p->align_x = mp_repack_get_align_x(p->video_to_f32);
+    p->align_y = mp_repack_get_align_y(p->video_to_f32);
 
-// Undo chroma_up() (copy temp to old_src if needed)
-static void chroma_down(struct mp_image *old_src, struct mp_image *temp)
-{
-    assert(old_src->w == temp->w && old_src->h == temp->h);
-    if (temp != old_src) {
-        if (old_src->imgfmt == IMGFMT_420P) {
-            // Downsampling, skipping the Y plane (see chroma_up())
-            assert(temp->imgfmt == IMGFMT_444P);
-            assert(temp->planes[0] == old_src->planes[0]);
-            struct mp_image t_dst = *temp;
-            mp_image_setfmt(&t_dst, IMGFMT_Y8);
-            mp_image_set_size(&t_dst, old_src->w >> 1, old_src->h >> 1);
-            struct mp_image t_src = t_dst;
-            mp_image_set_size(&t_src, temp->w, temp->h);
-            for (int c = 0; c < 2; c++) {
-                t_dst.planes[0] = old_src->planes[1 + c];
-                t_dst.stride[0] = old_src->stride[1 + c];
-                t_src.