From 18d4eebedb7f80493da87f8506ff0b2db796510a Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Thu, 25 Oct 2012 19:37:43 +0200
Subject: draw_bmp: cosmetics, refactor

Mostly pedantic bikeshedding issues.

Move some code around, so that the sub_bitmap_to_mp_images() function
can be split into two parts. This is better than having a big function
with many input and outputs, of which only half are used in each code
path.

Also, try to make code simpler by using a mp_rect type.
---
 sub/draw_bmp.c | 738 +++++++++++++++++++++++++--------------------------------
 sub/sub.c      |  24 +-
 sub/sub.h      |   4 +-
 3 files changed, 332 insertions(+), 434 deletions(-)

diff --git a/sub/draw_bmp.c b/sub/draw_bmp.c
index f191389785..120e581af8 100644
--- a/sub/draw_bmp.c
+++ b/sub/draw_bmp.c
@@ -16,13 +16,16 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#include "sub/draw_bmp.h"
-
 #include <stddef.h>
 #include <stdbool.h>
 #include <assert.h>
 #include <math.h>
+#include <inttypes.h>
+
+#include <libavutil/common.h>
 
+#include "mpcommon.h"
+#include "sub/draw_bmp.h"
 #include "sub/sub.h"
 #include "libmpcodecs/mp_image.h"
 #include "libmpcodecs/sws_utils.h"
@@ -49,201 +52,142 @@ struct mp_draw_sub_cache
     struct part *parts[MAX_OSD_PARTS];
 };
 
+static struct part *get_cache(struct mp_draw_sub_cache **cache,
+                              struct sub_bitmaps *sbs);
+static bool get_sub_area(struct mp_rect bb, struct mp_image *temp,
+                         struct sub_bitmap *sb, struct mp_image *out_area,
+                         int *out_src_x, int *out_src_y);
+
 #define ACCURATE
 #define CONDITIONAL
 
-static void blend_const16_alpha(uint8_t *dst,
-                                ssize_t dstRowStride,
-                                uint16_t srcp,
-                                const uint8_t *srca,
-                                ssize_t srcaRowStride,
-                                uint8_t srcamul, int rows,
-                                int cols)
+static void blend_const16_alpha(void *dst, int dst_stride, uint16_t srcp,
+                                uint8_t *srca, int srca_stride, uint8_t srcamul,
+                                int w, int h)
 {
-    int i, j;
-#ifdef CONDITIONAL
     if (!srcamul)
         return;
-#endif
-    for (i = 0; i < rows; ++i) {
-        uint16_t *dstr = (uint16_t *) (dst + dstRowStride * i);
-        const uint8_t *srcar = srca + srcaRowStride * i;
-        for (j = 0; j < cols; ++j) {
-            uint32_t srcap = srcar[j];
-                // 32bit to force the math ops to operate on 32 bit
+    for (int y = 0; y < h; y++) {
+        uint16_t *dst_r = (uint16_t *)((uint8_t *)dst + dst_stride * y);
+        uint8_t *srca_r = srca + srca_stride * y;
+        for (int x = 0; x < w; x++) {
+            uint32_t srcap = srca_r[x];
 #ifdef CONDITIONAL
             if (!srcap)
                 continue;
 #endif
-            uint16_t dstp = dstr[j];
             srcap *= srcamul; // now 0..65025
-            uint16_t outp =
-                (srcp * srcap + dstp * (65025 - srcap) + 32512) / 65025;
-            dstr[j] = outp;
+            dst_r[x] = (srcp * srcap + dst_r[x] * (65025 - srcap) + 32512) / 65025;
         }
     }
 }
 
-static void blend_src16_alpha(uint8_t *dst,
-                              ssize_t dstRowStride,
-                              const uint8_t *src,
-                              ssize_t srcRowStride,
-                              const uint8_t *srca,
-                              ssize_t srcaRowStride,
-                              int rows,
-                              int cols)
+static void blend_const8_alpha(void *dst, int dst_stride, uint16_t srcp,
+                               uint8_t *srca, int srca_stride, uint8_t srcamul,
+                               int w, int h)
 {
-    int i, j;
-    for (i = 0; i < rows; ++i) {
-        uint16_t *dstr = (uint16_t *) (dst + dstRowStride * i);
-        const uint16_t *srcr = (const uint16_t *) (src + srcRowStride * i);
-        const uint8_t *srcar = srca + srcaRowStride * i;
-        for (j = 0; j < cols; ++j) {
-            uint32_t srcap = srcar[j];
-                // 32bit to force the math ops to operate on 32 bit
+    if (!srcamul)
+        return;
+    for (int y = 0; y < h; y++) {
+        uint8_t *dst_r = (uint8_t *)dst + dst_stride * y;
+        uint8_t *srca_r = srca + srca_stride * y;
+        for (int x = 0; x < w; x++) {
+            uint32_t srcap = srca_r[x];
 #ifdef CONDITIONAL
             if (!srcap)
                 continue;
 #endif
-            uint16_t srcp = srcr[j];
-            uint16_t dstp = dstr[j];
-            uint16_t outp =
-                (srcp * srcap + dstp * (255 - srcap) + 127) / 255;
-            dstr[j] = outp;
+#ifdef ACCURATE
+            srcap *= srcamul; // now 0..65025
+            dst_r[x] = (srcp * srcap + dst_r[x] * (65025 - srcap) + 32512) / 65025;
+#else
+            srcap = (srcap * srcamul + 255) >> 8;
+            dst_r[x] = (srcp * srcap + dst_r[x] * (255 - srcap) + 255) >> 8;
+#endif
         }
     }
 }
 
-static void blend_const8_alpha(uint8_t *dst,
-                               ssize_t dstRowStride,
-                               uint16_t srcp,
-                               const uint8_t *srca,
-                               ssize_t srcaRowStride,
-                               uint8_t srcamul, int rows,
-                               int cols)
+static void blend_const_alpha(void *dst, int dst_stride, int srcp,
+                              uint8_t *srca, int srca_stride, uint8_t srcamul,
+                              int w, int h, int bytes)
 {
-    int i, j;
-#ifdef CONDITIONAL
-    if (!srcamul)
-        return;
-#endif
-    for (i = 0; i < rows; ++i) {
-        uint8_t *dstr = dst + dstRowStride * i;
-        const uint8_t *srcar = srca + srcaRowStride * i;
-        for (j = 0; j < cols; ++j) {
-            uint32_t srcap = srcar[j];
-                // 32bit to force the math ops to operate on 32 bit
+    if (bytes == 2) {
+        blend_const16_alpha(dst, dst_stride, srcp, srca, srca_stride, srcamul,
+                            w, h);
+    } else if (bytes == 1) {
+        blend_const8_alpha(dst, dst_stride, srcp, srca, srca_stride, srcamul,
+                           w, h);
+    }
+}
+
+static void blend_src16_alpha(void *dst, int dst_stride, void *src,
+                              int src_stride, uint8_t *srca, int srca_stride,
+                              int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        uint16_t *dst_r = (uint16_t *)((uint8_t *)dst + dst_stride * y);
+        uint16_t *src_r = (uint16_t *)((uint8_t *)src + src_stride * y);
+        uint8_t *srca_r = srca + srca_stride * y;
+        for (int x = 0; x < w; x++) {
+            uint32_t srcap = srca_r[x];
 #ifdef CONDITIONAL
             if (!srcap)
                 continue;
 #endif
-            uint8_t dstp = dstr[j];
-#ifdef ACCURATE
-            srcap *= srcamul; // now 0..65025
-            uint8_t outp =
-                (srcp * srcap + dstp * (65025 - srcap) + 32512) / 65025;
-            dstr[j] = outp;
-#else
-            srcap = (srcap * srcamul + 255) >> 8;
-            uint8_t outp =
-                (srcp * srcap + dstp * (255 - srcap) + 255) >> 8;
-            dstr[j] = outp;
-#endif
+            dst_r[x] = (src_r[x] * srcap + dst_r[x] * (255 - srcap) + 127) / 255;
         }
     }
 }
 
-static void blend_src8_alpha(uint8_t *dst,
-                             ssize_t dstRowStride,
-                             const uint8_t *src,
-                             ssize_t srcRowStride,
-                             const uint8_t *srca,
-                             ssize_t srcaRowStride,
-                             int rows,
-                             int cols)
+static void blend_src8_alpha(void *dst, int dst_stride, void *src,
+                             int src_stride, uint8_t *srca, int srca_stride,
+                             int w, int h)
 {
-    int i, j;
-    for (i = 0; i < rows; ++i) {
-        uint8_t *dstr = dst + dstRowStride * i;
-        const uint8_t *srcr = src + srcRowStride * i;
-        const uint8_t *srcar = srca + srcaRowStride * i;
-        for (j = 0; j < cols; ++j) {
-            uint16_t srcap = srcar[j];
-                // 16bit to force the math ops to operate on 16 bit
+    for (int y = 0; y < h; y++) {
+        uint8_t *dst_r = (uint8_t *)dst + dst_stride * y;
+        uint8_t *src_r = (uint8_t *)src + src_stride * y;
+        uint8_t *srca_r = srca + srca_stride * y;
+        for (int x = 0; x < w; x++) {
+            uint16_t srcap = srca_r[x];
 #ifdef CONDITIONAL
             if (!srcap)
                 continue;
 #endif
-            uint8_t srcp = srcr[j];
-            uint8_t dstp = dstr[j];
 #ifdef ACCURATE
-            uint8_t outp =
-                (srcp * srcap + dstp * (255 - srcap) + 127) / 255;
+            dst_r[x] = (src_r[x] * srcap + dst_r[x] * (255 - srcap) + 127) / 255;
 #else
-            uint8_t outp =
-                (srcp * srcap + dstp * (255 - srcap) + 255) >> 8;
+            dst_r[x] = (src_r[x] * srcap + dst_r[x] * (255 - srcap) + 255) >> 8;
 #endif
-            dstr[j] = outp;
         }
     }
 }
 
-static void blend_src_alpha(uint8_t *dst, ssize_t dstRowStride,
-                            const uint8_t *src, ssize_t srcRowStride,
-                            const uint8_t *srca, ssize_t srcaRowStride,
-                            int rows, int cols, int bytes)
-{
-    if (bytes == 2) {
-        blend_src16_alpha(dst, dstRowStride, src,
-                          srcRowStride, srca,
-                          srcaRowStride, rows, cols);
-    } else if (bytes == 1) {
-        blend_src8_alpha(dst, dstRowStride, src,
-                         srcRowStride, srca,
-                         srcaRowStride, rows, cols);
-    }
-}
-
-static void blend_const_alpha(uint8_t *dst, ssize_t dstRowStride,
-                              uint16_t srcp,
-                              const uint8_t *srca, ssize_t srcaRowStride,
-                              uint8_t srcamul,
-                              int rows, int cols, int bytes)
+static void blend_src_alpha(void *dst, int dst_stride, void *src,
+                            int src_stride, uint8_t *srca, int srca_stride,
+                            int w, int h, int bytes)
 {
     if (bytes == 2) {
-        blend_const16_alpha(dst, dstRowStride, srcp,
-                            srca, srcaRowStride,
-                            srcamul, rows,
-                            cols);
+        blend_src16_alpha(dst, dst_stride, src, src_stride, srca, srca_stride,
+                          w, h);
     } else if (bytes == 1) {
-        blend_const8_alpha(dst, dstRowStride, srcp,
-                           srca, srcaRowStride, srcamul,
-                           rows,
-                           cols);
+        blend_src8_alpha(dst, dst_stride, src, src_stride, srca, srca_stride,
+                         w, h);
     }
 }
 
-static inline int min(int x, int y)
-{
-    if (x < y)
-        return x;
-    else
-        return y;
-}
-
-static void unpremultiply_and_split_IMGFMT_BGR32(mp_image_t *img,
-        mp_image_t *alpha)
+static void unpremultiply_and_split_BGR32(struct mp_image *img,
+                                          struct mp_image *alpha)
 {
-    int x, y;
-    for (y = 0; y < img->h; ++y) {
+    for (int y = 0; y < img->h; ++y) {
         uint32_t *irow = (uint32_t *) &img->planes[0][img->stride[0] * y];
-        unsigned char *arow = &alpha->planes[0][alpha->stride[0] * y];
-        for (x = 0; x < img->w; ++x) {
+        uint8_t *arow = &alpha->planes[0][alpha->stride[0] * y];
+        for (int x = 0; x < img->w; ++x) {
             uint32_t pval = irow[x];
-            unsigned char aval = (pval >> 24);
-            unsigned char rval = (pval >> 16) & 0xFF;
-            unsigned char gval = (pval >> 8) & 0xFF;
-            unsigned char bval = pval & 0xFF;
+            uint8_t aval = (pval >> 24);
+            uint8_t rval = (pval >> 16) & 0xFF;
+            uint8_t gval = (pval >> 8) & 0xFF;
+            uint8_t bval = pval & 0xFF;
             // multiplied = separate * alpha / 255
             // separate = rint(multiplied * 255 / alpha)
             //          = floor(multiplied * 255 / alpha + 0.5)
@@ -252,9 +196,9 @@ static void unpremultiply_and_split_IMGFMT_BGR32(mp_image_t *img,
             int div = (int) aval;
             int add = div / 2;
             if (aval) {
-                rval = min(255, (rval * 255 + add) / div);
-                gval = min(255, (gval * 255 + add) / div);
-                bval = min(255, (bval * 255 + add) / div);
+                rval = FFMIN(255, (rval * 255 + add) / div);
+                gval = FFMIN(255, (gval * 255 + add) / div);
+                bval = FFMIN(255, (bval * 255 + add) / div);
                 irow[x] = bval + (gval << 8) + (rval << 16) + (aval << 24);
             }
             arow[x] = aval;
@@ -262,55 +206,106 @@ static void unpremultiply_and_split_IMGFMT_BGR32(mp_image_t *img,
     }
 }
 
-static bool sub_bitmap_to_mp_images(struct mp_image **sbi, int *color_yuv,
-                                    int *color_a, struct mp_image **sba,
-                                    struct sub_bitmap *sb,
-                                    int format, struct mp_csp_details *csp,
-                                    float rgb2yuv[3][4], int imgfmt, int bits)
+static void scale_sb_rgba(struct sub_bitmap *sb, struct mp_csp_details *csp,
+                          int imgfmt, struct mp_image **out_sbi,
+                          struct mp_image **out_sba)
+{
+    struct mp_image *sbisrc = new_mp_image(sb->w, sb->h);
+    mp_image_setfmt(sbisrc, IMGFMT_BGR32);
+    sbisrc->planes[0] = sb->bitmap;
+    sbisrc->stride[0] = sb->stride;
+    struct mp_image *sbisrc2 = alloc_mpi(sb->dw, sb->dh, IMGFMT_BGR32);
+    mp_image_swscale(sbisrc2, sbisrc, csp, SWS_BILINEAR);
+
+    struct mp_image *sba = alloc_mpi(sb->dw, sb->dh, IMGFMT_Y8);
+    unpremultiply_and_split_BGR32(sbisrc2, sba);
+
+    struct mp_image *sbi = alloc_mpi(sb->dw, sb->dh, imgfmt);
+    mp_image_swscale(sbi, sbisrc2, csp, SWS_BILINEAR);
+
+    free_mp_image(sbisrc);
+    free_mp_image(sbisrc2);
+
+    *out_sbi = sbi;
+    *out_sba = sba;
+}
+
+static void draw_rgba(struct mp_draw_sub_cache **cache, struct mp_rect bb,
+                      struct mp_image *temp, int bits, struct mp_csp_details *csp,
+                      struct sub_bitmaps *sbs)
+{
+    struct part *part = get_cache(cache, sbs);
+
+    for (int i = 0; i < sbs->num_parts; ++i) {
+        struct sub_bitmap *sb = &sbs->parts[i];
+
+        // libswscale madness: it requires a minimum width
+        // skip it, we can't reasonably handle it
+        if (sb->w < 8)
+            continue;
+
+        struct mp_image dst;
+        int src_x, src_y;
+        if (!get_sub_area(bb, temp, sb, &dst, &src_x, &src_y))
+            continue;
+
+        struct mp_image *sbi = NULL;
+        struct mp_image *sba = NULL;
+        if (part) {
+            sbi = part->imgs[i].i;
+            sba = part->imgs[i].a;
+        }
+
+        if (!(sbi && sba))
+            scale_sb_rgba(sb, csp, temp->imgfmt, &sbi, &sba);
+
+        int bytes = (bits + 7) / 8;
+        uint8_t *alpha_p = sba->planes[0] + src_y * sba->stride[0] + src_x;
+        for (int p = 0; p < 3; p++) {
+            void *src = sbi->planes[p] + src_y * sbi->stride[p] + src_x * bytes;
+            blend_src_alpha(dst.planes[p], dst.stride[p], src, sbi->stride[p],
+                            alpha_p, sba->stride[0], dst.w, dst.h, bytes);
+        }
+
+        if (part) {
+            part->imgs[i].i = talloc_steal(part, sbi);
+            part->imgs[i].a = talloc_steal(part, sba);
+        } else {
+            free_mp_image(sbi);
+            free_mp_image(sba);
+        }
+    }
+}
+
+static void draw_ass(struct mp_draw_sub_cache **cache, struct mp_rect bb,
+                     struct mp_image *temp, int bits, struct mp_csp_details *csp,
+                     struct sub_bitmaps *sbs)
 {
-    *sbi = NULL;
-    *sba = NULL;
-    if (format == SUBBITMAP_RGBA && sb->w >= 8) {
-        // >= 8 because of libswscale madness
-        // swscale the bitmap from w*h to dw*dh, changing BGRA8 into YUV444P16
-        // and make a scaled copy of A8
-        mp_image_t *sbisrc = new_mp_image(sb->w, sb->h);
-        mp_image_setfmt(sbisrc, IMGFMT_BGR32);
-        sbisrc->planes[0] = sb->bitmap;
-        sbisrc->stride[0] = sb->stride;
-        mp_image_t *sbisrc2 = alloc_mpi(sb->dw, sb->dh, IMGFMT_BGR32);
-        mp_image_swscale(sbisrc2, sbisrc, csp, SWS_BILINEAR);
-
-        // sbisrc2 now is the original image in premultiplied alpha, but
-        // properly scaled...
-        // now, un-premultiply so we can work in YUV color space, also extract
-        // alpha
-        *sba = alloc_mpi(sb->dw, sb->dh, IMGFMT_Y8);
-        unpremultiply_and_split_IMGFMT_BGR32(sbisrc2, *sba);
-
-        // convert to the output format
-        *sbi = alloc_mpi(sb->dw, sb->dh, imgfmt);
-        mp_image_swscale(*sbi, sbisrc2, csp, SWS_BILINEAR);
-
-        free_mp_image(sbisrc);
-        free_mp_image(sbisrc2);
-
-        color_yuv[0] = 255;
-        color_yuv[1] = 128;
-        color_yuv[2] = 128;
-        *color_a = 255;
-        return true;
-    } else if (format == SUBBITMAP_LIBASS &&
-            sb->w == sb->dw && sb->h == sb->dh) {
-        // swscale alpha only
-        *sba = new_mp_image(sb->w, sb->h);
-        mp_image_setfmt(*sba, IMGFMT_Y8);
-        (*sba)->planes[0] = sb->bitmap;
-        (*sba)->stride[0] = sb->stride;
+    struct mp_csp_params cspar = {
+        .colorspace = *csp,
+        .brightness = 0, .contrast = 1,
+        .hue = 0, .saturation = 1,
+        .rgamma = 1, .ggamma = 1, .bgamma = 1,
+        .texture_bits = 8, .input_bits = 8
+    };
+
+    float yuv2rgb[3][4], rgb2yuv[3][4];
+    mp_get_yuv2rgb_coeffs(&cspar, yuv2rgb);
+    mp_invert_yuv2rgb(rgb2yuv, yuv2rgb);
+
+    for (int i = 0; i < sbs->num_parts; ++i) {
+        struct sub_bitmap *sb = &sbs->parts[i];
+
+        struct mp_image dst;
+        int src_x, src_y;
+        if (!get_sub_area(bb, temp, sb, &dst, &src_x, &src_y))
+            continue;
+
         int r = (sb->libass.color >> 24) & 0xFF;
         int g = (sb->libass.color >> 16) & 0xFF;
         int b = (sb->libass.color >> 8) & 0xFF;
-        int a = sb->libass.color & 0xFF;
+        int a = 255 - (sb->libass.color & 0xFF);
+        int color_yuv[4];
         color_yuv[0] =
             rint(MP_MAP_RGB2YUV_COLOR(rgb2yuv, r, g, b, 255, 0)
                     * (1 << (bits - 8)));
@@ -320,135 +315,124 @@ static bool sub_bitmap_to_mp_images(struct mp_image **sbi, int *color_yuv,
         color_yuv[2] =
             rint(MP_MAP_RGB2YUV_COLOR(rgb2yuv, r, g, b, 255, 2)
                     * (1 << (bits - 8)));
-        *color_a = 255 - a;
         // NOTE: these overflows can actually happen (when subtitles use color
         // 0,0,0 while output levels only allows 16,16,16 upwards...)
-        if (color_yuv[0] < 0)
-            color_yuv[0] = 0;
-        if (color_yuv[1] < 0)
-            color_yuv[1] = 0;
-        if (color_yuv[2] < 0)
-            color_yuv[2] = 0;
-        if (*color_a < 0)
-            *color_a = 0;
-        if (color_yuv[0] > ((1 << bits) - 1))
-            color_yuv[0] = ((1 << bits) - 1);
-        if (color_yuv[1] > ((1 << bits) - 1))
-            color_yuv[1] = ((1 << bits) - 1);
-        if (color_yuv[2] > ((1 << bits) - 1))
-            color_yuv[2] = ((1 << bits) - 1);
-        if (*color_a > 255)
-            *color_a = 255;
-        return true;
-    } else
-        return false;
+        for (int i = 0; i < 3; i++)
+            color_yuv[i] = av_clip(color_yuv[i], 0, ((1 << bits) - 1));
+
+        int bytes = (bits + 7) / 8;
+        uint8_t *alpha_p = (uint8_t *)sb->bitmap + src_y * sb->stride + src_x;
+        for (int p = 0; p < 3; p++) {
+            blend_const_alpha(dst.planes[p], dst.stride[p], color_yuv[p],
+                              alpha_p, sb->stride, a, dst.w, dst.h, bytes);
+        }
+    }
 }
 
-static void mp_image_crop(struct mp_image *img, int x, int y, int w, int h)
+static void mp_image_crop(struct mp_image *img, struct mp_rect rc)
 {
-    int p;
-    for (p = 0; p < img->num_planes; ++p) {
+    for (int p = 0; p < img->num_planes; ++p) {
         int bits = MP_IMAGE_BITS_PER_PIXEL_ON_PLANE(img, p);
         img->planes[p] +=
-            (y >> (p ? img->chroma_y_shift : 0)) * img->stride[p] +
-            ((x >> (p ? img->chroma_x_shift : 0)) * bits) / 8;
+            (rc.y0 >> (p ? img->chroma_y_shift : 0)) * img->stride[p] +
+            (rc.x0 >> (p ? img->chroma_x_shift : 0)) * bits / 8;
     }
-    img->w = w;
-    img->h = h;
+    img->w = rc.x1 - rc.x0;
+    img->h = rc.y1 - rc.y0;
+    img->chroma_width = img->w >> img->chroma_x_shift;
+    img->chroma_height = img->h >> img->chroma_y_shift;
+    img->display_w = img->display_h = 0;
 }
 
-static bool clip_to_bounds(int *x, int *y, int *w, int *h,
-                           int bx, int by, int bw, int bh)
+static bool clip_to_bb(struct mp_rect bb, struct mp_rect *rc)
 {
-    if (*x < bx) {
-        *w += *x - bx;
-        *x = bx;
-    }
-    if (*y < 0) {
-        *h += *y - by;
-        *y = by;
-    }
-    if (*x + *w > bx + bw)
-        *w = bx + bw - *x;
-    if (*y + *h > by + bh)
-        *h = by + bh - *y;
+    rc->x0 = FFMAX(bb.x0, rc->x0);
+    rc->y0 = FFMAX(bb.y0, rc->y0);
+    rc->x1 = FFMIN(bb.x1, rc->x1);
+    rc->y1 = FFMIN(bb.y1, rc->y1);
 
-    if (*w <= 0 || *h <= 0)
-        return false;  // nothing left
-
-    return true;
+    return rc->x1 > rc->x0 && rc->y1 > rc->y0;
 }
 
-static void get_swscale_requirements(int *sx, int *sy,
-                                       const struct mp_image *img)
+static void get_swscale_alignment(const struct mp_image *img, int *out_xstep,
+                                  int *out_ystep)
 {
-    int p;
-
-    if (img->chroma_x_shift == 31)
-        *sx = 1;
-    else
-        *sx = (1 << img->chroma_x_shift);
+    int sx = (1 << img->chroma_x_shift);
+    int sy = (1 << img->chroma_y_shift);
 
-    if (img->chroma_y_shift == 31)
-        *sy = 1;
-    else
-        *sy = (1 << img->chroma_y_shift);
+    // Hack for IMGFMT_Y8
+    if (img->chroma_x_shift == 31 && img->chroma_y_shift == 31) {
+        sx = 1;
+        sy = 1;
+    }
 
-    for (p = 0; p < img->num_planes; ++p) {
+    for (int p = 0; p < img->num_planes; ++p) {
         int bits = MP_IMAGE_BITS_PER_PIXEL_ON_PLANE(img, p);
         // the * 2 fixes problems with writing past the destination width
-        while (((*sx >> img->chroma_x_shift) * bits) % (SWS_MIN_BYTE_ALIGN * 8 * 2))
-            *sx *= 2;
+        while (((sx >> img->chroma_x_shift) * bits) % (SWS_MIN_BYTE_ALIGN * 8 * 2))
+            sx *= 2;
     }
+
+    *out_xstep = sx;
+    *out_ystep = sy;
 }
 
-static void align_bbox(int *x1, int *y1, int *x2, int *y2, int xstep, int ystep)
+static void align_bbox(int xstep, int ystep, struct mp_rect *rc)
 {
-    *x1 -= (*x1 % xstep);
-    *y1 -= (*y1 % ystep);
-
-    *x2 += xstep - 1;
-    *y2 += ystep - 1;
-    *x2 -= (*x2 % xstep);
-    *y2 -= (*y2 % ystep);
+    rc->x0 = rc->x0 & ~(xstep - 1);
+    rc->y0 = rc->y0 & ~(ystep - 1);
+    rc->x1 = FFALIGN(rc->x1, xstep);
+    rc->y1 = FFALIGN(rc->y1, ystep);
 }
 
-static bool align_bbox_to_swscale_requirements(int *x1, int *y1,
-                                               int *x2, int *y2,
-                                               struct mp_image *img)
+static bool align_bbox_for_swscale(struct mp_image *img, struct mp_rect *rc)
 {
+    struct mp_rect img_rect = {0, 0, img->w, img->h};
+    // Get rid of negative coordinates
+    if (!clip_to_bb(img_rect, rc))
+        return false;
     int xstep, ystep;
-    get_swscale_requirements(&xstep, &ystep, img);
-    align_bbox(x1, y1, x2, y2, xstep, ystep);
-
-    if (*x1 < 0)
-        *x1 = 0;
-    if (*y1 < 0)
-        *y1 = 0;
-    if (*x2 > img->w)
-        *x2 = img->w;
-    if (*y2 > img->h)
-        *y2 = img->h;
-
-    return (*x2 > *x1) && (*y2 > *y1);
+    get_swscale_alignment(img, &xstep, &ystep);
+    align_bbox(xstep, ystep, rc);
+    return clip_to_bb(img_rect, rc);
 }
 
-// cache: if not NULL, the function will set *cache to a talloc-allocated cache
-//        containing scaled versions of sbs contents - free the cache with
-//        talloc_free()
-void mp_draw_sub_bitmaps(struct mp_draw_sub_cache **cache, struct mp_image *dst,
-                         struct sub_bitmaps *sbs, struct mp_csp_details *csp)
+// Try to find best/closest YUV 444 format for imgfmt
+static void get_closest_y444_format(int imgfmt, int *out_format, int *out_bits)
 {
-    int i;
-    int x1, y1, x2, y2;
-    int color_yuv[3];
-    int color_a;
-    float yuv2rgb[3][4];
-    float rgb2yuv[3][4];
-
-    if (!mp_sws_supported_format(dst->imgfmt))
-        return;
+#ifdef ACCURATE
+    struct mp_image tmp = {0};
+    mp_image_setfmt(&tmp, imgfmt);
+    if (tmp.flags & MP_IMGFLAG_YUV) {
+        int bits;
+        if (mp_get_chroma_shift(imgfmt, NULL, NULL, &bits)) {
+            switch (bits) {
+                case 8:
+                    *out_format = IMGFMT_444P;
+                    *out_bits = 8;
+                    return;
+                case 9:
+                    *out_format = IMGFMT_444P9;
+                    *out_bits = 9;
+                    return;
+                case 10:
+                    *out_format = IMGFMT_444P10;
+                    *out_bits = 10;
+                    return;
+            }
+        }
+    }
+    *out_format = IMGFMT_444P16;
+    *out_bits = 16;
+#else
+    *out_format = IMGFMT_444P;
+    *out_bits = 8;
+#endif
+}
 
+static struct part *get_cache(struct mp_draw_sub_cache **cache,
+                              struct sub_bitmaps *sbs)
+{
     if (cache && !*cache)
         *cache = talloc_zero(NULL, struct mp_draw_sub_cache);
 
@@ -472,153 +456,67 @@ void mp_draw_sub_bitmaps(struct mp_draw_sub_cache **cache, struct mp_image *dst,
         (*cache)->parts[sbs->render_index] = part;
     }
 
-#ifdef ACCURATE
-    int format = IMGFMT_444P16;
-    int bits = 16;
-    // however, we can try matching 8bit, 9bit, 10bit yuv formats!
-    if (dst->flags & MP_IMGFLAG_YUV) {
-        if (mp_get_chroma_shift(dst->imgfmt, NULL, NULL, &bits)) {
-            switch (bits) {
-                case 8:
-                    format = IMGFMT_444P;
-                    break;
-                case 9:
-                    format = IMGFMT_444P9;
-                    break;
-                case 10:
-                    format = IMGFMT_444P10;
-                    break;
-                default:
-                    // revert back
-                    bits = 16;
-                    break;
-            }
-        } else
-            bits = 16;
-    }
-#else
-    int format = IMGFMT_444P;
-    int bits = 8;
-#endif
-    int bytes = (bits + 7) / 8;
+    return part;
+}
 
-    struct mp_csp_params cspar = {
-        .colorspace = *csp,
-        .brightness = 0, .contrast = 1,
-        .hue = 0, .saturation = 1,
-        .rgamma = 1, .ggamma = 1, .bgamma = 1,
-        .texture_bits = 8, .input_bits = 8
-    };
+// Return area of intersection between target and sub-bitmap as cropped image
+static bool get_sub_area(struct mp_rect bb, struct mp_image *temp,
+                         struct sub_bitmap *sb, struct mp_image *out_area,
+                         int *out_src_x, int *out_src_y)
+{
+    // coordinates are relative to the bbox
+    struct mp_rect dst = {sb->x - bb.x0, sb->y - bb.y0};
+    dst.x1 = dst.x0 + sb->dw;
+    dst.y1 = dst.y0 + sb->dh;
+    if (!clip_to_bb((struct mp_rect){0, 0, temp->w, temp->h}, &dst))
+        return false;
 
-    // prepare YUV/RGB conversion values
-    mp_get_yuv2rgb_coeffs(&cspar, yuv2rgb);
-    mp_invert_yuv2rgb(rgb2yuv, yuv2rgb);
+    *out_src_x = (dst.x0 - sb->x) + bb.x0;
+    *out_src_y = (dst.y0 - sb->y) + bb.y0;
+    *out_area = *temp;
+    mp_image_crop(out_area, dst);
+
+    return true;
+}
+
+// cache: if not NULL, the function will set *cache to a talloc-allocated cache
+//        containing scaled versions of sbs contents - free the cache with
+//        talloc_free()
+void mp_draw_sub_bitmaps(struct mp_draw_sub_cache **cache, struct mp_image *dst,
+                         struct sub_bitmaps *sbs, struct mp_csp_details *csp)
+{
+    assert(mp_draw_sub_formats[sbs->format]);
+    if (!mp_sws_supported_format(dst->imgfmt))
+        return;
+
+    int format, bits;
+    get_closest_y444_format(dst->imgfmt, &format, &bits);
 
-    //mp_msg(MSGT_VO, MSGL_ERR, "%f %f %f %f // %f %f %f %f // %f %f %f %f\n",
-    //        rgb2yuv[0][0],
-    //        rgb2yuv[0][1],
-    //        rgb2yuv[0][2],
-    //        rgb2yuv[0][3],
-    //        rgb2yuv[1][0],
-    //        rgb2yuv[1][1],
-    //        rgb2yuv[1][2],
-    //        rgb2yuv[1][3],
-    //        rgb2yuv[2][0],
-    //        rgb2yuv[2][1],
-    //        rgb2yuv[2][2],
-    //        rgb2yuv[2][3]);
-
-    // calculate bounding range
-    if (!sub_bitmaps_bb(sbs, &x1, &y1, &x2, &y2))
+    struct mp_rect bb;
+    if (!sub_bitmaps_bb(sbs, &bb))
         return;
 
-    if (!align_bbox_to_swscale_requirements(&x1, &y1, &x2, &y2, dst))
-        return;  // nothing to do
+    if (!align_bbox_for_swscale(dst, &bb))
+        return;
 
-    // convert to a temp image
-    mp_image_t *temp;
-    mp_image_t dst_region = *dst;
+    struct mp_image *temp;
+    struct mp_image dst_region = *dst;
+    mp_image_crop(&dst_region, bb);
     if (dst->imgfmt == format) {
-        mp_image_crop(&dst_region, x1, y1, x2 - x1, y2 - y1);
         temp = &dst_region;
     } else {
-        mp_image_crop(&dst_region, x1, y1, x2 - x1, y2 - y1);
-        temp = alloc_mpi(x2 - x1, y2 - y1, format);
+        temp = alloc_mpi(bb.x1 - bb.x0, bb.y1 - bb.y0, format);
         mp_image_swscale(temp, &dst_region, csp, SWS_POINT); // chroma up
     }
 
-    for (i = 0; i < sbs->num_parts; ++i) {
-        struct sub_bitmap *sb = &sbs->parts[i];
-        mp_image_t *sbi = NULL;
-        mp_image_t *sba = NULL;
-
-        // cut off areas outside the image
-        int dst_x = sb->x - x1; // coordinates are relative to the bbox
-        int dst_y = sb->y - y1; // coordinates are relative to the bbox
-        int dst_w = sb->dw;
-        int dst_h = sb->dh;
-        if (!clip_to_bounds(&dst_x, &dst_y, &dst_w, &dst_h,
-                            0, 0, temp->w, temp->h))
-            continue;
-
-        if (part) {
-            sbi = part->imgs[i].i;
-            sba = part->imgs[i].a;
-        }
-
-        if (!(sbi && sba)) {
-            if (!sub_bitmap_to_mp_images(&sbi, color_yuv, &color_a, &sba, sb,
-                                         sbs->format, csp, rgb2yuv, format,
-                                         bits))
-            {
-                mp_msg(MSGT_VO, MSGL_ERR,
-                       "render_sub_bitmap: invalid sub bitmap type\n");
-                continue;
-            }
-        }
-
-        // call blend_alpha 3 times
-        int p;
-        int src_x = (dst_x + x1) - sb->x;
-        int src_y = (dst_y + y1) - sb->y;
-        unsigned char *alpha_p =
-            sba->planes[0] + src_y * sba->stride[0] + src_x;
-        for (p = 0; p < 3; ++p) {
-            unsigned char *dst_p =
-                temp->planes[p] + dst_y * temp->stride[p] + dst_x * bytes;
-            if (sbi) {
-                unsigned char *src_p =
-                    sbi->planes[p] + src_y * sbi->stride[p] + src_x * bytes;
-                blend_src_alpha(
-                    dst_p, temp->stride[p],
-                    src_p, sbi->stride[p],
-                    alpha_p, sba->stride[0],
-                    dst_h, dst_w, bytes
-                    );
-            } else {
-                blend_const_alpha(
-                    dst_p, temp->stride[p],
-                    color_yuv[p],
-                    alpha_p, sba->stride[0], color_a,
-                    dst_h, dst_w, bytes
-                    );
-            }
-        }
-
-        if (part) {
-            part->imgs[i].i = talloc_steal(part, sbi);
-            part->imgs[i].a = talloc_steal(part, sba);
-        } else {
-            free_mp_image(sbi);
-            free_mp_image(sba);
-        }
+    if (sbs->format == SUBBITMAP_RGBA) {
+        draw_rgba(cache, bb, temp, bits, csp, sbs);
+    } else if (sbs->format == SUBBITMAP_LIBASS) {
+        draw_ass(cache, bb, temp, bits, csp, sbs);
     }
 
     if (temp != &dst_region) {
-        // convert back
         mp_image_swscale(&dst_region, temp, csp, SWS_AREA); // chroma down
-
-        // clean up
         free_mp_image(temp);
     }
 }
diff --git a/sub/sub.c b/sub/sub.c
index 029d902604..7cea5a3cd1 100644
--- a/sub/sub.c
+++ b/sub/sub.c
@@ -23,7 +23,7 @@
 
 #include <libavutil/common.h>
 
-#include "config.h"
+#include "mpcommon.h"
 
 #include "stream/stream.h"
 
@@ -269,22 +269,22 @@ void vo_osd_changed(int new_value)
     osd->want_redraw = true;
 }
 
-bool sub_bitmaps_bb(struct sub_bitmaps *imgs, int *x1, int *y1,
-                    int *x2, int *y2)
+bool sub_bitmaps_bb(struct sub_bitmaps *imgs, struct mp_rect *out_bb)
 {
-    *x1 = *y1 = INT_MAX;
-    *x2 = *y2 = INT_MIN;
+    struct mp_rect bb = {INT_MAX, INT_MAX, INT_MIN, INT_MIN};
     for (int n = 0; n < imgs->num_parts; n++) {
         struct sub_bitmap *p = &imgs->parts[n];
-        *x1 = FFMIN(*x1, p->x);
-        *y1 = FFMIN(*y1, p->y);
-        *x2 = FFMAX(*x2, p->x + p->dw);
-        *y2 = FFMAX(*y2, p->y + p->dh);
+        bb.x0 = FFMIN(bb.x0, p->x);
+        bb.y0 = FFMIN(bb.y0, p->y);
+        bb.x1 = FFMAX(bb.x1, p->x + p->dw);
+        bb.y1 = FFMAX(bb.y1, p->y + p->dh);
     }
 
     // avoid degenerate bounding box if empty
-    *x1 = FFMIN(*x1, *x2);
-    *y1 = FFMIN(*y1, *y2);
+    bb.x0 = FFMIN(bb.x0, bb.x1);
+    bb.y0 = FFMIN(bb.y0, bb.y1);
 
-    return *x1 < *x2 && *y1 < *y2;
+    *out_bb = bb;
+
+    return bb.x0 < bb.x1 && bb.y0 < bb.y1;
 }
diff --git a/sub/sub.h b/sub/sub.h
index 4adf0a23d8..5013611e61 100644
--- a/sub/sub.h
+++ b/sub/sub.h
@@ -210,8 +210,8 @@ bool osd_draw_on_image(struct osd_state *osd, struct mp_osd_res res,
                        double video_pts, int draw_flags, struct mp_image *dest,
                        struct mp_csp_details *dest_csp);
 
-bool sub_bitmaps_bb(struct sub_bitmaps *imgs, int *x1, int *y1,
-                    int *x2, int *y2);
+struct mp_rect;
+bool sub_bitmaps_bb(struct sub_bitmaps *imgs, struct mp_rect *out_bb);
 
 // defined in osd_libass.c and osd_dummy.c
 
-- 
cgit v1.2.3