From 6bec6ac55828495356e3498dc52294e28c38c3c1 Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Thu, 24 Dec 2015 14:43:23 +0100
Subject: sub: better alpha blending when rendering to alpha surfaces

This actually treats destination alpha correctly, and gives much better
results than before. I don't know if this is perfectly correct yet,
though. Slight difference with vo_opengl behavior suggests it might not
be.

Note that this does not affect VOs with true alpha support. vo_opengl
does not use this code at all, and does the alpha calculations in OpenGL
instead.
---
 sub/draw_bmp.c         | 36 +++++++++++++++++++++++++++++++++++-
 video/fmt-conversion.c |  1 +
 video/img_format.h     |  1 +
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/sub/draw_bmp.c b/sub/draw_bmp.c
index da4760e105..ba5c6271e4 100644
--- a/sub/draw_bmp.c
+++ b/sub/draw_bmp.c
@@ -114,6 +114,7 @@ static void blend_const8_alpha(void *dst, int dst_stride, uint16_t srcp,
     }
 }
 
+// dst = srcp * (srca * srcamul) + dst * (1 - (srca * srcamul))
 static void blend_const_alpha(void *dst, int dst_stride, int srcp,
                               uint8_t *srca, int srca_stride, uint8_t srcamul,
                               int w, int h, int bytes)
@@ -169,6 +170,7 @@ static void blend_src8_alpha(void *dst, int dst_stride, void *src,
     }
 }
 
+// dst = src * srca + dst * (1 - srca)
 static void blend_src_alpha(void *dst, int dst_stride, void *src,
                             int src_stride, uint8_t *srca, int srca_stride,
                             int w, int h, int bytes)
@@ -182,6 +184,30 @@ static void blend_src_alpha(void *dst, int dst_stride, void *src,
     }
 }
 
+// dst = src * srcmul + dst * (1 - src * srcmul)
+static void blend_src_dst_mul(void *dst, int dst_stride,
+                              uint8_t *src, int src_stride, uint8_t srcmul,
+                              int w, int h, int dst_bytes)
+{
+    for (int y = 0; y < h; y++) {
+        void *dst_rp = (uint8_t *)dst + dst_stride * y;
+        uint8_t *src_r = (uint8_t *)src + src_stride * y;
+        if (dst_bytes == 2) {
+            uint16_t *dst_r = dst_rp;
+            for (int x = 0; x < w; x++) {
+                uint16_t srcp = src_r[x] * srcmul; // now 0..65025
+                dst_r[x] = (srcp * 65025 + dst_r[x] * (65025 - srcp) + 32512) / 65025;
+            }
+        } else if (dst_bytes == 1) {
+            uint8_t *dst_r = dst_rp;
+            for (int x = 0; x < w; x++) {
+                uint16_t srcp = src_r[x] * srcmul; // now 0..65025
+                dst_r[x] = (srcp * 255 + dst_r[x] * (65025 - srcp) + 32512) / 65025;
+            }
+        }
+    }
+}
+
 static void unpremultiply_and_split_BGR32(struct mp_image *img,
                                           struct mp_image *alpha)
 {
@@ -278,6 +304,10 @@ static void draw_rgba(struct mp_draw_sub_cache *cache, struct mp_rect bb,
             blend_src_alpha(dst.planes[p], dst.stride[p], src, sbi->stride[p],
                             alpha_p, sba->stride[0], dst.w, dst.h, bytes);
         }
+        if (temp->num_planes >= 4) {
+            blend_src_dst_mul(dst.planes[3], dst.stride[3], alpha_p,
+                              sba->stride[0], 255, dst.w, dst.h, bytes);
+        }
 
         part->imgs[i].i = talloc_steal(part, sbi);
         part->imgs[i].a = talloc_steal(part, sba);
@@ -328,6 +358,10 @@ static void draw_ass(struct mp_draw_sub_cache *cache, struct mp_rect bb,
             blend_const_alpha(dst.planes[p], dst.stride[p], color_yuv[p],
                               alpha_p, sb->stride, a, dst.w, dst.h, bytes);
         }
+        if (temp->num_planes >= 4) {
+            blend_src_dst_mul(dst.planes[3], dst.stride[3], alpha_p,
+                              sb->stride, a, dst.w, dst.h, bytes);
+        }
     }
 }
 
@@ -374,7 +408,7 @@ static void get_closest_y444_format(int imgfmt, int *out_format, int *out_bits)
 {
     struct mp_imgfmt_desc desc = mp_imgfmt_get_desc(imgfmt);
     if (desc.flags & MP_IMGFLAG_RGB) {
-        *out_format = IMGFMT_GBRP;
+        *out_format = desc.flags & MP_IMGFLAG_ALPHA ? IMGFMT_GBRAP : IMGFMT_GBRP;
         *out_bits = 8;
         return;
     } else if (desc.flags & MP_IMGFLAG_YUV_P) {
diff --git a/video/fmt-conversion.c b/video/fmt-conversion.c
index 6482f5429f..71cd044169 100644
--- a/video/fmt-conversion.c
+++ b/video/fmt-conversion.c
@@ -49,6 +49,7 @@ static const struct {
     {IMGFMT_BGR4,  AV_PIX_FMT_BGR4},
     {IMGFMT_PAL8,  AV_PIX_FMT_PAL8},
     {IMGFMT_GBRP,  AV_PIX_FMT_GBRP},
+    {IMGFMT_GBRAP, AV_PIX_FMT_GBRAP},
     {IMGFMT_YUYV,  AV_PIX_FMT_YUYV422},
     {IMGFMT_UYVY,  AV_PIX_FMT_UYVY422},
     {IMGFMT_NV12,  AV_PIX_FMT_NV12},
diff --git a/video/img_format.h b/video/img_format.h
index 0d8c699850..8788f86c35 100644
--- a/video/img_format.h
+++ b/video/img_format.h
@@ -197,6 +197,7 @@ enum mp_imgfmt {
 
     // Planar RGB (planes are shuffled: plane 0 is G, etc.)
     IMGFMT_GBRP,
+    IMGFMT_GBRAP,
 
     // XYZ colorspace, similar organization to RGB48. Even though it says "12",
     // the components are stored as 16 bit, with lower 4 bits set to 0.
-- 
cgit v1.2.3