From b077d0583ce9332621e2e2904a53896b12f85401 Mon Sep 17 00:00:00 2001
From: "Dr.Smile" <vabnick@gmail.com>
Date: Mon, 13 Apr 2020 10:12:37 +0300
Subject: Simplify blur algorithm

This commit removes prefilters altogether at the cost of
enlarged main filter kernel.
---
 libass/ass_bitmap.h        |   3 +-
 libass/ass_blur.c          | 652 +++++++++-------------------------
 libass/ass_func_template.h |  66 ++--
 libass/x86/blur.asm        | 859 ++++++++++++---------------------------------
 libass/x86/utils.asm       |  56 +++
 5 files changed, 470 insertions(+), 1166 deletions(-)

diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h
index 783dd6d..99052e4 100644
--- a/libass/ass_bitmap.h
+++ b/libass/ass_bitmap.h
@@ -80,8 +80,7 @@ typedef struct {
     Convert16to8Func stripe_pack;
     FilterFunc shrink_horz, shrink_vert;
     FilterFunc expand_horz, expand_vert;
-    FilterFunc pre_blur_horz[3], pre_blur_vert[3];
-    ParamFilterFunc main_blur_horz[3], main_blur_vert[3];
+    ParamFilterFunc blur_horz[5], blur_vert[5];
 } BitmapEngine;
 
 extern const BitmapEngine ass_bitmap_engine_c;
diff --git a/libass/ass_blur.c b/libass/ass_blur.c
index 0a622ea..2630086 100644
--- a/libass/ass_blur.c
+++ b/libass/ass_blur.c
@@ -29,17 +29,16 @@
 /*
  * Cascade Blur Algorithm
  *
- * The main idea is simple: to approximate gaussian blur with large radius
- * you can downscale, then apply filter with small pattern, then upscale back.
+ * The main idea is simple: to approximate a gaussian blur with large radius,
+ * you can scale down, apply a filter with a relatively small pattern, then scale back up.
  *
- * To achieve desired precision down/upscaling should be done with sufficiently smooth kernel.
- * Experiment shows that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
+ * To achieve the desired precision, scaling should be done with sufficiently smooth kernel.
+ * Experiments show that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
  * corresponding upscaling are enough for 8-bit precision.
  *
- * For central filter here is used generic 9-tap filter with one of 3 different patterns
- * combined with one of optional prefilters with fixed kernels. Kernel coefficients
- * of the main filter are obtained from solution of least squares problem
- * for Fourier transform of resulting kernel.
+ * Here we use generic filters with 5 different kernel widths (9 to 17-tap).
+ * Kernel coefficients of that filter are obtained from the solution of the least-squares problem
+ * for the Fourier transform of the resulting kernel.
  */
 
 
@@ -63,9 +62,7 @@ inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintpt
 
 inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size)
 {
-    ptr = get_line(ptr, offs, size);
-    for (int k = 0; k < STRIPE_WIDTH; ++k)
-        buf[k] = ptr[k];
+    memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0]));
 }
 
 /*
@@ -265,393 +262,143 @@ void ass_expand_vert_c(int16_t *dst, const int16_t *src,
 }
 
 /*
- * First Supplementary Filters
+ * Main Parametric Filters
  *
- * Perform 1D convolution with kernel [1, 2, 1].
+ * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...],
+ * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...),
+ * number of parameters is part of the function name.
  */
 
-static inline int16_t pre_blur1_func(int16_t p1, int16_t z0, int16_t n1)
+static inline void blur_horz(int16_t *dst, const int16_t *src,
+                             uintptr_t src_width, uintptr_t src_height,
+                             const int16_t *param, const int n)
 {
-    /*
-    return (1 * p1 + 2 * z0 + 1 * n1 + 2) >> 2;
-    */
-    return (uint16_t) (((uint16_t) (p1 + n1) >> 1) + z0 + 1) >> 1;
-}
-
-void ass_pre_blur1_horz_c(int16_t *dst, const int16_t *src,
-                          uintptr_t src_width, uintptr_t src_height)
-{
-    uintptr_t dst_width = src_width + 2;
+    uintptr_t dst_width = src_width + 2 * n;
     uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
     uintptr_t step = STRIPE_WIDTH * src_height;
 
     uintptr_t offs = 0;
-    int16_t buf[2 * STRIPE_WIDTH];
-    int16_t *ptr = buf + STRIPE_WIDTH;
+    int16_t buf[3 * STRIPE_WIDTH];
+    int16_t *ptr = buf + 2 * STRIPE_WIDTH;
     for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
-        for (uintptr_t y = 0; y < src_height; ++y) {
-            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
-            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = pre_blur1_func(ptr[k - 2], ptr[k - 1], ptr[k]);
-            dst += STRIPE_WIDTH;
+        for (uintptr_t y = 0; y < src_height; y++) {
+            for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++)
+                copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size);
+            int32_t acc[STRIPE_WIDTH];
+            for (int k = 0; k < STRIPE_WIDTH; k++)
+                acc[k] = 0x8000;
+            for (int i = n; i > 0; i--)
+                for (int k = 0; k < STRIPE_WIDTH; k++)
+                    acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] +
+                              (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1];
+            for (int k = 0; k < STRIPE_WIDTH; k++)
+                dst[k] = ptr[k - n] + (acc[k] >> 16);
+
+            dst  += STRIPE_WIDTH;
             offs += STRIPE_WIDTH;
         }
     }
 }
 
-void ass_pre_blur1_vert_c(int16_t *dst, const int16_t *src,
-                          uintptr_t src_width, uintptr_t src_height)
+static inline void blur_vert(int16_t *dst, const int16_t *src,
+                             uintptr_t src_width, uintptr_t src_height,
+                             const int16_t *param, const int n)
 {
-    uintptr_t dst_height = src_height + 2;
+    uintptr_t dst_height = src_height + 2 * n;
     uintptr_t step = STRIPE_WIDTH * src_height;
 
     for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
         uintptr_t offs = 0;
-        for (uintptr_t y = 0; y < dst_height; ++y) {
-            const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
-            const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
-            const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = pre_blur1_func(p1[k], z0[k], n1[k]);
-            dst += STRIPE_WIDTH;
+        for (uintptr_t y = 0; y < dst_height; y++) {
+            int32_t acc[STRIPE_WIDTH];
+            for (int k = 0; k < STRIPE_WIDTH; k++)
+                acc[k] = 0x8000;
+            const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step);
+            for (int i = n; i > 0; i--) {
+                const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step);
+                const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step);
+                for (int k = 0; k < STRIPE_WIDTH; k++)
+                    acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] +
+                              (int16_t) (line2[k] - center[k]) * param[i - 1];
+            }
+            for (int k = 0; k < STRIPE_WIDTH; k++)
+                dst[k] = center[k] + (acc[k] >> 16);
+
+            dst  += STRIPE_WIDTH;
             offs += STRIPE_WIDTH;
         }
         src += step;
     }
 }
 
-/*
- * Second Supplementary Filters
- *
- * Perform 1D convolution with kernel [1, 4, 6, 4, 1].
- */
-
-static inline int16_t pre_blur2_func(int16_t p2, int16_t p1, int16_t z0,
-                                     int16_t n1, int16_t n2)
+void ass_blur4_horz_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    /*
-    return (1 * p2 + 4 * p1 + 6 * z0 + 4 * n1 + 1 * n2 + 8) >> 4;
-    */
-    uint16_t r1 = ((uint16_t) (((uint16_t) (p2 + n2) >> 1) + z0) >> 1) + z0;
-    uint16_t r2 = p1 + n1;
-    uint16_t r = ((uint16_t) (r1 + r2) >> 1) | (0x8000 & r1 & r2);
-    return (uint16_t) (r + 1) >> 1;
+    blur_horz(dst, src, src_width, src_height, param, 4);
 }
 
-void ass_pre_blur2_horz_c(int16_t *dst, const int16_t *src,
-                          uintptr_t src_width, uintptr_t src_height)
+void ass_blur4_vert_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    uintptr_t dst_width = src_width + 4;
-    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    uintptr_t offs = 0;
-    int16_t buf[2 * STRIPE_WIDTH];
-    int16_t *ptr = buf + STRIPE_WIDTH;
-    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
-        for (uintptr_t y = 0; y < src_height; ++y) {
-            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
-            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = pre_blur2_func(ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k]);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-    }
+    blur_vert(dst, src, src_width, src_height, param, 4);
 }
 
-void ass_pre_blur2_vert_c(int16_t *dst, const int16_t *src,
-                          uintptr_t src_width, uintptr_t src_height)
+void ass_blur5_horz_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    uintptr_t dst_height = src_height + 4;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
-        uintptr_t offs = 0;
-        for (uintptr_t y = 0; y < dst_height; ++y) {
-            const int16_t *p2 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
-            const int16_t *p1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
-            const int16_t *z0 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
-            const int16_t *n1 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
-            const int16_t *n2 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = pre_blur2_func(p2[k], p1[k], z0[k], n1[k], n2[k]);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-        src += step;
-    }
+    blur_horz(dst, src, src_width, src_height, param, 5);
 }
 
-/*
- * Third Supplementary Filters
- *
- * Perform 1D convolution with kernel [1, 6, 15, 20, 15, 6, 1].
- */
-
-static inline int16_t pre_blur3_func(int16_t p3, int16_t p2, int16_t p1, int16_t z0,
-                                     int16_t n1, int16_t n2, int16_t n3)
+void ass_blur5_vert_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    /*
-    return (1 * p3 + 6 * p2 + 15 * p1 + 20 * z0 + 15 * n1 + 6 * n2 + 1 * n3 + 32) >> 6;
-    */
-    return (20 * (uint16_t) z0 +
-            15 * (uint16_t) (p1 + n1) +
-             6 * (uint16_t) (p2 + n2) +
-             1 * (uint16_t) (p3 + n3) + 32) >> 6;
+    blur_vert(dst, src, src_width, src_height, param, 5);
 }
 
-void ass_pre_blur3_horz_c(int16_t *dst, const int16_t *src,
-                          uintptr_t src_width, uintptr_t src_height)
+void ass_blur6_horz_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    uintptr_t dst_width = src_width + 6;
-    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    uintptr_t offs = 0;
-    int16_t buf[2 * STRIPE_WIDTH];
-    int16_t *ptr = buf + STRIPE_WIDTH;
-    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
-        for (uintptr_t y = 0; y < src_height; ++y) {
-            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
-            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = pre_blur3_func(ptr[k - 6], ptr[k - 5], ptr[k - 4], ptr[k - 3],
-                                        ptr[k - 2], ptr[k - 1], ptr[k]);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-    }
+    blur_horz(dst, src, src_width, src_height, param, 6);
 }
 
-void ass_pre_blur3_vert_c(int16_t *dst, const int16_t *src,
-                          uintptr_t src_width, uintptr_t src_height)
+void ass_blur6_vert_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    uintptr_t dst_height = src_height + 6;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
-        uintptr_t offs = 0;
-        for (uintptr_t y = 0; y < dst_height; ++y) {
-            const int16_t *p3 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
-            const int16_t *p2 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
-            const int16_t *p1 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
-            const int16_t *z0 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
-            const int16_t *n1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
-            const int16_t *n2 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
-            const int16_t *n3 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = pre_blur3_func(p3[k], p2[k], p1[k], z0[k], n1[k], n2[k], n3[k]);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-        src += step;
-    }
+    blur_vert(dst, src, src_width, src_height, param, 6);
 }
 
-/*
- * Main 9-tap Parametric Filters
- *
- * Perform 1D convolution with kernel
- *         [c3, c2, c1, c0, d, c0, c1, c2, c3] or
- *     [c3,  0, c2, c1, c0, d, c0, c1, c2,  0, c3] or
- * [c3,  0, c2,  0, c1, c0, d, c0, c1,  0, c2,  0, c3] accordingly.
- *
- * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + c3).
- */
-
-static inline int16_t blur_func(int16_t p4, int16_t p3, int16_t p2, int16_t p1, int16_t z0,
-                                int16_t n1, int16_t n2, int16_t n3, int16_t n4, const int16_t c[])
+void ass_blur7_horz_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    p1 -= z0;
-    p2 -= z0;
-    p3 -= z0;
-    p4 -= z0;
-    n1 -= z0;
-    n2 -= z0;
-    n3 -= z0;
-    n4 -= z0;
-    return (((p1 + n1) * c[0] +
-             (p2 + n2) * c[1] +
-             (p3 + n3) * c[2] +
-             (p4 + n4) * c[3] +
-             0x8000) >> 16) + z0;
+    blur_horz(dst, src, src_width, src_height, param, 7);
 }
 
-void ass_blur1234_horz_c(int16_t *dst, const int16_t *src,
-                         uintptr_t src_width, uintptr_t src_height,
-                         const int16_t *param)
+void ass_blur7_vert_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    uintptr_t dst_width = src_width + 8;
-    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    uintptr_t offs = 0;
-    int16_t buf[2 * STRIPE_WIDTH];
-    int16_t *ptr = buf + STRIPE_WIDTH;
-    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
-        for (uintptr_t y = 0; y < src_height; ++y) {
-            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
-            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = blur_func(ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5], ptr[k - 4],
-                                   ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k - 0], param);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-    }
+    blur_vert(dst, src, src_width, src_height, param, 7);
 }
 
-void ass_blur1234_vert_c(int16_t *dst, const int16_t *src,
-                         uintptr_t src_width, uintptr_t src_height,
-                         const int16_t *param)
+void ass_blur8_horz_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    uintptr_t dst_height = src_height + 8;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
-        uintptr_t offs = 0;
-        for (uintptr_t y = 0; y < dst_height; ++y) {
-            const int16_t *p4 = get_line(src, offs - 8 * STRIPE_WIDTH, step);
-            const int16_t *p3 = get_line(src, offs - 7 * STRIPE_WIDTH, step);
-            const int16_t *p2 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
-            const int16_t *p1 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
-            const int16_t *z0 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
-            const int16_t *n1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
-            const int16_t *n2 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
-            const int16_t *n3 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
-            const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
-                                   n1[k], n2[k], n3[k], n4[k], param);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-        src += step;
-    }
+    blur_horz(dst, src, src_width, src_height, param, 8);
 }
 
-void ass_blur1235_horz_c(int16_t *dst, const int16_t *src,
-                         uintptr_t src_width, uintptr_t src_height,
-                         const int16_t *param)
+void ass_blur8_vert_c(int16_t *dst, const int16_t *src,
+                      uintptr_t src_width, uintptr_t src_height,
+                      const int16_t *param)
 {
-    uintptr_t dst_width = src_width + 10;
-    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    uintptr_t offs = 0;
-#if STRIPE_WIDTH < 10
-    int16_t buf[3 * STRIPE_WIDTH];
-    int16_t *ptr = buf + 2 * STRIPE_WIDTH;
-#else
-    int16_t buf[2 * STRIPE_WIDTH];
-    int16_t *ptr = buf + STRIPE_WIDTH;
-#endif
-    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
-        for (uintptr_t y = 0; y < src_height; ++y) {
-#if STRIPE_WIDTH < 10
-            copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size);
-#endif
-            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
-            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = blur_func(ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5],
-                                   ptr[k -  4], ptr[k - 3], ptr[k - 2], ptr[k - 0], param);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-    }
-}
-
-void ass_blur1235_vert_c(int16_t *dst, const int16_t *src,
-                         uintptr_t src_width, uintptr_t src_height,
-                         const int16_t *param)
-{
-    uintptr_t dst_height = src_height + 10;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
-        uintptr_t offs = 0;
-        for (uintptr_t y = 0; y < dst_height; ++y) {
-            const int16_t *p4 = get_line(src, offs - 10 * STRIPE_WIDTH, step);
-            const int16_t *p3 = get_line(src, offs -  8 * STRIPE_WIDTH, step);
-            const int16_t *p2 = get_line(src, offs -  7 * STRIPE_WIDTH, step);
-            const int16_t *p1 = get_line(src, offs -  6 * STRIPE_WIDTH, step);
-            const int16_t *z0 = get_line(src, offs -  5 * STRIPE_WIDTH, step);
-            const int16_t *n1 = get_line(src, offs -  4 * STRIPE_WIDTH, step);
-            const int16_t *n2 = get_line(src, offs -  3 * STRIPE_WIDTH, step);
-            const int16_t *n3 = get_line(src, offs -  2 * STRIPE_WIDTH, step);
-            const int16_t *n4 = get_line(src, offs -  0 * STRIPE_WIDTH, step);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
-                                   n1[k], n2[k], n3[k], n4[k], param);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-        src += step;
-    }
-}
-
-void ass_blur1246_horz_c(int16_t *dst, const int16_t *src,
-                         uintptr_t src_width, uintptr_t src_height,
-                         const int16_t *param)
-{
-    uintptr_t dst_width = src_width + 12;
-    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    uintptr_t offs = 0;
-#if STRIPE_WIDTH < 12
-    int16_t buf[3 * STRIPE_WIDTH];
-    int16_t *ptr = buf + 2 * STRIPE_WIDTH;
-#else
-    int16_t buf[2 * STRIPE_WIDTH];
-    int16_t *ptr = buf + STRIPE_WIDTH;
-#endif
-    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
-        for (uintptr_t y = 0; y < src_height; ++y) {
-#if STRIPE_WIDTH < 12
-            copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size);
-#endif
-            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
-            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = blur_func(ptr[k - 12], ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6],
-                                   ptr[k -  5], ptr[k -  4], ptr[k - 2], ptr[k - 0], param);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-    }
-}
-
-void ass_blur1246_vert_c(int16_t *dst, const int16_t *src,
-                         uintptr_t src_width, uintptr_t src_height,
-                         const int16_t *param)
-{
-    uintptr_t dst_height = src_height + 12;
-    uintptr_t step = STRIPE_WIDTH * src_height;
-
-    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
-        uintptr_t offs = 0;
-        for (uintptr_t y = 0; y < dst_height; ++y) {
-            const int16_t *p4 = get_line(src, offs - 12 * STRIPE_WIDTH, step);
-            const int16_t *p3 = get_line(src, offs - 10 * STRIPE_WIDTH, step);
-            const int16_t *p2 = get_line(src, offs -  8 * STRIPE_WIDTH, step);
-            const int16_t *p1 = get_line(src, offs -  7 * STRIPE_WIDTH, step);
-            const int16_t *z0 = get_line(src, offs -  6 * STRIPE_WIDTH, step);
-            const int16_t *n1 = get_line(src, offs -  5 * STRIPE_WIDTH, step);
-            const int16_t *n2 = get_line(src, offs -  4 * STRIPE_WIDTH, step);
-            const int16_t *n3 = get_line(src, offs -  2 * STRIPE_WIDTH, step);
-            const int16_t *n4 = get_line(src, offs -  0 * STRIPE_WIDTH, step);
-            for (int k = 0; k < STRIPE_WIDTH; ++k)
-                dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
-                                   n1[k], n2[k], n3[k], n4[k], param);
-            dst += STRIPE_WIDTH;
-            offs += STRIPE_WIDTH;
-        }
-        src += step;
-    }
+    blur_vert(dst, src, src_width, src_height, param, 8);
 }
 
 
@@ -665,27 +412,17 @@ static void calc_gauss(double *res, int n, double r2)
     res[0] = cur;
     cur *= mul;
     res[1] = cur;
-    for (int i = 2; i <= n; ++i) {
+    for (int i = 2; i < n; i++) {
         mul *= mul2;
         cur *= mul;
         res[i] = cur;
     }
 }
 
-static void coeff_blur121(double *coeff, int n)
-{
-    double prev = coeff[1];
-    for (int i = 0; i <= n; ++i) {
-        double res = (prev + 2 * coeff[i] + coeff[i + 1]) / 4;
-        prev = coeff[i];
-        coeff[i] = res;
-    }
-}
-
 static void coeff_filter(double *coeff, int n, const double kernel[4])
 {
     double prev1 = coeff[1], prev2 = coeff[2], prev3 = coeff[3];
-    for (int i = 0; i <= n; ++i) {
+    for (int i = 0; i < n; i++) {
         double res = coeff[i + 0]  * kernel[0] +
             (prev1 + coeff[i + 1]) * kernel[1] +
             (prev2 + coeff[i + 2]) * kernel[2] +
@@ -697,142 +434,97 @@ static void coeff_filter(double *coeff, int n, const double kernel[4])
     }
 }
 
-static void calc_matrix(double mat[4][4], const double *mat_freq, const int *index)
+static void calc_matrix(double mat[][8], const double *mat_freq, int n)
 {
-    for (int i = 0; i < 4; ++i) {
-        mat[i][i] = mat_freq[2 * index[i]] + 3 * mat_freq[0] - 4 * mat_freq[index[i]];
-        for (int j = i + 1; j < 4; ++j)
-            mat[i][j] = mat[j][i] =
-                mat_freq[index[i] + index[j]] + mat_freq[index[j] - index[i]] +
-                2 * (mat_freq[0] - mat_freq[index[i]] - mat_freq[index[j]]);
+    for (int i = 0; i < n; i++) {
+        mat[i][i] = mat_freq[2 * i + 2] + 3 * mat_freq[0] - 4 * mat_freq[i + 1];
+        for (int j = i + 1; j < n; j++)
+            mat[i][j] = mat[j][i] = mat_freq[i + j + 2] + mat_freq[j - i] +
+                2 * (mat_freq[0] - mat_freq[i + 1] - mat_freq[j + 1]);
     }
 
     // invert transpose
-    for (int k = 0; k < 4; ++k) {
-        int ip = k, jp = k;  // pivot
-        double z = 1 / mat[ip][jp];
-        mat[ip][jp] = 1;
-        for (int i = 0; i < 4; ++i) {
-            if (i == ip)
+    for (int k = 0; k < n; k++) {
+        double z = 1 / mat[k][k];
+        mat[k][k] = 1;
+        for (int i = 0; i < n; i++) {
+            if (i == k)
                 continue;
 
-            double mul = mat[i][jp] * z;
-            mat[i][jp] = 0;
-            for (int j = 0; j < 4; ++j)
-                mat[i][j] -= mat[ip][j] * mul;
+            double mul = mat[i][k] * z;
+            mat[i][k] = 0;
+            for (int j = 0; j < n; j++)
+                mat[i][j] -= mat[k][j] * mul;
         }
-        for (int j = 0; j < 4; ++j)
-            mat[ip][j] *= z;
+        for (int j = 0; j < n; j++)
+            mat[k][j] *= z;
     }
 }
 
 /**
  * \brief Solve least squares problem for kernel of the main filter
  * \param mu out: output coefficients
- * \param index in: filter tap positions
- * \param prefilter in: supplementary filter type
+ * \param n in: filter kernel radius
  * \param r2 in: desired standard deviation squared
  * \param mul in: scale multiplier
  */
-static void calc_coeff(double mu[4], const int index[4], int prefilter, double r2, double mul)
+static void calc_coeff(double mu[], int n, double r2, double mul)
 {
-    double mul2 = mul * mul, mul3 = mul2 * mul;
+    assert(n > 0 && n <= 8);
+
+    const double w = 12096;
     double kernel[] = {
-        (5204 + 2520 * mul + 1092 * mul2 + 3280 * mul3) / 12096,
-        (2943 -  210 * mul -  273 * mul2 - 2460 * mul3) / 12096,
-        ( 486 -  924 * mul -  546 * mul2 +  984 * mul3) / 12096,
-        (  17 -  126 * mul +  273 * mul2 -  164 * mul3) / 12096,
+        ((( + 3280 / w) * mul + 1092 / w) * mul + 2520 / w) * mul + 5204 / w,
+        ((( - 2460 / w) * mul -  273 / w) * mul -  210 / w) * mul + 2943 / w,
+        ((( +  984 / w) * mul -  546 / w) * mul -  924 / w) * mul +  486 / w,
+        ((( -  164 / w) * mul +  273 / w) * mul -  126 / w) * mul +   17 / w,
     };
 
-    double mat_freq[14];
-    memcpy(mat_freq, kernel, sizeof(kernel));
-    memset(mat_freq + 4, 0, sizeof(mat_freq) - sizeof(kernel));
-    int n = 6;
-    coeff_filter(mat_freq, n, kernel);
-    for (int k = 0; k < 2 * prefilter; ++k)
-        coeff_blur121(mat_freq, ++n);
-
-    double vec_freq[13];
-    n = index[3] + prefilter + 3;
-    calc_gauss(vec_freq, n, r2);
-    memset(vec_freq + n + 1, 0, sizeof(vec_freq) - (n + 1) * sizeof(vec_freq[0]));
-    n -= 3;
-    coeff_filter(vec_freq, n, kernel);
-    for (int k = 0; k < prefilter; ++k)
-        coeff_blur121(vec_freq, --n);
-
-    double mat[4][4];
-    calc_matrix(mat, mat_freq, index);
-
-    double vec[4];
-    for (int i = 0; i < 4; ++i)
-        vec[i] = mat_freq[0] - mat_freq[index[i]] - vec_freq[0] + vec_freq[index[i]];
-
-    for (int i = 0; i < 4; ++i) {
+    double mat_freq[17] = { kernel[0], kernel[1], kernel[2], kernel[3] };
+    coeff_filter(mat_freq, 7, kernel);
+
+    double vec_freq[12];
+    calc_gauss(vec_freq, n + 4, r2 * mul);
+    coeff_filter(vec_freq, n + 1, kernel);
+
+    double mat[8][8];
+    calc_matrix(mat, mat_freq, n);
+
+    double vec[8];
+    for (int i = 0; i < n; i++)
+        vec[i] = mat_freq[0] - mat_freq[i + 1] - vec_freq[0] + vec_freq[i + 1];
+
+    for (int i = 0; i < n; i++) {
         double res = 0;
-        for (int j = 0; j < 4; ++j)
+        for (int j = 0; j < n; j++)
             res += mat[i][j] * vec[j];
         mu[i] = FFMAX(0, res);
     }
 }
 
 typedef struct {
-    int level, prefilter, filter;
-    int16_t coeff[4];
+    int level, radius;
+    int16_t coeff[8];
 } BlurMethod;
 
 static void find_best_method(BlurMethod *blur, double r2)
 {
-    static const int index[][4] = {
-        { 1, 2, 3, 4 },
-        { 1, 2, 3, 5 },
-        { 1, 2, 4, 6 },
-    };
-
-    double mu[5];
-    if (r2 < 1.9) {
-        blur->level = blur->prefilter = blur->filter = 0;
-
-        if (r2 < 0.5) {
-            mu[2] = 0.085 * r2 * r2 * r2;
-            mu[1] = 0.5 * r2 - 4 * mu[2];
-            mu[3] = mu[4] = 0;
-        } else {
-            calc_gauss(mu, 4, r2);
-        }
+    double mu[8];
+    if (r2 < 0.5) {
+        blur->level = 0;
+        blur->radius = 4;
+        mu[1] = 0.085 * r2 * r2 * r2;
+        mu[0] = 0.5 * r2 - 4 * mu[1];
+        mu[2] = mu[3] = 0;
     } else {
-        double mul = 1;
-        if (r2 < 6.693) {
-            blur->level = 0;
-
-            if (r2 < 2.8)
-                blur->prefilter = 1;
-            else if (r2 < 4.4)
-                blur->prefilter = 2;
-            else
-                blur->prefilter = 3;
-
-            blur->filter = blur->prefilter - 1;
-        } else {
-            frexp((r2 + 0.7) / 26.5, &blur->level);
-            blur->level = (blur->level + 3) >> 1;
-            mul = pow(0.25, blur->level);
-            r2 *= mul;
-
-            if (r2 < 3.15 - 1.5 * mul)
-                blur->prefilter = 0;
-            else if (r2 < 5.3 - 5.2 * mul)
-                blur->prefilter = 1;
-            else
-                blur->prefilter = 2;
-
-            blur->filter = blur->prefilter;
-        }
-        calc_coeff(mu + 1, index[blur->filter], blur->prefilter, r2, mul);
+        double frac = frexp(sqrt(0.11569 * r2 + 0.20591047), &blur->level);
+        double mul = pow(0.25, blur->level);
+        blur->radius = 8 - (int) ((10.1525 + 0.8335 * mul) * (1 - frac));
+        blur->radius = FFMAX(blur->radius, 4);
+        calc_coeff(mu, blur->radius, r2, mul);
     }
-
-    for (int i = 1; i <= 4; ++i)
-        blur->coeff[i - 1] = (int) (0x10000 * mu[i] + 0.5);
+    for (int i = 0; i < blur->radius; i++)
+        blur->coeff[i] = (int) (0x10000 * mu[i] + 0.5);
 }
 
 /**
@@ -844,19 +536,16 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
     BlurMethod blur;
     find_best_method(&blur, r2);
 
-    int w = bm->w, h = bm->h;
-    int offset = ((2 * (blur.prefilter + blur.filter) + 17) << blur.level) - 5;
-    int end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4;
-    int end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4;
-
-    if (end_w >= INT_MAX / 4)
-        return false;
+    uint32_t w = bm->w, h = bm->h;
+    int offset = ((2 * blur.radius + 9) << blur.level) - 5;
+    uint32_t end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4;
+    uint32_t end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4;
 
     const int stripe_width = 1 << (engine->align_order - 1);
-    int aligned_end_w = (end_w + stripe_width - 1) & ~(stripe_width - 1);
-    if (end_h >= INT_MAX / 8 / aligned_end_w)
+    uint64_t size = (((uint64_t) end_w + stripe_width - 1) & ~(stripe_width - 1)) * end_h;
+    if (size > INT_MAX / 4)
         return false;
-    int size = end_h * aligned_end_w;
+
     int16_t *tmp = ass_aligned_alloc(2 * stripe_width, 4 * size, false);
     if (!tmp)
         return false;
@@ -875,27 +564,18 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
         w = (w + 5) >> 1;
         index ^= 1;
     }
-    if (blur.prefilter) {
-        engine->pre_blur_horz[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h);
-        w += 2 * blur.prefilter;
-        index ^= 1;
-    }
-    engine->main_blur_horz[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff);
-    w += 2 * blur.filter + 8;
+    assert(blur.radius >= 4 && blur.radius <= 8);
+    engine->blur_horz[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff);
+    w += 2 * blur.radius;
+    index ^= 1;
+    engine->blur_vert[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff);
+    h += 2 * blur.radius;
     index ^= 1;
     for (int i = 0; i < blur.level; ++i) {
         engine->expand_horz(buf[index ^ 1], buf[index], w, h);
         w = 2 * w + 4;
         index ^= 1;
     }
-    if (blur.prefilter) {
-        engine->pre_blur_vert[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h);
-        h += 2 * blur.prefilter;
-        index ^= 1;
-    }
-    engine->main_blur_vert[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff);
-    h += 2 * blur.filter + 8;
-    index ^= 1;
     for (int i = 0; i < blur.level; ++i) {
         engine->expand_vert(buf[index ^ 1], buf[index], w, h);
         h = 2 * h + 4;
@@ -907,7 +587,7 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
         ass_aligned_free(tmp);
         return false;
     }
-    offset = ((blur.prefilter + blur.filter + 8) << blur.level) - 4;
+    offset = ((blur.radius + 4) << blur.level) - 4;
     bm->left -= offset;
     bm->top  -= offset;
 
diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h
index 381d3fb..79ca3a6 100644
--- a/libass/ass_func_template.h
+++ b/libass/ass_func_template.h
@@ -57,36 +57,36 @@ void DECORATE(expand_horz)(int16_t *dst, const int16_t *src,
                            uintptr_t src_width, uintptr_t src_height);
 void DECORATE(expand_vert)(int16_t *dst, const int16_t *src,
                            uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur1_horz)(int16_t *dst, const int16_t *src,
-                              uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur1_vert)(int16_t *dst, const int16_t *src,
-                              uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur2_horz)(int16_t *dst, const int16_t *src,
-                              uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur2_vert)(int16_t *dst, const int16_t *src,
-                              uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur3_horz)(int16_t *dst, const int16_t *src,
-                              uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur3_vert)(int16_t *dst, const int16_t *src,
-                              uintptr_t src_width, uintptr_t src_height);
-void DECORATE(blur1234_horz)(int16_t *dst, const int16_t *src,
-                             uintptr_t src_width, uintptr_t src_height,
-                             const int16_t *param);
-void DECORATE(blur1234_vert)(int16_t *dst, const int16_t *src,
-                             uintptr_t src_width, uintptr_t src_height,
-                             const int16_t *param);
-void DECORATE(blur1235_horz)(int16_t *dst, const int16_t *src,
-                             uintptr_t src_width, uintptr_t src_height,
-                             const int16_t *param);
-void DECORATE(blur1235_vert)(int16_t *dst, const int16_t *src,
-                             uintptr_t src_width, uintptr_t src_height,
-                             const int16_t *param);
-void DECORATE(blur1246_horz)(int16_t *dst, const int16_t *src,
-                             uintptr_t src_width, uintptr_t src_height,
-                             const int16_t *param);
-void DECORATE(blur1246_vert)(int16_t *dst, const int16_t *src,
-                             uintptr_t src_width, uintptr_t src_height,
-                             const int16_t *param);
+void DECORATE(blur4_horz)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur4_vert)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur5_horz)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur5_vert)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur6_horz)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur6_vert)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur7_horz)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur7_vert)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur8_horz)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
+void DECORATE(blur8_vert)(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height,
+                          const int16_t *param);
 
 
 const BitmapEngine DECORATE(bitmap_engine) = {
@@ -125,8 +125,6 @@ const BitmapEngine DECORATE(bitmap_engine) = {
     .shrink_vert = DECORATE(shrink_vert),
     .expand_horz = DECORATE(expand_horz),
     .expand_vert = DECORATE(expand_vert),
-    .pre_blur_horz = { DECORATE(pre_blur1_horz), DECORATE(pre_blur2_horz), DECORATE(pre_blur3_horz) },
-    .pre_blur_vert = { DECORATE(pre_blur1_vert), DECORATE(pre_blur2_vert), DECORATE(pre_blur3_vert) },
-    .main_blur_horz = { DECORATE(blur1234_horz), DECORATE(blur1235_horz), DECORATE(blur1246_horz) },
-    .main_blur_vert = { DECORATE(blur1234_vert), DECORATE(blur1235_vert), DECORATE(blur1246_vert) },
+    .blur_horz = { DECORATE(blur4_horz), DECORATE(blur5_horz), DECORATE(blur6_horz), DECORATE(blur7_horz), DECORATE(blur8_horz) },
+    .blur_vert = { DECORATE(blur4_vert), DECORATE(blur5_vert), DECORATE(blur6_vert), DECORATE(blur7_vert), DECORATE(blur8_vert) },
 };
diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm
index ba35f9d..88636a6 100644
--- a/libass/x86/blur.asm
+++ b/libass/x86/blur.asm
@@ -203,7 +203,7 @@ STRIPE_PACK
     lea %6, [%5]
     cmp %6, %3
     cmovae %6, %4
-%if (mmsize != 32) || (%0 < 7)
+%if mmsize != 32 || %0 < 7
     mova m%1, [%2 + %6]
 %elifidn %7, left
     mova xm%1, [%2 + %6]
@@ -219,7 +219,7 @@ STRIPE_PACK
     sub %5, %2
     cmp %4, %3
     cmovb %5, %4
-%if (mmsize != 32) || (%0 < 6)
+%if mmsize != 32 || %0 < 6
     mova m%1, [%2 + %5]
 %elifidn %6, left
     mova xm%1, [%2 + %5]
@@ -286,12 +286,8 @@ cglobal shrink_horz, 4,7,8
     mova m3, m0
     mova m4, m1
 %endif
-    psrldq m3, 10
-    psrldq m4, 10
-    pslldq m6, m1, 6
-    por m3, m6
-    pslldq m6, m2, 6
-    por m4, m6
+    PALIGNR m3,m1,m3, m6, 10
+    PALIGNR m4,m2,m4, m6, 10
     paddw m3, m1
     paddw m4, m2
     pand m3, m7
@@ -310,14 +306,10 @@ cglobal shrink_horz, 4,7,8
 %if mmsize == 32
     vperm2i128 m0, m0, m1, 0x20
 %endif
-    psrldq m0, 8
-    pslldq m6, m1, 8
-    por m0, m6
-    paddd m5, m0, m1
+    PALIGNR m5,m1,m0, m6, 8
+    paddd m5, m1
     psrld m5, 1
-    psrldq m0, 4
-    pslldq m6, m1, 4
-    por m0, m6
+    PALIGNR m0,m1,m0, m6, 12
     paddd m5, m0
     psrld m5, 1
     paddd m5, m3
@@ -327,14 +319,10 @@ cglobal shrink_horz, 4,7,8
 %if mmsize == 32
     vperm2i128 m1, m1, m2, 0x21
 %endif
-    psrldq m1, 8
-    pslldq m6, m2, 8
-    por m1, m6
-    paddd m5, m1, m2
+    PALIGNR m5,m2,m1, m6, 8
+    paddd m5, m2
     psrld m5, 1
-    psrldq m1, 4
-    pslldq m6, m2, 4
-    por m1, m6
+    PALIGNR m1,m2,m1, m6, 12
     paddd m5, m1
     psrld m5, 1
     paddd m5, m4
@@ -501,25 +489,20 @@ cglobal expand_horz, 4,7,5
 %endif
 .main_loop:
 %if ARCH_X86_64
-    LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+    LOAD_LINE 2, r1,r2,r7, r4 + 0 * r3, r6, right
     LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
 %else
-    LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+    LOAD_LINE_COMPACT 2, r1,r2,r4, r6, right
     add r4, r3
     LOAD_LINE_COMPACT 1, r1,r2,r4, r6
     sub r4, r3
 %endif
 
 %if mmsize == 32
-    vperm2i128 m0, m0, m1, 0x20
+    vperm2i128 m2, m2, m1, 0x20
 %endif
-    psrldq m0, 12
-    pslldq m3, m1, 4
-    por m0, m3
-    psrldq m2, m0, 2
-    pslldq m3, m1, 2
-    por m2, m3
-
+    PALIGNR m0,m1,m2, m3, 12
+    PALIGNR m2,m1,m2, m3, 14
     paddw m3, m0, m1
     psrlw m3, 1
     paddw m3, m2
@@ -564,22 +547,17 @@ cglobal expand_horz, 4,7,5
 
 .odd_stripe:
 %if ARCH_X86_64
-    LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+    LOAD_LINE 2, r1,r2,r7, r4 + 0 * r3, r6, right
     LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left
 %else
-    LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+    LOAD_LINE_COMPACT 2, r1,r2,r4, r6, right
     add r4, r3
     LOAD_LINE_COMPACT 1, r1,r2,r4, r6, left
     sub r4, r3
 %endif
 
-    psrldq xm0, 12
-    pslldq xm3, xm1, 4
-    por xm0, xm3
-    psrldq xm2, xm0, 2
-    pslldq xm3, xm1, 2
-    por xm2, xm3
-
+    PALIGNR xm0,xm1,xm2, xm3, 12
+    PALIGNR xm2,xm1,xm2, xm3, 14
     paddw xm3, xm0, xm1
     psrlw xm3, 1
     paddw xm3, xm2
@@ -674,313 +652,52 @@ INIT_YMM avx2
 EXPAND_VERT
 
 ;------------------------------------------------------------------------------
-; PRE_BLUR1_HORZ
-; void pre_blur1_horz(int16_t *dst, const int16_t *src,
-;                     uintptr_t src_width, uintptr_t src_height);
+; LOAD_MULTIPLIER 1:n, 2:m_mul, 3:src, 4:tmp
+; Load blur parameters into xmm/ymm registers
 ;------------------------------------------------------------------------------
 
-%macro PRE_BLUR1_HORZ 0
+%macro LOAD_MULTIPLIER 4
 %if ARCH_X86_64
-cglobal pre_blur1_horz, 4,8,4
+    %assign %%t %2 + (%1 - 1) / 2
 %else
-cglobal pre_blur1_horz, 4,7,4
-%endif
-    lea r5, [2 * r2 + mmsize + 3]
-    lea r2, [2 * r2 + mmsize - 1]
-    and r5, ~(mmsize - 1)
-    and r2, ~(mmsize - 1)
-    imul r5, r3
-    imul r2, r3
-    add r5, r0
-    xor r4, r4
-    MUL r3, mmsize
-    sub r4, r3
-    mova m3, [words_one]
-%if ARCH_X86_64
-    lea r7, [words_zero]
-    sub r7, r1
+    %assign %%t %2
 %endif
-
-.main_loop:
-%if ARCH_X86_64
-    LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
-    LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
-%else
-    LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
-    add r4, r3
-    LOAD_LINE_COMPACT 1, r1,r2,r4, r6
-    sub r4, r3
+    movu xm %+ %%t, [%3]
+%if %1 % 2
+    pextrw %4d, xm %+ %%t, 0
+    pslldq xm %+ %%t, 2
+    pinsrw xm %+ %%t, %4d, 0
 %endif
-
 %if mmsize == 32
-    vperm2i128 m0, m0, m1, 0x20
-%endif
-    psrldq m0, 12
-    pslldq m2, m1, 4
-    por m0, m2
-    psrldq m2, m0, 2
-    paddw m0, m1
-    pslldq m1, 2
-    psrlw m0, 1
-    por m1, m2
-    paddw m0, m1
-    paddw m0, m3
-    psrlw m0, 1
-
-    mova [r0], m0
-    add r0, mmsize
-    add r4, mmsize
-    cmp r0, r5
-    jb .main_loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-PRE_BLUR1_HORZ
-INIT_YMM avx2
-PRE_BLUR1_HORZ
-
-;------------------------------------------------------------------------------
-; PRE_BLUR1_VERT
-; void pre_blur1_vert(int16_t *dst, const int16_t *src,
-;                     uintptr_t src_width, uintptr_t src_height);
-;------------------------------------------------------------------------------
-
-%macro PRE_BLUR1_VERT 0
-cglobal pre_blur1_vert, 4,7,4
-    lea r2, [2 * r2 + mmsize - 1]
-    lea r5, [r3 + 2]
-    and r2, ~(mmsize - 1)
-    imul r2, r5
-    MUL r3, mmsize
-    add r2, r0
-    mova m3, [words_one]
-    lea r6, [words_zero]
-    sub r6, r1
-
-.col_loop:
-    mov r4, -2 * mmsize
-    pxor m0, m0
-    pxor m1, m1
-.row_loop:
-    LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
-
-    paddw m0, m2
-    psrlw m0, 1
-    paddw m0, m1
-    paddw m0, m3
-    psrlw m0, 1
-
-    mova [r0], m0
-    add r4, mmsize
-    add r0, mmsize
-    mova m0, m1
-    mova m1, m2
-    cmp r4, r3
-    jl .row_loop
-    add r1, r3
-    sub r6, r3
-    cmp r0, r2
-    jb .col_loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-PRE_BLUR1_VERT
-INIT_YMM avx2
-PRE_BLUR1_VERT
-
-;------------------------------------------------------------------------------
-; PRE_BLUR2_HORZ
-; void pre_blur2_horz(int16_t *dst, const int16_t *src,
-;                     uintptr_t src_width, uintptr_t src_height);
-;------------------------------------------------------------------------------
-
-%macro PRE_BLUR2_HORZ 0
-%if ARCH_X86_64
-cglobal pre_blur2_horz, 4,8,7
-%else
-cglobal pre_blur2_horz, 4,7,7
-%endif
-    lea r5, [2 * r2 + mmsize + 7]
-    lea r2, [2 * r2 + mmsize - 1]
-    and r5, ~(mmsize - 1)
-    and r2, ~(mmsize - 1)
-    imul r5, r3
-    imul r2, r3
-    add r5, r0
-    xor r4, r4
-    MUL r3, mmsize
-    sub r4, r3
-    mova m5, [words_one]
-    mova m6, [words_sign]
-%if ARCH_X86_64
-    lea r7, [words_zero]
-    sub r7, r1
+    vpermq m %+ %%t, m %+ %%t, q1010
 %endif
-
-.main_loop:
 %if ARCH_X86_64
-    LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
-    LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
-%else
-    LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
-    add r4, r3
-    LOAD_LINE_COMPACT 1, r1,r2,r4, r6
-    sub r4, r3
+    %assign %%i 0
+%rep (%1 + 1) / 2
+    %assign %%c %2 + %%i
+    pshufd m %+ %%c, m %+ %%t, q1111 * %%i
+    %assign %%i %%i + 1
+%endrep
 %endif
-
-%if mmsize == 32
-    vperm2i128 m0, m0, m1, 0x20
-%endif
-    psrldq m0, 8
-    pslldq m2, m1, 8
-    por m2, m0
-    paddw m2, m1
-    psrlw m2, 1
-    psrldq m0, 2
-    pslldq m3, m1, 6
-    por m3, m0
-    psrldq m0, 2
-    pslldq m4, m1, 4
-    por m4, m0
-    paddw m2, m4
-    psrlw m2, 1
-    paddw m2, m4
-    psrldq m0, 2
-    pslldq m1, 2
-    por m0, m1
-    paddw m0, m3
-    mova m1, m6
-    pand m1, m0
-    pand m1, m2
-    paddw m0, m2
-    psrlw m0, 1
-    por m0, m1
-    paddw m0, m5
-    psrlw m0, 1
-
-    mova [r0], m0
-    add r0, mmsize
-    add r4, mmsize
-    cmp r0, r5
-    jb .main_loop
-    RET
 %endmacro
 
-INIT_XMM sse2
-PRE_BLUR2_HORZ
-INIT_YMM avx2
-PRE_BLUR2_HORZ
-
 ;------------------------------------------------------------------------------
-; PRE_BLUR2_VERT
-; void pre_blur2_vert(int16_t *dst, const int16_t *src,
-;                     uintptr_t src_width, uintptr_t src_height);
+; FILTER_PAIR 1-2:m_acc[2], 3-4:m_line[2], 5:m_tmp, 6:m_mul, 7:pos
+; Calculate acc += line[0] * mul[odd] + line[1] * mul[even]
 ;------------------------------------------------------------------------------
 
-%macro PRE_BLUR2_VERT 0
-%if ARCH_X86_64
-cglobal pre_blur2_vert, 4,7,9
-%else
-cglobal pre_blur2_vert, 4,7,8
-%endif
-    lea r2, [2 * r2 + mmsize - 1]
-    lea r5, [r3 + 4]
-    and r2, ~(mmsize - 1)
-    imul r2, r5
-    MUL r3, mmsize
-    add r2, r0
-    mova m7, [words_one]
-%if ARCH_X86_64
-    mova m8, [words_sign]
-%endif
-    lea r6, [words_zero]
-    sub r6, r1
-
-.col_loop:
-    mov r4, -4 * mmsize
-    pxor m0, m0
-    pxor m1, m1
-    pxor m2, m2
-    pxor m3, m3
-.row_loop:
-    LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
-
-%if ARCH_X86_64
-    mova m6, m8
-%else
-    psllw m6, m7, 15
-%endif
-    paddw m0, m4
-    psrlw m0, 1
-    paddw m0, m2
-    psrlw m0, 1
-    paddw m0, m2
-    paddw m5, m1, m3
-    pand m6, m0
-    pand m6, m5
-    paddw m0, m5
-    psrlw m0, 1
-    por m0, m6
-    paddw m0, m7
-    psrlw m0, 1
-
-    mova [r0], m0
-    add r4, mmsize
-    add r0, mmsize
-    mova m0, m1
-    mova m1, m2
-    mova m2, m3
-    mova m3, m4
-    cmp r4, r3
-    jl .row_loop
-    add r1, r3
-    sub r6, r3
-    cmp r0, r2
-    jb .col_loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-PRE_BLUR2_VERT
-INIT_YMM avx2
-PRE_BLUR2_VERT
-
-;------------------------------------------------------------------------------
-; ADD_LINE 1:m_acc1, 2:m_acc2, 3:m_line, 4-5:m_tmp
-; Calculate acc += line
-;------------------------------------------------------------------------------
-
-%macro ADD_LINE 5
-    psraw m%4, m%3, 15
-    punpcklwd m%5, m%3, m%4
-    punpckhwd m%3, m%4
-%ifidn %1, %5
-    paddd m%1, m%2
-%else
-    paddd m%1, m%5
-%endif
-    paddd m%2, m%3
-%endmacro
-
-;------------------------------------------------------------------------------
-; FILTER_PAIR 1:m_acc1, 2:m_acc2, 3:m_line1, 4:m_line2,
-;             5:m_tmp, 6:m_mul64, [7:m_mul32, 8:swizzle]
-; Calculate acc += line1 * mul[odd] + line2 * mul[even]
-;------------------------------------------------------------------------------
-
-%macro FILTER_PAIR 6-8
+%macro FILTER_PAIR 7
     punpcklwd m%5, m%4, m%3
     punpckhwd m%4, m%3
-%if ARCH_X86_64 || (%0 < 8)
-    pmaddwd m%5, m%6
-    pmaddwd m%4, m%6
+    %assign %%p ((%7) - 1) / 2
+%if ARCH_X86_64
+    %assign %%p %6 + %%p
 %else
-    pshufd m%3, m%7, %8
-    pmaddwd m%5, m%3
-    pmaddwd m%4, m%3
+    pshufd m%3, m%6, q1111 * %%p
+    %assign %%p %3
 %endif
+    pmaddwd m%5, m %+ %%p
+    pmaddwd m%4, m %+ %%p
 %ifidn %1, %5
     paddd m%1, m%2
 %else
@@ -990,225 +707,54 @@ PRE_BLUR2_VERT
 %endmacro
 
 ;------------------------------------------------------------------------------
-; PRE_BLUR3_HORZ
-; void pre_blur3_horz(int16_t *dst, const int16_t *src,
-;                     uintptr_t src_width, uintptr_t src_height);
+; NEXT_DIFF 1:m_res, 2:m_side, 3:m_center, 4:position, 5:left/right
+; Calculate difference between next offset line and center line
 ;------------------------------------------------------------------------------
 
-%macro PRE_BLUR3_HORZ 0
-%if ARCH_X86_64
-cglobal pre_blur3_horz, 4,8,9
-%else
-cglobal pre_blur3_horz, 4,7,8
-%endif
-    lea r5, [2 * r2 + mmsize + 11]
-    lea r2, [2 * r2 + mmsize - 1]
-    and r5, ~(mmsize - 1)
-    and r2, ~(mmsize - 1)
-    imul r5, r3
-    imul r2, r3
-    add r5, r0
-    xor r4, r4
-    MUL r3, mmsize
-    sub r4, r3
-    mova m5, [words_15_6]
-%if ARCH_X86_64
-    mova m8, [dwords_32]
-    lea r7, [words_zero]
-    sub r7, r1
-%endif
+%macro NEXT_DIFF 5
+%ifidn %5, left
 
-.main_loop:
-%if ARCH_X86_64
-    LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
-    LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%if cpuflag(ssse3)
+    palignr m%1, m%3, m%2, 16 - (%4)
 %else
-    LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
-    add r4, r3
-    LOAD_LINE_COMPACT 1, r1,r2,r4, r6
-    sub r4, r3
+    psrldq m%2, 2
+    pslldq m%1, m%3, %4
+    por m%1, m%2
 %endif
 
-%if ARCH_X86_64
-    mova m7, m8
-%else
-    mova m7, [dwords_32]
-%endif
-%if mmsize == 32
-    vperm2i128 m0, m0, m1, 0x20
-%endif
-    psrldq m2, m0, 10
-    pslldq m3, m1, 6
-    por m2, m3
-
-    psrldq m0, 4
-    pslldq m3, m2, 6
-    por m3, m0
-    psubw m3, m2
-    ADD_LINE 6,7, 3,4, 6
-
-    psrldq m0, 2
-    pslldq m3, m2, 4
-    por m3, m0
-    psubw m3, m2
-    psrldq m0, 2
-    pslldq m4, m2, 2
-    por m4, m0
-    psubw m4, m2
-    FILTER_PAIR 6,7, 3,4, 0, 5
-
-    psubw m3, m1, m2
-    ADD_LINE 6,7, 3,4, 0
-
-    pslldq m1, 2
-    psrldq m3, m2, 4
-    por m3, m1
-    psubw m3, m2
-    pslldq m1, 2
-    psrldq m4, m2, 2
-    por m4, m1
-    psubw m4, m2
-    FILTER_PAIR 6,7, 3,4, 0, 5
-
-    psrad m6, 6
-    psrad m7, 6
-    packssdw m6, m7
-    paddw m2, m6
-    mova [r0], m2
-    add r0, mmsize
-    add r4, mmsize
-    cmp r0, r5
-    jb .main_loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-PRE_BLUR3_HORZ
-INIT_YMM avx2
-PRE_BLUR3_HORZ
+%elifidn %5, right
 
-;------------------------------------------------------------------------------
-; PRE_BLUR3_VERT
-; void pre_blur3_vert(int16_t *dst, const int16_t *src,
-;                     uintptr_t src_width, uintptr_t src_height);
-;------------------------------------------------------------------------------
-
-%macro PRE_BLUR3_VERT 0
-%if ARCH_X86_64
-cglobal pre_blur3_vert, 4,7,8
+%if cpuflag(ssse3)
+    palignr m%1, m%2, m%3, %4
 %else
-cglobal pre_blur3_vert, 4,7,8
+    pslldq m%2, 2
+    psrldq m%1, m%3, %4
+    por m%1, m%2
 %endif
-    lea r2, [2 * r2 + mmsize - 1]
-    lea r5, [r3 + 6]
-    and r2, ~(mmsize - 1)
-    imul r2, r5
-    MUL r3, mmsize
-    add r2, r0
-    mova m4, [dwords_32]
-    mova m5, [words_15_6]
-    lea r6, [words_zero]
-    sub r6, r1
 
-.col_loop:
-    mov r4, -6 * mmsize
-.row_loop:
-    mova m6, m4
-    mova m7, m4
-    LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5
-
-    LOAD_LINE 1, r1,r3,r6, r4 + 0 * mmsize, r5
-    psubw m1, m0
-    ADD_LINE 6,7, 1,2, 3
-
-    LOAD_LINE 1, r1,r3,r6, r4 + 1 * mmsize, r5
-    LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
-    psubw m1, m0
-    psubw m2, m0
-    FILTER_PAIR 6,7, 1,2, 3, 5
-
-    LOAD_LINE 1, r1,r3,r6, r4 + 6 * mmsize, r5
-    psubw m1, m0
-    ADD_LINE 6,7, 1,2, 3
-
-    LOAD_LINE 1, r1,r3,r6, r4 + 5 * mmsize, r5
-    LOAD_LINE 2, r1,r3,r6, r4 + 4 * mmsize, r5
-    psubw m1, m0
-    psubw m2, m0
-    FILTER_PAIR 6,7, 1,2, 3, 5
-
-    psrad m6, 6
-    psrad m7, 6
-    packssdw m6, m7
-    paddw m0, m6
-    mova [r0], m0
-    add r4, mmsize
-    add r0, mmsize
-    cmp r4, r3
-    jl .row_loop
-    add r1, r3
-    sub r6, r3
-    cmp r0, r2
-    jb .col_loop
-    RET
-%endmacro
-
-INIT_XMM sse2
-PRE_BLUR3_VERT
-INIT_YMM avx2
-PRE_BLUR3_VERT
-
-;------------------------------------------------------------------------------
-; LOAD_MULTIPLIER 1:m_mul1, 2:m_mul2, 3:src, 4:tmp
-; Load blur parameters into xmm/ymm registers
-;------------------------------------------------------------------------------
-
-%macro LOAD_MULTIPLIER 4
-    mov %4, [%3]
-    movd xm%1, %4d
-%if ARCH_X86_64
-    shr %4, 32
-%else
-    mov %4, [%3 + 4]
-%endif
-    movd xm%2, %4d
-%if ARCH_X86_64 == 0
-    punpckldq xm%1, xm%2
-%if mmsize == 32
-    vpbroadcastq m%1, xm%1
-%endif
-%elif mmsize == 32
-    vpbroadcastd m%1, xm%1
-    vpbroadcastd m%2, xm%2
 %else
-    pshufd m%1, m%1, q0000
-    pshufd m%2, m%2, q0000
+    %error "left/right expected"
 %endif
+    psubw m%1, m%3
 %endmacro
 
 ;------------------------------------------------------------------------------
-; BLUR_HORZ 1:pattern
-; void blurNNNN_horz(int16_t *dst, const int16_t *src,
-;                    uintptr_t src_width, uintptr_t src_height,
-;                    const int16_t *param);
+; BLUR_HORZ 1:radius
+; void blurN_horz(int16_t *dst, const int16_t *src,
+;                 uintptr_t src_width, uintptr_t src_height,
+;                 const int16_t *param);
 ;------------------------------------------------------------------------------
 
 %macro BLUR_HORZ 1
-    %assign %%i1 %1 / 1000 % 10
-    %assign %%i2 %1 / 100 % 10
-    %assign %%i3 %1 / 10 % 10
-    %assign %%i4 %1 / 1 % 10
 %if ARCH_X86_64
-cglobal blur%1_horz, 5,8,10
+    %assign %%narg 9 + (%1 + 1) / 2
+cglobal blur%1_horz, 5,8,%%narg
 %else
 cglobal blur%1_horz, 5,7,8
+    SWAP 7, 9
 %endif
-%if ARCH_X86_64
-    LOAD_MULTIPLIER 8,9, r4, r5
-%else
-    LOAD_MULTIPLIER 5,0, r4, r5
-%endif
-    lea r5, [2 * r2 + mmsize + 4 * %%i4 - 1]
+    LOAD_MULTIPLIER %1, 9, r4, r5
+    lea r5, [2 * r2 + mmsize + 4 * %1 - 1]
     lea r2, [2 * r2 + mmsize - 1]
     and r5, ~(mmsize - 1)
     and r2, ~(mmsize - 1)
@@ -1217,106 +763,129 @@ cglobal blur%1_horz, 5,7,8
     add r5, r0
     xor r4, r4
     MUL r3, mmsize
-%if (mmsize != 32) && (%%i4 > 4)
+%if mmsize != 32 && %1 > 4
     sub r4, r3
 %endif
     sub r4, r3
 %if ARCH_X86_64
-    mova m5, [dwords_round]
+    mova m7, [dwords_round]
     lea r7, [words_zero]
     sub r7, r1
 %endif
 
 .main_loop:
 %if ARCH_X86_64
-%if %%i4 > 4
-    LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6
+%if %1 > 4
+    LOAD_LINE 1, r1,r2,r7, r4 + 0 * r3, r6
 %else
-    LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+    LOAD_LINE 1, r1,r2,r7, r4 + 0 * r3, r6, right
 %endif
-    LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
-%if (mmsize != 32) && (%%i4 > 4)
-    LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6
-    SWAP 1, 2
+    LOAD_LINE 2, r1,r2,r7, r4 + 1 * r3, r6
+%if mmsize != 32 && %1 > 4
+    LOAD_LINE 0, r1,r2,r7, r4 + 2 * r3, r6
+    SWAP 0, 2
 %endif
 %else
-%if %%i4 > 4
-    LOAD_LINE_COMPACT 0, r1,r2,r4, r6
+%if %1 > 4
+    LOAD_LINE_COMPACT 1, r1,r2,r4, r6
 %else
-    LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+    LOAD_LINE_COMPACT 1, r1,r2,r4, r6, right
 %endif
-    add r4, r3
-    LOAD_LINE_COMPACT 1, r1,r2,r4, r6
-%if (mmsize != 32) && (%%i4 > 4)
     add r4, r3
     LOAD_LINE_COMPACT 2, r1,r2,r4, r6
-    SWAP 1, 2
+%if mmsize != 32 && %1 > 4
+    add r4, r3
+    LOAD_LINE_COMPACT 0, r1,r2,r4, r6
+    SWAP 0, 2
     sub r4, r3
 %endif
     sub r4, r3
 %endif
 
-%if ARCH_X86_64
-    mova m7, m5
+%if %1 > 4
+%if mmsize == 32
+    vperm2i128 m0, m1, m2, 0x21
+%endif
+%if cpuflag(ssse3)
+    PALIGNR m1,m0,m1, m3, 16 - 2 * %1
 %else
-    mova m7, [dwords_round]
+    PALIGNR m1,m0,m1, m3, 32 - 4 * %1
 %endif
-%if %%i4 > 4
+    PALIGNR m0,m2,m0, m3, 16 - 2 * %1
+%else
 %if mmsize == 32
-    vperm2i128 m2, m0, m1, 0x21
+    vperm2i128 m1, m1, m2, 0x20
 %endif
-    psrldq m0, 32 - 4 * %%i4
-    pslldq m3, m2, 4 * %%i4 - 16
-    por m0, m3
-    psrldq m2, 16 - 2 * %%i4
+%if cpuflag(ssse3)
+    palignr m0, m2, m1, 8
+    pslldq m1, 8
 %else
-%if mmsize == 32
-    vperm2i128 m0, m0, m1, 0x20
+    shufpd m0, m1, m2, 5
 %endif
-    psrldq m2, m0, 16 - 2 * %%i4
 %endif
-    pslldq m3, m1, 2 * %%i4
-    por m2, m3
-
-    psubw m3, m1, m2
-    pslldq m1, 2 * (%%i4 - %%i3)
-    psrldq m4, m2, 2 * %%i3
-    por m4, m1
-    psubw m4, m2
-    FILTER_PAIR 6,7, 3,4, 6, 9,5,q1111
-
-    pslldq m1, 2 * (%%i3 - %%i2)
-    psrldq m3, m2, 2 * %%i2
-    por m3, m1
-    psubw m3, m2
-    pslldq m1, 2 * (%%i2 - %%i1)
-    psrldq m4, m2, 2 * %%i1
-    por m4, m1
-    psubw m4, m2
-    FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000
-
-    psubw m3, m0, m2
-    psrldq m0, 2 * (%%i4 - %%i3)
-    pslldq m4, m2, 2 * %%i3
-    por m4, m0
-    psubw m4, m2
-    FILTER_PAIR 6,7, 3,4, 1, 9,5,q1111
-
-    psrldq m0, 2 * (%%i3 - %%i2)
-    pslldq m3, m2, 2 * %%i2
-    por m3, m0
-    psubw m3, m2
-    psrldq m0, 2 * (%%i2 - %%i1)
-    pslldq m4, m2, 2 * %%i1
-    por m4, m0
-    psubw m4, m2
-    FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000
 
-    psrad m6, 16
-    psrad m7, 16
-    packssdw m6, m7
-    paddw m2, m6
+%if ARCH_X86_64
+    mova m6, m7
+%else
+    mova m6, [dwords_round]
+    mova [r0], m1
+    SWAP 1, 8
+%endif
+
+    %assign %%i %1
+    psubw m3, m2, m0
+%if cpuflag(ssse3) && %1 < 8
+    psrldq m2, 16 - 2 * %1
+%endif
+    NEXT_DIFF 4,2,0, 2 * %%i - 2, right
+    FILTER_PAIR 5,6, 3,4, 5, 9,%%i
+%rep %1 / 2 - 1
+    %assign %%i %%i - 2
+    NEXT_DIFF 3,2,0, 2 * %%i, right
+    NEXT_DIFF 4,2,0, 2 * %%i - 2, right
+    FILTER_PAIR 5,6, 3,4, 8, 9,%%i
+%endrep
+
+%if ARCH_X86_64 == 0
+    SWAP 1, 8
+    mova m1, [r0]
+%if %1 % 2
     mova [r0], m2
+%endif
+    SWAP 2, 8
+%endif
+
+    %assign %%i %1
+%if cpuflag(ssse3) && %1 < 8
+    NEXT_DIFF 3,1,0, 2 * %%i, left
+%else
+    psubw m3, m1, m0
+%endif
+    NEXT_DIFF 4,1,0, 2 * %%i - 2, left
+    FILTER_PAIR 5,6, 3,4, 8, 9,%%i
+%rep %1 / 2 - 1
+    %assign %%i %%i - 2
+    NEXT_DIFF 3,1,0, 2 * %%i, left
+    NEXT_DIFF 4,1,0, 2 * %%i - 2, left
+    FILTER_PAIR 5,6, 3,4, 8, 9,%%i
+%endrep
+
+%if %%i > 2
+    %assign %%i %%i - 2
+%if ARCH_X86_64 == 0
+    SWAP 2, 8
+    mova m2, [r0]
+%endif
+    NEXT_DIFF 3,1,0, 2 * %%i, left
+    NEXT_DIFF 4,2,0, 2 * %%i, right
+    FILTER_PAIR 5,6, 3,4, 1, 9,%%i
+%endif
+
+    psrad m5, 16
+    psrad m6, 16
+    packssdw m5, m6
+    paddw m0, m5
+    mova [r0], m0
     add r0, mmsize
     add r4, mmsize
     cmp r0, r5
@@ -1325,82 +894,80 @@ cglobal blur%1_horz, 5,7,8
 %endmacro
 
 INIT_XMM sse2
-BLUR_HORZ 1234
-BLUR_HORZ 1235
-BLUR_HORZ 1246
+BLUR_HORZ 4
+BLUR_HORZ 5
+BLUR_HORZ 6
+BLUR_HORZ 7
+BLUR_HORZ 8
 INIT_YMM avx2
-BLUR_HORZ 1234
-BLUR_HORZ 1235
-BLUR_HORZ 1246
+BLUR_HORZ 4
+BLUR_HORZ 5
+BLUR_HORZ 6
+BLUR_HORZ 7
+BLUR_HORZ 8
 
 ;------------------------------------------------------------------------------
-; BLUR_VERT 1:pattern
-; void blurNNNN_vert(int16_t *dst, const int16_t *src,
-;                    uintptr_t src_width, uintptr_t src_height,
-;                    const int16_t *param);
+; BLUR_VERT 1:radius
+; void blurN_vert(int16_t *dst, const int16_t *src,
+;                 uintptr_t src_width, uintptr_t src_height,
+;                 const int16_t *param);
 ;------------------------------------------------------------------------------
 
 %macro BLUR_VERT 1
-    %assign %%i1 %1 / 1000 % 10
-    %assign %%i2 %1 / 100 % 10
-    %assign %%i3 %1 / 10 % 10
-    %assign %%i4 %1 / 1 % 10
 %if ARCH_X86_64
-cglobal blur%1_vert, 5,7,9
+    %assign %%narg 7 + (%1 + 1) / 2
+cglobal blur%1_vert, 5,7,%%narg
 %else
 cglobal blur%1_vert, 5,7,8
 %endif
-%if ARCH_X86_64
-    LOAD_MULTIPLIER 4,5, r4, r5
-%else
-    LOAD_MULTIPLIER 5,0, r4, r5
-    SWAP 4, 8
-%endif
+    LOAD_MULTIPLIER %1, 7, r4, r5
     lea r2, [2 * r2 + mmsize - 1]
-    lea r5, [r3 + 2 * %%i4]
+    lea r5, [r3 + 2 * %1]
     and r2, ~(mmsize - 1)
     imul r2, r5
     MUL r3, mmsize
     add r2, r0
-    mova m8, [dwords_round]
+    mova m4, [dwords_round]
     lea r6, [words_zero]
     sub r6, r1
 
 .col_loop:
-    mov r4, -2 * %%i4 * mmsize
+    mov r4, -2 * %1 * mmsize
 .row_loop:
-    mova m6, m8
-    mova m7, m8
-    LOAD_LINE 0, r1,r3,r6, r4 + %%i4 * mmsize, r5
+    mova m5, m4
+    mova m6, m4
+    LOAD_LINE 0, r1,r3,r6, r4 + %1 * mmsize, r5
 
-    LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 - %%i4) * mmsize, r5
-    LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 - %%i3) * mmsize, r5
-    psubw m1, m0
-    psubw m2, m0
-    FILTER_PAIR 6,7, 1,2, 3, 5,5,q1111
+    %assign %%i %1
+%rep %1 / 2
 
-    LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 - %%i2) * mmsize, r5
-    LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 - %%i1) * mmsize, r5
+    LOAD_LINE 1, r1,r3,r6, r4 + (%1 - %%i) * mmsize, r5
+    LOAD_LINE 2, r1,r3,r6, r4 + (%1 - %%i + 1) * mmsize, r5
     psubw m1, m0
     psubw m2, m0
-    FILTER_PAIR 6,7, 1,2, 3, 4,5,q0000
+    FILTER_PAIR 5,6, 1,2, 3, 7,%%i
 
-    LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 + %%i4) * mmsize, r5
-    LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 + %%i3) * mmsize, r5
+    LOAD_LINE 1, r1,r3,r6, r4 + (%1 + %%i) * mmsize, r5
+    LOAD_LINE 2, r1,r3,r6, r4 + (%1 + %%i - 1) * mmsize, r5
     psubw m1, m0
     psubw m2, m0
-    FILTER_PAIR 6,7, 1,2, 3, 5,5,q1111
+    FILTER_PAIR 5,6, 1,2, 3, 7,%%i
+
+    %assign %%i %%i - 2
+%endrep
 
-    LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 + %%i2) * mmsize, r5
-    LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 + %%i1) * mmsize, r5
+%if %%i > 0
+    LOAD_LINE 1, r1,r3,r6, r4 + (%1 - %%i) * mmsize, r5
+    LOAD_LINE 2, r1,r3,r6, r4 + (%1 + %%i) * mmsize, r5
     psubw m1, m0
     psubw m2, m0
-    FILTER_PAIR 6,7, 1,2, 3, 4,5,q0000
+    FILTER_PAIR 5,6, 1,2, 3, 7,%%i
+%endif
 
+    psrad m5, 16
     psrad m6, 16
-    psrad m7, 16
-    packssdw m6, m7
-    paddw m0, m6
+    packssdw m5, m6
+    paddw m0, m5
     mova [r0], m0
     add r4, mmsize
     add r0, mmsize
@@ -1414,10 +981,14 @@ cglobal blur%1_vert, 5,7,8
 %endmacro
 
 INIT_XMM sse2
-BLUR_VERT 1234
-BLUR_VERT 1235
-BLUR_VERT 1246
+BLUR_VERT 4
+BLUR_VERT 5
+BLUR_VERT 6
+BLUR_VERT 7
+BLUR_VERT 8
 INIT_YMM avx2
-BLUR_VERT 1234
-BLUR_VERT 1235
-BLUR_VERT 1246
+BLUR_VERT 4
+BLUR_VERT 5
+BLUR_VERT 6
+BLUR_VERT 7
+BLUR_VERT 8
diff --git a/libass/x86/utils.asm b/libass/x86/utils.asm
index 7da4e4e..9d0ecb9 100644
--- a/libass/x86/utils.asm
+++ b/libass/x86/utils.asm
@@ -83,3 +83,59 @@
     pmaxsw m%1, m%2
 %endif
 %endmacro
+
+;------------------------------------------------------------------------------
+; PALIGNR 1:m_dst, 2:m_src1, 3:m_src2, 4:m_tmp, 5:amount
+;------------------------------------------------------------------------------
+
+%macro PALIGNR 5
+%if (%5) == 0
+%ifnidn %1, %3
+    mova %1, %3
+%endif
+%elif mmsize == 32
+    palignr %1, %2, %3, %5
+%elif cpuflag(ssse3)
+
+%ifnidn %1, %3
+    palignr %1, %2, %3, %5
+%elifidn %2, %4
+    palignr %2, %3, %5
+    mova %1, %2
+%else
+    mova %4, %3
+    palignr %1, %2, %4, %5
+%endif
+
+%elif (%5) == 8
+
+%ifnidn %1, %2
+    shufpd %1, %3, %2, 5
+%elifidn %3, %4
+    shufpd %3, %2, 5
+    mova %1, %3
+%else
+    mova %4, %2
+    shufpd %1, %3, %4, 5
+%endif
+
+%else
+
+    %assign %%flip 0
+%ifidn %1, %3
+    %assign %%flip 1
+%endif
+%ifidn %2, %4
+    %assign %%flip 1
+%endif
+