/*
 * Copyright (C) 2015 Vabishchevich Nikolay <vabnick@gmail.com>
 *
 * This file is part of libass.
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include "config.h"
#include "ass_compat.h"

#include <math.h>
#include <stdbool.h>

#include "ass_utils.h"
#include "ass_bitmap.h"


/*
 * Cascade Blur Algorithm
 *
 * The main idea is simple: to approximate a gaussian blur with large radius,
 * you can scale down, apply a filter with a relatively small pattern, then scale back up.
 *
 * To achieve the desired precision, scaling should be done with sufficiently smooth kernel.
 * Experiments show that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
 * corresponding upscaling are enough for 8-bit precision.
 *
 * Here we use generic filters with 5 different kernel widths (9 to 17-tap).
 * Kernel coefficients of that filter are obtained from the solution of the least-squares problem
 * for the Fourier transform of the resulting kernel.
 */


#define STRIPE_WIDTH  (1 << (C_ALIGN_ORDER - 1))
#define STRIPE_MASK   (STRIPE_WIDTH - 1)
static int16_t zero_line[STRIPE_WIDTH];
static int16_t dither_line[2 * STRIPE_WIDTH] = {
#if STRIPE_WIDTH > 8
     8, 40,  8, 40,  8, 40,  8, 40,  8, 40,  8, 40,  8, 40,  8, 40,
    56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24,
#else
     8, 40,  8, 40,  8, 40,  8, 40,
    56, 24, 56, 24, 56, 24, 56, 24,
#endif
};

inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintptr_t size)
{
    return offs < size ? ptr + offs : zero_line;
}

inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size)
{
    memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0]));
}

/*
 * Unpack/Pack Functions
 *
 * Convert between regular 8-bit bitmap and internal format.
 * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height].
 * Each pixel is represented as 16-bit integer in range of [0-0x4000].
 */

void ass_stripe_unpack_c(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
                         uintptr_t width, uintptr_t height)
{
    for (uintptr_t y = 0; y < height; y++) {
        int16_t *ptr = dst;
        for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
            for (int k = 0; k < STRIPE_WIDTH; k++)
                ptr[k] = (uint16_t) (((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1;
                //ptr[k] = (0x4000 * src[x + k] + 127) / 255;
            ptr += STRIPE_WIDTH * height;
        }
        dst += STRIPE_WIDTH;
        src += src_stride;
    }
}

void ass_stripe_pack_c(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
                       uintptr_t width, uintptr_t height)
{
    for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
        uint8_t *ptr = dst;
        for (uintptr_t y = 0; y < height; y++) {
            const int16_t *dither = dither_line + (y & 1) * STRIPE_WIDTH;
            for (int k = 0; k < STRIPE_WIDTH; k++)
                ptr[k] = (uint16_t) (src[k] - (src[k] >> 8) + dither[k]) >> 6;
                //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000;
            ptr += dst_stride;
            src += STRIPE_WIDTH;
        }
        dst += STRIPE_WIDTH;
    }
    uintptr_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK);
    for (uintptr_t y = 0; y < height; y++) {
        for (uintptr_t x = 0; x < left; x++)
            dst[x] = 0;
        dst += dst_stride;
    }
}

/*
 * Contract Filters
 *
 * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1].
 */

static inline int16_t shrink_func(int16_t p1p, int16_t p1n,
                                  int16_t z0p, int16_t z0n,
                                  int16_t n1p, int16_t n1n)
{
    /*
    return (1 * p1p + 5 * p1n + 10 * z0p + 10 * z0n + 5 * n1p + 1 * n1n + 16) >> 5;
    */
    int32_t r = (p1p + p1n + n1p + n1n) >> 1;
    r = (r + z0p + z0n) >> 1;
    r = (r + p1n + n1p) >> 1;
    return (r + z0p + z0n + 2) >> 2;
}

void ass_shrink_horz_c(int16_t *dst, const int16_t *src,
                       uintptr_t src_width, uintptr_t src_height)
{
    uintptr_t dst_width = (src_width + 5) >> 1;
    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
    uintptr_t step = STRIPE_WIDTH * src_height;

    uintptr_t offs = 0;
    int16_t buf[3 * STRIPE_WIDTH];
    int16_t *ptr = buf + STRIPE_WIDTH;
    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
        for (uintptr_t y = 0; y < src_height; y++) {
            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
            copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size);
            copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size);
            for (int k = 0; k < STRIPE_WIDTH; k++)
                dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3],
                                     ptr[2 * k - 2], ptr[2 * k - 1],
                                     ptr[2 * k + 0], ptr[2 * k + 1]);
            dst  += STRIPE_WIDTH;
            offs += STRIPE_WIDTH;
        }
        offs += step;
    }
}

void ass_shrink_vert_c(int16_t *dst, const int16_t *src,
                       uintptr_t src_width, uintptr_t src_height)
{
    uintptr_t dst_height = (src_height + 5) >> 1;
    uintptr_t step = STRIPE_WIDTH * src_height;

    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
        uintptr_t offs = 0;
        for (uintptr_t y = 0; y < dst_height; y++) {
            const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step);
            const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step);
            const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step);
            const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step);
            const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step);
            const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step);
            for (int k = 0; k < STRIPE_WIDTH; k++)
                dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]);
            dst  += 1 * STRIPE_WIDTH;
            offs += 2 * STRIPE_WIDTH;
        }
        src += step;
    }
}

/*
 * Expand Filters
 *
 * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5].
 */

static inline void expand_func(int16_t *rp, int16_t *rn,
                               int16_t p1, int16_t z0, int16_t n1)
{
    /*
    *rp = (5 * p1 + 10 * z0 + 1 * n1 + 8) >> 4;
    *rn = (1 * p1 + 10 * z0 + 5 * n1 + 8) >> 4;
    */
    uint16_t r = (uint16_t) (((uint16_t) (p1 + n1) >> 1) + z0) >> 1;
    *rp = (uint16_t) (((uint16_t) (r + p1) >> 1) + z0 + 1) >> 1;
    *rn = (uint16_t) (((uint16_t) (r + n1) >> 1) + z0 + 1) >> 1;
}

void ass_expand_horz_c(int16_t *dst, const int16_t *src,
                       uintptr_t src_width, uintptr_t src_height)
{
    uintptr_t dst_width = 2 * src_width + 4;
    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
    uintptr_t step = STRIPE_WIDTH * src_height;

    uintptr_t offs = 0;
    int16_t buf[2 * STRIPE_WIDTH];
    int16_t *ptr = buf + STRIPE_WIDTH;
    for (uintptr_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) {
        for (uintptr_t y = 0; y < src_height; y++) {
            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
            for (int k = 0; k < STRIPE_WIDTH / 2; k++)
                expand_func(&dst[2 * k], &dst[2 * k + 1],
                            ptr[k - 2], ptr[k - 1], ptr[k]);
            int16_t *next = dst + step - STRIPE_WIDTH;
            for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; k++)
                expand_func(&next[2 * k], &next[2 * k + 1],
                            ptr[k - 2], ptr[k - 1], ptr[k]);
            dst  += STRIPE_WIDTH;
            offs += STRIPE_WIDTH;
        }
        dst += step;
    }
    if ((dst_width - 1) & STRIPE_WIDTH)
        return;

    for (uintptr_t y = 0; y < src_height; y++) {
        copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
        copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
        for (int k = 0; k < STRIPE_WIDTH / 2; k++)
            expand_func(&dst[2 * k], &dst[2 * k + 1],
                        ptr[k - 2], ptr[k - 1], ptr[k]);
        dst  += STRIPE_WIDTH;
        offs += STRIPE_WIDTH;
    }
}

void ass_expand_vert_c(int16_t *dst, const int16_t *src,
                       uintptr_t src_width, uintptr_t src_height)
{
    uintptr_t dst_height = 2 * src_height + 4;
    uintptr_t step = STRIPE_WIDTH * src_height;

    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
        uintptr_t offs = 0;
        for (uintptr_t y = 0; y < dst_height; y += 2) {
            const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
            const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
            const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
            for (int k = 0; k < STRIPE_WIDTH; k++)
                expand_func(&dst[k], &dst[k + STRIPE_WIDTH],
                            p1[k], z0[k], n1[k]);
            dst  += 2 * STRIPE_WIDTH;
            offs += 1 * STRIPE_WIDTH;
        }
        src += step;
    }
}

/*
 * Main Parametric Filters
 *
 * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...],
 * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...),
 * number of parameters is part of the function name.
 */

static inline void blur_horz(int16_t *dst, const int16_t *src,
                             uintptr_t src_width, uintptr_t src_height,
                             const int16_t *param, const int n)
{
    uintptr_t dst_width = src_width + 2 * n;
    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
    uintptr_t step = STRIPE_WIDTH * src_height;

    uintptr_t offs = 0;
    int16_t buf[3 * STRIPE_WIDTH];
    int16_t *ptr = buf + 2 * STRIPE_WIDTH;
    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
        for (uintptr_t y = 0; y < src_height; y++) {
            for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++)
                copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size);
            int32_t acc[STRIPE_WIDTH];
            for (int k = 0; k < STRIPE_WIDTH; k++)
                acc[k] = 0x8000;
            for (int i = n; i > 0; i--)
                for (int k = 0; k < STRIPE_WIDTH; k++)
                    acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] +
                              (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1];
            for (int k = 0; k < STRIPE_WIDTH; k++)
                dst[k] = ptr[k - n] + (acc[k] >> 16);

            dst  += STRIPE_WIDTH;
            offs += STRIPE_WIDTH;
        }
    }
}

static inline void blur_vert(int16_t *dst, const int16_t *src,
                             uintptr_t src_width, uintptr_t src_height,
                             const int16_t *param, const int n)
{
    uintptr_t dst_height = src_height + 2 * n;
    uintptr_t step = STRIPE_WIDTH * src_height;

    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
        uintptr_t offs = 0;
        for (uintptr_t y = 0; y < dst_height; y++) {
            int32_t acc[STRIPE_WIDTH];
            for (int k = 0; k < STRIPE_WIDTH; k++)
                acc[k] = 0x8000;
            const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step);
            for (int i = n; i > 0; i--) {
                const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step);
                const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step);
                for (int k = 0; k < STRIPE_WIDTH; k++)
                    acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] +
                              (int16_t) (line2[k] - center[k]) * param[i - 1];
            }
            for (int k = 0; k < STRIPE_WIDTH; k++)
                dst[k] = center[k] + (acc[k] >> 16);

            dst  += STRIPE_WIDTH;
            offs += STRIPE_WIDTH;
        }
        src += step;
    }
}

void ass_blur4_horz_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_horz(dst, src, src_width, src_height, param, 4);
}

void ass_blur4_vert_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_vert(dst, src, src_width, src_height, param, 4);
}

void ass_blur5_horz_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_horz(dst, src, src_width, src_height, param, 5);
}

void ass_blur5_vert_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_vert(dst, src, src_width, src_height, param, 5);
}

void ass_blur6_horz_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_horz(dst, src, src_width, src_height, param, 6);
}

void ass_blur6_vert_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_vert(dst, src, src_width, src_height, param, 6);
}

void ass_blur7_horz_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_horz(dst, src, src_width, src_height, param, 7);
}

void ass_blur7_vert_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_vert(dst, src, src_width, src_height, param, 7);
}

void ass_blur8_horz_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_horz(dst, src, src_width, src_height, param, 8);
}

void ass_blur8_vert_c(int16_t *dst, const int16_t *src,
                      uintptr_t src_width, uintptr_t src_height,
                      const int16_t *param)
{
    blur_vert(dst, src, src_width, src_height, param, 8);
}


static void calc_gauss(double *res, int n, double r2)
{
    double alpha = 0.5 / r2;
    double mul = exp(-alpha), mul2 = mul * mul;
    double cur = sqrt(alpha / M_PI);

    res[0] = cur;
    cur *= mul;
    res[1] = cur;
    for (int i = 2; i < n; i++) {
        mul *= mul2;
        cur *= mul;
        res[i] = cur;
    }
}

static void coeff_filter(double *coeff, int n, const double kernel[4])
{
    double prev1 = coeff[1], prev2 = coeff[2], prev3 = coeff[3];
    for (int i = 0; i < n; i++) {
        double res = coeff[i + 0]  * kernel[0] +
            (prev1 + coeff[i + 1]) * kernel[1] +
            (prev2 + coeff[i + 2]) * kernel[2] +
            (prev3 + coeff[i + 3]) * kernel[3];
        prev3 = prev2;
        prev2 = prev1;
        prev1 = coeff[i];
        coeff[i] = res;
    }
}

static void calc_matrix(double mat[][8], const double *mat_freq, int n)
{
    for (int i = 0; i < n; i++) {
        mat[i][i] = mat_freq[2 * i + 2] + 3 * mat_freq[0] - 4 * mat_freq[i + 1];
        for (int j = i + 1; j < n; j++)
            mat[i][j] = mat[j][i] = mat_freq[i + j + 2] + mat_freq[j - i] +
                2 * (mat_freq[0] - mat_freq[i + 1] - mat_freq[j + 1]);
    }

    // invert transpose
    for (int k = 0; k < n; k++) {
        double z = 1 / mat[k][k];
        mat[k][k] = 1;
        for (int i = 0; i < n; i++) {
            if (i == k)
                continue;

            double mul = mat[i][k] * z;
            mat[i][k] = 0;
            for (int j = 0; j < n; j++)
                mat[i][j] -= mat[k][j] * mul;
        }
        for (int j = 0; j < n; j++)
            mat[k][j] *= z;
    }
}

/**
 * \brief Solve least squares problem for kernel of the main filter
 * \param mu out: output coefficients
 * \param n in: filter kernel radius
 * \param r2 in: desired standard deviation squared
 * \param mul in: scale multiplier
 */
static void calc_coeff(double mu[], int n, double r2, double mul)
{
    assert(n > 0 && n <= 8);

    const double w = 12096;
    double kernel[] = {
        ((( + 3280 / w) * mul + 1092 / w) * mul + 2520 / w) * mul + 5204 / w,
        ((( - 2460 / w) * mul -  273 / w) * mul -  210 / w) * mul + 2943 / w,
        ((( +  984 / w) * mul -  546 / w) * mul -  924 / w) * mul +  486 / w,
        ((( -  164 / w) * mul +  273 / w) * mul -  126 / w) * mul +   17 / w,
    };

    double mat_freq[17] = { kernel[0], kernel[1], kernel[2], kernel[3] };
    coeff_filter(mat_freq, 7, kernel);

    double vec_freq[12];
    calc_gauss(vec_freq, n + 4, r2 * mul);
    coeff_filter(vec_freq, n + 1, kernel);

    double mat[8][8];
    calc_matrix(mat, mat_freq, n);

    double vec[8];
    for (int i = 0; i < n; i++)
        vec[i] = mat_freq[0] - mat_freq[i + 1] - vec_freq[0] + vec_freq[i + 1];

    for (int i = 0; i < n; i++) {
        double res = 0;
        for (int j = 0; j < n; j++)
            res += mat[i][j] * vec[j];
        mu[i] = FFMAX(0, res);
    }
}

typedef struct {
    int level, radius;
    int16_t coeff[8];
} BlurMethod;

static void find_best_method(BlurMethod *blur, double r2)
{
    double mu[8];
    if (r2 < 0.5) {
        blur->level = 0;
        blur->radius = 4;
        mu[1] = 0.085 * r2 * r2 * r2;
        mu[0] = 0.5 * r2 - 4 * mu[1];
        mu[2] = mu[3] = 0;
    } else {
        double frac = frexp(sqrt(0.11569 * r2 + 0.20591047), &blur->level);
        double mul = pow(0.25, blur->level);
        blur->radius = 8 - (int) ((10.1525 + 0.8335 * mul) * (1 - frac));
        blur->radius = FFMAX(blur->radius, 4);
        calc_coeff(mu, blur->radius, r2, mul);
    }
    for (int i = 0; i < blur->radius; i++)
        blur->coeff[i] = (int) (0x10000 * mu[i] + 0.5);
}

/**
 * \brief Perform approximate gaussian blur
 * \param r2 in: desired standard deviation squared
 */
bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
{
    BlurMethod blur;
    find_best_method(&blur, r2);

    uint32_t w = bm->w, h = bm->h;
    int offset = ((2 * blur.radius + 9) << blur.level) - 5;
    uint32_t end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4;
    uint32_t end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4;

    const int stripe_width = 1 << (engine->align_order - 1);
    uint64_t size = (((uint64_t) end_w + stripe_width - 1) & ~(stripe_width - 1)) * end_h;
    if (size > INT_MAX / 4)
        return false;

    int16_t *tmp = ass_aligned_alloc(2 * stripe_width, 4 * size, false);
    if (!tmp)
        return false;

    engine->stripe_unpack(tmp, bm->buffer, bm->stride, w, h);
    int16_t *buf[2] = {tmp, tmp + size};
    int index = 0;

    for (int i = 0; i < blur.level; i++) {
        engine->shrink_vert(buf[index ^ 1], buf[index], w, h);
        h = (h + 5) >> 1;
        index ^= 1;
    }
    for (int i = 0; i < blur.level; i++) {
        engine->shrink_horz(buf[index ^ 1], buf[index], w, h);
        w = (w + 5) >> 1;
        index ^= 1;
    }
    assert(blur.radius >= 4 && blur.radius <= 8);
    engine->blur_horz[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff);
    w += 2 * blur.radius;
    index ^= 1;
    engine->blur_vert[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff);
    h += 2 * blur.radius;
    index ^= 1;
    for (int i = 0; i < blur.level; i++) {
        engine->expand_horz(buf[index ^ 1], buf[index], w, h);
        w = 2 * w + 4;
        index ^= 1;
    }
    for (int i = 0; i < blur.level; i++) {
        engine->expand_vert(buf[index ^ 1], buf[index], w, h);
        h = 2 * h + 4;
        index ^= 1;
    }
    assert(w == end_w && h == end_h);

    if (!realloc_bitmap(engine, bm, w, h)) {
        ass_aligned_free(tmp);
        return false;
    }
    offset = ((blur.radius + 4) << blur.level) - 4;
    bm->left -= offset;
    bm->top  -= offset;

    engine->stripe_pack(bm->buffer, bm->stride, buf[index], w, h);
    ass_aligned_free(tmp);
    return true;
}