15 files changed, 513 insertions, 477 deletions
diff --git a/video/out/opengl/angle_common.c b/video/out/opengl/angle_common.c
deleted file mode 100644
index 21cc924714..0000000000
--- a/video/out/opengl/angle_common.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "angle_common.h"
-
-// Test if Direct3D11 can be used by us. Basically, this prevents trying to use
-// D3D11 on Win7, and then failing somewhere in the process.
-bool d3d11_check_decoding(ID3D11Device *dev)
-{
-    HRESULT hr;
-    // We assume that NV12 is always supported, if hw decoding is supported at
-    // all.
-    UINT supported = 0;
-    hr = ID3D11Device_CheckFormatSupport(dev, DXGI_FORMAT_NV12, &supported);
-    return !FAILED(hr) && (supported & D3D11_BIND_DECODER);
-}
diff --git a/video/out/opengl/angle_common.h b/video/out/opengl/angle_common.h
deleted file mode 100644
index 14ecd6ab3c..0000000000
--- a/video/out/opengl/angle_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef MP_ANGLE_COMMON_H
-#define MP_ANGLE_COMMON_H
-
-#include <initguid.h>
-#include <assert.h>
-#include <windows.h>
-#include <d3d11.h>
-
-#include <stdbool.h>
-
-bool d3d11_check_decoding(ID3D11Device *dev);
-
-#endif
-\ No newline at end of file
diff --git a/video/out/opengl/context_angle.c b/video/out/opengl/context_angle.c
index cc14fc32c6..28515f431f 100644
--- a/video/out/opengl/context_angle.c
+++ b/video/out/opengl/context_angle.c
@@ -38,6 +38,7 @@ struct priv {
     EGLContext egl_context;
     EGLSurface egl_surface;
     bool use_es2;
+    PFNEGLPOSTSUBBUFFERNVPROC eglPostSubBufferNV;
 };
 
 static void angle_uninit(MPGLContext *ctx)
@@ -288,6 +289,11 @@ static int angle_init(struct MPGLContext *ctx, int flags)
     // Configure the underlying Direct3D device
     d3d_init(ctx);
 
+    if (strstr(exts, "EGL_NV_post_sub_buffer")) {
+        p->eglPostSubBufferNV =
+            (PFNEGLPOSTSUBBUFFERNVPROC)eglGetProcAddress("eglPostSubBufferNV");
+    }
+
     mpgl_load_functions(ctx->gl, get_proc_address, NULL, vo->log);
     return 0;
 
@@ -315,7 +321,16 @@ static int angle_reconfig(struct MPGLContext *ctx)
 
 static int angle_control(MPGLContext *ctx, int *events, int request, void *arg)
 {
-    return vo_w32_control(ctx->vo, events, request, arg);
+    struct priv *p = ctx->priv;
+    int r = vo_w32_control(ctx->vo, events, request, arg);
+
+    // Calling eglPostSubBufferNV with a 0-sized region doesn't present a frame
+    // or block, but it does update the swapchain to match the window size
+    // See: https://groups.google.com/d/msg/angleproject/RvyVkjRCQGU/gfKfT64IAgAJ
+    if ((*events & VO_EVENT_RESIZE) && p->eglPostSubBufferNV)
+        p->eglPostSubBufferNV(p->egl_display, p->egl_surface, 0, 0, 0, 0);
+
+    return r;
 }
 
 static void angle_swap_buffers(MPGLContext *ctx)
diff --git a/video/out/opengl/hwdec_d3d11egl.c b/video/out/opengl/hwdec_d3d11egl.c
index 549d3f5cac..07333c372e 100644
--- a/video/out/opengl/hwdec_d3d11egl.c
+++ b/video/out/opengl/hwdec_d3d11egl.c
@@ -23,7 +23,6 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "angle_common.h"
 #include "angle_dynamic.h"
 
 #include "common/common.h"
@@ -31,6 +30,7 @@
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
 #include "video/hwdec.h"
+#include "video/decode/d3d.h"
 
 #ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
 #define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x3AAB
@@ -195,6 +195,7 @@ static int create(struct gl_hwdec *hw)
         .type = HWDEC_D3D11VA,
         .driver_name = hw->driver->name,
         .ctx = p->d3d11_device,
+        .download_image = d3d11_download_image,
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
 
diff --git a/video/out/opengl/hwdec_d3d11eglrgb.c b/video/out/opengl/hwdec_d3d11eglrgb.c
index 2e61189154..be8057cde3 100644
--- a/video/out/opengl/hwdec_d3d11eglrgb.c
+++ b/video/out/opengl/hwdec_d3d11eglrgb.c
@@ -23,7 +23,6 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "angle_common.h"
 #include "angle_dynamic.h"
 
 #include "common/common.h"
@@ -31,6 +30,7 @@
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
 #include "video/hwdec.h"
+#include "video/decode/d3d.h"
 
 #ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
 #define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x3AAB
@@ -87,6 +87,8 @@ static int create(struct gl_hwdec *hw)
     if (!angle_load())
         return -1;
 
+    d3d_load_dlls();
+
     EGLDisplay egl_display = eglGetCurrentDisplay();
     if (!egl_display)
         return -1;
@@ -104,7 +106,6 @@ static int create(struct gl_hwdec *hw)
 
     p->egl_display = egl_display;
 
-    HANDLE d3d11_dll = GetModuleHandleW(L"d3d11.dll");
     if (!d3d11_dll) {
         if (!hw->probing)
             MP_ERR(hw, "Failed to load D3D11 library\n");
diff --git a/video/out/opengl/hwdec_dxva2egl.c b/video/out/opengl/hwdec_dxva2egl.c
index d67a85bff5..f206b962d1 100644
--- a/video/out/opengl/hwdec_dxva2egl.c
+++ b/video/out/opengl/hwdec_dxva2egl.c
@@ -29,11 +29,11 @@
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
 #include "video/hwdec.h"
+#include "video/decode/d3d.h"
 
 struct priv {
     struct mp_hwdec_ctx hwctx;
 
-    HMODULE             d3d9_dll;
     IDirect3D9Ex       *d3d9ex;
     IDirect3DDevice9Ex *device9ex;
     IDirect3DQuery9    *query9;
@@ -89,9 +89,6 @@ static void destroy(struct gl_hwdec *hw)
 
     if (p->d3d9ex)
         IDirect3D9Ex_Release(p->d3d9ex);
-
-    if (p->d3d9_dll)
-        FreeLibrary(p->d3d9_dll);
 }
 
 static int create(struct gl_hwdec *hw)
@@ -99,6 +96,8 @@ static int create(struct gl_hwdec *hw)
     if (!angle_load())
         return -1;
 
+    d3d_load_dlls();
+
     EGLDisplay egl_display = eglGetCurrentDisplay();
     if (!egl_display)
         return -1;
@@ -118,15 +117,14 @@ static int create(struct gl_hwdec *hw)
 
     p->egl_display = egl_display;
 
-    p->d3d9_dll = LoadLibraryW(L"d3d9.dll");
-    if (!p->d3d9_dll) {
+    if (!d3d9_dll) {
         MP_FATAL(hw, "Failed to load \"d3d9.dll\": %s\n",
                  mp_LastError_to_str());
         goto fail;
     }
 
     HRESULT (WINAPI *Direct3DCreate9Ex)(UINT SDKVersion, IDirect3D9Ex **ppD3D);
-    Direct3DCreate9Ex = (void *)GetProcAddress(p->d3d9_dll, "Direct3DCreate9Ex");
+    Direct3DCreate9Ex = (void *)GetProcAddress(d3d9_dll, "Direct3DCreate9Ex");
     if (!Direct3DCreate9Ex) {
         MP_FATAL(hw, "Direct3D 9Ex not supported\n");
         goto fail;
diff --git a/video/out/opengl/hwdec_vaglx.c b/video/out/opengl/hwdec_vaglx.c
index 2e3017c193..0400604067 100644
--- a/video/out/opengl/hwdec_vaglx.c
+++ b/video/out/opengl/hwdec_vaglx.c
@@ -185,7 +185,7 @@ static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
                           0, 0, hw_image->w, hw_image->h,
                           0, 0, hw_image->w, hw_image->h,
                           NULL, 0,
-                          va_get_colorspace_flag(hw_image->params.colorspace));
+                          va_get_colorspace_flag(hw_image->params.color.space));
     CHECK_VA_STATUS(p, "vaPutSurface()");
     va_unlock(p->ctx);
 
diff --git a/video/out/opengl/osd.c b/video/out/opengl/osd.c
index 7b1ec162fd..5df5bb199a 100644
--- a/video/out/opengl/osd.c
+++ b/video/out/opengl/osd.c
@@ -21,8 +21,6 @@
 
 #include <libavutil/common.h>
 
-#include "video/out/bitmap_packer.h"
-
 #include "formats.h"
 #include "utils.h"
 #include "osd.h"
@@ -53,20 +51,17 @@ struct mpgl_osd_part {
     int change_id;
     GLuint texture;
     int w, h;
-    GLuint buffer;
+    struct gl_pbo_upload pbo;
     int num_subparts;
     int prev_num_subparts;
     struct sub_bitmap *subparts;
     struct vertex *vertices;
-    struct bitmap_packer *packer;
-    void *upload;
 };
 
 struct mpgl_osd {
     struct mp_log *log;
     struct osd_state *osd;
     GL *gl;
-    GLint max_tex_wh;
     bool use_pbo;
     struct mpgl_osd_part *parts[MAX_OSD_PARTS];
     const struct gl_format *fmt_table[SUBBITMAP_COUNT];
@@ -89,21 +84,11 @@ struct mpgl_osd *mpgl_osd_init(GL *gl, struct mp_log *log, struct osd_state *osd
         .scratch = talloc_zero_size(ctx, 1),
     };
 
-    gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &ctx->max_tex_wh);
-
     ctx->fmt_table[SUBBITMAP_LIBASS] = gl_find_unorm_format(gl, 1, 1);
     ctx->fmt_table[SUBBITMAP_RGBA]   = gl_find_unorm_format(gl, 1, 4);
 
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        struct mpgl_osd_part *p = talloc_ptrtype(ctx, p);
-        *p = (struct mpgl_osd_part) {
-            .packer = talloc_struct(p, struct bitmap_packer, {
-                .w_max = ctx->max_tex_wh,
-                .h_max = ctx->max_tex_wh,
-            }),
-        };
-        ctx->parts[n] = p;
-    }
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n] = talloc_zero(ctx, struct mpgl_osd_part);
 
     for (int n = 0; n < SUBBITMAP_COUNT; n++)
         ctx->formats[n] = !!ctx->fmt_table[n];
@@ -125,9 +110,7 @@ void mpgl_osd_destroy(struct mpgl_osd *ctx)
     for (int n = 0; n < MAX_OSD_PARTS; n++) {
         struct mpgl_osd_part *p = ctx->parts[n];
         gl->DeleteTextures(1, &p->texture);
-        if (gl->DeleteBuffers)
-            gl->DeleteBuffers(1, &p->buffer);
-        talloc_free(p->upload);
+        gl_pbo_upload_uninit(&p->pbo);
     }
     talloc_free(ctx);
 }
@@ -137,87 +120,6 @@ void mpgl_osd_set_options(struct mpgl_osd *ctx, bool pbo)
     ctx->use_pbo = pbo;
 }
 
-static bool upload(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
-                   struct sub_bitmaps *imgs, bool pbo)
-{
-    GL *gl = ctx->gl;
-    bool success = true;
-    const struct gl_format *fmt = ctx->fmt_table[imgs->format];
-    size_t pix_stride = gl_bytes_per_pixel(fmt->format, fmt->type);
-    size_t buffer_size = pix_stride * osd->h * osd->w;
-
-    char *data = NULL;
-    void *texdata = NULL;
-
-    if (pbo) {
-        if (!osd->buffer) {
-            gl->GenBuffers(1, &osd->buffer);
-            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, osd->buffer);
-            gl->BufferData(GL_PIXEL_UNPACK_BUFFER, buffer_size, NULL,
-                           GL_DYNAMIC_COPY);
-        }
-
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, osd->buffer);
-        data = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, buffer_size,
-                                  GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
-        if (!data) {
-            success = false;
-            goto done;
-        }
-    } else {
-        if (!imgs->packed) {
-            if (!osd->upload)
-                osd->upload = talloc_size(NULL, buffer_size);
-            data = osd->upload;
-            texdata = data;
-        }
-    }
-
-    int copy_w = 0;
-    int copy_h = 0;
-    size_t stride = 0;
-    if (imgs->packed) {
-        copy_w = imgs->packed_w;
-        copy_h = imgs->packed_h;
-        stride = imgs->packed->stride[0];
-        texdata = imgs->packed->planes[0];
-        if (pbo) {
-            memcpy_pic(data, texdata, pix_stride * copy_w,  copy_h,
-                       osd->w * pix_stride, stride);
-            stride = osd->w * pix_stride;
-            texdata = NULL;
-        }
-    } else {
-        struct pos bb[2];
-        packer_get_bb(osd->packer, bb);
-        copy_w = bb[1].x;
-        copy_h = bb[1].y;
-        stride = osd->w * pix_stride;
-        packer_copy_subbitmaps(osd->packer, imgs, data, pix_stride, stride);
-    }
-
-    if (pbo) {
-        if (!gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER)) {
-            success = false;
-            goto done;
-        }
-    }
-
-    gl_upload_tex(gl, GL_TEXTURE_2D, fmt->format, fmt->type, texdata, stride,
-                  0, 0, copy_w, copy_h);
-
-    if (pbo)
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-done:
-    if (!success) {
-        MP_FATAL(ctx, "Error: can't upload subtitles! "
-                 "Remove the 'pbo' suboption.\n");
-    }
-
-    return success;
-}
-
 static int next_pow2(int v)
 {
     for (int x = 0; x < 30; x++) {
@@ -231,31 +133,12 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
                        struct sub_bitmaps *imgs)
 {
     GL *gl = ctx->gl;
+    bool ok = false;
 
-    int req_w = 0;
-    int req_h = 0;
-
-    if (imgs->packed) {
-        req_w = next_pow2(imgs->packed_w);
-        req_h = next_pow2(imgs->packed_h);
-    } else {
-        // assume 2x2 filter on scaling
-        osd->packer->padding = imgs->scaled;
-        int r = packer_pack_from_subbitmaps(osd->packer, imgs);
-        if (r < 0) {
-            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
-                "supported size %dx%d.\n", osd->packer->w_max, osd->packer->h_max);
-            return false;
-        }
-        req_w = osd->packer->w;
-        req_h = osd->packer->h;
-    }
+    assert(imgs->packed);
 
-    if (req_w > ctx->max_tex_wh || req_h > ctx->max_tex_wh) {
-        MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
-                "supported size %dx%d.\n", ctx->max_tex_wh, ctx->max_tex_wh);
-        return false;
-    }
+    int req_w = next_pow2(imgs->packed_w);
+    int req_h = next_pow2(imgs->packed_h);
 
     const struct gl_format *fmt = ctx->fmt_table[imgs->format];
     assert(fmt);
@@ -270,6 +153,17 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
         osd->w = FFMAX(32, req_w);
         osd->h = FFMAX(32, req_h);
 
+        MP_VERBOSE(ctx, "Reallocating OSD texture to %dx%d.\n", osd->w, osd->h);
+
+        GLint max_wh;
+        gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &max_wh);
+
+        if (osd->w > max_wh || osd->h > max_wh) {
+            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
+                   "supported size %dx%d.\n", max_wh, max_wh);
+            goto done;
+        }
+
         gl->TexImage2D(GL_TEXTURE_2D, 0, fmt->internal_format, osd->w, osd->h,
                        0, fmt->format, fmt->type, NULL);
 
@@ -277,24 +171,17 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-
-        if (gl->DeleteBuffers)
-            gl->DeleteBuffers(1, &osd->buffer);
-        osd->buffer = 0;
-
-        talloc_free(osd->upload);
-        osd->upload = NULL;
     }
 
-    bool uploaded = false;
-    if (ctx->use_pbo)
-        uploaded = upload(ctx, osd, imgs, true);
-    if (!uploaded)
-        upload(ctx, osd, imgs, false);
+    gl_pbo_upload_tex(&osd->pbo, gl, ctx->use_pbo, GL_TEXTURE_2D, fmt->format,
+                      fmt->type, osd->w, osd->h, imgs->packed->planes[0],
+                      imgs->packed->stride[0], 0, 0,
+                      imgs->packed_w, imgs->packed_h);
+    ok = true;
 
+done:
     gl->BindTexture(GL_TEXTURE_2D, 0);
-
-    return true;
+    return ok;
 }
 
 static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
@@ -319,13 +206,6 @@ static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
     MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
     memcpy(osd->subparts, imgs->parts,
            osd->num_subparts * sizeof(osd->subparts[0]));
-
-    if (!imgs->packed) {
-        for (int n = 0; n < osd->num_subparts; n++) {
-            osd->subparts[n].src_x = osd->packer->result[n].x;
-            osd->subparts[n].src_y = osd->packer->result[n].y;
-        }
-    }
 }
 
 static void write_quad(struct vertex *va, struct gl_transform t,
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
index 8f915a56e3..112012f04f 100644
--- a/video/out/opengl/user_shaders.c
+++ b/video/out/opengl/user_shaders.c
@@ -16,6 +16,7 @@
  */
 
 #include <ctype.h>
+#include <assert.h>
 
 #include "user_shaders.h"
 
@@ -69,6 +70,94 @@ static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
     return true;
 }
 
+// Returns whether successful. 'result' is left untouched on failure
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result)
+{
+    float stack[MAX_SZEXP_SIZE] = {0};
+    int idx = 0; // points to next element to push
+
+    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
+        switch (expr[i].tag) {
+        case SZEXP_END:
+            goto done;
+
+        case SZEXP_CONST:
+            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
+            // impossible to overflow the stack
+            assert(idx < MAX_SZEXP_SIZE);
+            stack[idx++] = expr[i].val.cval;
+            continue;
+
+        case SZEXP_OP1:
+            if (idx < 1) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            switch (expr[i].val.op) {
+            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+            default: abort();
+            }
+            continue;
+
+        case SZEXP_OP2:
+            if (idx < 2) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            // Pop the operands in reverse order
+            float op2 = stack[--idx];
+            float op1 = stack[--idx];
+            float res = 0.0;
+            switch (expr[i].val.op) {
+            case SZEXP_OP_ADD: res = op1 + op2; break;
+            case SZEXP_OP_SUB: res = op1 - op2; break;
+            case SZEXP_OP_MUL: res = op1 * op2; break;
+            case SZEXP_OP_DIV: res = op1 / op2; break;
+            case SZEXP_OP_GT:  res = op1 > op2; break;
+            case SZEXP_OP_LT:  res = op1 < op2; break;
+            default: abort();
+            }
+
+            if (!isfinite(res)) {
+                mp_warn(log, "Illegal operation in RPN expression!\n");
+                return false;
+            }
+
+            stack[idx++] = res;
+            continue;
+
+        case SZEXP_VAR_W:
+        case SZEXP_VAR_H: {
+            struct bstr name = expr[i].val.varname;
+            float size[2];
+
+            if (!lookup(priv, name, size)) {
+                mp_warn(log, "Variable %.*s not found in RPN expression!\n",
+                        BSTR_P(name));
+                return false;
+            }
+
+            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? size[0] : size[1];
+            continue;
+            }
+        }
+    }
+
+done:
+    // Return the single stack element
+    if (idx != 1) {
+        mp_warn(log, "Malformed stack after RPN expression!\n");
+        return false;
+    }
+
+    *result = stack[0];
+    return true;
+}
+
 // Returns false if no more shaders could be parsed
 bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
                             struct gl_user_shader *out)
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
index b8c287b6bd..7527eb3ba2 100644
--- a/video/out/opengl/user_shaders.h
+++ b/video/out/opengl/user_shaders.h
@@ -71,4 +71,9 @@ struct gl_user_shader {
 bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
                             struct gl_user_shader *out);
 
+// Evaluate a szexp, given a lookup function for named textures
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result);
+
 #endif
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 73b411e66c..72a748a82d 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -109,8 +109,10 @@ mp_image_t *gl_read_window_contents(GL *gl)
     mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, vp[2], vp[3]);
     if (!image)
         return NULL;
+    gl->BindFramebuffer(GL_FRAMEBUFFER, gl->main_fb);
+    GLenum obj = gl->main_fb ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
     gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
-    gl->ReadBuffer(GL_FRONT);
+    gl->ReadBuffer(obj);
     //flip image while reading (and also avoid stride-related trouble)
     for (int y = 0; y < vp[3]; y++) {
         gl->ReadPixels(vp[0], vp[1] + vp[3] - y - 1, vp[2], 1,
@@ -118,6 +120,7 @@ mp_image_t *gl_read_window_contents(GL *gl)
                        image->planes[0] + y * image->stride[0]);
     }
     gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
     return image;
 }
 
@@ -1121,3 +1124,73 @@ void gl_timer_stop(struct gl_timer *timer)
     if (gl->EndQuery)
         gl->EndQuery(GL_TIME_ELAPSED);
 }
+
+// Upload a texture, going through a PBO. PBO supposedly can facilitate
+// asynchronous copy from CPU to GPU, so this is an optimization. Note that
+// changing format/type/tex_w/tex_h or reusing the PBO in the same frame can
+// ruin performance.
+// This call is like gl_upload_tex(), plus PBO management/use.
+// target, format, type, dataptr, stride, x, y, w, h: texture upload params
+//                                                    (see gl_upload_tex())
+// tex_w, tex_h: maximum size of the used texture
+// use_pbo: for convenience, if false redirects the call to gl_upload_tex
+void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo,
+                       GLenum target, GLenum format, GLenum type,
+                       int tex_w, int tex_h, const void *dataptr, int stride,
+                       int x, int y, int w, int h)
+{
+    assert(x >= 0 && y >= 0 && w >= 0 && h >= 0);
+    assert(x + w <= tex_w && y + h <= tex_h);
+
+    if (!use_pbo || !gl->MapBufferRange)
+        goto no_pbo;
+
+    size_t pix_stride = gl_bytes_per_pixel(format, type);
+    size_t buffer_size = pix_stride * tex_w * tex_h;
+    size_t needed_size = pix_stride * w * h;
+
+    if (buffer_size != pbo->buffer_size)
+        gl_pbo_upload_uninit(pbo);
+
+    if (!pbo->buffers[0]) {
+        pbo->gl = gl;
+        pbo->buffer_size = buffer_size;
+        gl->GenBuffers(2, &pbo->buffers[0]);
+        for (int n = 0; n < 2; n++) {
+            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffers[n]);
+            gl->BufferData(GL_PIXEL_UNPACK_BUFFER, buffer_size, NULL,
+                           GL_DYNAMIC_COPY);
+        }
+    }
+
+    pbo->index = (pbo->index + 1) % 2;
+
+    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffers[pbo->index]);
+    void *data = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, needed_size,
+                                    GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+    if (!data)
+        goto no_pbo;
+
+    memcpy_pic(data, dataptr, pix_stride * w,  h, pix_stride * w, stride);
+
+    if (!gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER)) {
+        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+        goto no_pbo;
+    }
+
+    gl_upload_tex(gl, target, format, type, NULL, pix_stride * w, x, y, w, h);
+
+    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    return;
+
+no_pbo:
+    gl_upload_tex(gl, target, format, type, dataptr, stride, x, y, w, h);
+}
+
+void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo)
+{
+    if (pbo->gl)
+        pbo->gl->DeleteBuffers(2, &pbo->buffers[0]);
+    *pbo = (struct gl_pbo_upload){0};
+}
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 9b4fd8471d..ec54d19b8a 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -182,4 +182,17 @@ uint64_t gl_timer_last_us(struct gl_timer *timer);
 uint64_t gl_timer_avg_us(struct gl_timer *timer);
 uint64_t gl_timer_peak_us(struct gl_timer *timer);
 
+struct gl_pbo_upload {
+    GL *gl;
+    int index;
+    GLuint buffers[2];
+    size_t buffer_size;
+};
+
+void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo,
+                       GLenum target, GLenum format,  GLenum type,
+                       int tex_w, int tex_h, const void *dataptr, int stride,
+                       int x, int y, int w, int h);
+void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo);
+
 #endif
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index f46fdc1c9f..468bee90b5 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -41,7 +41,6 @@
 #include "user_shaders.h"
 #include "video/out/filter_kernels.h"
 #include "video/out/aspect.h"
-#include "video/out/bitmap_packer.h"
 #include "video/out/dither.h"
 #include "video/out/vo.h"
 
@@ -97,13 +96,13 @@ struct texplane {
     GLenum gl_format;
     GLenum gl_type;
     GLuint gl_texture;
-    int gl_buffer;
     char swizzle[5];
+    bool flipped;
+    struct gl_pbo_upload pbo;
 };
 
 struct video_image {
     struct texplane planes[4];
-    bool image_flipped;
     struct mp_image *mpi;       // original input image
     bool hwdec_mapped;
 };
@@ -676,7 +675,7 @@ static int pass_bind(struct gl_video *p, struct img_tex tex)
 }
 
 // Rotation by 90° and flipping.
-static void get_plane_source_transform(struct gl_video *p, int w, int h,
+static void get_plane_source_transform(struct gl_video *p, struct texplane *t,
                                        struct gl_transform *out_tr)
 {
     struct gl_transform tr = identity_trans;
@@ -689,11 +688,11 @@ static void get_plane_source_transform(struct gl_video *p, int w, int h,
     // basically, recenter to keep the whole image in view
     float b[2] = {1, 1};
     gl_transform_vec(rot, &b[0], &b[1]);
-    tr.t[0] += b[0] < 0 ? w : 0;
-    tr.t[1] += b[1] < 0 ? h : 0;
+    tr.t[0] += b[0] < 0 ? t->w : 0;
+    tr.t[1] += b[1] < 0 ? t->h : 0;
 
-    if (p->image.image_flipped) {
-        struct gl_transform flip = {{{1, 0}, {0, -1}}, {0, h}};
+    if (t->flipped) {
+        struct gl_transform flip = {{{1, 0}, {0, -1}}, {0, t->h}};
         gl_transform_trans(flip, &tr);
     }
 
@@ -730,7 +729,7 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
 
     // The existing code assumes we just have a single tex multiplier for
     // all of the planes. This may change in the future
-    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.colorspace,
+    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.color.space,
                                          p->image_desc.component_bits,
                                          p->image_desc.component_full_bits);
 
@@ -764,7 +763,7 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
             .components = p->image_desc.components[n],
         };
         snprintf(tex[n].swizzle, sizeof(tex[n].swizzle), "%s", t->swizzle);
-        get_plane_source_transform(p, t->w, t->h, &tex[n].transform);
+        get_plane_source_transform(p, t, &tex[n].transform);
         if (p->image_params.rotate % 180 == 90)
             MPSWAP(int, tex[n].w, tex[n].h);
 
@@ -794,7 +793,7 @@ static void init_video(struct gl_video *p)
     mp_image_params_guess_csp(&p->image_params);
 
     int eq_caps = MP_CSP_EQ_CAPS_GAMMA;
-    if (p->image_params.colorspace != MP_CSP_BT_2020_C)
+    if (p->image_params.color.space != MP_CSP_BT_2020_C)
         eq_caps |= MP_CSP_EQ_CAPS_COLORMATRIX;
     if (p->image_desc.flags & MP_IMGFLAG_XYZ)
         eq_caps |= MP_CSP_EQ_CAPS_BRIGHTNESS;
@@ -879,7 +878,7 @@ static void uninit_video(struct gl_video *p)
         struct texplane *plane = &vimg->planes[n];
 
         gl->DeleteTextures(1, &plane->gl_texture);
-        gl->DeleteBuffers(1, &plane->gl_buffer);
+        gl_pbo_upload_uninit(&plane->pbo);
     }
     *vimg = (struct video_image){0};
 
@@ -1239,6 +1238,9 @@ static void load_shader(struct gl_video *p, struct bstr body)
     gl_sc_uniform_f(p->sc, "frame", p->frames_uploaded);
     gl_sc_uniform_vec2(p->sc, "image_size", (GLfloat[]){p->image_params.w,
                                                         p->image_params.h});
+    gl_sc_uniform_vec2(p->sc, "target_size",
+                       (GLfloat[]){p->dst_rect.x1 - p->dst_rect.x0,
+                                   p->dst_rect.y1 - p->dst_rect.y0});
 }
 
 static const char *get_custom_shader_fn(struct gl_video *p, const char *body)
@@ -1542,112 +1544,40 @@ static void user_hook_old(struct gl_video *p, struct img_tex tex,
     GLSLF("color = %s(HOOKED_raw, HOOKED_pos, HOOKED_size);\n", fn_name);
 }
 
-// Returns whether successful. 'result' is left untouched on failure
-static bool eval_szexpr(struct gl_video *p, struct img_tex tex,
-                        struct szexp expr[MAX_SZEXP_SIZE],
-                        float *result)
-{
-    float stack[MAX_SZEXP_SIZE] = {0};
-    int idx = 0; // points to next element to push
-
-    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
-        switch (expr[i].tag) {
-        case SZEXP_END:
-            goto done;
-
-        case SZEXP_CONST:
-            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
-            // impossible to overflow the stack
-            assert(idx < MAX_SZEXP_SIZE);
-            stack[idx++] = expr[i].val.cval;
-            continue;
-
-        case SZEXP_OP1:
-            if (idx < 1) {
-                MP_WARN(p, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            switch (expr[i].val.op) {
-            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
-            default: abort();
-            }
-            continue;
-
-        case SZEXP_OP2:
-            if (idx < 2) {
-                MP_WARN(p, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            // Pop the operands in reverse order
-            float op2 = stack[--idx];
-            float op1 = stack[--idx];
-            float res = 0.0;
-            switch (expr[i].val.op) {
-            case SZEXP_OP_ADD: res = op1 + op2; break;
-            case SZEXP_OP_SUB: res = op1 - op2; break;
-            case SZEXP_OP_MUL: res = op1 * op2; break;
-            case SZEXP_OP_DIV: res = op1 / op2; break;
-            case SZEXP_OP_GT:  res = op1 > op2; break;
-            case SZEXP_OP_LT:  res = op1 < op2; break;
-            default: abort();
-            }
-
-            if (!isfinite(res)) {
-                MP_WARN(p, "Illegal operation in RPN expression!\n");
-                return false;
-            }
-
-            stack[idx++] = res;