7 files changed, 283 insertions, 164 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index a92a8df962..f50f67a203 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -4295,7 +4295,8 @@ The following video options are currently all specific to ``--vo=gpu`` and
     Controls the number of VkQueues used for rendering (limited by how many
     your device supports). In theory, using more queues could enable some
     parallelism between frames (when using a ``--swapchain-depth`` higher than
-    1). (Default: 1)
+    1), but it can also slow things down on hardware where there's no true
+    parallelism between queues. (Default: 1)
 
 ``--d3d11-warp=<yes|no|auto>``
     Use WARP (Windows Advanced Rasterization Platform) with the D3D11 GPU
diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h
index de49c6f1af..b849b6dc0b 100644
--- a/video/out/vulkan/common.h
+++ b/video/out/vulkan/common.h
@@ -48,10 +48,23 @@ struct mpvk_ctx {
     VkSurfaceKHR surf;
     VkSurfaceFormatKHR surf_format; // picked at surface initialization time
 
-    struct vk_malloc *alloc; // memory allocator for this device
-    struct vk_cmdpool *pool; // primary command pool for this device
-    struct vk_cmd *last_cmd; // most recently submitted (pending) command
+    struct vk_malloc *alloc;      // memory allocator for this device
     struct spirv_compiler *spirv; // GLSL -> SPIR-V compiler
+    struct vk_cmdpool **pools;    // command pools (one per queue family)
+    int num_pools;
+    struct vk_cmd *last_cmd;      // most recently submitted command
+
+    // Queued/pending commands. These are shared for the entire mpvk_ctx to
+    // ensure submission and callbacks are FIFO
+    struct vk_cmd **cmds_queued;  // recorded but not yet submitted
+    struct vk_cmd **cmds_pending; // submitted but not completed
+    int num_cmds_queued;
+    int num_cmds_pending;
+
+    // Pointers into *pools
+    struct vk_cmdpool *pool_graphics; // required
+    struct vk_cmdpool *pool_compute;  // optional
+    struct vk_cmdpool *pool_transfer; // optional
 
     // Common pool of signals, to avoid having to re-create these objects often
     struct vk_signal **signals;
diff --git a/video/out/vulkan/context.c b/video/out/vulkan/context.c
index ddc80be042..4f96440652 100644
--- a/video/out/vulkan/context.c
+++ b/video/out/vulkan/context.c
@@ -245,7 +245,8 @@ void ra_vk_ctx_uninit(struct ra_ctx *ctx)
         struct priv *p = ctx->swapchain->priv;
         struct mpvk_ctx *vk = p->vk;
 
-        mpvk_dev_wait_cmds(vk, UINT64_MAX);
+        mpvk_flush_commands(vk);
+        mpvk_poll_commands(vk, UINT64_MAX);
 
         for (int i = 0; i < p->num_images; i++)
             ra_tex_free(ctx->ra, &p->images[i]);
@@ -355,7 +356,7 @@ bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h)
     // more than one swapchain already active, so we need to flush any pending
     // asynchronous swapchain release operations that may be ongoing.
     while (p->old_swapchain)
-        mpvk_dev_wait_cmds(vk, 100000); // 100μs
+        mpvk_poll_commands(vk, 100000); // 100μs
 
     VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
     sinfo.imageExtent  = (VkExtent2D){ w, h };
@@ -501,14 +502,14 @@ static bool submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame)
     vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL);
 
     vk_cmd_queue(vk, cmd);
-    if (!vk_flush_commands(vk))
+    if (!mpvk_flush_commands(vk))
         goto error;
 
     // Older nvidia drivers can spontaneously combust when submitting to the
     // same queue as we're rendering from, in a multi-queue scenario. Safest
     // option is to flush the commands first and then submit to the next queue.
     // We can drop this hack in the future, I suppose.
-    struct vk_cmdpool *pool = vk->pool;
+    struct vk_cmdpool *pool = vk->pool_graphics;
     VkQueue queue = pool->queues[pool->idx_queues];
 
     VkPresentInfoKHR pinfo = {
@@ -533,7 +534,7 @@ static void swap_buffers(struct ra_swapchain *sw)
     struct priv *p = sw->priv;
 
     while (p->frames_in_flight >= sw->ctx->opts.swapchain_depth)
-        mpvk_dev_wait_cmds(p->vk, 100000); // 100μs
+        mpvk_poll_commands(p->vk, 100000); // 100μs
 }
 
 static const struct ra_swapchain_fns vulkan_swapchain = {
diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c
index f6cb1143bb..a9aced33d8 100644
--- a/video/out/vulkan/malloc.c
+++ b/video/out/vulkan/malloc.c
@@ -133,11 +133,23 @@ static struct vk_slab *slab_alloc(struct mpvk_ctx *vk, struct vk_heap *heap,
 
     uint32_t typeBits = heap->typeBits ? heap->typeBits : UINT32_MAX;
     if (heap->usage) {
+        // FIXME: Since we can't keep track of queue family ownership properly,
+        // and we don't know in advance what types of queue families this buffer
+        // will belong to, we're forced to share all of our buffers between all
+        // command pools.
+        uint32_t qfs[3] = {0};
+        for (int i = 0; i < vk->num_pools; i++)
+            qfs[i] = vk->pools[i]->qf;
+
         VkBufferCreateInfo binfo = {
             .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
             .size  = slab->size,
             .usage = heap->usage,
+            .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
+                                             : VK_SHARING_MODE_EXCLUSIVE,
             .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = vk->num_pools,
+            .pQueueFamilyIndices = qfs,
         };
 
         VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer));
diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c
index 42e6c2f64c..905fc89596 100644
--- a/video/out/vulkan/ra_vk.c
+++ b/video/out/vulkan/ra_vk.c
@@ -6,6 +6,12 @@
 
 static struct ra_fns ra_fns_vk;
 
+enum queue_type {
+    GRAPHICS,
+    COMPUTE,
+    TRANSFER,
+};
+
 // For ra.priv
 struct ra_vk {
     struct mpvk_ctx *vk;
@@ -22,18 +28,6 @@ struct mpvk_ctx *ra_vk_get(struct ra *ra)
     return p->vk;
 }
 
-// Returns a command buffer, or NULL on error
-static struct vk_cmd *vk_require_cmd(struct ra *ra)
-{
-    struct ra_vk *p = ra->priv;
-    struct mpvk_ctx *vk = ra_vk_get(ra);
-
-    if (!p->cmd)
-        p->cmd = vk_cmd_begin(vk, vk->pool);
-
-    return p->cmd;
-}
-
 static void vk_submit(struct ra *ra)
 {
     struct ra_vk *p = ra->priv;
@@ -45,22 +39,46 @@ static void vk_submit(struct ra *ra)
     }
 }
 
-// The callback's *priv will always be set to `ra`
-static void vk_callback(struct ra *ra, vk_cb callback, void *arg)
+// Returns a command buffer, or NULL on error
+static struct vk_cmd *vk_require_cmd(struct ra *ra, enum queue_type type)
 {
     struct ra_vk *p = ra->priv;
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
-    if (p->cmd) {
-        vk_cmd_callback(p->cmd, callback, ra, arg);
-    } else {
-        vk_dev_callback(vk, callback, ra, arg);
+    struct vk_cmdpool *pool;
+    switch (type) {
+    case GRAPHICS: pool = vk->pool_graphics; break;
+    case COMPUTE:  pool = vk->pool_compute;  break;
+
+    // GRAPHICS and COMPUTE also imply TRANSFER capability (vulkan spec)
+    case TRANSFER:
+        pool = vk->pool_transfer;
+        if (!pool)
+            pool = vk->pool_compute;
+        if (!pool)
+            pool = vk->pool_graphics;
+        break;
+    default: abort();
     }
+
+    assert(pool);
+    if (p->cmd && p->cmd->pool == pool)
+        return p->cmd;
+
+    vk_submit(ra);
+    p->cmd = vk_cmd_begin(vk, pool);
+    return p->cmd;
 }
 
 #define MAKE_LAZY_DESTRUCTOR(fun, argtype)                  \
     static void fun##_lazy(struct ra *ra, argtype *arg) {   \
-        vk_callback(ra, (vk_cb) fun, arg);                  \
+        struct ra_vk *p = ra->priv;                         \
+        struct mpvk_ctx *vk = ra_vk_get(ra);                \
+        if (p->cmd) {                                       \
+            vk_cmd_callback(p->cmd, (vk_cb) fun, ra, arg);  \
+        } else {                                            \
+            vk_dev_callback(vk, (vk_cb) fun, ra, arg);      \
+        }                                                   \
     }
 
 static void vk_destroy_ra(struct ra *ra)
@@ -69,8 +87,8 @@ static void vk_destroy_ra(struct ra *ra)
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
     vk_submit(ra);
-    vk_flush_commands(vk);
-    mpvk_dev_wait_cmds(vk, UINT64_MAX);
+    mpvk_flush_commands(vk);
+    mpvk_poll_commands(vk, UINT64_MAX);
     ra_tex_free(ra, &p->clear_tex);
 
     talloc_free(ra);
@@ -190,7 +208,7 @@ struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
     ra->max_shmem = vk->limits.maxComputeSharedMemorySize;
     ra->max_pushc_size = vk->limits.maxPushConstantsSize;
 
-    if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT)
+    if (vk->pool_compute)
         ra->caps |= RA_CAP_COMPUTE;
 
     if (!vk_setup_formats(ra))
@@ -280,6 +298,7 @@ static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
 // For ra_tex.priv
 struct ra_tex_vk {
     bool external_img;
+    enum queue_type upload_queue;
     VkImageType type;
     VkImage img;
     struct vk_memslice mem;
@@ -480,6 +499,7 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
     tex->params.initial_data = NULL;
 
     struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
+    tex_vk->upload_queue = GRAPHICS;
 
     const struct vk_format *fmt = params->format->priv;
     switch (params->dimensions) {
@@ -501,6 +521,10 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
     if (params->host_mutable || params->blit_dst || params->initial_data)
         usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
+    // Always use the transfer pool if available, for efficiency
+    if (params->host_mutable && vk->pool_transfer)
+        tex_vk->upload_queue = TRANSFER;
+
     // Double-check image usage support and fail immediately if invalid
     VkImageFormatProperties iprop;
     VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
@@ -528,6 +552,14 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
         return NULL;
     }
 
+    // FIXME: Since we can't keep track of queue family ownership properly,
+    // and we don't know in advance what types of queue families this image
+    // will belong to, we're forced to share all of our images between all
+    // command pools.
+    uint32_t qfs[3] = {0};
+    for (int i = 0; i < vk->num_pools; i++)
+        qfs[i] = vk->pools[i]->qf;
+
     VkImageCreateInfo iinfo = {
         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
         .imageType = tex_vk->type,
@@ -539,9 +571,10 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
         .tiling = VK_IMAGE_TILING_OPTIMAL,
         .usage = usage,
         .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
-        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-        .queueFamilyIndexCount = 1,
-        .pQueueFamilyIndices = &vk->pool->qf,
+        .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
+                                         : VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = vk->num_pools,
+        .pQueueFamilyIndices = qfs,
     };
 
     VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img));
@@ -632,6 +665,7 @@ struct ra_buf_vk {
     struct vk_bufslice slice;
     int refcount; // 1 = object allocated but not in use, > 1 = in use
     bool needsflush;
+    enum queue_type update_queue;
     // "current" metadata, can change during course of execution
     VkPipelineStageFlags current_stage;
     VkAccessFlags current_access;
@@ -700,7 +734,7 @@ static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
         memcpy((void *)addr, data, size);
         buf_vk->needsflush = true;
     } else {
-        struct vk_cmd *cmd = vk_require_cmd(ra);
+        struct vk_cmd *cmd = vk_require_cmd(ra, buf_vk->update_queue);
         if (!cmd) {
             MP_ERR(ra, "Failed updating buffer!\n");
             return;
@@ -736,6 +770,9 @@ static struct ra_buf *vk_buf_create(struct ra *ra,
     case RA_BUF_TYPE_TEX_UPLOAD:
         bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
         memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        // Use TRANSFER-style updates for large enough buffers for efficiency
+        if (params->size > 1024*1024) // 1 MB
+            buf_vk->update_queue = TRANSFER;
         break;
     case RA_BUF_TYPE_UNIFORM:
         bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
@@ -746,6 +783,7 @@ static struct ra_buf *vk_buf_create(struct ra *ra,
         bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
         memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
         align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment);
+        buf_vk->update_queue = COMPUTE;
         break;
     case RA_BUF_TYPE_VERTEX:
         bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
@@ -832,7 +870,7 @@ static bool vk_tex_upload(struct ra *ra,
     uint64_t size = region.bufferRowLength * region.bufferImageHeight *
                     region.imageExtent.depth;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, tex_vk->upload_queue);
     if (!cmd)
         goto error;
 
@@ -1478,7 +1516,12 @@ static void vk_renderpass_run(struct ra *ra,
     struct ra_renderpass *pass = params->pass;
     struct ra_renderpass_vk *pass_vk = pass->priv;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    static const enum queue_type types[] = {
+        [RA_RENDERPASS_TYPE_RASTER]  = GRAPHICS,
+        [RA_RENDERPASS_TYPE_COMPUTE] = COMPUTE,
+    };
+
+    struct vk_cmd *cmd = vk_require_cmd(ra, types[pass->params.type]);
     if (!cmd)
         goto error;
 
@@ -1603,7 +1646,7 @@ static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
     struct ra_tex_vk *src_vk = src->priv;
     struct ra_tex_vk *dst_vk = dst->priv;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
@@ -1643,7 +1686,7 @@ static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
     struct ra_tex_vk *tex_vk = tex->priv;
     assert(tex->params.blit_dst);
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
@@ -1726,7 +1769,7 @@ error:
 static void vk_timer_record(struct ra *ra, VkQueryPool pool, int index,
                             VkPipelineStageFlags stage)
 {
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
@@ -1795,7 +1838,7 @@ static struct ra_fns ra_fns_vk = {
 struct vk_cmd *ra_vk_submit(struct ra *ra, struct ra_tex *tex)
 {
     struct ra_vk *p = ra->priv;
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return NULL;
 
diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c
index 9d9d8d9820..cb73e7d8ac 100644
--- a/video/out/vulkan/utils.c
+++ b/video/out/vulkan/utils.c
@@ -139,7 +139,15 @@ void mpvk_uninit(struct mpvk_ctx *vk)
         return;
 
     if (vk->dev) {
-        vk_cmdpool_destroy(vk, vk->pool);
+        mpvk_flush_commands(vk);
+        mpvk_poll_commands(vk, UINT64_MAX);
+        assert(vk->num_cmds_queued == 0);
+        assert(vk->num_cmds_pending == 0);
+        talloc_free(vk->cmds_queued);
+        talloc_free(vk->cmds_pending);
+        for (int i = 0; i < vk->num_pools; i++)
+            vk_cmdpool_destroy(vk, vk->pools[i]);
+        talloc_free(vk->pools);
         for (int i = 0; i < vk->num_signals; i++)
             vk_signal_destroy(vk, &vk->signals[i]);
         talloc_free(vk->signals);
@@ -377,6 +385,53 @@ error:
     return false;
 }
 
+// Find the most specialized queue supported a combination of flags. In cases
+// where there are multiple queue families at the same specialization level,
+// this finds the one with the most queues. Returns -1 if no queue was found.
+static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags)
+{
+    int idx = -1;
+    for (int i = 0; i < qfnum; i++) {
+        if (!(qfs[i].queueFlags & flags))
+            continue;
+
+        // QF is more specialized
+        if (idx < 0 || qfs[i].queueFlags < qfs[idx].queueFlags)
+            idx = i;
+
+        // QF has more queues (at the same specialization level)
+        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+            qfs[i].queueCount > qfs[idx].queueCount)
+            idx = i;
+    }
+
+    return idx;
+}
+
+static void add_qinfo(void *tactx, VkDeviceQueueCreateInfo **qinfos,
+                      int *num_qinfos, VkQueueFamilyProperties *qfs, int idx,
+                      int qcount)
+{
+    if (idx < 0)
+        return;
+
+    // Check to see if we've already added this queue family
+    for (int i = 0; i < *num_qinfos; i++) {
+        if ((*qinfos)[i].queueFamilyIndex == idx)
+            return;
+    }
+
+    float *priorities = talloc_zero_array(tactx, float, qcount);
+    VkDeviceQueueCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = idx,
+        .queueCount = MPMIN(qcount, qfs[idx].queueCount),
+        .pQueuePriorities = priorities,
+    };
+
+    MP_TARRAY_APPEND(tactx, *qinfos, *num_qinfos, qinfo);
+}
+
 bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
 {
     assert(vk->physd);
@@ -395,45 +450,34 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
                    (unsigned)qfs[i].queueFlags, (int)qfs[i].queueCount);
     }
 
-    // For most of our rendering operations, we want to use one "primary" pool,
-    // so just pick the queue family with the most features.
-    int idx = -1;
-    for (int i = 0; i < qfnum; i++) {
-        if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT))
-            continue;
-
-        // QF supports more features
-        if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags)
-            idx = i;
-
-        // QF supports more queues (at the same specialization level)
-        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
-            qfs[i].queueCount > qfs[idx].queueCount)
-        {
-            idx = i;
-        }
-    }
+    int idx_gfx  = find_qf(qfs, qfnum, VK_QUEUE_GRAPHICS_BIT),
+        idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT),
+        idx_tf   = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT);
 
     // Vulkan requires at least one GRAPHICS queue, so if this fails something
     // is horribly wrong.
-    assert(idx >= 0);
+    assert(idx_gfx >= 0);
+    MP_VERBOSE(vk, "Using graphics queue (QF %d)\n", idx_gfx);
 
     // Ensure we can actually present to the surface using this queue
     VkBool32 sup;
-    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx, vk->surf, &sup));
+    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx, vk->surf, &sup));
     if (!sup) {
         MP_ERR(vk, "Queue family does not support surface presentation!\n");
         goto error;
     }
 
+    if (idx_tf >= 0 && idx_tf != idx_gfx)
+        MP_VERBOSE(vk, "Using async transfer (QF %d)\n", idx_tf);
+    if (idx_comp >= 0 && idx_comp != idx_gfx)
+        MP_VERBOSE(vk, "Using async compute (QF %d)\n", idx_comp);
+
     // Now that we know which QFs we want, we can create the logical device
-    float *priorities = talloc_zero_array(tmp, float, opts.queue_count);
-    VkDeviceQueueCreateInfo qinfo = {
-        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-        .queueFamilyIndex = idx,
-        .queueCount = MPMIN(qfs[idx].queueCount, opts.queue_count),
-        .pQueuePriorities = priorities,
-    };
+    VkDeviceQueueCreateInfo *qinfos = NULL;
+    int num_qinfos = 0;
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_gfx, opts.queue_count);
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_comp, opts.queue_count);
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_tf, opts.queue_count);
 
     const char **exts = NULL;
     int num_exts = 0;
@@ -443,8 +487,8 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
 
     VkDeviceCreateInfo dinfo = {
         .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
-        .queueCreateInfoCount = 1,
-        .pQueueCreateInfos = &qinfo,
+        .pQueueCreateInfos = qinfos,
+        .queueCreateInfoCount = num_qinfos,
         .ppEnabledExtensionNames = exts,
         .enabledExtensionCount = num_exts,
     };
@@ -455,12 +499,20 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
 
     VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev));
 
-    vk_malloc_init(vk);
+    // Create the command pools and memory allocator
+    for (int i = 0; i < num_qinfos; i++) {
+        int qf = qinfos[i].queueFamilyIndex;
+        struct vk_cmdpool *pool = vk_cmdpool_create(vk, qinfos[i], qfs[qf]);
+        if (!pool)
+            goto error;
+        MP_TARRAY_APPEND(NULL, vk->pools, vk->num_pools, pool);
+    }
 
-    // Create the command pool(s)
-    vk->pool = vk_cmdpool_create(vk, qinfo, qfs[idx]);
-    if (!vk->pool)
-        goto error;
+    vk->pool_graphics = vk->pools[idx_gfx];
+    vk->pool_compute  = idx_comp >= 0 ? vk->pools[idx_comp] : NULL;
+    vk->pool_transfer = idx_tf   >= 0 ? vk->pools[idx_tf] : NULL;
+
+    vk_malloc_init(vk);
 
     talloc_free(tmp);
     return true;
@@ -563,10 +615,8 @@ static void vk_cmdpool_destroy(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
     if (!pool)
         return;
 
-    for (int i = 0; i < pool->num_cmds_available; i++)
-        vk_cmd_destroy(vk, pool->cmds_available[i]);
-    for (int i = 0; i < pool->num_cmds_pending; i++)
-        vk_cmd_destroy(vk, pool->cmds_pending[i]);
+    for (int i = 0; i < pool->num_cmds; i++)
+        vk_cmd_destroy(vk, pool->cmds[i]);
 
     vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR);
     talloc_free(pool);
@@ -603,26 +653,67 @@ error:
     return NULL;
 }
 
-void mpvk_pool_wait_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
-                         uint64_t timeout)
+void mpvk_poll_commands(struct mpvk_ctx *vk, uint64_t timeout)
 {
-    if (!pool)
-        return;
-
-    while (pool->num_cmds_pending > 0) {
-        struct vk_cmd *cmd = pool->cmds_pending[0];
+    while (vk->num_cmds_pending > 0) {
+        struct vk_cmd *cmd = vk->cmds_pending[0];
+        struct vk_cmdpool *pool = cmd->pool;
         VkResult res = vk_cmd_poll(vk, cmd, timeout);
         if (res == VK_TIMEOUT)
             break;
         vk_cmd_reset(vk, cmd);
-        MP_TARRAY_REMOVE_AT(pool->cmds_pending, pool->num_cmds_pending, 0);
-        MP_TARRAY_APPEND(pool, pool->cmds_available, pool->num_cmds_available, cmd);
+        MP_TARRAY_REMOVE_AT(vk->cmds_pending, vk->num_cmds_pending, 0);
+        MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
     }
 }
 
-void mpvk_dev_wait_cmds(struct mpvk_ctx *vk, uint64_t timeout)
+bool mpvk_flush_commands(struct mpvk_ctx *vk)
 {
-    mpvk_pool_wait_cmds(vk, vk->pool, timeout);
+    bool ret = true;
+
+    for (int i = 0; i < vk->num_cmds_queued; i++) {
+        struct vk_cmd *cmd = vk->cmds_queued[i];
+        struct vk_cmdpool *pool = cmd->pool;
+
+        VkSubmitInfo sinfo = {
+            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .commandBufferCount = 1,
+            .pCommandBuffers = &cmd->buf,
+            .waitSemaphoreCount = cmd->num_deps,
+            .pWaitSemaphores = cmd->deps,
+            .pWaitDstStageMask = cmd->depstages,
+            .signalSemaphoreCount = cmd->num_sigs,
+            .pSignalSemaphores = cmd->sigs,
+        };
+
+        VK(vkQueueSubmit(cmd->queue, 1, &sinfo, cmd->fence));
+        MP_TARRAY_APPEND(NULL, vk->cmds_pending, vk->num_cmds_pending, cmd);
+
+        if (mp_msg_test(vk->log, MSGL_TRACE)) {
+            MP_TRACE(vk, "Submitted command on queue %p (QF %d):\n",
+                     (void *)cmd->queue, pool->qf);
+            for (int n = 0; n < cmd->num_deps; n++)
+                MP_TRACE(vk, "    waits on semaphore %p\n", (void *)cmd->deps[n]);
+            for (int n = 0; n < cmd->num_sigs; n++)
+                MP_TRACE(vk, "    signals semaphore %p\n", (void *)cmd->sigs[n]);
+        }
+        continue;
+
+error:
+        vk_cmd_reset(vk, cmd);
+        MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
+        ret = false;
+    }
+
+    vk->num_cmds_queued = 0;
+
+    // Rotate the queues to ensure good parallelism across frames
+    for (int i = 0; i < vk->num_pools; i++) {
+        struct vk_cmdpool *pool = vk->pools[i];
+        pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
+    }
+
+    return ret;
 }
 
 void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg)
@@ -639,10 +730,10 @@ struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
 {
     // garbage collect the cmdpool first, to increase the chances of getting
     // an already-available command buffer
-    mpvk_pool_wait_cmds(vk, pool, 0);
+    mpvk_poll_commands(vk, 0);
 
     struct vk_cmd *cmd = NULL;
-    if (MP_TARRAY_POP(pool->cmds_available, pool->num_cmds_available, &cmd))
+    if (MP_TARRAY_POP(pool->cmds, pool->num_cmds, &cmd))
         goto done;
 
     // No free command buffers => allocate another one
@@ -675,58 +766,13 @@ void vk_cmd_queue(struct mpvk_ctx *vk, struct vk_cmd *cmd)
     VK(vkEndCommandBuffer(cmd->buf));
 
     VK(vkResetFences(vk->dev, 1, &cmd->fence));
-    MP_TARRAY_APPEND(pool, pool->cmds_queued, pool->num_cmds_queued, cmd);
+    MP_TARRAY_APPEND(NULL, vk->cmds_queued, vk->num_cmds_queued, cmd);
     vk->last_cmd = cmd;
     return;
 
 error:
     vk_cmd_reset(vk, cmd);
-    MP_TARRAY_APPEND(pool, pool->cmds_available, pool->num_cmds_available, cmd);
-}
-
-bool vk_flush_commands(struct mpvk_ctx *vk)
-{
-    bool ret = true;
-
-    struct vk_cmdpool *pool = vk->pool;
-    for (int i = 0; i < pool->num_cmds_queued; i++) {
-        struct vk_cmd *cmd = pool->cmds_queued[i];
-
-        VkSubmitInfo sinfo = {
-            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-            .commandBufferCount = 1,
-            .pCommandBuffers = &cmd->buf,
-            .waitSemaphoreCount = cmd->num_deps,
-            .pWaitSemaphores = cmd->deps,
-            .pWaitDstStageMask = cmd->depstages,
-            .signalSemaphoreCount = cmd->num_sigs,
-            .pSignalSemaphores = cmd->sigs,
-        };
-
-        VK(vkQueueSubmit(cmd->queue, 1, &sinfo, cmd->fence));
-        MP_TARRAY_APPEND(pool, pool->cmds_pending, pool->num_cmds_pending, cmd);
-
-        if (mp_msg_test(vk->log, MSGL_TRACE)) {
-            MP_TRACE(vk, "Submitted command on queue %p (QF %d):\n",
-                     (void *)cmd->queue, pool->qf);
-            for (int n = 0; n < cmd->num_deps; n++)
-                MP_TRACE(vk, "    waits on semaphore %p\n", (void *)cmd->deps[n]);
-            for (int n = 0; n < cmd->num_sigs; n++)
-                MP_TRACE(vk, "    signals semaphore %p\n", (void *)cmd->sigs[n]);
-        }
-        continue;
-
-error:
-        vk_cmd_reset(vk, cmd);
-        MP_TARRAY_APPEND(pool, pool->cmds_available, pool->num_cmds_available, cmd);
-        ret = false;
-    }
-
-    pool->num_cmds_queued = 0;
-
-    // Rotate the queues to ensure good parallelism across frames
-    pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
-    return ret;
+    MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
 }
 
 void vk_signal_destroy(struct mpvk_ctx *vk, struct vk_signal **sig)
@@ -762,10 +808,16 @@ struct vk_signal *vk_cmd_signal(struct mpvk_ctx *vk, struct vk_cmd *cmd,
     VK(vkCreateEvent(vk->dev, &einfo, MPVK_ALLOCATOR, &sig->event));
 
 done:
-    // Signal both the semaphore and the event. (We will only end up using one)
+    // Signal both the semaphore and the event if possible. (We will only
+    // end up using one or the other)
     vk_cmd_sig(cmd, sig->semaphore);
-    vkCmdSetEvent(cmd->buf, sig->event, stage);
-    sig->event_source = cmd->queue;
+
+    VkQueueFlags req = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+    if (cmd->pool->props.queueFlags & req) {
+        vkCmdSetEvent(cmd->buf, sig->event, stage);
+        sig->event_source = cmd->queue;
+    }
+
     return sig;
 
 error:
@@ -787,14 +839,14 @@ static bool unsignal_cmd(struct vk_cmd *cmd, VkSemaphore sem)
 
 // Attempts to remove a queued signal operation. Returns true if sucessful,
 // i.e. the signal could be removed before it ever got fired.
-static bool unsignal(struct vk_cmd *cmd, VkSemaphore sem)
+static bool unsignal(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore sem)
 {
     if (unsignal_cmd(cmd, sem))
         return true;
 
     // Attempt to remove it from any queued commands
-    for (int i = 0; i < cmd->pool->num_cmds_queued; i++) {
-        if (unsignal_cmd(cmd->pool->cmds_queued[i], sem))
+    for (int i = 0; i < vk->num_cmds_queued; i++) {
+        if (unsignal_cmd(vk->cmds_queued[i], sem))
             return true;
     }
 
@@ -806,7 +858,9 @@ static void release_signal(struct mpvk_ctx *vk, struct vk_signal *sig)
     // The semaphore never needs to be recreated, because it's either
     // unsignaled while still queued, or unsignaled as a result of a device
     // wait. But the event *may* need to be reset, so just always reset it.
-    vkResetEvent(vk->dev, sig->event);
+    if (sig->event_source)
+        vkResetEvent(vk->dev, sig->event);
+    sig->event_source = NULL;
     MP_TARRAY_APPEND(NULL, vk->signals, vk->num_signals, sig);
 }
 
@@ -819,7 +873,7 @@ void vk_cmd_wait(struct mpvk_ctx *vk, struct vk_cmd *cmd,
         return;
 
     if (out_event && sig->event && sig->event_source == cmd->queue &&
-        unsignal(cmd, sig->semaphore))
+        unsignal(vk, cmd, sig->semaphore))
     {
         // If we can remove the semaphore signal operation from the history and
         // pretend it never happened, then we get to use the VkEvent. This also
diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h
index 538897afae..de3a757be3 100644
--- a/video/out/vulkan/utils.h
+++ b/video/out/vulkan/utils.h
@@ -66,9 +66,13 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts);
 // UINT64_MAX effectively means waiting until the pool/device is idle. The
 // timeout may also be passed as 0, in which case this function will not block,
 // but only poll for completed commands.
-void mpvk_pool_wait_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
-                         uint64_t timeout);
-void mpvk_dev_wait_cmds(struct mpvk_ctx *vk, uint64_t timeout);
+void mpvk_poll_commands(struct mpvk_ctx *vk, uint64_t timeout);
+
+// Flush all currently queued commands. Call this once per frame, after
+// submitting all of the command buffers for that frame. Calling this more
+// often than that is possible but bad for performance.
+// Returns whether successful. Failed commands will be implicitly dropped.
+bool mpvk_flush_commands(struct mpvk_ctx *vk);
 
 // Since lots of vulkan operations need to be done lazily once the affected
 // resources are no longer in use, provide an abstraction for tracking these.
@@ -158,13 +162,10 @@ struct vk_cmdpool {
     VkQueue *queues;
     int num_queues;
     int idx_queues;
-    // Command buffers associated with this queue
-    struct vk_cmd **cmds_available; // available for re-recording
-    struct vk_cmd **cmds_queued;    // recorded but not yet submitted
-    struct vk_cmd **cmds_pending;   // submitted but not completed
-    int num_cmds_available;
-    int num_cmds_queued;
-    int num_cmds_pending;
+    // Command buffers associated with this queue. These are available for
+    // re-recording
+    struct vk_cmd **cmds;
+    int num_cmds;
 };
 
 // Fetch a command buffer from a command pool and begin recording to it.
@@ -175,12 +176,6 @@ struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
 // takes over ownership of *cmd, i.e. the caller should not touch it again.
 void vk_cmd_queue(struct mpvk_ctx *vk, struct vk_cmd *cmd);
 
-// Flush all currently queued commands. Call this once per frame, after
-// submitting all of the command buffers for that frame. Calling this more
-// often than that is possible but bad for performance.
-// Returns whether successful. Failed commands will be implicitly dropped.
-bool vk_flush_commands(struct mpvk_ctx *vk);
-
 // Predefined structs for a simple non-layered, non-mipped image
 extern const VkImageSubresourceRange vk_range;
 extern const VkImageSubresourceLayers vk_layers;