vo_gpu: vulkan: support split command pools

Instead of using a single primary queue, we generate multiple vk_cmdpools and pick the right one dynamically based on the intent. This has a number of immediate benefits: 1. We can use async texture uploads 2. We can use the DMA engine for buffer updates 3. We can benefit from async compute on AMD GPUs Unfortunately, the major downside is that due to the lack of QF ownership tracking, we need to use CONCURRENT sharing for all resources (buffers *and* images!). In theory, we could try figuring out a way to get rid of the concurrent sharing for buffers (which is only needed for compute shader UBOs), but even so, the concurrent sharing mode doesn't really seem to have a significant impact over here (nvidia). It's possible that other platforms may disagree. Our deadlock-avoidance strategy is stupidly simple: Just flush the command every time we need to switch queues, and make sure all submission and callbacks happen in FIFO order. This required lifting the cmds_pending and cmds_queued out from vk_cmdpool to mpvk_ctx, and some functions died/got moved as a result, but that's a relatively minor change. On my hardware this is a fairly significant performance boost, mainly due to async transfers. (Nvidia doesn't expose separate compute queues anyway). On AMD, this should be a performance boost as well due to async compute.
author: Niklas Haas <git@haasn.xyz> 2017-09-24 15:05:24 +0200
committer: Martin Herkt <652892+lachs0r@users.noreply.github.com> 2017-12-25 00:47:53 +0100
commit: bded247fb53558dd5cba26560d1f24e9234ae24e (patch)
tree: 1e2c1819fc009acf9eea0d481a003799f7ffda8c
parent: a3c9685257e60e32646bb54a895ef7574a945f69 (diff)
download: mpv-bded247fb53558dd5cba26560d1f24e9234ae24e.tar.bz2
mpv-bded247fb53558dd5cba26560d1f24e9234ae24e.tar.xz
7 files changed, 283 insertions, 164 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index a92a8df962..f50f67a203 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -4295,7 +4295,8 @@ The following video options are currently all specific to ``--vo=gpu`` and
     Controls the number of VkQueues used for rendering (limited by how many
     your device supports). In theory, using more queues could enable some
     parallelism between frames (when using a ``--swapchain-depth`` higher than
-    1). (Default: 1)
+    1), but it can also slow things down on hardware where there's no true
+    parallelism between queues. (Default: 1)
 
 ``--d3d11-warp=<yes|no|auto>``
     Use WARP (Windows Advanced Rasterization Platform) with the D3D11 GPU
diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h
index de49c6f1af..b849b6dc0b 100644
--- a/video/out/vulkan/common.h
+++ b/video/out/vulkan/common.h
@@ -48,10 +48,23 @@ struct mpvk_ctx {
     VkSurfaceKHR surf;
     VkSurfaceFormatKHR surf_format; // picked at surface initialization time
 
-    struct vk_malloc *alloc; // memory allocator for this device
-    struct vk_cmdpool *pool; // primary command pool for this device
-    struct vk_cmd *last_cmd; // most recently submitted (pending) command
+    struct vk_malloc *alloc;      // memory allocator for this device
     struct spirv_compiler *spirv; // GLSL -> SPIR-V compiler
+    struct vk_cmdpool **pools;    // command pools (one per queue family)
+    int num_pools;
+    struct vk_cmd *last_cmd;      // most recently submitted command
+
+    // Queued/pending commands. These are shared for the entire mpvk_ctx to
+    // ensure submission and callbacks are FIFO
+    struct vk_cmd **cmds_queued;  // recorded but not yet submitted
+    struct vk_cmd **cmds_pending; // submitted but not completed
+    int num_cmds_queued;
+    int num_cmds_pending;
+
+    // Pointers into *pools
+    struct vk_cmdpool *pool_graphics; // required
+    struct vk_cmdpool *pool_compute;  // optional
+    struct vk_cmdpool *pool_transfer; // optional
 
     // Common pool of signals, to avoid having to re-create these objects often
     struct vk_signal **signals;
diff --git a/video/out/vulkan/context.c b/video/out/vulkan/context.c
index ddc80be042..4f96440652 100644
--- a/video/out/vulkan/context.c
+++ b/video/out/vulkan/context.c
@@ -245,7 +245,8 @@ void ra_vk_ctx_uninit(struct ra_ctx *ctx)
         struct priv *p = ctx->swapchain->priv;
         struct mpvk_ctx *vk = p->vk;
 
-        mpvk_dev_wait_cmds(vk, UINT64_MAX);
+        mpvk_flush_commands(vk);
+        mpvk_poll_commands(vk, UINT64_MAX);
 
         for (int i = 0; i < p->num_images; i++)
             ra_tex_free(ctx->ra, &p->images[i]);
@@ -355,7 +356,7 @@ bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h)
     // more than one swapchain already active, so we need to flush any pending
     // asynchronous swapchain release operations that may be ongoing.
     while (p->old_swapchain)
-        mpvk_dev_wait_cmds(vk, 100000); // 100μs
+        mpvk_poll_commands(vk, 100000); // 100μs
 
     VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
     sinfo.imageExtent  = (VkExtent2D){ w, h };
@@ -501,14 +502,14 @@ static bool submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame)
     vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL);
 
     vk_cmd_queue(vk, cmd);
-    if (!vk_flush_commands(vk))
+    if (!mpvk_flush_commands(vk))
         goto error;
 
     // Older nvidia drivers can spontaneously combust when submitting to the
     // same queue as we're rendering from, in a multi-queue scenario. Safest
     // option is to flush the commands first and then submit to the next queue.
     // We can drop this hack in the future, I suppose.
-    struct vk_cmdpool *pool = vk->pool;
+    struct vk_cmdpool *pool = vk->pool_graphics;
     VkQueue queue = pool->queues[pool->idx_queues];
 
     VkPresentInfoKHR pinfo = {
@@ -533,7 +534,7 @@ static void swap_buffers(struct ra_swapchain *sw)
     struct priv *p = sw->priv;
 
     while (p->frames_in_flight >= sw->ctx->opts.swapchain_depth)
-        mpvk_dev_wait_cmds(p->vk, 100000); // 100μs
+        mpvk_poll_commands(p->vk, 100000); // 100μs
 }
 
 static const struct ra_swapchain_fns vulkan_swapchain = {
diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c
index f6cb1143bb..a9aced33d8 100644
--- a/video/out/vulkan/malloc.c
+++ b/video/out/vulkan/malloc.c
@@ -133,11 +133,23 @@ static struct vk_slab *slab_alloc(struct mpvk_ctx *vk, struct vk_heap *heap,
 
     uint32_t typeBits = heap->typeBits ? heap->typeBits : UINT32_MAX;
     if (heap->usage) {
+        // FIXME: Since we can't keep track of queue family ownership properly,
+        // and we don't know in advance what types of queue families this buffer
+        // will belong to, we're forced to share all of our buffers between all
+        // command pools.
+        uint32_t qfs[3] = {0};
+        for (int i = 0; i < vk->num_pools; i++)
+            qfs[i] = vk->pools[i]->qf;
+
         VkBufferCreateInfo binfo = {
             .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
             .size  = slab->size,
             .usage = heap->usage,
+            .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
+                                             : VK_SHARING_MODE_EXCLUSIVE,
             .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = vk->num_pools,
+            .pQueueFamilyIndices = qfs,
         };
 
         VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer));
diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c
index 42e6c2f64c..905fc89596 100644
--- a/video/out/vulkan/ra_vk.c
+++ b/video/out/vulkan/ra_vk.c
@@ -6,6 +6,12 @@
 
 static struct ra_fns ra_fns_vk;
 
+enum queue_type {
+    GRAPHICS,
+    COMPUTE,
+    TRANSFER,
+};
+
 // For ra.priv
 struct ra_vk {
     struct mpvk_ctx *vk;
@@ -22,18 +28,6 @@ struct mpvk_ctx *ra_vk_get(struct ra *ra)
     return p->vk;
 }
 
-// Returns a command buffer, or NULL on error
-static struct vk_cmd *vk_require_cmd(struct ra *ra)
-{
-    struct ra_vk *p = ra->priv;
-    struct mpvk_ctx *vk = ra_vk_get(ra);
-
-    if (!p->cmd)
-        p->cmd = vk_cmd_begin(vk, vk->pool);
-
-    return p->cmd;
-}
-
 static void vk_submit(struct ra *ra)
 {
     struct ra_vk *p = ra->priv;
@@ -45,22 +39,46 @@ static void vk_submit(struct ra *ra)
     }
 }
 
-// The callback's *priv will always be set to `ra`
-static void vk_callback(struct ra *ra, vk_cb callback, void *arg)
+// Returns a command buffer, or NULL on error
+static struct vk_cmd *vk_require_cmd(struct ra *ra, enum queue_type type)
 {
     struct ra_vk *p = ra->priv;
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
-    if (p->cmd) {
-        vk_cmd_callback(p->cmd, callback, ra, arg);
-    } else {
-        vk_dev_callback(vk, callback, ra, arg);
+    struct vk_cmdpool *pool;
+    switch (type) {
+    case GRAPHICS: pool = vk->pool_graphics; break;
+    case COMPUTE:  pool = vk->pool_compute;  break;
+
+    // GRAPHICS and COMPUTE also imply TRANSFER capability (vulkan spec)
+    case TRANSFER:
+        pool = vk->pool_transfer;
+        if (!pool)
+            pool = vk->pool_compute;
+        if (!pool)
+            pool = vk->pool_graphics;
+        break;
+    default: abort();
     }
+
+    assert(pool);
+    if (p->cmd && p->cmd->pool == pool)
+        return p->cmd;
+
+    vk_submit(ra);
+    p->cmd = vk_cmd_begin(vk, pool);
+    return p->cmd;
 }
 
 #define MAKE_LAZY_DESTRUCTOR(fun, argtype)                  \
     static void fun##_lazy(struct ra *ra, argtype *arg) {   \
-        vk_callback(ra, (vk_cb) fun, arg);                  \
+        struct ra_vk *p = ra->priv;                         \
+        struct mpvk_ctx *vk = ra_vk_get(ra);                \
+        if (p->cmd) {                                       \
+            vk_cmd_callback(p->cmd, (vk_cb) fun, ra, arg);  \
+        } else {                                            \
+            vk_dev_callback(vk, (vk_cb) fun, ra, arg);      \
+        }                                                   \
     }
 
 static void vk_destroy_ra(struct ra *ra)
@@ -69,8 +87,8 @@ static void vk_destroy_ra(struct ra *ra)
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
     vk_submit(ra);
-    vk_flush_commands(vk);
-    mpvk_dev_wait_cmds(vk, UINT64_MAX);
+    mpvk_flush_commands(vk);
+    mpvk_poll_commands(vk, UINT64_MAX);
     ra_tex_free(ra, &p->clear_tex);
 
     talloc_free(ra);
@@ -190,7 +208,7 @@ struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
     ra->max_shmem = vk->limits.maxComputeSharedMemorySize;
     ra->max_pushc_size = vk->limits.maxPushConstantsSize;
 
-    if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT)
+    if (vk->pool_compute)
         ra->caps |= RA_CAP_COMPUTE;
 
     if (!vk_setup_formats(ra))
@@ -280,6 +298,7 @@ static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
 // For ra_tex.priv
 struct ra_tex_vk {
     bool external_img;
+    enum queue_type upload_queue;
     VkImageType type;
     VkImage img;
     struct vk_memslice mem;
@@ -480,6 +499,7 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
     tex->params.initial_data = NULL;
 
     struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
+    tex_vk->upload_queue = GRAPHICS;
 
     const struct vk_format *fmt = params->format->priv;
     switch (params->dimensions) {
@@ -501,6 +521,10 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
     if (params->host_mutable || params->blit_dst || params->initial_data)
         usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
+    // Always use the transfer pool if available, for efficiency
+    if (params->host_mutable && vk->pool_transfer)
+        tex_vk->upload_queue = TRANSFER;
+
     // Double-check image usage support and fail immediately if invalid
     VkImageFormatProperties iprop;
     VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
@@ -528,6 +552,14 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
         return NULL;
     }
 
+    // FIXME: Since we can't keep track of queue family ownership properly,
+    // and we don't know in advance what types of queue families this image
+    // will belong to, we're forced to share all of our images between all
+    // command pools.
+    uint32_t qfs[3] = {0};
+    for (int i = 0; i < vk->num_pools; i++)
+        qfs[i] = vk->pools[i]->qf;
+
     VkImageCreateInfo iinfo = {
         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
         .imageType = tex_vk->type,
@@ -539,9 +571,10 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
         .tiling = VK_IMAGE_TILING_OPTIMAL,
         .usage = usage,
         .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
-        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-        .queueFamilyIndexCount = 1,
-        .pQueueFamilyIndices = &vk->pool->qf,
+        .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
+                                         : VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = vk->num_pools,
+        .pQueueFamilyIndices = qfs,
     };
 
     VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img));
@@ -632,6 +665,7 @@ struct ra_buf_vk {
     struct vk_bufslice slice;
     int refcount; // 1 = object allocated but not in use, > 1 = in use
     bool needsflush;
+    enum queue_type update_queue;
     // "current" metadata, can change during course of execution
     VkPipelineStageFlags current_stage;
     VkAccessFlags current_access;
@@ -700,7 +734,7 @@ static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
         memcpy((void *)addr, data, size);
         buf_vk->needsflush = true;
     } else {
-        struct vk_cmd *cmd = vk_require_cmd(ra);
+        struct vk_cmd *cmd = vk_require_cmd(ra, buf_vk->update_queue);
         if (!cmd) {
             MP_ERR(ra, "Failed updating buffer!\n");
             return;
@@ -736,6 +770,9 @@ static struct ra_buf *vk_buf_create(struct ra *ra,
     case RA_BUF_TYPE_TEX_UPLOAD:
         bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
         memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        // Use TRANSFER-style updates for large enough buffers for efficiency
+        if (params->size > 1024*1024) // 1 MB
+            buf_vk->update_queue = TRANSFER;
         break;
     case RA_BUF_TYPE_UNIFORM:
         bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
@@ -746,6 +783,7 @@ static struct ra_buf *vk_buf_create(struct ra *ra,
         bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
         memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
         align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment);
+        buf_vk->update_queue = COMPUTE;
         break;
     case RA_BUF_TYPE_VERTEX:
         bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
@@ -832,7 +870,7 @@ static bool vk_tex_upload(struct ra *ra,
     uint64_t size = region.bufferRowLength * region.bufferImageHeight *
                     region.imageExtent.depth;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, tex_vk->upload_queue);
     if (!cmd)
         goto error;
 
@@ -1478,7 +1516,12 @@ static void vk_renderpass_run(struct ra *ra,
     struct ra_renderpass *pass = params->pass;
     struct ra_renderpass_vk *pass_vk = pass->priv;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    static const enum queue_type types[] = {
+        [RA_RENDERPASS_TYPE_RASTER]  = GRAPHICS,
+        [RA_RENDERPASS_TYPE_COMPUTE] = COMPUTE,
+    };
+
+    struct vk_cmd *cmd = vk_require_cmd(ra, types[pass->params.type]);
     if (!cmd)
         goto error;
 
@@ -1603,7 +1646,7 @@ static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
     struct ra_tex_vk *src_vk = src->priv;
     struct ra_tex_vk *dst_vk = dst->priv;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
@@ -1643,7 +1686,7 @@ static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
     struct ra_tex_vk *tex_vk = tex->priv;
     assert(tex->params.blit_dst);
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
@@ -1726,7 +1769,7 @@ error:
 static void vk_timer_record(struct ra *ra, VkQueryPool pool, int index,
                             VkPipelineStageFlags stage)
 {
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
@@ -1795,7 +1838,7 @@ static struct ra_fns ra_fns_vk = {
 struct vk_cmd *ra_vk_submit(struct ra *ra, struct ra_tex *tex)
 {
     struct ra_vk *p = ra->priv;
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return NULL;
 
diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c
index 9d9d8d9820..cb73e7d8ac 100644
--- a/video/out/vulkan/utils.c
+++ b/video/out/vulkan/utils.c
@@ -139,7 +139,15 @@ void mpvk_uninit(struct mpvk_ctx *vk)
         return;
 
     if (vk->dev) {
-        vk_cmdpool_destroy(vk, vk->pool);
+        mpvk_flush_commands(vk);
+        mpvk_poll_commands(vk, UINT64_MAX);
+        assert(vk->num_cmds_queued == 0);
+        assert(vk->num_cmds_pending == 0);
+        talloc_free(vk->cmds_queued);
+        talloc_free(vk->cmds_pending);
+        for (int i = 0; i < vk->num_pools; i++)
+            vk_cmdpool_destroy(vk, vk->pools[i]);
+        talloc_free(vk->pools);
         for (int i = 0; i < vk->num_signals; i++)
             vk_signal_destroy(vk, &vk->signals[i]);
         talloc_free(vk->signals);
@@ -377,6 +385,53 @@ error:
     return false;
 }
 
+// Find the most specialized queue supported a combination of flags. In cases
+// where there are multiple queue families at the same specialization level,
+// this finds the one with the most queues. Returns -1 if no queue was found.
+static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags)
+{
+    int idx = -1;
+    for (int i = 0; i < qfnum; i++) {
+        if (!(qfs[i].queueFlags & flags))
+            continue;
+
+        // QF is more specialized
+        if (idx < 0 || qfs[i].queueFlags < qfs[idx].queueFlags)
+            idx = i;
+
+        // QF has more queues (at the same specialization level)
+        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+            qfs[i].queueCount > qfs[idx].queueCount)
+            idx = i;
+    }
+
+    return idx;
+}
+
+static void add_qinfo(void *tactx, VkDeviceQueueCreateInfo **qinfos,
+                      int *num_qinfos, VkQueueFamilyProperties *qfs, int idx,
+                      int qcount)
+{
+    if (idx < 0)
+        return;
+
+    // Check to see if we've already added this queue family
+    for (int i = 0; i < *num_qinfos; i++) {
+        if ((*qinfos)[i].queueFamilyIndex == idx)
+            return;
+    }
+
+    float *priorities = talloc_zero_array(tactx, float, qcount);
+    VkDeviceQueueCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = idx,
+        .queueCount = MPMIN(qcount, qfs[idx].queueCount),
+        .pQueuePriorities = priorities,
+    };
+
+    MP_TARRAY_APPEND(tactx, *qinfos, *num_qinfos, qinfo);
+}
+
 bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
 {
     assert(vk->physd);
@@ -395,45 +450,34 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
                    (unsigned)qfs[i].queueFlags, (int)qfs[i].queueCount);
     }
 
-    // For most of our rendering operations, we want to use one "primary" pool,
-    // so just pick the queue family with the most features.
-    int idx = -1;
-    for (int i = 0; i < qfnum; i++) {
-        if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT))
-            continue;
-
-        // QF supports more features
-        if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags)
-            idx = i;
-
-        // QF supports more queues (at the same specialization level)
-        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
-            qfs[i].queueCount > qfs[idx].queueCount)
-        {
-            idx = i;
-        }
-    }
+    int idx_gfx  = find_qf(qfs, qfnum, VK_QUEUE_GRAPHICS_BIT),
+        idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT),
+        idx_tf   = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT);
 
     // Vulkan requires at least one GRAPHICS queue, so if this fails something
     // is horribly wrong.
-    assert(idx >= 0);
+    assert(idx_gfx >= 0);
+    MP_VERBOSE(vk, "Using graphics queue (QF %d)\n", idx_gfx);
 
     // Ensure we can actually present to the surface using this queue
     VkBool32 sup;
-    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx, vk->surf, &sup));
+    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx, vk->surf, &sup));
     if (!sup) {
         MP_ERR(vk, "Queue family does not support surface presentation!\n");
         goto error;
     }
 
+    if (idx_tf >= 0 && idx_tf != idx_gfx)
+        MP_VERBOSE(vk, "Using async transfer (QF %d)\n", idx_tf);
+    if (idx_comp >= 0 && idx_comp != idx_gfx)
+        MP_VERBOSE(vk, "Using async compute (QF %d)\n", idx_comp);
+
     // Now that we know which QFs we want, we can create the logical device
-    float *priorities = talloc_zero_array(tmp, float, opts.queue_count);
-    VkDeviceQueueCreateInfo qinfo = {
-        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-        .queueFamilyIndex = idx,
-        .queueCount = MPMIN(qfs[idx].queueCount, opts.queue_count),
-        .pQueuePriorities = priorities,
-    };
+    VkDeviceQueueCreateInfo *qinfos = NULL;
+    int num_qinfos = 0;
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_gfx, opts.queue_count);
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_comp, opts.queue_count);
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_tf, opts.queue_count);
 
     const char **exts = NULL;
     int num_exts = 0;
@@ -443,8 +487,8 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
 
     VkDeviceCreateInfo dinfo = {
         .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
-        .queueCreateInfoCount = 1,
-        .pQueueCreateInfos = &qinfo,
+        .pQueueCreateInfos = qinfos,
+        .queueCreateInfoCount = num_qinfos,
         .ppEnabledExtensionNames = exts,
         .enabledExtensionCount = num_exts,
     };
@@ -455,12 +499,20 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
 
     VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev));
 
-    vk_malloc_init(vk);
+    // Create the command pools and memory allocator
+    for (int i = 0; i < num_qinfos; i++) {
+        int qf = qinfos[i].queueFamilyIndex;
+        struct vk_cmdpool *pool = vk_cmdpool_create(vk, qinfos[i], qfs[qf]);
+        if (!pool)
+            goto error;
+        MP_TARRAY_APPEND(NULL, vk->pools, vk->num_pools, pool);
+    }
 
-    // Create the command pool(s)
-    vk->pool = vk_cmdpool_create(vk, qinfo, qfs[idx]);
-    if (!vk->pool)
-        goto error;
+    vk->pool_graphics = vk->pools[idx_gfx];
+    vk->pool_compute  = idx_comp >= 0 ? vk->pools[idx_comp] : NULL;
+    vk->pool_transfer = idx_tf   >= 0 ? vk->pools[idx_tf] : NULL;
+
+    vk_malloc_init(vk);
 
     talloc_free(tmp);
     return true;
@@ -563,10 +615,8 @@ static void vk_cmdpool_destroy(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
     if (!pool)
         return;
 
-    for (int i = 0; i < pool->num_cmds_available; i++)
-        vk_cmd_destroy(vk, pool->cmds_available[i]);
-    for (int i = 0; i < pool->num_cmds_pending; i++)
-        vk_cmd_destroy(vk, pool->cmds_pending[i]);
+    for (int i = 0; i < pool->num_cmds; i++)
+        vk_cmd_destroy(vk, pool->cmds[i]);
 
     vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR);
     talloc_free(pool);
@@ -603,26 +653,67 @@ error:
     return NULL;
 }
 
-void mpvk_pool_wait_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
-                         uint64_t timeout)
+void mpvk_poll_commands(struct mpvk_ctx *vk, uint64_t timeout)
 {
-    if (!pool)
-        return;
-
-    while (pool->num_cmds_pending > 0) {
-        struct vk_cmd *cmd = pool->cmds_pending[0];
+    while (vk->num_cmds_pending > 0) {
+        struct vk_cmd *cmd = vk->cmds_pending[0];
+        struct vk_cmdpool *pool = cmd->pool;
         VkResult res = vk_cmd_poll(vk, cmd, timeout);
         if (res == VK_TIMEOUT)
             break;
         vk_cmd_reset(vk, cmd);
-        MP_TARRAY_REMOVE_AT(pool->cmds_pending, pool->num_cmds_pending, 0);
-        MP_TARRAY_APPEND(pool, pool->cmds_available, pool->num_cmds_available, cmd);
+        MP_TARRAY_REMOVE_AT(vk->cmds_pending, vk->num_cmds_pending, 0);
+        MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
     }
 }
 
-void mpvk_dev_wait_cmds(struct mpvk_ctx *vk, uint64_t timeout)
+bool mpvk_flush_commands(struct mpvk_ctx *vk)
 {
-    mpvk_pool_wait_cmds(vk, vk->pool, timeout);
+    bool ret = true;
+
+    for (int i = 0; i < vk->num_cmds_queued; i++) {
+        struct vk_cmd *cmd = vk->cmds_queued[i];
+        struct vk_cmdpool *pool = cmd->pool;
+
+        VkSubmitInfo sinfo = {
+            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .commandBufferCount = 1,
+            .pCommandBuffers = &cmd->buf,
+            .waitSemaphoreCount = cmd->num_deps,
+            .pWaitSemaphores = cmd->deps,
+            .pWaitDstStageMask = cmd->depstages,
+            .signalSemaphoreCount = cmd->num_sigs,
+            .pSignalSemaphores = cmd->sigs,
+        };
+
+        VK(vkQueueSubmit(cmd->queue, 1, &sinfo, cmd->fence));
+        MP_TARRAY_APPEND(NULL, vk->cmds_pending, vk->num_cmds_pending, cmd);
+
+        if (mp_msg_test(vk->log, MSGL_TRACE)) {
+            MP_TRACE(vk, "Submitted command on queue %p (QF %d):\n",
+                     (void *)cmd->queue, pool->qf);
+            for (int n = 0; n < cmd->num_deps; n++)
+                MP_TRACE(vk, "    waits on semaphore %p\n", (void *)cmd->deps[n]);
+            for (int n = 0; n < cmd->num_sigs; n++)
+                MP_TRACE(vk, "    signals semaphore %p\n", (void *)cmd->sigs[n]);
+        }
+        continue;
+
+error:
+        vk_cmd_reset(vk, cmd);
+        MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
+        ret = false;
+    }
+
+    vk->num_cmds_queued = 0;
+
+    // Rotate the queues to ensure good parallelism across frames
+    for (int i = 0; i < vk->num_pools; i++) {
+        struct vk_cmdpool *pool = vk->pools[i];
+        pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
+    }
+
+    return ret;
 }
 
 void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg)
@@ -639,10 +730,10 @@ struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
 {
     // garbage collect the cmdpool first, to increase the chances of getting
     // an already-available command buffer
-    mpvk_pool_wait_cmds(vk, pool, 0);
+    mpvk_poll_commands(vk, 0);
 
     struct vk_cmd *cmd = NULL;
-    if (MP_TARRAY_POP(pool->cmds_available, pool->num_cmds_available, &cmd))
+    if (MP_TARRAY_POP(pool->cmds, pool->num_cmds, &cmd))
         goto done;
 
     // No free command buffers => allocate another one
@@ -675,58 +766,13 @@ void vk_cmd_queue(struct mpvk_ctx *vk, struct vk_cmd *cmd)
     VK(vkEndCommandBuffer(cmd->buf));
 
     VK(vkResetFences(vk->dev, 1, &cmd->fence));
-    MP_TARRAY_APPEND(pool, pool->cmds_queued, pool->num_cmds_queued, cmd);
+    MP_TARRAY_APPEND(NULL, vk->cmds_queued, vk->num_cmds_queued, cmd);
     vk->last_cmd = cmd;
     return;
 
 error:
     vk_cmd_reset(vk, cmd);
-    MP_TARRAY_APPEND(pool, pool->cmds_available, pool->num_cmds_available, cmd);
-}
-
-bool vk_flush_commands(struct mpvk_ctx *vk)
-{
-    bool ret = true;
-
-    struct vk_cmdpool *pool = vk->pool;
-    for (int i = 0; i < pool->num_cmds_queued; i++) {
-        struct vk_cmd *cmd = pool->cmds_queued[i];
-
-        VkSubmitInfo sinfo = {
-            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-            .commandBufferCount = 1,
-            .pCommandBuffers = &cmd->buf,
-            .waitSemaphoreCount = cmd->num_deps,
-            .pWaitSemaphores = cmd->deps,
-            .pWaitDstStageMask = cmd->depstages,
-            .signalSemaphoreCount = cmd->num_sigs,
-            .pSignalSemaphores = cmd->sigs,
-        };
-
-        VK(vkQueueSubmit(cmd->queue, 1, &sinfo, cmd->fence));
-        MP_TARRAY_APPEND(pool, pool->cmds_pending, pool->num_cmds_pending, cmd);
-
-        if (mp_msg_test(vk->log, MSGL_TRACE)) {
-            MP_TRACE(vk, "Submitted command on queue %p (QF %d):\n",
-                     (void *)cmd->queue, pool->qf);
-            for (int n = 0; n < cmd->num_deps; n++)
-                MP_TRACE(vk, "    waits on semaphore %p\n", (void *)cmd->deps[n]);
-            for (int n = 0; n < cmd->num_sigs; n++)
-                MP_TRACE(vk, "    signals semaphore %p\n", (void *)cmd->sigs[n]);
-        }
-        continue;
-
-error:
-        vk_cmd_reset(vk, cmd);
-        MP_TARRAY_APPEND(pool, pool->cmds_available, pool->num_cmds_available, cmd);
-        ret = false;
-    }
-
-    pool->num_cmds_queued = 0;
-
-    // Rotate the queues to ensure good parallelism across frames
-    pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
-    return ret;
+    MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
 }
 
 void vk_signal_destroy(struct mpvk_ctx *vk, struct vk_signal **sig)
@@ -762,10 +808,16 @@ struct vk_signal *vk_cmd_signal(struct mpvk_ctx *vk, struct vk_cmd *cmd,
     VK(vkCreateEvent(vk->dev, &einfo, MPVK_ALLOCATOR, &sig->event));
 
 done:
-    // Signal both the semaphore and the event. (We will only end up using one)
+    // Signal both the semaphore and the event if possible. (We will only
+    // end up using one or the other)
     vk_cmd_sig(cmd, sig->semaphore);
-    vkCmdSetEvent(cmd->buf, sig->event, stage);
-    sig->event_source = cmd->queue;
+
+    VkQueueFlags req = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+    if (cmd->pool->props.queueFlags & req) {
+        vkCmdSetEvent(cmd->buf, sig->event, stage);
+        sig->event_source = cmd->queue;
+    }
+
     return sig;
 
 error:
@@ -787,14 +839,14 @@ static bool unsignal_cmd(struct vk_cmd *cmd, VkSemaphore sem)
 
 // Attempts to remove a queued signal operation. Returns true if sucessful,
 // i.e. the signal could be removed before it ever got fired.
-static bool unsignal(struct vk_cmd *cmd, VkSemaphore sem)
+static bool unsignal(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore sem)
 {
     if (unsignal_cmd(cmd, sem))
         return true;
 
     // Attempt to remove it from any queued commands
-    for (int i = 0; i < cmd->pool->num_cmds_queued; i++) {
-        if (unsignal_cmd(cmd->pool->cmds_queued[i], sem))
+    for (int i = 0; i < vk->num_cmds_queued; i++) {
+        if (unsignal_cmd(vk->cmds_queued[i], sem))
             return true;
     }
 
@@ -806,7 +858,9 @@ static void release_signal(struct mpvk_ctx *vk, struct vk_signal *sig)
     // The semaphore never needs to be recreated, because it's either
     // unsignaled while still queued, or unsignaled as a result of a device
     // wait. But the event *may* need to be reset, so just always reset it.
-    vkResetEvent(vk->dev, sig->event);
+    if (sig->event_source)
+        vkResetEvent(vk->dev, sig->event);
+    sig->event_source = NULL;
     MP_TARRAY_APPEND(NULL, vk->signals, vk->num_signals, sig);
 }
 
@@ -819,7 +873,7 @@ void vk_cmd_wait(struct mpvk_ctx *vk, struct vk_cmd *cmd,
         return;
 
     if (out_event && sig->event && sig->event_source == cmd->queue &&
-        unsignal(cmd, sig->semaphore))
+        unsignal(vk, cmd, sig->semaphore))
     {
         // If we can remove the semaphore signal operation from the history and
         // pretend it never happened, then we get to use the VkEvent. This also
diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h
index 538897afae..de3a757be3 100644
--- a/video/out/vulkan/utils.h
+++ b/video/out/vulkan/utils.h
@@ -66,9 +66,13 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts);
 // UINT64_MAX effectively means waiting until the pool/device is idle. The
 // timeout may also be passed as 0, in which case this function will not block,
 // but only poll for completed commands.
-void mpvk_pool_wait_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
-                         uint64_t timeout);
-void mpvk_dev_wait_cmds(struct mpvk_ctx *vk, uint64_t timeout);
+void mpvk_poll_commands(struct mpvk_ctx *vk, uint64_t timeout);
+
+// Flush all currently queued commands. Call this once per frame, after
+// submitting all of the command buffers for that frame. Calling this more
+// often than that is possible but bad for performance.
+// Returns whether successful. Failed commands will be implicitly dropped.
+bool mpvk_flush_commands(struct mpvk_ctx *vk);
 
 // Since lots of vulkan operations need to be done lazily once the affected
 // resources are no longer in use, provide an abstraction for tracking these.
@@ -158,13 +162,10 @@ struct vk_cmdpool {
     VkQueue *queues;
     int num_queues;
     int idx_queues;
-    // Command buffers associated with this queue
-    struct vk_cmd **cmds_available; // available for re-recording
-    struct vk_cmd **cmds_queued;    // recorded but not yet submitted
-    struct vk_cmd **cmds_pending;   // submitted but not completed
-    int num_cmds_available;
-    int num_cmds_queued;
-    int num_cmds_pending;
+    // Command buffers associated with this queue. These are available for
+    // re-recording
+    struct vk_cmd **cmds;
+    int num_cmds;
 };
 
 // Fetch a command buffer from a command pool and begin recording to it.
@@ -175,12 +176,6 @@ struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
 // takes over ownership of *cmd, i.e. the caller should not touch it again.
 void vk_cmd_queue(struct mpvk_ctx *vk, struct vk_cmd *cmd);
 
-// Flush all currently queued commands. Call this once per frame, after
-// submitting all of the command buffers for that frame. Calling this more
-// often than that is possible but bad for performance.
-// Returns whether successful. Failed commands will be implicitly dropped.
-bool vk_flush_commands(struct mpvk_ctx *vk);
-
 // Predefined structs for a simple non-layered, non-mipped image
 extern const VkImageSubresourceRange vk_range;
 extern const VkImageSubresourceLayers vk_layers;
author	Niklas Haas <git@haasn.xyz>	2017-09-24 15:05:24 +0200
committer	Martin Herkt <652892+lachs0r@users.noreply.github.com>	2017-12-25 00:47:53 +0100
commit	bded247fb53558dd5cba26560d1f24e9234ae24e (patch)
tree	1e2c1819fc009acf9eea0d481a003799f7ffda8c
parent	a3c9685257e60e32646bb54a895ef7574a945f69 (diff)
download	mpv-bded247fb53558dd5cba26560d1f24e9234ae24e.tar.bz2 mpv-bded247fb53558dd5cba26560d1f24e9234ae24e.tar.xz