diff options
Diffstat (limited to 'video/out/vulkan/ra_vk.c')
-rw-r--r-- | video/out/vulkan/ra_vk.c | 1590 |
1 files changed, 1590 insertions, 0 deletions
diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c new file mode 100644 index 0000000000..ce0cbc66e9 --- /dev/null +++ b/video/out/vulkan/ra_vk.c @@ -0,0 +1,1590 @@ +#include "ra_vk.h" +#include "malloc.h" +#include "video/out/opengl/utils.h" + +static struct ra_fns ra_fns_vk; + +// For ra.priv +struct ra_vk { + struct mpvk_ctx *vk; + struct ra_tex *clear_tex; // stupid hack for clear() + struct vk_cmd *cmd; // currently recording cmd +}; + +struct mpvk_ctx *ra_vk_get(struct ra *ra) +{ + if (ra->fns != &ra_fns_vk) + return NULL; + + struct ra_vk *p = ra->priv; + return p->vk; +} + +// Returns a command buffer, or NULL on error +static struct vk_cmd *vk_require_cmd(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + if (!p->cmd) + p->cmd = vk_cmd_begin(vk, vk->pool); + + return p->cmd; +} + +// Note: This technically follows the flush() API, but we don't need +// to expose that (and in fact, it's a bad idea) since we control flushing +// behavior with ra_vk_present_frame already. +static bool vk_flush(struct ra *ra, VkSemaphore *done) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + if (p->cmd) { + if (!vk_cmd_submit(vk, p->cmd, done)) + return false; + p->cmd = NULL; + } + + return true; +} + +// The callback's *priv will always be set to `ra` +static void vk_callback(struct ra *ra, vk_cb callback, void *arg) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + if (p->cmd) { + vk_cmd_callback(p->cmd, callback, ra, arg); + } else { + vk_dev_callback(vk, callback, ra, arg); + } +} + +#define MAKE_LAZY_DESTRUCTOR(fun, argtype) \ + static void fun##_lazy(struct ra *ra, argtype *arg) { \ + vk_callback(ra, (vk_cb) fun, arg); \ + } + +static void vk_destroy_ra(struct ra *ra) +{ + struct ra_vk *p = ra->priv; + struct mpvk_ctx *vk = ra_vk_get(ra); + + vk_flush(ra, NULL); + mpvk_dev_wait_idle(vk); + ra_tex_free(ra, &p->clear_tex); + + talloc_free(ra); +} + +static bool vk_setup_formats(struct ra *ra) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->name; vk_fmt++) { + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->iformat, &prop); + + // As a bare minimum, we need to sample from an allocated image + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + if (!(flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) + continue; + + VkFormatFeatureFlags linear_bits, render_bits; + linear_bits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + render_bits = VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; + + struct ra_format *fmt = talloc_zero(ra, struct ra_format); + *fmt = (struct ra_format) { + .name = vk_fmt->name, + .priv = (void *)vk_fmt, + .ctype = vk_fmt->ctype, + .ordered = !vk_fmt->fucked_order, + .num_components = vk_fmt->components, + .pixel_size = vk_fmt->bytes, + .linear_filter = !!(flags & linear_bits), + .renderable = !!(flags & render_bits), + }; + + for (int i = 0; i < 4; i++) + fmt->component_size[i] = fmt->component_depth[i] = vk_fmt->bits[i]; + + MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt); + } + + // Populate some other capabilities related to formats while we're at it + VkImageType imgType[3] = { + VK_IMAGE_TYPE_1D, + VK_IMAGE_TYPE_2D, + VK_IMAGE_TYPE_3D + }; + + // R8_UNORM is supported on literally every single vulkan implementation + const VkFormat testfmt = VK_FORMAT_R8_UNORM; + + for (int d = 0; d < 3; d++) { + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + testfmt, imgType[d], VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_SAMPLED_BIT, 0, &iprop); + + switch (imgType[d]) { + case VK_IMAGE_TYPE_1D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_1D; + break; + case VK_IMAGE_TYPE_2D: + // 2D formats must be supported by RA, so ensure this is the case + VK_ASSERT(res, "Querying 2D format limits"); + ra->max_texture_wh = MPMIN(iprop.maxExtent.width, iprop.maxExtent.height); + break; + case VK_IMAGE_TYPE_3D: + if (res == VK_SUCCESS) + ra->caps |= RA_CAP_TEX_3D; + break; + } + } + + // RA_CAP_BLIT implies both blitting between images as well as blitting + // directly to the swapchain image, so check for all three operations + bool blittable = true; + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, testfmt, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_SRC_BIT)) + blittable = false; + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + vkGetPhysicalDeviceFormatProperties(vk->physd, vk->surf_format.format, &prop); + if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT)) + blittable = false; + + if (blittable) + ra->caps |= RA_CAP_BLIT; + + return true; + +error: + return false; +} + +static struct ra_fns ra_fns_vk; + +struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log) +{ + assert(vk->dev); + assert(vk->alloc); + + struct ra *ra = talloc_zero(NULL, struct ra); + ra->log = log; + ra->fns = &ra_fns_vk; + + struct ra_vk *p = ra->priv = talloc_zero(ra, struct ra_vk); + p->vk = vk; + + // There's no way to query the supported GLSL version from VK_NV_glsl_shader + // (thanks nvidia), so just pick the GL version that modern nvidia devices + // support.. + ra->glsl_version = 450; + ra->glsl_vulkan = true; + ra->max_shmem = vk->limits.maxComputeSharedMemorySize; + ra->caps = RA_CAP_NESTED_ARRAY; + + if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT) + ra->caps |= RA_CAP_COMPUTE; + + if (!vk_setup_formats(ra)) + goto error; + + // UBO support is required + ra->caps |= RA_CAP_BUF_RO; + + // Try creating a shader storage buffer + struct ra_buf_params ssbo_params = { + .type = RA_BUF_TYPE_SHADER_STORAGE, + .size = 16, + }; + + struct ra_buf *ssbo = ra_buf_create(ra, &ssbo_params); + if (ssbo) { + ra->caps |= RA_CAP_BUF_RW; + ra_buf_free(ra, &ssbo); + } + + // To support clear() by region, we need to allocate a dummy 1x1 image that + // will be used as the source of blit operations + struct ra_tex_params clear_params = { + .dimensions = 1, // no point in using a 2D image if height = 1 + .w = 1, + .h = 1, + .d = 1, + .format = ra_find_float16_format(ra, 4), + .blit_src = 1, + .host_mutable = 1, + }; + + p->clear_tex = ra_tex_create(ra, &clear_params); + if (!p->clear_tex) { + MP_ERR(ra, "Failed creating 1x1 dummy texture for clear()!\n"); + goto error; + } + + return ra; + +error: + vk_destroy_ra(ra); + return NULL; +} + +// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain +// compatible +static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt, + bool load_fbo, VkRenderPass *out) +{ + struct vk_format *vk_fmt = fmt->priv; + assert(fmt->renderable); + + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = vk_fmt->iformat, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = load_fbo ? VK_ATTACHMENT_LOAD_OP_LOAD + : VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + return vkCreateRenderPass(dev, &rinfo, MPVK_ALLOCATOR, out); +} + +// For ra_tex.priv +struct ra_tex_vk { + bool external_img; + VkImageType type; + VkImage img; + struct vk_memslice mem; + // for sampling + VkImageView view; + VkSampler sampler; + // for rendering + VkFramebuffer framebuffer; + VkRenderPass dummyPass; + // for uploading + struct ra_buf_pool pbo; + // "current" metadata, can change during the course of execution + VkImageLayout current_layout; + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +// Small helper to ease image barrier creation. if `discard` is set, the contents +// of the image will be undefined after the barrier +static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, VkImageLayout newLayout, + bool discard) +{ + VkImageMemoryBarrier imgBarrier = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .oldLayout = tex_vk->current_layout, + .newLayout = newLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .srcAccessMask = tex_vk->current_access, + .dstAccessMask = newAccess, + .image = tex_vk->img, + .subresourceRange = vk_range, + }; + + if (discard) { + imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imgBarrier.srcAccessMask = 0; + } + + if (imgBarrier.oldLayout != imgBarrier.newLayout || + imgBarrier.srcAccessMask != imgBarrier.dstAccessMask) + { + vkCmdPipelineBarrier(cmd->buf, tex_vk->current_stage, newStage, 0, + 0, NULL, 0, NULL, 1, &imgBarrier); + } + + tex_vk->current_stage = newStage; + tex_vk->current_layout = newLayout; + tex_vk->current_access = newAccess; +} + +static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex) +{ + if (!tex) + return; + + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_tex_vk *tex_vk = tex->priv; + + ra_buf_pool_uninit(ra, &tex_vk->pbo); + vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR); + vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR); + vkDestroyImageView(vk->dev, tex_vk->view, MPVK_ALLOCATOR); + if (!tex_vk->external_img) { + vkDestroyImage(vk->dev, tex_vk->img, MPVK_ALLOCATOR); + vk_free_memslice(vk, tex_vk->mem); + } + + talloc_free(tex); +} + +MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct ra_tex); + +// Initializes non-VkImage values like the image view, samplers, etc. +static bool vk_init_image(struct ra *ra, struct ra_tex *tex) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_tex_params *params = &tex->params; + struct ra_tex_vk *tex_vk = tex->priv; + assert(tex_vk->img); + + tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED; + tex_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + tex_vk->current_access = 0; + + if (params->render_src || params->render_dst) { + static const VkImageViewType viewType[] = { + [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D, + [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D, + [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D, + }; + + const struct vk_format *fmt = params->format->priv; + VkImageViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = tex_vk->img, + .viewType = viewType[tex_vk->type], + .format = fmt->iformat, + .subresourceRange = vk_range, + }; + + VK(vkCreateImageView(vk->dev, &vinfo, MPVK_ALLOCATOR, &tex_vk->view)); + } + + if (params->render_src) { + assert(params->format->linear_filter || !params->src_linear); + VkFilter filter = params->src_linear + ? VK_FILTER_LINEAR + : VK_FILTER_NEAREST; + VkSamplerAddressMode wrap = params->src_repeat + ? VK_SAMPLER_ADDRESS_MODE_REPEAT + : VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + VkSamplerCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = filter, + .minFilter = filter, + .addressModeU = wrap, + .addressModeV = wrap, + .addressModeW = wrap, + .maxAnisotropy = 1.0, + }; + + VK(vkCreateSampler(vk->dev, &sinfo, MPVK_ALLOCATOR, &tex_vk->sampler)); + } + + if (params->render_dst) { + // Framebuffers need to be created against a specific render pass + // layout, so we need to temporarily create a skeleton/dummy render + // pass for vulkan to figure out the compatibility + VK(vk_create_render_pass(vk->dev, params->format, false, &tex_vk->dummyPass)); + + VkFramebufferCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .renderPass = tex_vk->dummyPass, + .attachmentCount = 1, + .pAttachments = &tex_vk->view, + .width = tex->params.w, + .height = tex->params.h, + .layers = 1, + }; + + VK(vkCreateFramebuffer(vk->dev, &finfo, MPVK_ALLOCATOR, + &tex_vk->framebuffer)); + + // NOTE: Normally we would free the dummyPass again here, but a bug + // in the nvidia vulkan driver causes a segfault if you do. + } + + return true; + +error: + return false; +} + +static struct ra_tex *vk_tex_create(struct ra *ra, + const struct ra_tex_params *params) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_tex *tex = talloc_zero(NULL, struct ra_tex); + tex->params = *params; + tex->params.initial_data = NULL; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + + const struct vk_format *fmt = params->format->priv; + switch (params->dimensions) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + default: abort(); + } + + VkImageUsageFlags usage = 0; + if (params->render_src) + usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (params->render_dst) + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + if (params->storage_dst) + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + if (params->blit_src) + usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + if (params->host_mutable || params->blit_dst || params->initial_data) + usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + // Double-check image usage support and fail immediately if invalid + VkImageFormatProperties iprop; + VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd, + fmt->iformat, tex_vk->type, VK_IMAGE_TILING_OPTIMAL, usage, 0, + &iprop); + if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) { + return NULL; + } else { + VK_ASSERT(res, "Querying image format properties"); + } + + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + VkFormatFeatureFlags flags = prop.optimalTilingFeatures; + + bool has_blit_src = flags & VK_FORMAT_FEATURE_BLIT_SRC_BIT, + has_src_linear = flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + + if (params->w > iprop.maxExtent.width || + params->h > iprop.maxExtent.height || + params->d > iprop.maxExtent.depth || + (params->blit_src && !has_blit_src) || + (params->src_linear && !has_src_linear)) + { + return NULL; + } + + VkImageCreateInfo iinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = tex_vk->type, + .format = fmt->iformat, + .extent = (VkExtent3D) { params->w, params->h, params->d }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 1, + .pQueueFamilyIndices = &vk->pool->qf, + }; + + VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img)); + + VkMemoryPropertyFlagBits memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + VkMemoryRequirements reqs; + vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs); + + struct vk_memslice *mem = &tex_vk->mem; + if (!vk_malloc_generic(vk, reqs, memFlags, mem)) + goto error; + + VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset)); + + if (!vk_init_image(ra, tex)) + goto error; + + if (params->initial_data) { + struct ra_tex_upload_params ul_params = { + .tex = tex, + .invalidate = true, + .src = params->initial_data, + .stride = params->w * fmt->bytes, + }; + if (!ra->fns->tex_upload(ra, &ul_params)) + goto error; + } + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +struct ra_tex *ra_vk_wrap_swapchain_img(struct ra *ra, VkImage vkimg, + VkSwapchainCreateInfoKHR info) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_tex *tex = NULL; + + const struct ra_format *format = NULL; + for (int i = 0; i < ra->num_formats; i++) { + const struct vk_format *fmt = ra->formats[i]->priv; + if (fmt->iformat == vk->surf_format.format) { + format = ra->formats[i]; + break; + } + } + + if (!format) { + MP_ERR(ra, "Could not find ra_format suitable for wrapped swchain image " + "with surface format 0x%x\n", vk->surf_format.format); + goto error; + } + + tex = talloc_zero(NULL, struct ra_tex); + tex->params = (struct ra_tex_params) { + .format = format, + .dimensions = 2, + .w = info.imageExtent.width, + .h = info.imageExtent.height, + .d = 1, + .blit_src = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .blit_dst = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .render_src = !!(info.imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT), + .render_dst = !!(info.imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT), + .storage_dst = !!(info.imageUsage & VK_IMAGE_USAGE_STORAGE_BIT), + }; + + struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk); + tex_vk->type = VK_IMAGE_TYPE_2D; + tex_vk->external_img = true; + tex_vk->img = vkimg; + + if (!vk_init_image(ra, tex)) + goto error; + + return tex; + +error: + vk_tex_destroy(ra, tex); + return NULL; +} + +// For ra_buf.priv +struct ra_buf_vk { + struct vk_bufslice slice; + int refcount; // 1 = object allocated but not in use, > 1 = in use + bool needsflush; + // "current" metadata, can change during course of execution + VkPipelineStageFlagBits current_stage; + VkAccessFlagBits current_access; +}; + +static void vk_buf_deref(struct ra *ra, struct ra_buf *buf) +{ + if (!buf) + return; + + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_buf_vk *buf_vk = buf->priv; + + if (--buf_vk->refcount == 0) { + vk_free_memslice(vk, buf_vk->slice.mem); + talloc_free(buf); + } +} + +static void buf_barrier(struct ra *ra, struct vk_cmd *cmd, struct ra_buf *buf, + VkPipelineStageFlagBits newStage, + VkAccessFlagBits newAccess, int offset, size_t size) +{ + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferMemoryBarrier buffBarrier = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .srcAccessMask = buf_vk->current_access, + .dstAccessMask = newAccess, + .buffer = buf_vk->slice.buf, + .offset = offset, + .size = size, + }; + + if (buf_vk->needsflush || buf->params.host_mapped) { + buffBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + buf_vk->current_stage = VK_PIPELINE_STAGE_HOST_BIT; + buf_vk->needsflush = false; + } + + if (buffBarrier.srcAccessMask != buffBarrier.dstAccessMask) { + vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0, + 0, NULL, 1, &buffBarrier, 0, NULL); + } + + buf_vk->current_stage = newStage; + buf_vk->current_access = newAccess; + buf_vk->refcount++; + vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, ra, buf); +} + +#define vk_buf_destroy vk_buf_deref +MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct ra_buf); + +static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, + const void *data, size_t size) +{ + assert(buf->params.host_mutable || buf->params.initial_data); + struct ra_buf_vk *buf_vk = buf->priv; + + // For host-mapped buffers, we can just directly memcpy the buffer contents. + // Otherwise, we can update the buffer from the GPU using a command buffer. + if (buf_vk->slice.data) { + assert(offset + size <= buf->params.size); + uintptr_t addr = (uintptr_t)buf_vk->slice.data + offset; + memcpy((void *)addr, data, size); + buf_vk->needsflush = true; + } else { + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) { + MP_ERR(ra, "Failed updating buffer!\n"); + return; + } + + buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, offset, size); + + VkDeviceSize bufOffset = buf_vk->slice.mem.offset + offset; + assert(bufOffset == MP_ALIGN_UP(bufOffset, 4)); + vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, bufOffset, size, data); + } +} + +static struct ra_buf *vk_buf_create(struct ra *ra, + const struct ra_buf_params *params) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_buf *buf = talloc_zero(NULL, struct ra_buf); + buf->params = *params; + + struct ra_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct ra_buf_vk); + buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + buf_vk->current_access = 0; + buf_vk->refcount = 1; + + VkBufferUsageFlagBits bufFlags = 0; + VkMemoryPropertyFlagBits memFlags = 0; + VkDeviceSize align = 4; // alignment 4 is needed for buf_update + + switch (params->type) { + case RA_BUF_TYPE_TEX_UPLOAD: + bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + break; + case RA_BUF_TYPE_UNIFORM: + bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minUniformBufferOffsetAlignment); + break; + case RA_BUF_TYPE_SHADER_STORAGE: + bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment); + break; + case RA_BUF_TYPE_VERTEX: + bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; + memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + default: abort(); + } + + if (params->host_mutable || params->initial_data) { + bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; + align = MP_ALIGN_UP(align, vk->limits.optimalBufferCopyOffsetAlignment); + } + + if (params->host_mapped) { + memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + + if (!vk_malloc_buffer(vk, bufFlags, memFlags, params->size, align, + &buf_vk->slice)) + { + goto error; + } + + if (params->host_mapped) + buf->data = buf_vk->slice.data; + + if (params->initial_data) + vk_buf_update(ra, buf, 0, params->initial_data, params->size); + + buf->params.initial_data = NULL; // do this after vk_buf_update + return buf; + +error: + vk_buf_destroy(ra, buf); + return NULL; +} + +static bool vk_buf_poll(struct ra *ra, struct ra_buf *buf) +{ + struct ra_buf_vk *buf_vk = buf->priv; + return buf_vk->refcount == 1; +} + +static bool vk_tex_upload(struct ra *ra, + const struct ra_tex_upload_params *params) +{ + struct ra_tex *tex = params->tex; + struct ra_tex_vk *tex_vk = tex->priv; + + if (!params->buf) + return ra_tex_upload_pbo(ra, &tex_vk->pbo, params); + + assert(!params->src); + assert(params->buf); + struct ra_buf *buf = params->buf; + struct ra_buf_vk *buf_vk = buf->priv; + + VkBufferImageCopy region = { + .bufferOffset = buf_vk->slice.mem.offset + params->buf_offset, + .bufferRowLength = tex->params.w, + .bufferImageHeight = tex->params.h, + .imageSubresource = vk_layers, + .imageExtent = (VkExtent3D){tex->params.w, tex->params.h, tex->params.d}, + }; + + if (tex->params.dimensions == 2) { + int pix_size = tex->params.format->pixel_size; + region.bufferRowLength = params->stride / pix_size; + if (region.bufferRowLength * pix_size != params->stride) { + MP_ERR(ra, "Texture upload strides must be a multiple of the texel " + "size!\n"); + goto error; + } + + if (params->rc) { + struct mp_rect *rc = params->rc; + region.imageOffset = (VkOffset3D){rc->x0, rc->y0, 0}; + region.imageExtent = (VkExtent3D){mp_rect_w(*rc), mp_rect_h(*rc), 1}; + } + } + + uint64_t size = region.bufferRowLength * region.bufferImageHeight * + region.imageExtent.depth; + + struct vk_cmd *cmd = vk_require_cmd(ra); + if (!cmd) + goto error; + + buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size); + + tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + params->invalidate); + + vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img, + tex_vk->current_layout, 1, ®ion); + + return true; + +error: + return false; +} + +#define MPVK_NUM_DS MPVK_MAX_STREAMING_DEPTH + +// For ra_renderpass.priv +struct ra_renderpass_vk { + // Compiled shaders + VkShaderModule vert; + VkShaderModule frag; + VkShaderModule comp; + // Pipeline / render pass + VkPipeline pipe; + VkPipelineLayout pipeLayout; + VkPipelineCache pipeCache; + VkRenderPass renderPass; + // Descriptor set (bindings) + VkDescriptorSetLayout dsLayout; + VkDescriptorPool dsPool; + VkDescriptorSet dss[MPVK_NUM_DS]; + int dindex; + // Vertex buffers (vertices) + struct ra_buf_pool vbo; + + // For updating + VkWriteDescriptorSet *dswrite; + VkDescriptorImageInfo *dsiinfo; + VkDescriptorBufferInfo *dsbinfo; +}; + +static void vk_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass) +{ + if (!pass) + return; + + struct mpvk_ctx *vk = ra_vk_get(ra); + struct ra_renderpass_vk *pass_vk = pass->priv; + + ra_buf_pool_uninit(ra, &pass_vk->vbo); + vkDestroyPipeline(vk->dev, pass_vk->pipe, MPVK_ALLOCATOR); + vkDestroyPipelineCache(vk->dev, pass_vk->pipeCache, MPVK_ALLOCATOR); + vkDestroyRenderPass(vk->dev, pass_vk->renderPass, MPVK_ALLOCATOR); + vkDestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, MPVK_ALLOCATOR); + vkDestroyDescriptorPool(vk->dev, pass_vk->dsPool, MPVK_ALLOCATOR); + vkDestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->vert, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->frag, MPVK_ALLOCATOR); + vkDestroyShaderModule(vk->dev, pass_vk->comp, MPVK_ALLOCATOR); + + talloc_free(pass); +} + +MAKE_LAZY_DESTRUCTOR(vk_renderpass_destroy, struct ra_renderpass); + +static const VkDescriptorType dsType[] = { + [RA_VARTYPE_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + [RA_VARTYPE_IMG_W] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + [RA_VARTYPE_BUF_RO] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + [RA_VARTYPE_BUF_RW] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, +}; + +static bool vk_get_input_format(struct ra *ra, struct ra_renderpass_input *inp, + VkFormat *out_fmt) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + enum ra_ctype ctype; + switch (inp->type) { + case RA_VARTYPE_FLOAT: ctype = RA_CTYPE_FLOAT; break; + case RA_VARTYPE_BYTE_UNORM: ctype = RA_CTYPE_UNORM; break; + default: abort(); + } + + assert(inp->dim_m == 1); + for (const struct vk_format *fmt = vk_formats; fmt->name; fmt++) { + if (fmt->ctype != ctype) + continue; + if (fmt->components != inp->dim_v) + continue; + if (fmt->bytes != ra_renderpass_input_layout(inp).size) + continue; + + // Ensure this format is valid for vertex attributes + VkFormatProperties prop; + vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop); + if (!(prop.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT)) + continue; + + *out_fmt = fmt->iformat; + return true; + } + + return false; +} + +static const VkPipelineStageFlagBits stageFlags[] = { + [RA_RENDERPASS_TYPE_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT, + [RA_RENDERPASS_TYPE_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT, +}; + +static struct ra_renderpass *vk_renderpass_create(struct ra *ra, + const struct ra_renderpass_params *params) +{ + struct mpvk_ctx *vk = ra_vk_get(ra); + + struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass); + pass->params = *ra_renderpass_params_copy(pass, params); + pass->params.cached_program = (bstr){0}; + struct ra_renderpass_vk *pass_vk = pass->priv = + talloc_zero(pass, struct ra_renderpass_vk); + + static int dsCount[RA_VARTYPE_COUNT] = {0}; + VkDescriptorSetLayoutBinding *bindings = NULL; + int num_bindings = 0; + + for (int i = 0; i < params->num_inputs; i++) { + struct ra_renderpass_input *inp = ¶ms->inputs[i]; + switch (inp->type) { + case RA_VARTYPE_TEX: + case RA_VARTYPE_IMG_W: + case RA_VARTYPE_BUF_RO: + case RA_VARTYPE_BUF_RW: { + VkDescriptorSetLayoutBinding desc = { + .binding = inp->binding, + .descriptorType = dsType[inp->type], + .descriptorCount = 1, + .stageFlags = stageFlags[params->type], + }; + + MP_TARRAY_APPEND(pass, bindings, num_bindings, desc); + dsCount[inp->type]++; + break; + } + default: abort(); + } + } + + VkDescriptorPoolSize *dsPoolSizes = NULL; + int poolSizeCount = 0; + for (enum ra_vartype t = 0; t < RA_VARTYPE_COUNT; t++) { + if (dsCount[t] > 0) { + VkDescriptorPoolSize dssize = { + .type = dsType[t], + .descriptorCount = dsCount[t] * MPVK_NUM_DS, + }; + + MP_TARRAY_APPEND(pass, dsPoolSizes, poolSizeCount, dssize); + } + } + + VkDescriptorPoolCreateInfo pinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = MPVK_NUM_DS, + .pPoolSizes = dsPoolSizes, + .poolSizeCount = poolSizeCount, + }; + + VK(vkCreateDescriptorPool(vk->dev, &pinfo, MPVK_ALLOCATOR, &pass_vk->dsPool)); + talloc_free(dsPoolSizes); + + pass_vk->dswrite = talloc_array(pass, VkWriteDescriptorSet, num_bindings); + pass_vk->dsiinfo = talloc_array(pass, VkDescriptorImageInfo, num_bindings); + pass_vk->dsbinfo = talloc_array(pass, VkDescriptorBufferInfo, num_bindings); + + VkDescriptorSetLayoutCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pBindings = bindings, + .bindingCount = num_bindings, + }; + + VK(vkCreateDescriptorSetLayout(vk->dev, &dinfo, MPVK_ALLOCATOR, + &pass_vk->dsLayout)); + + VkDescriptorSetLayout layouts[MPVK_NUM_DS]; + for (int i = 0; i < MPVK_NUM_DS; i++) + layouts[i] = pass_vk->dsLayout; + + VkDescriptorSetAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = pass_vk->dsPool, + .descriptorSetCount = MPVK_NUM_DS, + .pSetLayouts = layouts, + }; + + VK(vkAllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss)); + + VkPipelineLayoutCreateInfo linfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &pass_vk->dsLayout, + }; + + VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR, + &pass_vk->pipeLayout)); + + VkPipelineCacheCreateInfo pcinfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, + .pInitialData = params->cached_program.start, + .initialDataSize = params->cached_program.len, + }; + + VK(vkCreatePipelineCache(vk->dev, &pcinfo, MPVK_ALLOCATOR, &pass_vk->pipeCache)); + + VkShaderModuleCreateInfo sinfo = { + |