/*
 * Copyright © 2020 Raspberry Pi
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "v3dv_private.h"
#include "v3dv_meta_common.h"

#include "compiler/nir/nir_builder.h"
#include "vk_format_info.h"
#include "util/u_pack_color.h"

static void
get_hw_clear_color(struct v3dv_device *device,
                   const VkClearColorValue *color,
                   VkFormat fb_format,
                   VkFormat image_format,
                   uint32_t internal_type,
                   uint32_t internal_bpp,
                   uint32_t *hw_color)
{
   const uint32_t internal_size = 4 << internal_bpp;

   /* If the image format doesn't match the framebuffer format, then we are
    * trying to clear an unsupported tlb format using a compatible
    * format for the framebuffer. In this case, we want to make sure that
    * we pack the clear value according to the original format semantics,
    * not the compatible format.
    */
   if (fb_format == image_format) {
      v3dv_X(device, get_hw_clear_color)(color, internal_type, internal_size,
                                         hw_color);
   } else {
      union util_color uc;
      enum pipe_format pipe_image_format =
         vk_format_to_pipe_format(image_format);
      util_pack_color(color->float32, pipe_image_format, &uc);
      memcpy(hw_color, uc.ui, internal_size);
   }
}

/* Returns true if the implementation is able to handle the case, false
 * otherwise.
*/
static bool
clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                struct v3dv_image *image,
                const VkClearValue *clear_value,
                const VkImageSubresourceRange *range)
{
   const VkOffset3D origin = { 0, 0, 0 };
   VkFormat fb_format;
   if (!v3dv_meta_can_use_tlb(image, &origin, &fb_format))
      return false;

   uint32_t internal_type, internal_bpp;
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
      (fb_format, range->aspectMask,
       &internal_type, &internal_bpp);

   union v3dv_clear_value hw_clear_value = { 0 };
   if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
      get_hw_clear_color(cmd_buffer->device, &clear_value->color, fb_format,
                         image->vk.format, internal_type, internal_bpp,
                         &hw_clear_value.color[0]);
   } else {
      assert((range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) ||
             (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT));
      hw_clear_value.z = clear_value->depthStencil.depth;
      hw_clear_value.s = clear_value->depthStencil.stencil;
   }

   uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
   uint32_t min_level = range->baseMipLevel;
   uint32_t max_level = range->baseMipLevel + level_count;

   /* For 3D images baseArrayLayer and layerCount must be 0 and 1 respectively.
    * Instead, we need to consider the full depth dimension of the image, which
    * goes from 0 up to the level's depth extent.
    */
   uint32_t min_layer;
   uint32_t max_layer;
   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
      min_layer = range->baseArrayLayer;
      max_layer = range->baseArrayLayer +
                  vk_image_subresource_layer_count(&image->vk, range);
   } else {
      min_layer = 0;
      max_layer = 0;
   }

   for (uint32_t level = min_level; level < max_level; level++) {
      if (image->vk.image_type == VK_IMAGE_TYPE_3D)
         max_layer = u_minify(image->vk.extent.depth, level);

      uint32_t width = u_minify(image->vk.extent.width, level);
      uint32_t height = u_minify(image->vk.extent.height, level);

      struct v3dv_job *job =
         v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);

      if (!job)
         return true;

      v3dv_job_start_frame(job, width, height, max_layer, false,
                           1, internal_bpp,
                           image->vk.samples > VK_SAMPLE_COUNT_1_BIT);

      struct v3dv_meta_framebuffer framebuffer;
      v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
                                                 internal_type,
                                                 &job->frame_tiling);

      v3dv_X(job->device, job_emit_binning_flush)(job);

      /* If this triggers it is an application bug: the spec requires
       * that any aspects to clear are present in the image.
       */
      assert(range->aspectMask & image->vk.aspects);

      v3dv_X(job->device, meta_emit_clear_image_rcl)
         (job, image, &framebuffer, &hw_clear_value,
          range->aspectMask, min_layer, max_layer, level);

      v3dv_cmd_buffer_finish_job(cmd_buffer);
   }

   return true;
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,
                        VkImage _image,
                        VkImageLayout imageLayout,
                        const VkClearColorValue *pColor,
                        uint32_t rangeCount,
                        const VkImageSubresourceRange *pRanges)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_image, image, _image);

   const VkClearValue clear_value = {
      .color = *pColor,
   };

   for (uint32_t i = 0; i < rangeCount; i++) {
      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
         continue;
      unreachable("Unsupported color clear.");
   }
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
                               VkImage _image,
                               VkImageLayout imageLayout,
                               const VkClearDepthStencilValue *pDepthStencil,
                               uint32_t rangeCount,
                               const VkImageSubresourceRange *pRanges)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_image, image, _image);

   const VkClearValue clear_value = {
      .depthStencil = *pDepthStencil,
   };

   for (uint32_t i = 0; i < rangeCount; i++) {
      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
         continue;
      unreachable("Unsupported depth/stencil clear.");
   }
}

static void
destroy_color_clear_pipeline(VkDevice _device,
                             uint64_t pipeline,
                             VkAllocationCallbacks *alloc)
{
   struct v3dv_meta_color_clear_pipeline *p =
      (struct v3dv_meta_color_clear_pipeline *) (uintptr_t) pipeline;
   v3dv_DestroyPipeline(_device, p->pipeline, alloc);
   if (p->cached)
      v3dv_DestroyRenderPass(_device, p->pass, alloc);
   vk_free(alloc, p);
}

static void
destroy_depth_clear_pipeline(VkDevice _device,
                             struct v3dv_meta_depth_clear_pipeline *p,
                             VkAllocationCallbacks *alloc)
{
   v3dv_DestroyPipeline(_device, p->pipeline, alloc);
   vk_free(alloc, p);
}

static VkResult
create_color_clear_pipeline_layout(struct v3dv_device *device,
                                   VkPipelineLayout *pipeline_layout)
{
   /* FIXME: this is abusing a bit the API, since not all of our clear
    * pipelines have a geometry shader. We could create 2 different pipeline
    * layouts, but this works for us for now.
    */
   VkPushConstantRange ranges[2] = {
      { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16 },
      { VK_SHADER_STAGE_GEOMETRY_BIT, 16, 4 },
   };

   VkPipelineLayoutCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
      .setLayoutCount = 0,
      .pushConstantRangeCount = 2,
      .pPushConstantRanges = ranges,
   };

   return v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
                                    &info, &device->vk.alloc, pipeline_layout);
}

static VkResult
create_depth_clear_pipeline_layout(struct v3dv_device *device,
                                   VkPipelineLayout *pipeline_layout)
{
   /* FIXME: this is abusing a bit the API, since not all of our clear
    * pipelines have a geometry shader. We could create 2 different pipeline
    * layouts, but this works for us for now.
    */
   VkPushConstantRange ranges[2] = {
      { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4 },
      { VK_SHADER_STAGE_GEOMETRY_BIT, 4, 4 },
   };

   VkPipelineLayoutCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
      .setLayoutCount = 0,
      .pushConstantRangeCount = 2,
      .pPushConstantRanges = ranges
   };

   return v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
                                    &info, &device->vk.alloc, pipeline_layout);
}

void
v3dv_meta_clear_init(struct v3dv_device *device)
{
   device->meta.color_clear.cache =
      _mesa_hash_table_create(NULL, u64_hash, u64_compare);

   create_color_clear_pipeline_layout(device,
                                      &device->meta.color_clear.p_layout);

   device->meta.depth_clear.cache =
      _mesa_hash_table_create(NULL, u64_hash, u64_compare);

   create_depth_clear_pipeline_layout(device,
                                      &device->meta.depth_clear.p_layout);
}

void
v3dv_meta_clear_finish(struct v3dv_device *device)
{
   VkDevice _device = v3dv_device_to_handle(device);

   hash_table_foreach(device->meta.color_clear.cache, entry) {
      struct v3dv_meta_color_clear_pipeline *item = entry->data;
      destroy_color_clear_pipeline(_device, (uintptr_t)item, &device->vk.alloc);
   }
   _mesa_hash_table_destroy(device->meta.color_clear.cache, NULL);

   if (device->meta.color_clear.p_layout) {
      v3dv_DestroyPipelineLayout(_device, device->meta.color_clear.p_layout,
                                 &device->vk.alloc);
   }

   hash_table_foreach(device->meta.depth_clear.cache, entry) {
      struct v3dv_meta_depth_clear_pipeline *item = entry->data;
      destroy_depth_clear_pipeline(_device, item, &device->vk.alloc);
   }
   _mesa_hash_table_destroy(device->meta.depth_clear.cache, NULL);

   if (device->meta.depth_clear.p_layout) {
      v3dv_DestroyPipelineLayout(_device, device->meta.depth_clear.p_layout,
                                 &device->vk.alloc);
   }
}

static nir_ssa_def *
gen_rect_vertices(nir_builder *b)
{
   nir_ssa_def *vertex_id = nir_load_vertex_id(b);

   /* vertex 0: -1.0, -1.0
    * vertex 1: -1.0,  1.0
    * vertex 2:  1.0, -1.0
    * vertex 3:  1.0,  1.0
    *
    * so:
    *
    * channel 0 is vertex_id < 2 ? -1.0 :  1.0
    * channel 1 is vertex id & 1 ?  1.0 : -1.0
    */

   nir_ssa_def *one = nir_imm_int(b, 1);
   nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
   nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);

   nir_ssa_def *comp[4];
   comp[0] = nir_bcsel(b, c0cmp,
                       nir_imm_float(b, -1.0f),
                       nir_imm_float(b, 1.0f));

   comp[1] = nir_bcsel(b, c1cmp,
                       nir_imm_float(b, 1.0f),
                       nir_imm_float(b, -1.0f));
   comp[2] = nir_imm_float(b, 0.0f);
   comp[3] = nir_imm_float(b, 1.0f);
   return nir_vec(b, comp, 4);
}

static nir_shader *
get_clear_rect_vs()
{
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
                                                  "meta clear vs");

   const struct glsl_type *vec4 = glsl_vec4_type();
   nir_variable *vs_out_pos =
      nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
   vs_out_pos->data.location = VARYING_SLOT_POS;

   nir_ssa_def *pos = gen_rect_vertices(&b);
   nir_store_var(&b, vs_out_pos, pos, 0xf);

   return b.shader;
}

static nir_shader *
get_clear_rect_gs(uint32_t push_constant_layer_base)
{
   /* FIXME: this creates a geometry shader that takes the index of a single
    * layer to clear from push constants, so we need to emit a draw call for
    * each layer that we want to clear. We could actually do better and have it
    * take a range of layers and then emit one triangle per layer to clear,
    * however, if we were to do this we would need to be careful not to exceed
    * the maximum number of output vertices allowed in a geometry shader.
    */
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
                                                  "meta clear gs");
   nir_shader *nir = b.shader;
   nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
   nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
                               (1ull << VARYING_SLOT_LAYER);
   nir->info.gs.input_primitive = GL_TRIANGLES;
   nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
   nir->info.gs.vertices_in = 3;
   nir->info.gs.vertices_out = 3;
   nir->info.gs.invocations = 1;
   nir->info.gs.active_stream_mask = 0x1;

   /* in vec4 gl_Position[3] */
   nir_variable *gs_in_pos =
      nir_variable_create(b.shader, nir_var_shader_in,
                          glsl_array_type(glsl_vec4_type(), 3, 0),
                          "in_gl_Position");
   gs_in_pos->data.location = VARYING_SLOT_POS;

   /* out vec4 gl_Position */
   nir_variable *gs_out_pos =
      nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
                          "out_gl_Position");
   gs_out_pos->data.location = VARYING_SLOT_POS;

   /* out float gl_Layer */
   nir_variable *gs_out_layer =
      nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
                          "out_gl_Layer");
   gs_out_layer->data.location = VARYING_SLOT_LAYER;

   /* Emit output triangle */
   for (uint32_t i = 0; i < 3; i++) {
      /* gl_Position from shader input */
      nir_deref_instr *in_pos_i =
         nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
      nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);

      /* gl_Layer from push constants */
      nir_ssa_def *layer =
         nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                                .base = push_constant_layer_base, .range = 4);
      nir_store_var(&b, gs_out_layer, layer, 0x1);

      nir_emit_vertex(&b, 0);
   }

   nir_end_primitive(&b, 0);

   return nir;
}

static nir_shader *
get_color_clear_rect_fs(uint32_t rt_idx, VkFormat format)
{
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
                                                  "meta clear fs");

   enum pipe_format pformat = vk_format_to_pipe_format(format);
   const struct glsl_type *fs_out_type =
      util_format_is_float(pformat) ? glsl_vec4_type() : glsl_uvec4_type();

   nir_variable *fs_out_color =
      nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
   fs_out_color->data.location = FRAG_RESULT_DATA0 + rt_idx;

   nir_ssa_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
   nir_store_var(&b, fs_out_color, color_load, 0xf);

   return b.shader;
}

static nir_shader *
get_depth_clear_rect_fs()
{
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
                                                  "meta depth clear fs");

   nir_variable *fs_out_depth =
      nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
                          "out_depth");
   fs_out_depth->data.location = FRAG_RESULT_DEPTH;

   nir_ssa_def *depth_load =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);

   nir_store_var(&b, fs_out_depth, depth_load, 0x1);

   return b.shader;
}

static VkResult
create_pipeline(struct v3dv_device *device,
                struct v3dv_render_pass *pass,
                uint32_t subpass_idx,
                uint32_t samples,
                struct nir_shader *vs_nir,
                struct nir_shader *gs_nir,
                struct nir_shader *fs_nir,
                const VkPipelineVertexInputStateCreateInfo *vi_state,
                const VkPipelineDepthStencilStateCreateInfo *ds_state,
                const VkPipelineColorBlendStateCreateInfo *cb_state,
                const VkPipelineLayout layout,
                VkPipeline *pipeline)
{
   VkPipelineShaderStageCreateInfo stages[3] = { 0 };
   struct vk_shader_module vs_m;
   struct vk_shader_module gs_m;
   struct vk_shader_module fs_m;

   uint32_t stage_count = 0;
   v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
   stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
   stages[stage_count].stage = VK_SHADER_STAGE_VERTEX_BIT;
   stages[stage_count].module = vk_shader_module_to_handle(&vs_m);
   stages[stage_count].pName = "main";
   stage_count++;

   if (gs_nir) {
      v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
      stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
      stages[stage_count].stage = VK_SHADER_STAGE_GEOMETRY_BIT;
      stages[stage_count].module = vk_shader_module_to_handle(&gs_m);
      stages[stage_count].pName = "main";
      stage_count++;
   }

   if (fs_nir) {
      v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
      stages[stage_count].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
      stages[stage_count].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
      stages[stage_count].module = vk_shader_module_to_handle(&fs_m);
      stages[stage_count].pName = "main";
      stage_count++;
   }

   VkGraphicsPipelineCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,

      .stageCount = stage_count,
      .pStages = stages,

      .pVertexInputState = vi_state,

      .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
         .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
         .primitiveRestartEnable = false,
      },

      .pViewportState = &(VkPipelineViewportStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .viewportCount = 1,
         .scissorCount = 1,
      },

      .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
         .rasterizerDiscardEnable = false,
         .polygonMode = VK_POLYGON_MODE_FILL,
         .cullMode = VK_CULL_MODE_NONE,
         .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
         .depthBiasEnable = false,
      },

      .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
         .rasterizationSamples = samples,
         .sampleShadingEnable = false,
         .pSampleMask = NULL,
         .alphaToCoverageEnable = false,
         .alphaToOneEnable = false,
      },

      .pDepthStencilState = ds_state,

      .pColorBlendState = cb_state,

      /* The meta clear pipeline declares all state as dynamic.
       * As a consequence, vkCmdBindPipeline writes no dynamic state
       * to the cmd buffer. Therefore, at the end of the meta clear,
       * we need only restore dynamic state that was vkCmdSet.
       */
      .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
         .dynamicStateCount = 6,
         .pDynamicStates = (VkDynamicState[]) {
            VK_DYNAMIC_STATE_VIEWPORT,
            VK_DYNAMIC_STATE_SCISSOR,
            VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
            VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
            VK_DYNAMIC_STATE_STENCIL_REFERENCE,
            VK_DYNAMIC_STATE_BLEND_CONSTANTS,
            VK_DYNAMIC_STATE_DEPTH_BIAS,
            VK_DYNAMIC_STATE_LINE_WIDTH,
         },
      },

      .flags = 0,
      .layout = layout,
      .renderPass = v3dv_render_pass_to_handle(pass),
      .subpass = subpass_idx,
   };

   VkResult result =
      v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
                                   VK_NULL_HANDLE,
                                   1, &info,
                                   &device->vk.alloc,
                                   pipeline);

   ralloc_free(vs_nir);
   ralloc_free(fs_nir);

   return result;
}

static VkResult
create_color_clear_pipeline(struct v3dv_device *device,
                            struct v3dv_render_pass *pass,
                            uint32_t subpass_idx,
                            uint32_t rt_idx,
                            VkFormat format,
                            uint32_t samples,
                            uint32_t components,
                            bool is_layered,
                            VkPipelineLayout pipeline_layout,
                            VkPipeline *pipeline)
{
   nir_shader *vs_nir = get_clear_rect_vs();
   nir_shader *fs_nir = get_color_clear_rect_fs(rt_idx, format);
   nir_shader *gs_nir = is_layered ? get_clear_rect_gs(16) : NULL;

   const VkPipelineVertexInputStateCreateInfo vi_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
      .vertexBindingDescriptionCount = 0,
      .vertexAttributeDescriptionCount = 0,
   };

   const VkPipelineDepthStencilStateCreateInfo ds_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
      .depthTestEnable = false,
      .depthWriteEnable = false,
      .depthBoundsTestEnable = false,
      .stencilTestEnable = false,
   };

   assert(subpass_idx < pass->subpass_count);
   const uint32_t color_count = pass->subpasses[subpass_idx].color_count;
   assert(rt_idx < color_count);

   VkPipelineColorBlendAttachmentState blend_att_state[V3D_MAX_DRAW_BUFFERS];
   for (uint32_t i = 0; i < color_count; i++) {
      blend_att_state[i] = (VkPipelineColorBlendAttachmentState) {
         .blendEnable = false,
         .colorWriteMask = i == rt_idx ? components : 0,
      };
   }

   const VkPipelineColorBlendStateCreateInfo cb_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
      .logicOpEnable = false,
      .attachmentCount = color_count,
      .pAttachments = blend_att_state
   };

   return create_pipeline(device,
                          pass, subpass_idx,
                          samples,
                          vs_nir, gs_nir, fs_nir,
                          &vi_state,
                          &ds_state,
                          &cb_state,
                          pipeline_layout,
                          pipeline);
}

static VkResult
create_depth_clear_pipeline(struct v3dv_device *device,
                            VkImageAspectFlags aspects,
                            struct v3dv_render_pass *pass,
                            uint32_t subpass_idx,
                            uint32_t samples,
                            bool is_layered,
                            VkPipelineLayout pipeline_layout,
                            VkPipeline *pipeline)
{
   const bool has_depth = aspects & VK_IMAGE_ASPECT_DEPTH_BIT;
   const bool has_stencil = aspects & VK_IMAGE_ASPECT_STENCIL_BIT;
   assert(has_depth || has_stencil);

   nir_shader *vs_nir = get_clear_rect_vs();
   nir_shader *fs_nir = has_depth ? get_depth_clear_rect_fs() : NULL;
   nir_shader *gs_nir = is_layered ? get_clear_rect_gs(4) : NULL;

   const VkPipelineVertexInputStateCreateInfo vi_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
      .vertexBindingDescriptionCount = 0,
      .vertexAttributeDescriptionCount = 0,
   };

   const VkPipelineDepthStencilStateCreateInfo ds_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
      .depthTestEnable = has_depth,
      .depthWriteEnable = has_depth,
      .depthCompareOp = VK_COMPARE_OP_ALWAYS,
      .depthBoundsTestEnable = false,
      .stencilTestEnable = has_stencil,
      .front = {
         .passOp = VK_STENCIL_OP_REPLACE,
         .compareOp = VK_COMPARE_OP_ALWAYS,
         /* compareMask, writeMask and reference are dynamic state */
      },
      .back = { 0 },
   };

   assert(subpass_idx < pass->subpass_count);
   VkPipelineColorBlendAttachmentState blend_att_state[V3D_MAX_DRAW_BUFFERS] = { 0 };
   const VkPipelineColorBlendStateCreateInfo cb_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
      .logicOpEnable = false,
      .attachmentCount = pass->subpasses[subpass_idx].color_count,
      .pAttachments = blend_att_state,
   };

   return create_pipeline(device,
                          pass, subpass_idx,
                          samples,
                          vs_nir, gs_nir, fs_nir,
                          &vi_state,
                          &ds_state,
                          &cb_state,
                          pipeline_layout,
                          pipeline);
}

static VkResult
create_color_clear_render_pass(struct v3dv_device *device,
                               uint32_t rt_idx,
                               VkFormat format,
                               uint32_t samples,
                               VkRenderPass *pass)
{
   VkAttachmentDescription att = {
      .format = format,
      .samples = samples,
      .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
      .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
      .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
   };

   VkAttachmentReference att_ref = {
      .attachment = rt_idx,
      .layout = VK_IMAGE_LAYOUT_GENERAL,
   };

   VkSubpassDescription subpass = {
      .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
      .inputAttachmentCount = 0,
      .colorAttachmentCount = 1,
      .pColorAttachments = &att_ref,
      .pResolveAttachments = NULL,
      .pDepthStencilAttachment = NULL,
      .preserveAttachmentCount = 0,
      .pPreserveAttachments = NULL,
   };

   VkRenderPassCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
      .attachmentCount = 1,
      .pAttachments = &att,
      .subpassCount = 1,
      .pSubpasses = &subpass,
      .dependencyCount = 0,
      .pDependencies = NULL,
   };

   return v3dv_CreateRenderPass(v3dv_device_to_handle(device),
                                &info, &device->vk.alloc, pass);
}

static inline uint64_t
get_color_clear_pipeline_cache_key(uint32_t rt_idx,
                                   VkFormat format,
                                   uint32_t samples,
                                   uint32_t components,
                                   bool is_layered)
{
   assert(rt_idx < V3D_MAX_DRAW_BUFFERS);

   uint64_t key = 0;
   uint32_t bit_offset = 0;

   key |= rt_idx;
   bit_offset += 2;

   key |= ((uint64_t) format) << bit_offset;
   bit_offset += 32;

   key |= ((uint64_t) samples) << bit_offset;
   bit_offset += 4;

   key |= ((uint64_t) components) << bit_offset;
   bit_offset += 4;

   key |= (is_layered ? 1ull : 0ull) << bit_offset;
   bit_offset += 1;

   assert(bit_offset <= 64);
   return key;
}

static inline uint64_t
get_depth_clear_pipeline_cache_key(VkImageAspectFlags aspects,
                                   VkFormat format,
                                   uint32_t samples,
                                   bool is_layered)
{
   uint64_t key = 0;
   uint32_t bit_offset = 0;

   key |= format;
   bit_offset += 32;

   key |= ((uint64_t) samples) << bit_offset;
   bit_offset += 4;

   const bool has_depth = (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) ? 1 : 0;
   key |= ((uint64_t) has_depth) << bit_offset;
   bit_offset++;

   const bool has_stencil = (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) ? 1 : 0;
   key |= ((uint64_t) has_stencil) << bit_offset;
   bit_offset++;;

   key |= (is_layered ? 1ull : 0ull) << bit_offset;
   bit_offset += 1;

   assert(bit_offset <= 64);
   return key;
}

static VkResult
get_color_clear_pipeline(struct v3dv_device *device,
                         struct v3dv_render_pass *pass,
                         uint32_t subpass_idx,
                         uint32_t rt_idx,
                         uint32_t attachment_idx,
                         VkFormat format,
                         uint32_t samples,
                         uint32_t components,
                         bool is_layered,
                         struct v3dv_meta_color_clear_pipeline **pipeline)
{
   assert(vk_format_is_color(format));

   VkResult result = VK_SUCCESS;

   /* If pass != NULL it means that we are emitting the clear as a draw call
    * in the current pass bound by the application. In that case, we can't
    * cache the pipeline, since it will be referencing that pass and the
    * application could be destroying it at any point. Hopefully, the perf
    * impact is not too big since we still have the device pipeline cache
    * around and we won't end up re-compiling the clear shader.
    *
    * FIXME: alternatively, we could refcount (or maybe clone) the render pass
    * provided by the application and include it in the pipeline key setup
    * to make caching safe in this scenario, however, based on tests with
    * vkQuake3, the fact that we are not caching here doesn't seem to have
    * any significant impact in performance, so it might not be worth it.
    */
   const bool can_cache_pipeline = (pass == NULL);

   uint64_t key;
   if (can_cache_pipeline) {
      key = get_color_clear_pipeline_cache_key(rt_idx, format, samples,
                                               components, is_layered);
      mtx_lock(&device->meta.mtx);
      struct hash_entry *entry =
         _mesa_hash_table_search(device->meta.color_clear.cache, &key);
      if (entry) {
         mtx_unlock(&device->meta.mtx);
         *pipeline = entry->data;
         return VK_SUCCESS;
      }
   }

   *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);

   if (*pipeline == NULL) {
      result = VK_ERROR_OUT_OF_HOST_MEMORY;
      goto fail;
   }

   if (!pass) {
      result = create_color_clear_render_pass(device,
                                              rt_idx,
                                              format,
                                              samples,
                                              &(*pipeline)->pass);
      if (result != VK_SUCCESS)
         goto fail;

      pass = v3dv_render_pass_from_handle((*pipeline)->pass);
   } else {
      (*pipeline)->pass = v3dv_render_pass_to_handle(pass);
   }

   result = create_color_clear_pipeline(device,
                                        pass,
                                        subpass_idx,
                                        rt_idx,
                                        format,
                                        samples,
                                        components,
                                        is_layered,
                                        device->meta.color_clear.p_layout,
                                        &(*pipeline)->pipeline);
   if (result != VK_SUCCESS)
      goto fail;

   if (can_cache_pipeline) {
      (*pipeline)->key = key;
      (*pipeline)->cached = true;
      _mesa_hash_table_insert(device->meta.color_clear.cache,
                              &(*pipeline)->key, *pipeline);

      mtx_unlock(&device->meta.mtx);
   }

   return VK_SUCCESS;

fail:
   if (can_cache_pipeline)
      mtx_unlock(&device->meta.mtx);

   VkDevice _device = v3dv_device_to_handle(device);
   if (*pipeline) {
      if ((*pipeline)->cached)
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
      if ((*pipeline)->pipeline)
         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
      vk_free(&device->vk.alloc, *pipeline);
      *pipeline = NULL;
   }

   return result;
}

static VkResult
get_depth_clear_pipeline(struct v3dv_device *device,
                         VkImageAspectFlags aspects,
                         struct v3dv_render_pass *pass,
                         uint32_t subpass_idx,
                         uint32_t attachment_idx,
                         bool is_layered,
                         struct v3dv_meta_depth_clear_pipeline **pipeline)
{
   assert(subpass_idx < pass->subpass_count);
   assert(attachment_idx != VK_ATTACHMENT_UNUSED);
   assert(attachment_idx < pass->attachment_count);

   VkResult result = VK_SUCCESS;

   const uint32_t samples = pass->attachments[attachment_idx].desc.samples;
   const VkFormat format = pass->attachments[attachment_idx].desc.format;
   assert(vk_format_is_depth_or_stencil(format));

   const uint64_t key =
      get_depth_clear_pipeline_cache_key(aspects, format, samples, is_layered);
   mtx_lock(&device->meta.mtx);
   struct hash_entry *entry =
      _mesa_hash_table_search(device->meta.depth_clear.cache, &key);
   if (entry) {
      mtx_unlock(&device->meta.mtx);
      *pipeline = entry->data;
      return VK_SUCCESS;
   }

   *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);

   if (*pipeline == NULL) {
      result = VK_ERROR_OUT_OF_HOST_MEMORY;
      goto fail;
   }

   result = create_depth_clear_pipeline(device,
                                        aspects,
                                        pass,
                                        subpass_idx,
                                        samples,
                                        is_layered,
                                        device->meta.depth_clear.p_layout,
                                        &(*pipeline)->pipeline);
   if (result != VK_SUCCESS)
      goto fail;

   (*pipeline)->key = key;
   _mesa_hash_table_insert(device->meta.depth_clear.cache,
                           &(*pipeline)->key, *pipeline);

   mtx_unlock(&device->meta.mtx);
   return VK_SUCCESS;

fail:
   mtx_unlock(&device->meta.mtx);

   VkDevice _device = v3dv_device_to_handle(device);
   if (*pipeline) {
      if ((*pipeline)->pipeline)
         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
      vk_free(&device->vk.alloc, *pipeline);
      *pipeline = NULL;
   }

   return result;
}

/* Emits a scissored quad in the clear color */
static void
emit_subpass_color_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                               struct v3dv_render_pass *pass,
                               struct v3dv_subpass *subpass,
                               uint32_t rt_idx,
                               const VkClearColorValue *clear_color,
                               bool is_layered,
                               bool all_rects_same_layers,
                               uint32_t rect_count,
                               const VkClearRect *rects)
{
   /* Skip if attachment is unused in the current subpass */
   assert(rt_idx < subpass->color_count);
   const uint32_t attachment_idx = subpass->color_attachments[rt_idx].attachment;
   if (attachment_idx == VK_ATTACHMENT_UNUSED)
      return;

   /* Obtain a pipeline for this clear */
   assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
   const VkFormat format =
      cmd_buffer->state.pass->attachments[attachment_idx].desc.format;
   const VkFormat samples =
      cmd_buffer->state.pass->attachments[attachment_idx].desc.samples;
   const uint32_t components = VK_COLOR_COMPONENT_R_BIT |
                               VK_COLOR_COMPONENT_G_BIT |
                               VK_COLOR_COMPONENT_B_BIT |
                               VK_COLOR_COMPONENT_A_BIT;
   struct v3dv_meta_color_clear_pipeline *pipeline = NULL;
   VkResult result = get_color_clear_pipeline(cmd_buffer->device,
                                              pass,
                                              cmd_buffer->state.subpass_idx,
                                              rt_idx,
                                              attachment_idx,
                                              format,
                                              samples,
                                              components,
                                              is_layered,
                                              &pipeline);
   if (result != VK_SUCCESS) {
      if (result == VK_ERROR_OUT_OF_HOST_MEMORY)
         v3dv_flag_oom(cmd_buffer, NULL);
      return;
   }
   assert(pipeline && pipeline->pipeline);

   /* Emit clear rects */
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, false);

   VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
   v3dv_CmdPushConstants(cmd_buffer_handle,
                         cmd_buffer->device->meta.depth_clear.p_layout,
                         VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16,
                         clear_color->float32);

   v3dv_CmdBindPipeline(cmd_buffer_handle,
                        VK_PIPELINE_BIND_POINT_GRAPHICS,
                        pipeline->pipeline);

   uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;

   for (uint32_t i = 0; i < rect_count; i++) {
      const VkViewport viewport = {
         .x = rects[i].rect.offset.x,
         .y = rects[i].rect.offset.y,
         .width = rects[i].rect.extent.width,
         .height = rects[i].rect.extent.height,
         .minDepth = 0.0f,
         .maxDepth = 1.0f
      };
      v3dv_CmdSetViewport(cmd_buffer_handle, 0, 1, &viewport);
      v3dv_CmdSetScissor(cmd_buffer_handle, 0, 1, &rects[i].rect);

      if (is_layered) {
         for (uint32_t layer_offset = 0; layer_offset < rects[i].layerCount;
              layer_offset++) {
            uint32_t layer = rects[i].baseArrayLayer + layer_offset;
            v3dv_CmdPushConstants(cmd_buffer_handle,
                                  cmd_buffer->device->meta.depth_clear.p_layout,
                                  VK_SHADER_STAGE_GEOMETRY_BIT, 16, 4, &layer);
            v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
         }
      } else {
         assert(rects[i].baseArrayLayer == 0 && rects[i].layerCount == 1);
         v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
      }
   }

   /* Subpass pipelines can't be cached because they include a reference to the
    * render pass currently bound by the application, which means that we need
    * to destroy them manually here.
    */
   assert(!pipeline->cached);
   v3dv_cmd_buffer_add_private_obj(
      cmd_buffer, (uintptr_t)pipeline,
      (v3dv_cmd_buffer_private_obj_destroy_cb) destroy_color_clear_pipeline);

   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
}

/* Emits a scissored quad, clearing the depth aspect by writing to gl_FragDepth
 * and the stencil aspect by using stencil testing.
 */
static void
emit_subpass_ds_clear_rects(struct v3dv_cmd_buffer *cmd_buffer,
                            struct v3dv_render_pass *pass,
                            struct v3dv_subpass *subpass,
                            VkImageAspectFlags aspects,
                            const VkClearDepthStencilValue *clear_ds,
                            bool is_layered,
                            bool all_rects_same_layers,
                            uint32_t rect_count,
                            const VkClearRect *rects)
{
   /* Skip if attachment is unused in the current subpass */
   const uint32_t attachment_idx = subpass->ds_attachment.attachment;
   if (attachment_idx == VK_ATTACHMENT_UNUSED)
      return;

   /* Obtain a pipeline for this clear */
   assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
   struct v3dv_meta_depth_clear_pipeline *pipeline = NULL;
   VkResult result = get_depth_clear_pipeline(cmd_buffer->device,
                                              aspects,
                                              pass,
                                              cmd_buffer->state.subpass_idx,
                                              attachment_idx,
                                              is_layered,
                                              &pipeline);
   if (result != VK_SUCCESS) {
      if (result == VK_ERROR_OUT_OF_HOST_MEMORY)
         v3dv_flag_oom(cmd_buffer, NULL);
      return;
   }
   assert(pipeline && pipeline->pipeline);

   /* Emit clear rects */
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, false);

   VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
   v3dv_CmdPushConstants(cmd_buffer_handle,
                         cmd_buffer->device->meta.depth_clear.p_layout,
                         VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4,
                         &clear_ds->depth);

   v3dv_CmdBindPipeline(cmd_buffer_handle,
                        VK_PIPELINE_BIND_POINT_GRAPHICS,
                        pipeline->pipeline);

   uint32_t dynamic_states = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
      v3dv_CmdSetStencilReference(cmd_buffer_handle,
                                  VK_STENCIL_FACE_FRONT_AND_BACK,
                                  clear_ds->stencil);
      v3dv_CmdSetStencilWriteMask(cmd_buffer_handle,
                                  VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
      v3dv_CmdSetStencilCompareMask(cmd_buffer_handle,
                                    VK_STENCIL_FACE_FRONT_AND_BACK, 0xff);
      dynamic_states |= VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK |
                        VK_DYNAMIC_STATE_STENCIL_WRITE_MASK |
                        VK_DYNAMIC_STATE_STENCIL_REFERENCE;
   }

   for (uint32_t i = 0; i < rect_count; i++) {
      const VkViewport viewport = {
         .x = rects[i].rect.offset.x,
         .y = rects[i].rect.offset.y,
         .width = rects[i].rect.extent.width,
         .height = rects[i].rect.extent.height,
         .minDepth = 0.0f,
         .maxDepth = 1.0f
      };
      v3dv_CmdSetViewport(cmd_buffer_handle, 0, 1, &viewport);
      v3dv_CmdSetScissor(cmd_buffer_handle, 0, 1, &rects[i].rect);
      if (is_layered) {
         for (uint32_t layer_offset = 0; layer_offset < rects[i].layerCount;
              layer_offset++) {
            uint32_t layer = rects[i].baseArrayLayer + layer_offset;
            v3dv_CmdPushConstants(cmd_buffer_handle,
                                  cmd_buffer->device->meta.depth_clear.p_layout,
                                  VK_SHADER_STAGE_GEOMETRY_BIT, 4, 4, &layer);
            v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
         }
      } else {
         assert(rects[i].baseArrayLayer == 0 && rects[i].layerCount == 1);
         v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
      }
   }

   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
}

static void
gather_layering_info(uint32_t rect_count, const VkClearRect *rects,
                     bool *is_layered, bool *all_rects_same_layers)
{
   *all_rects_same_layers = true;

   uint32_t min_layer = rects[0].baseArrayLayer;
   uint32_t max_layer = rects[0].baseArrayLayer + rects[0].layerCount - 1;
   for (uint32_t i = 1; i < rect_count; i++) {
      if (rects[i].baseArrayLayer != rects[i - 1].baseArrayLayer ||
          rects[i].layerCount != rects[i - 1].layerCount) {
         *all_rects_same_layers = false;
         min_layer = MIN2(min_layer, rects[i].baseArrayLayer);
         max_layer = MAX2(max_layer, rects[i].baseArrayLayer +
                                     rects[i].layerCount - 1);
      }
   }

   *is_layered = !(min_layer == 0 && max_layer == 0);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
                         uint32_t attachmentCount,
                         const VkClearAttachment *pAttachments,
                         uint32_t rectCount,
                         const VkClearRect *pRects)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);

   /* We can only clear attachments in the current subpass */
   assert(attachmentCount <= 5); /* 4 color + D/S */

   struct v3dv_render_pass *pass = cmd_buffer->state.pass;

   assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
   struct v3dv_subpass *subpass =
      &cmd_buffer->state.pass->subpasses[cmd_buffer->state.subpass_idx];

   /* Emit a clear rect inside the current job for this subpass. For layered
    * framebuffers, we use a geometry shader to redirect clears to the
    * appropriate layers.
    */
   bool is_layered, all_rects_same_layers;
   gather_layering_info(rectCount, pRects, &is_layered, &all_rects_same_layers);
   for (uint32_t i = 0; i < attachmentCount; i++) {
      if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
         emit_subpass_color_clear_rects(cmd_buffer, pass, subpass,
                                        pAttachments[i].colorAttachment,
                                        &pAttachments[i].clearValue.color,
                                        is_layered, all_rects_same_layers,
                                        rectCount, pRects);
      } else {
         emit_subpass_ds_clear_rects(cmd_buffer, pass, subpass,
                                     pAttachments[i].aspectMask,
                                     &pAttachments[i].clearValue.depthStencil,
                                     is_layered, all_rects_same_layers,
                                     rectCount, pRects);
      }
   }
}
