Update on "[ET-VK] Used hashed layout instead of axis map UBO"

SS-JIA · SS-JIA · commit b43aa880daf4 · 2024-10-28T14:40:53.000-07:00
## Context #6358 showed that passing in the axis map of a tensor via a specialization constant allows shaders to utilize the axis map in indexing calculations with minimal impact to latency. This diff extends that idea, and introduces the concept of a hashed layout. The hashed layout is a 32 bit integer where: 1. Bits 28-31: `axis_map[0]` 2. Bits 24-27: `axis_map[1]` 3. Bits 20-23: `axis_map[2]` 4. Bits 16-19: `axis_map[3]` 5. Bits 12-15: `packed_dim` 6. Bits 0-11: unused Essentially, the integer is divided into chunks of 4 bits, and each chunk is used to represent a value from the `axis_map` + `packed_dim`. This way, the entire description of how the tensor is represented as a texture can be passed into a compute shader with a single specialization constant. Within the compute shader, the axis map and packed dim can be extracted like so: ``` ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); const lowp int in_packed_dim = unhash_packed_dim(in_layout); ``` Note that `lowp` can be used because the expected values are limited by the dimensionality of the tensor, therefore we expect only small values. ## Changes 1. Introduce `hashed_layout` 2. Replace all uses of `axis_map_ubo` with `hashed_layout` 3. Remove `axis_map_ubo` from `vTensor. This also reduces the size of the class. Differential Revision: [D65085141](https://our.internmc.facebook.com/intern/diff/D65085141/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl
@@ -21,12 +21,13 @@ layout(std430) buffer;
 ${layout_declare_buffer(B, "w", "nchw_out", "int")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_map")}
 ${layout_declare_ubo(B, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 axis_map = unhash_axis_map(t_layout);
+const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 void main() {
   const int out_buf_idx = int(gl_GlobalInvocationID.x);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
@@ -23,11 +23,12 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "nchw_in", "int")}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 axis_map = unhash_axis_map(t_layout);
+const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 /*
  * Extends sign of int8
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -126,13 +126,12 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op(
       pipeline_barrier,
       global_wg_size,
       adaptive_work_group_size(global_wg_size),
-      {v_src.packed_dim()},
+      {v_src.hashed_layout()},
       VK_NULL_HANDLE,
       0,
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
-      v_src.axis_map_ubo(),
       v_src.numel_ubo());
 }
 
@@ -335,7 +334,7 @@ void record_matmul_texture3d(
       pipeline_barrier,
       global_wg_size,
       {8, 8, 1},
-      {out.packed_dim(), mat1.packed_dim(), mat2.packed_dim()},
+      {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
       VK_NULL_HANDLE,
       0,
       out.image(
@@ -346,11 +345,8 @@ void record_matmul_texture3d(
       mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       out.sizes_ubo(),
       out.logical_limits_ubo(),
-      out.axis_map_ubo(),
       mat1.sizes_ubo(),
-      mat1.axis_map_ubo(),
-      mat2.sizes_ubo(),
-      mat2.axis_map_ubo());
+      mat2.sizes_ubo());
 }
 
 //
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1602,8 +1602,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() for arithmetic shader output c
-  // +1: t.axis_map_ubo() for arithmetic shader output c
-  expected_vma_allocation_count += 4;
+  expected_vma_allocation_count += 3;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef d = graph.add_input_tensor(
@@ -1612,9 +1611,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 2);
 
   // +1: t.sizes_ubo() uniform buffer for staging shader
-  // +1: t.axis_map_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  expected_vma_allocation_count += 3;
+  expected_vma_allocation_count += 2;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef e = graph.add_tensor(
@@ -1627,8 +1625,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() for arithmetic shader output e
-  // +1: t.axis_map_ubo() for arithmetic shader output e
-  expected_vma_allocation_count += 4;
+  expected_vma_allocation_count += 3;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef out = {};