[ET-VK] Use dim order when converting buffer index to tensor index (#11622)

pytorchbot · web-flow · commit e62a4ef9dc3e · 2025-06-12T19:21:08.000-04:00
## Changes * Update callsites to `bufi_to_tidx` to account for the tensor dim order * Remove existing functions which do not accept dim order as argument. ## Motivation > Update callsites to `bufi_to_tidx` to account for the tensor dim order > Remove existing functions which do not accept dim order as argument. As mentioned in the below diff, dim order is required to properly convert from a linear buffer index to N-dimension tensor index using a tensor's strides. Technically the dim order can be inferred from the strides array by performing an index sort. However, for the sake of efficiency it is better to just pass the dim order directly into the compute shader. Currently the `bufi_to_tidx` function which performs the conversion between buffer index and tensor index assumes that the dim order follows a specific pattern using the packed dim as an input. However, it is not guaranteed that the dim order is the same as what is assumed. Furthermore, there is an existing bug when calling `bufi_to_tidx` without providing `packed_dim` as an input. In this case, the function will infer the packed dim by finding the first dim with a stride of 1. However, this causes issues when multiple dims may have a stride of 1, which may occur when there are dims with a size of 1. In this case the wrong packed dim may be inferred and therefore the assumed dim order is completely wrong. To address these issues, make it standard to either account for the packed dim when converting bufi to tidx, or to explicitly call out an assumption about the tensor's dim order. ## Performance Impact * None expected Differential Revision: [D76393428](https://our.internmc.facebook.com/intern/diff/D76393428/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -48,19 +48,18 @@ $else:
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
+
 $if STORAGE == "buffer":
-  ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
+  const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 $else:
-  ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
   const lowp int packed_dim = unhash_packed_dim(out_layout);
 
-  ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-  ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
 
 #ifdef USING_BUFFER
@@ -77,7 +76,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
   const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
   const ivec4 other_tidx = min(out_tidx, other_sizes - 1);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -68,21 +68,6 @@
  */
 #define mod4(x) ((x) & 3)
 
-/*
- * Find the packed dimension of a tensor given its strides. The packed dimension
- * is the "fastest moving" dimension which will have a stride of 1.
- */
-int find_packed_dim(const ivec4 strides) {
-  int packed_dim = 0;
-  for (int i = 0; i <= 3; i++) {
-    if (strides[i] == 1) {
-      packed_dim = i;
-      break;
-    }
-  }
-  return packed_dim;
-}
-
 /*
  * Get the staging buffer indices that contain the data of the texel that
  * corresponds to the provided tensor index. Since the texel have 4 elements,
@@ -129,27 +114,26 @@ int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
       tidx.x;
 }
 
-// TODO(ssjia): make this function use dim order so that it can work with any
-// dim order. Currently it assumes that the dim order is contiguous, except for
-// the packed dim.
-ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) {
+ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
-    if (i != packed_dim) {
-      idx[i] = bufi / strides[i];
-      bufi %= strides[i];
-    }
+    int dim = dim_order[i];
+    idx[dim] = bufi / strides[dim];
+    bufi %= strides[dim];
   }
-  idx[packed_dim] = bufi;
   return idx;
 }
 
-// Convenience overload of the above function, which will determine the packed
-// dim from the strides automatically so it doesn't have to be passed in as a
-// function argument.
-ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) {
-  int packed_dim = find_packed_dim(strides);
-  return bufi_to_tidx(bufi, strides, packed_dim);
+/*
+ * bufi_to_tidx but assumes that the tensor is contiguous
+ */
+ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) {
+  ivec4 idx;
+  for (int i = 3; i >= 0; i--) {
+    idx[i] = bufi / strides[i];
+    bufi %= strides[i];
+  }
+  return idx;
 }
 
 int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
@@ -269,12 +253,22 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
  * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
  */
 #define unhash_axis_map(hash) \
-  ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
+
+/*
+ *
+ */
+#define unhash_dim_order(hash) \
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
 
 #define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
 
 #define DEFAULT_LAYOUT 0x02210
 
+#define DEFAULT_DIM_ORDER 0x03210
+
+#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3)
+
 /************************
  * Deprecated Functions *
  ************************/
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
@@ -62,7 +62,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);
+  const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides);
 
   const FLOAT_T scale = t_scales[out_tidx.x];
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -10,8 +10,8 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)}
 
 $if USE_PUSH_CONST:
   layout(push_constant) uniform restrict Block {
@@ -20,15 +20,14 @@ $if USE_PUSH_CONST:
     int numel;
   };
 $else:
-  ${layout_declare_ubo(2, "ivec4", "out_sizes")}
-  ${layout_declare_ubo(3, "ivec4", "out_strides")}
-  ${layout_declare_ubo(4, "int", "numel")}
+  ${layout_declare_ubo(B, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "out_strides")}
+  ${layout_declare_ubo(B, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// This constant is unused in this shader but is kept so that the signature is
-// consistent with nchw_to_image.
-${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")}
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 ${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
 
 void main() {
@@ -37,7 +36,7 @@ void main() {
     return;
   }
 
-  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides);
+  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
 
   ivec4 sizes = out_sizes;
   if (transpose_hw == 1) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -9,6 +9,8 @@
 #ifndef SELECT_GLSLH
 #define SELECT_GLSLH
 
+#ifndef USING_BUFFER
+
 /*
  * Enable the fast path if a texel loaded from the input texture can be used as
  * is to store to the output texture. The following conditions must be met:
@@ -29,6 +31,8 @@ bool can_use_fast_path() {
   return true;
 }
 
+#endif // USING_BUFFER
+
 /*
  * Given an output tensor index, return the corresponding input tensor index for
  * the select operator. This is done by "inserting" the select index at the
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -9,6 +9,8 @@
 #ifndef SLICE_GLSLH
 #define SLICE_GLSLH
 
+#ifndef USING_BUFFER
+
 /**
  * Enable the fast path if a texel loaded from the input texture can be used as
  * is to store to the output texture. The following conditions must be met:
@@ -26,6 +28,8 @@ bool can_use_fast_path() {
   return true;
 }
 
+#endif // USING_BUFFER
+
 /*
  * Converts output tensor indices to input tensor indices for the slice operation.
  * This function maps the output indices to the corresponding input indices based on
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
@@ -37,8 +37,10 @@ layout(push_constant) uniform restrict Block {
   int selected_dim;
 };
 
-${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -50,7 +52,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
   ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
 
   const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
@@ -37,40 +37,28 @@ $if STORAGE == "buffer":
   ${layout_declare_ubo(B, "ivec4", "cond_strides")}
   ${layout_declare_ubo(B, "ivec4", "self_strides")}
   ${layout_declare_ubo(B, "ivec4", "other_strides")}
-
-  ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "cond_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "self_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
 $else:
   ${layout_declare_ubo(B, "ivec3", "out_limits")}
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #ifdef USING_BUFFER
 
 void main() {
   int out_bufi = int(gl_GlobalInvocationID.x);
-  // ivec4 tidx = ivec4(gl_GlobalInvocationID, 0);
-  // int out_bufi = tidx_to_bufi(tidx, out_strides);
-  // int cond_bufi = tidx_to_bufi(tidx, cond_strides);
-  // int self_bufi = tidx_to_bufi(tidx, self_strides);
-  // int other_bufi = tidx_to_bufi(tidx, other_strides);
   if (out_bufi >= out_numl) {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  out_bufi = tidx_to_bufi(out_tidx, out_strides);
-
-  const ivec4 cond_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int cond_bufi = tidx_to_bufi(cond_tidx, cond_strides);
-
-  const ivec4 self_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int self_bufi = tidx_to_bufi(self_tidx, self_strides);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
 
-  const ivec4 other_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int other_bufi = tidx_to_bufi(other_tidx, other_strides);
+  const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides);
+  const int self_bufi = tidx_to_bufi(out_tidx, self_strides);
+  const int other_bufi = tidx_to_bufi(out_tidx, other_strides);
 
   COND_T cond = t_condition[cond_bufi] ;
   T v_self = t_self[self_bufi];
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -143,9 +143,9 @@ void add_binary_op_buffer_node(
           PushConstantDataInfo(&alpha_val, sizeof(float)),
       }},
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(in1),
-       graph.packed_dim_of(in2)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(in1),
+       graph.hashed_layout_of(in2)},
       // Resize Args
       {},
       // Resizing Logic
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
@@ -43,6 +43,10 @@ void check_linear_qcsnw_args(
     VK_CHECK_COND(
         utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
   }
+
+  if (graph.is_buffer_storage(out)) {
+    VK_CHECK_COND(graph.is_contiguous(out));
+  }
 }
 
 void resize_linear_qcsnw_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
@@ -55,7 +55,6 @@ void add_transfer_copy_node(
   } transfer_params{static_cast<int32_t>(dim_whcn)};
 
   std::vector<PushConstantDataInfo> push_constants;
-  vkapi::SpecVarList spec_vars;
 
   if (graph.is_buffer_storage(out)) {
     push_constants = {
@@ -64,23 +63,18 @@ void add_transfer_copy_node(
         graph.strides_pc_of(in),
         graph.numel_pc_of(out),
         PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
-
-    spec_vars = {
-        graph.packed_dim_of(out),
-        graph.packed_dim_of(in),
-    };
   } else {
     push_constants = {
         graph.sizes_pc_of(out),
         graph.sizes_pc_of(in),
         PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
-
-    spec_vars = {
-        graph.hashed_layout_of(out),
-        graph.hashed_layout_of(in),
-    };
   }
 
+  vkapi::SpecVarList spec_vars = {
+      graph.hashed_layout_of(out),
+      graph.hashed_layout_of(in),
+  };
+
   // Determine the shader directly
   std::string kernel_name;
   if (transfer_type == TransferType::SELECT) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
@@ -54,7 +54,7 @@ void add_where_texture_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out)},
+      {graph.hashed_layout_of(out)},
       // Resize Arguments
       {},
       // Resizing Logic
@@ -96,10 +96,7 @@ void add_where_buffer_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(cond),
-       graph.packed_dim_of(self),
-       graph.packed_dim_of(other)},
+      {graph.hashed_layout_of(out)},
       // Resize Arguments
       {},
       // Resizing Logic
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -52,13 +52,17 @@ def get_binary_elementwise_inputs():
             ((S, S1, S2), (S, S1, 1), 2.0),
             ((S, S1, S2), (S, 1, S2), 2.0),
             ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
+            ((3, 64, 1), (1, 64, 1)),
         ]
     )
     test_suite.layouts = [
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
-    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.storage_types = [
+        "utils::kBuffer",
+        "utils::kTexture3D",
+    ]
     return test_suite
 
 
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -26,13 +26,14 @@ void record_nchw_to_buffer_op(
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
+  vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(v_dst, true, false),
       pipeline_barrier,
       {uint32_t(v_dst.numel()), 1, 1},
       {64, 1, 1},
-      {},
+      specialization_constants,
       VK_NULL_HANDLE,
       0,
       v_dst.buffer(

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ void main() {`
`62`	`62`	`return;`
`63`	`63`	`}`
`64`	`64`
`65`		`- const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);`
	`65`	`+ const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides);`
`66`	`66`
`67`	`67`	`const FLOAT_T scale = t_scales[out_tidx.x];`
`68`	`68`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,10 @@ void check_linear_qcsnw_args(`
`43`	`43`	`VK_CHECK_COND(`
`44`	`44`	`utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));`
`45`	`45`	`}`
	`46`	`+`
	`47`	`+ if (graph.is_buffer_storage(out)) {`
	`48`	`+ VK_CHECK_COND(graph.is_contiguous(out));`
	`49`	`+ }`
`46`	`50`	`}`
`47`	`51`
`48`	`52`	`void resize_linear_qcsnw_node(`