[ET-VK] Allow int4 linear to execute without 8bit buffer support

SS-JIA · SS-JIA · commit c435c16b00c1 · 2025-04-09T15:33:43.000-07:00
## Context Some Vulkan devices do not have support for 8-bit buffers, which is currently required to execute the int4 linear compute shader due to the prepacking shader requiring it. This diff bypasses that restriction by introducing a variant of the prepacking shader that does not need 8-bit buffers. ## Changes Introduce a variant of the int4 weight prepacking shader that interprets the tensor data as an array of `uint` instead of `uint8_t`. Each `uint` represents 4 `uint8_t` values. Differential Revision: [D72750897](https://our.internmc.facebook.com/intern/diff/D72750897/) ghstack-source-id: 277173967 Pull Request resolved: #10030
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -125,6 +125,8 @@ def buffer_gvec_type(dtype: str, n: int) -> str:
 
     if dtype == "float":
         return f"vec{n}"
+    if dtype == "uint":
+        return f"uvec{n}"
     elif dtype == "half":
         return f"f16vec{n}"
     elif dtype == "int":
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
@@ -10,76 +10,52 @@
 
 #define PRECISION ${PRECISION}
 
-${define_required_extensions("uint8")}
-${define_required_extensions("int8")}
+$if not NO_INT8_BUFFERS:
+  ${define_required_extensions("uint8")}
+$if STORAGE == "buffer":
+  ${define_required_extensions("int8")}
 
 layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_qmat2", "uint8", STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "nchw_4x2", "uint8", "buffer")}
+$if NO_INT8_BUFFERS:
+  ${layout_declare_tensor(B, "r", "nchw_4x2", "uint", "buffer")}
+$else:
+  ${layout_declare_tensor(B, "r", "nchw_4x2", "uint8", "buffer")}
 
 layout(push_constant) uniform restrict Block {
   ivec4 qmat2_sizes;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-uint8_t get_first(const uint8_t packed) {
-  return uint8_t((packed & 0xF0) >> 4);
+$if NO_INT8_BUFFERS:
+  #define BUF_T uint
+$else:
+  #define BUF_T uint8_t
+
+$if STORAGE == "buffer":
+  #define UVEC4_T u8vec4
+$else:
+  #define UVEC4_T uvec4
+
+uint get_first(const BUF_T packed) {
+  return (packed & 0xF0) >> 4;
 }
 
-uint8_t get_second(const uint8_t packed) {
-  return uint8_t(packed & 0x0F);
+uint get_second(const BUF_T packed) {
+  return packed & 0x0F;
 }
 
-uint8_t combine(const uint8_t first, const uint8_t second) {
-  return uint8_t(first << 4 | second);
+uint combine(const uint first, const uint second) {
+  return (first << 4 | second);
 }
 
-/*
- * This shader packs the weight tensor into a texture.
- *
- * The original tensor has a (W, H) shape of (K / 2, N) and each scalar element
- * is a uint8_t, which contains 2 packed 4 bit uint values.
- *
- * The transform performed by this shader is to first transpose the tensor, so
- * the shape of the packed tensor becomes (N / 2, K). Then, the 4 bit integers
- * are re-packed in groups of 8. For each 4 uint8_t values, the "left" 4-bits
- * of each value contain the 0, 1, 2, 3 4-bit values, and the "right" 4-bits of
- * each value contain the 4, 5, 6, 7 4-bit values.
- *
- * As a concrete example, consider the following weight tensor. The | demarks
- * the packing boundary, so 1| 2 represents a single uint8_t value with 1 in the
- * leftmost 4 bits and 2 in the rightmost 4 bits.
- *
- *  1| 2,  3| 4,  5| 6,  7| 8,
- *  9|10, 11|12, 13|14, 15|16,
- * 17|18, 19|20, 21|22, 23|24,
- * 25|26, 27|28, 29|30, 31|32,
- * 33|34, 35|36, 37|38, 39|40,
- * 41|42, 43|44, 45|46, 47|48,
- * 49|50, 51|52, 53|54, 55|56,
- * 57|58, 59|60, 61|62, 63|64,
- *
- * After packing, the packed tensor would contain
- *
- *  1|33,  9|41, 17|49, 25|57,
- *  2|34, 10|42, 18|50, 26|58,
- *  3|35, 11|43, 19|51, 27|59,
- *  4|36, 12|44, 20|52, 28|60,
- *  5|37, 13|45, 21|53, 29|61,
- *  6|38, 14|46, 22|54, 30|62,
- *  7|39, 15|47, 23|55, 31|63,
- *  8|40, 16|48, 24|56, 32|64,
- *
- * The purpose of interleaving is to make it easier to extract the unpacked
- * values in order using the u8vec4 vectorized type. With the packing in place,
- * The 4-bit values can be extracted via
- *
- * u8vec4 packed;
- * u8vec4 vals_0123 = (packed & 0xF0) >> 4;
- * u8vec4 vals_4567 = (packed | 0x0F);
- */
+$if NO_INT8_BUFFERS:
+  uint extract_comp(const uint packed4, const uint idx) {
+    return (packed4 >> (idx * 8)) & 0xFF;
+  }
+
 void main() {
   // Each thread writes 2 output texels along the height axis
   ivec2 packed_pos = ivec2(
@@ -102,25 +78,32 @@ void main() {
   int in_numcols = qmat2_sizes.y;
   int in_num_int8_cols = qmat2_sizes.y >> 1;
 
-  uint8_t in_vals[8][2];
+  uint in_vals[8][2];
   for (int r = 0; r < 8; ++r) {
     if (in_row + r < in_numrows) {
-      uint8_t in_val_packed = nchw_4x2[(in_row + r) * in_num_int8_cols + in_int8_col];
+      uint scalar_idx = (in_row + r) * in_num_int8_cols + in_int8_col;
+      $if NO_INT8_BUFFERS:
+        BUF_T in_val_packed_texel = nchw_4x2[scalar_idx >> 2];
+        const uint packed_idx = scalar_idx % 4;
+        uint in_val_packed = extract_comp(in_val_packed_texel, packed_idx);
+      $else:
+        BUF_T in_val_packed = nchw_4x2[scalar_idx];
+
       in_vals[r][0] = get_first(in_val_packed);
       in_vals[r][1] = get_second(in_val_packed);
     } else {
-      in_vals[r][0] = uint8_t(0);
-      in_vals[r][1] = uint8_t(0);
+      in_vals[r][0] = uint(0);
+      in_vals[r][1] = uint(0);
     }
   }
 
-  u8vec4 out_tex_1 = u8vec4(
+  UVEC4_T out_tex_1 = UVEC4_T(
       combine(in_vals[0][0], in_vals[4][0]),
       combine(in_vals[1][0], in_vals[5][0]),
       combine(in_vals[2][0], in_vals[6][0]),
       combine(in_vals[3][0], in_vals[7][0]));
 
-  u8vec4 out_tex_2 = u8vec4(
+  UVEC4_T out_tex_2 = UVEC4_T(
       combine(in_vals[0][1], in_vals[4][1]),
       combine(in_vals[1][1], in_vals[5][1]),
       combine(in_vals[2][1], in_vals[6][1]),
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
@@ -7,9 +7,10 @@
 pack_int4_linear_weight_transposed_interleaved:
   parameter_names_with_default_values:
     STORAGE: texture2d
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: texture2d
-      - VALUE: buffer
+    NO_INT8_BUFFERS: false
   shader_variants:
-    - NAME: pack_int4_linear_weight_transposed_interleaved
+    - NAME: pack_int4_linear_weight_transposed_interleaved_texture2d
+    - NAME: pack_int4_linear_weight_transposed_interleaved_buffer
+      STORAGE: buffer
+    - NAME: pack_int4_linear_weight_transposed_interleaved_nobitw8buffer_texture2d
+      NO_INT8_BUFFERS: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
@@ -14,7 +14,8 @@
 #define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
 
 ${define_required_extensions(DTYPE)}
-${define_required_extensions("int8")}
+$if WEIGHT_STORAGE == "buffer":
+  ${define_required_extensions("uint8")}
 
 layout(std430) buffer;
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp
@@ -22,9 +22,6 @@ void check_q_4w_linear_args(
     const ValueRef group_size,
     const ValueRef scales_and_zeros,
     const ValueRef out) {
-  VK_CHECK_COND(graph.int16_shader_types_enabled());
-  VK_CHECK_COND(graph.int8_buffers_enabled());
-
   VK_CHECK_COND(graph.val_is_tensor(mat1));
   VK_CHECK_COND(graph.val_is_tref(mat2_data));
   VK_CHECK_COND(graph.val_is_tref(scales_and_zeros));
@@ -97,7 +94,10 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
   global_wg_size = graph.logical_limits_of(qmat2);
   global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(2));
 
-  std::string kernel_name = "pack_int4_linear_weight_transposed_interleaved";
+  std::string kernel_name =
+      graph.context()->adapter_ptr()->has_full_int8_buffers_support()
+      ? "pack_int4_linear_weight_transposed_interleaved"
+      : "pack_int4_linear_weight_transposed_interleaved_nobitw8buffer";
   add_storage_type_suffix(kernel_name, storage_type);
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
diff --git a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
@@ -274,11 +274,6 @@ TEST(VulkanInt4LinearTest, test_reference_impl) {
 }
 
 TEST(VulkanInt4LinearTest, test_vulkan_impl) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
   test_vulkan_linear_int4(
       /*B = */ 1,
       /*M = */ 4,