Update on "[ET-VK] Add coop shader for int8 linear"

SS-JIA · SS-JIA · commit ba31c57fa62b · 2025-04-23T11:02:18.000-07:00
Title says it all! ## Changes * Apply co-operative shader for vector * matrix computations. Differential Revision: [D73279548](https://our.internmc.facebook.com/intern/diff/D73279548/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -179,6 +179,11 @@ utils::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
   return utils::kChannelsPacked;
 }
 
+bool ComputeGraph::device_name_contains(const char* substr) {
+  return context_->adapter_ptr()->device_name().find(substr) !=
+      std::string::npos;
+}
+
 void ComputeGraph::check_no_active_value_ptrs() {
   VK_CHECK_COND(
       values_in_use_ == 0,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -443,6 +443,15 @@ class ComputeGraph final {
   utils::GPUMemoryLayout suggested_memory_layout(
       const std::vector<int64_t>& sizes);
 
+  inline bool device_is_adreno() {
+    return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
+  }
+  const std::string& device_name() {
+    return context()->adapter_ptr()->device_name();
+  }
+
+  bool device_name_contains(const char* substr);
+
   //
   // Graph Building
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl
@@ -60,7 +60,7 @@ void main() {
   $if SCALES_STORAGE == "buffer":
     const VEC4_T scales = VEC4_T(t_scales[out_col >> 2]);
   $else:
-    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec3(out_col >> 2, 0, 0), 0));
+    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec2(out_col >> 2, 0), 0));
 
   [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
     partial_c[gid][wid][i] = VEC4_T(0.0);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.yaml
@@ -10,18 +10,19 @@ q_8w_linear_coop:
     IN_STORAGE: texture3d
     OUT_STORAGE: texture3d
     WEIGHT_STORAGE: texture2d
-    SCALES_STORAGE: buffer
+    SCALES_STORAGE: texture2d
     TILE_ROWS: 4
   generate_variant_forall:
     TILE_ROWS:
       - VALUE: 1
         SUFFIX: o4x1
   shader_variants:
-    - NAME: q_8w_linear_coop_texture3d_texture3d_texture2d_float
-    - NAME: q_8w_linear_coop_buffer_buffer_texture2d_float
+    - NAME: q_8w_linear_coop_texture3d_texture3d_texture2d_texture2d_float
+    - NAME: q_8w_linear_coop_buffer_buffer_texture2d_texture2d_float
       IN_STORAGE: buffer
       OUT_STORAGE: buffer
-    - NAME: q_8w_linear_coop_buffer_buffer_buffer_float
+    - NAME: q_8w_linear_coop_buffer_buffer_buffer_buffer_float
       IN_STORAGE: buffer
       OUT_STORAGE: buffer
       WEIGHT_STORAGE: buffer
+      SCALES_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
@@ -53,7 +53,7 @@ void main() {
   $if SCALES_STORAGE == "buffer":
     const VEC4_T scales = VEC4_T(t_scales[out_col >> 2]);
   $else:
-    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec3(out_col >> 2, 0, 0), 0));
+    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec2(out_col >> 2, 0), 0));
 
   [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
     c[i] = VEC4_T(0.0);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml
@@ -10,7 +10,7 @@ q_8w_linear_tiled:
     IN_STORAGE: texture3d
     OUT_STORAGE: texture3d
     WEIGHT_STORAGE: texture2d
-    SCALES_STORAGE: buffer
+    SCALES_STORAGE: texture2d
     TILE_ROWS: 4
   generate_variant_forall:
     TILE_ROWS:
@@ -21,11 +21,12 @@ q_8w_linear_tiled:
       - VALUE: 6
         SUFFIX: o4x6
   shader_variants:
-    - NAME: q_8w_linear_tiled_texture3d_texture3d_texture2d_float
-    - NAME: q_8w_linear_tiled_buffer_buffer_texture2d_float
+    - NAME: q_8w_linear_tiled_texture3d_texture3d_texture2d_texture2d_float
+    - NAME: q_8w_linear_tiled_buffer_buffer_texture2d_texture2d_float
       IN_STORAGE: buffer
       OUT_STORAGE: buffer
-    - NAME: q_8w_linear_tiled_buffer_buffer_buffer_float
+    - NAME: q_8w_linear_tiled_buffer_buffer_buffer_buffer_float
       IN_STORAGE: buffer
       OUT_STORAGE: buffer
       WEIGHT_STORAGE: buffer
+      SCALES_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
@@ -162,15 +162,20 @@ void add_q_8w_linear_tiled_node(
   ValueRef q_mat2 = prepack_standard_hw_transposed(
       graph, q_mat2_data, q_mat2_storage, utils::kWidthPacked);
 
+  utils::StorageType scales_storage = utils::kTexture2D;
+  if (N > max_extent) {
+    scales_storage = utils::kBuffer;
+  }
   ValueRef scales =
-      prepack_standard(graph, scales_data, utils::kBuffer, utils::kWidthPacked);
+      prepack_standard(graph, scales_data, scales_storage, utils::kWidthPacked);
 
   std::string kernel_name =
       use_coop_algorithm ? "q_8w_linear_coop" : "q_8w_linear_tiled";
   kernel_name.reserve(kShaderNameReserve);
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(q_mat2));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(scales));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
@@ -179,6 +184,9 @@ void add_q_8w_linear_tiled_node(
   if (M % 6 == 0) {
     kernel_name += "_o4x6";
     out_tile_nrows = 6;
+  } else if (M % 4 == 0) {
+    kernel_name += "_o4x4";
+    out_tile_nrows = 4;
   } else if (M % 1 == 0) {
     kernel_name += "_o4x1";
     out_tile_nrows = 1;
@@ -255,6 +263,13 @@ bool can_use_tiled_impl(
 }
 
 bool can_use_coop_impl(ComputeGraph& graph, const ValueRef mat1) {
+  // Do not use coop algorithm for Adreno 702; manual experimentation shows that
+  // it performs worse than the tiled algorithm.
+  // TODO(ssjia): Determine a more robust heuristic to determine when the coop
+  // algorithm should be used, instead of depending on specific device identity.
+  if (graph.device_is_adreno() && graph.device_name_contains("702")) {
+    return false;
+  }
   // Check that the computation is vector * matrix
   return (graph.size_at<int>(-2, mat1) == 1);
 }
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -122,6 +122,15 @@ class Adapter final {
     return physical_device_.timestamp_period;
   }
 
+  // Device Identity
+  inline const std::string& device_name() const {
+    return physical_device_.device_name;
+  }
+
+  inline vkapi::DeviceType device_type() const {
+    return physical_device_.device_type;
+  }
+
   // Queue Management
 
   Queue request_queue();