Update base for Update on "[ET-VK][ez] Enable max_pool2d.default"

ssjia · ssjia · commit cc302d91ddfe · 2025-09-04T08:45:15.000-07:00
max_pool2d_with_indices is already implemented; this diff enables max_pool2d as well by just re-using the same implementation. Differential Revision: [D81513446](https://our.internmc.facebook.com/intern/diff/D81513446/) [ghstack-poisoned]
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
@@ -130,6 +130,40 @@ def __init__(self, mm_node: torch.fx.Node) -> None:
 
         self.match_found = True
 
+    def is_weight_only_quantized(self) -> bool:
+        return self.quantize_input_node is None
+
+    def is_weight_pergroup_quantized(self) -> bool:
+        weight_shape = self.weight_node.meta["val"].shape
+        scales_shape = self.weight_scales_node.meta["val"].shape
+        if len(scales_shape) != 2:
+            return False
+
+        # Check that:
+        # height dim of scales is same as height dim of weight (N / output channels dim)
+        # width dim of weight (K / in channels dim) is divisible by width dim of scales
+        # (number of quantization groups)
+        return scales_shape[-2] == weight_shape[-2] and (
+            weight_shape[-1] % scales_shape[-1] == 0
+        )
+
+    def is_weight_perchannel_quantized(self) -> bool:
+        weight_shape = self.weight_node.meta["val"].shape
+        scales_shape = self.weight_scales_node.meta["val"].shape
+        if len(scales_shape) != 1:
+            return False
+
+        # scales should have same size as weight's output channels dim
+        return scales_shape[0] == weight_shape[-2]
+
+    def is_input_static_per_tensor_quantized(self) -> bool:
+        if self.quantize_input_node is None:
+            return False
+
+        # For static quantization per tensor quantization, the scales and zeros
+        # are scalars.
+        return isinstance(self.input_scales_node, float)
+
 
 linear_anchor_nodes = {
     exir_ops.edge.aten.linear.default,
@@ -227,18 +261,10 @@ def make_linear_q4ga_op(
     ep: ExportedProgram,
     graph_module: torch.fx.GraphModule,
     match: QuantizedLinearMatch,
+    weight_tensor: torch.Tensor,
+    weight_scales_tensor: torch.Tensor,
+    weight_zeros_tensor: torch.Tensor,
 ):
-    weight_tensor = get_param_tensor(ep, match.weight_node)
-    assert weight_tensor is not None
-
-    assert match.weight_scales_node is not None
-    weight_scales_tensor = get_param_tensor(ep, match.weight_scales_node)
-    assert weight_scales_tensor is not None
-
-    assert match.weight_zeros_node is not None
-    weight_zeros_tensor = get_param_tensor(ep, match.weight_zeros_node)
-    assert weight_zeros_tensor is not None
-
     packed_quantized_weight_tensor = pack_4bit_weight_tensor(weight_tensor)
     utils.update_program_state_dict(
         ep, match.weight_node.name, packed_quantized_weight_tensor
@@ -281,23 +307,8 @@ def make_linear_q8ta_q8csw_custom_op(
     ep: ExportedProgram,
     graph_module: torch.fx.GraphModule,
     match: QuantizedLinearMatch,
+    weight_tensor: torch.Tensor,
 ):
-    weight_tensor = get_param_tensor(ep, match.weight_node)
-    assert weight_tensor is not None
-
-    assert match.weight_scales_node is not None
-    weight_scales_tensor = get_param_tensor(ep, match.weight_scales_node)
-    assert weight_scales_tensor is not None
-
-    assert match.weight_zeros_node is not None
-    weight_zeros_tensor = get_param_tensor(ep, match.weight_zeros_node)
-    assert weight_zeros_tensor is not None
-
-    bias_tensor = None
-    if match.bias_node is not None:
-        bias_tensor = get_param_tensor(ep, match.bias_node)
-        assert bias_tensor is not None
-
     first_graph_node = list(graph_module.graph.nodes)[0]
     with graph_module.graph.inserting_before(first_graph_node):
         weight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
@@ -340,7 +351,40 @@ def replace_quantized_linear_patterns(
     graph_module: torch.fx.GraphModule,
     match: QuantizedLinearMatch,
 ):
-    if match.quantize_input_node is None:
-        make_linear_q4ga_op(ep, graph_module, match)
-    else:
-        make_linear_q8ta_q8csw_custom_op(ep, graph_module, match)
+    # Extract relevant tensors
+    weight_tensor = get_param_tensor(ep, match.weight_node)
+    assert weight_tensor is not None
+
+    assert match.weight_scales_node is not None
+    weight_scales_tensor = get_param_tensor(ep, match.weight_scales_node)
+    assert weight_scales_tensor is not None
+
+    assert match.weight_zeros_node is not None
+    weight_zeros_tensor = get_param_tensor(ep, match.weight_zeros_node)
+    assert weight_zeros_tensor is not None
+
+    # Biases not supported at the moment
+    if match.bias_node is not None:
+        return
+
+    # Route to appropriate custom op
+    if (
+        match.is_weight_only_quantized()
+        and match.is_weight_pergroup_quantized()
+        and utils.is_in_4bit_range(weight_tensor)
+    ):
+        make_linear_q4ga_op(
+            ep,
+            graph_module,
+            match,
+            weight_tensor,
+            weight_scales_tensor,
+            weight_zeros_tensor,
+        )
+    elif (
+        match.is_input_static_per_tensor_quantized()
+        and match.is_weight_perchannel_quantized()
+    ):
+        make_linear_q8ta_q8csw_custom_op(ep, graph_module, match, weight_tensor)
+
+    # No-op for unsupported quant patterns
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh
@@ -42,21 +42,20 @@ void fp_accumulate_with_int8_weight(
   // Weight tile is indexed as w_tile.data[k4][n4][n4i]
   //   -> gives packed integer containing the 4x 8-bit quantized values at index
   //      (n, k), (n, k + 1), (n, k + 2), (n, k + 3)
+  VEC4_T weight_texel;
 #if TILE_K4 == 1 && TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    VEC4_T unpacked_weight_k_row;
-    // n = 0
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][0]);
-    accum.data[m][0][0] += dot(in_tile.data[m][0], unpacked_weight_k_row);
-    // n = 1
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][1]);
-    accum.data[m][0][1] += dot(in_tile.data[m][0], unpacked_weight_k_row);
-    // n = 2
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][2]);
-    accum.data[m][0][2] += dot(in_tile.data[m][0], unpacked_weight_k_row);
-    // n = 3
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][3]);
-    accum.data[m][0][3] += dot(in_tile.data[m][0], unpacked_weight_k_row);
+  [[unroll]] for (int k = 0; k < 4; ++k) {
+    // Unpack one column of weights
+    weight_texel = VEC4_T(
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][0], k),
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][1], k),
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][2], k),
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][3], k));
+
+    for (int m = 0; m < TILE_M; ++m) {
+      accum.data[m][0] =
+          fma(VEC4_T(in_tile.data[m][0][k]), weight_texel, accum.data[m][0]);
+    }
   }
 
 #else
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -128,8 +128,9 @@ std::vector<int64_t> calculate_input_im2col_sizes(
 
   // K -> flattened convolution window (adjusted)
   const int64_t K = flattened_kernel_len * groups_val;
-  // M -> number of elements in 2D output plane
-  const int64_t M = out_height * out_width * batches;
+  // M -> number of elements in 2D output plane. This is aligned to the next
+  // multiple of 4 since the im2col shader operates on 4x4 blocks.
+  const int64_t M = utils::align_up_4(out_height * out_width * batches);
 
   return {M, K};
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h b/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h
@@ -16,7 +16,7 @@ enum class QuantizationGranularity {
   PerChannel,
   PerTensor,
   PerGroup,
-  None,
+  NoQuantization,
 };
 
 static constexpr QuantizationGranularity kPerChannel =
@@ -26,7 +26,7 @@ static constexpr QuantizationGranularity kPerTensor =
 static constexpr QuantizationGranularity kPerGroup =
     QuantizationGranularity::PerGroup;
 static constexpr QuantizationGranularity kNoQuantization =
-    QuantizationGranularity::None;
+    QuantizationGranularity::NoQuantization;
 
 struct QuantizationConfig {
   int nbits;
diff --git a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp
@@ -395,19 +395,19 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
           std::to_string(config.kernel.w);
 
       config.test_case_name = prefix + suffix;
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      // The default operator tested is activation + weight quantized conv2d;
+      // however, only test this if the int8 dot product extension is supported
+      if (vkcompute::api::context()
+              ->adapter_ptr()
+              ->supports_int8_dot_product()) {
+        test_cases.push_back(
+            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      }
 
       Conv2dConfig wo_quant_config = config;
       wo_quant_config.op_name = "conv2d_q8csw";
       test_cases.push_back(create_test_case_from_config(
           wo_quant_config, storage_type, vkapi::kFloat));
-      // Conv2dConfig config2 = config;
-      // config2.shader_variant_name = "conv2d_q8csw_linear_tiled";
-      // config2.name_suffix = prefix + suffix;
-      // test_cases.push_back(
-      //     create_test_case_from_config(config2, storage_type,
-      //     vkapi::kFloat));
     }
   }
 
@@ -778,7 +778,7 @@ int main(int argc, char* argv[]) {
       quantized_conv2d_flop_calculator,
       "QuantizedConv2d",
       0,
-      1,
+      10,
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/q8csw_linear.cpp b/backends/vulkan/test/custom_ops/q8csw_linear.cpp
@@ -151,9 +151,9 @@ std::vector<TestCase> generate_quantized_linear_easy_cases() {
   std::vector<TestCase> test_cases;
 
   // Single simple configuration for debugging
-  int M = 16;
-  int K = 64;
-  int N = 32;
+  int M = 4;
+  int K = 4;
+  int N = 4;
 
   LinearConfig config = {
       M, // Batch size
@@ -217,9 +217,13 @@ std::vector<TestCase> generate_quantized_linear_test_cases() {
     config.test_case_name = generated_test_case_name;
 
     for (const auto& storage_type : storage_types) {
-      // Test both activation+weight quantized and weight only quantized
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      if (vkcompute::api::context()
+              ->adapter_ptr()
+              ->supports_int8_dot_product()) {
+        // Test both activation+weight quantized and weight only quantized
+        test_cases.push_back(
+            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      }
 
       LinearConfig wo_quant_config = config;
       wo_quant_config.op_name = "linear_q8csw";
@@ -462,7 +466,6 @@ int main(int argc, char* argv[]) {
 
   ReferenceComputeFunc ref_fn = reference_impl;
 
-  // Execute easy test cases using the new framework with custom FLOP calculator
   auto results = execute_test_cases(
       generate_quantized_linear_test_cases,
       quantized_linear_flop_calculator,
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
@@ -1172,6 +1172,26 @@ def trace_args_until_placeholder(
     return cur_node, traversed
 
 
+def is_in_4bit_range(tensor: torch.Tensor) -> bool:
+    """
+    Check if the given tensor is in the range of 4-bit quantization and is of integer type.
+    """
+    if tensor.dtype not in (torch.int8, torch.uint8):
+        return False
+
+    return tensor.min().item() >= -8 and tensor.max().item() <= 7
+
+
+def is_in_8bit_range(tensor: torch.Tensor) -> bool:
+    """
+    Check if the given tensor is in the range of 4-bit quantization and is of integer type.
+    """
+    if tensor.dtype not in (torch.int8, torch.uint8):
+        return False
+
+    return tensor.min().item() >= -128 and tensor.max().item() <= 127
+
+
 ##
 ## Misc
 ##