Update on "[ET-VK] Quantized Int8 Convolution"

ssjia · ssjia · commit 1aa9a15474d5 · 2025-09-04T08:45:13.000-07:00
See the below diff; this diff implements int8 quantized conv2d using the quantized linear layer introduced below. Note that the current implementation doesn't yet support depthwise convs; a specialized implementation will need to be added for that. Differential Revision: [D81330809](https://our.internmc.facebook.com/intern/diff/D81330809/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh
@@ -42,21 +42,20 @@ void fp_accumulate_with_int8_weight(
   // Weight tile is indexed as w_tile.data[k4][n4][n4i]
   //   -> gives packed integer containing the 4x 8-bit quantized values at index
   //      (n, k), (n, k + 1), (n, k + 2), (n, k + 3)
+  VEC4_T weight_texel;
 #if TILE_K4 == 1 && TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    VEC4_T unpacked_weight_k_row;
-    // n = 0
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][0]);
-    accum.data[m][0][0] += dot(in_tile.data[m][0], unpacked_weight_k_row);
-    // n = 1
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][1]);
-    accum.data[m][0][1] += dot(in_tile.data[m][0], unpacked_weight_k_row);
-    // n = 2
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][2]);
-    accum.data[m][0][2] += dot(in_tile.data[m][0], unpacked_weight_k_row);
-    // n = 3
-    unpacked_weight_k_row = unpack_packed_4xint8(w_tile.data[0][0][3]);
-    accum.data[m][0][3] += dot(in_tile.data[m][0], unpacked_weight_k_row);
+  [[unroll]] for (int k = 0; k < 4; ++k) {
+    // Unpack one column of weights
+    weight_texel = VEC4_T(
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][0], k),
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][1], k),
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][2], k),
+        extract_8bit_from_packed_int_le(w_tile.data[0][0][3], k));
+
+    for (int m = 0; m < TILE_M; ++m) {
+      accum.data[m][0] =
+          fma(VEC4_T(in_tile.data[m][0][k]), weight_texel, accum.data[m][0]);
+    }
   }
 
 #else
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -128,8 +128,9 @@ std::vector<int64_t> calculate_input_im2col_sizes(
 
   // K -> flattened convolution window (adjusted)
   const int64_t K = flattened_kernel_len * groups_val;
-  // M -> number of elements in 2D output plane
-  const int64_t M = out_height * out_width * batches;
+  // M -> number of elements in 2D output plane. This is aligned to the next
+  // multiple of 4 since the im2col shader operates on 4x4 blocks.
+  const int64_t M = utils::align_up_4(out_height * out_width * batches);
 
   return {M, K};
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h b/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h
@@ -16,7 +16,7 @@ enum class QuantizationGranularity {
   PerChannel,
   PerTensor,
   PerGroup,
-  None,
+  NoQuantization,
 };
 
 static constexpr QuantizationGranularity kPerChannel =
@@ -26,7 +26,7 @@ static constexpr QuantizationGranularity kPerTensor =
 static constexpr QuantizationGranularity kPerGroup =
     QuantizationGranularity::PerGroup;
 static constexpr QuantizationGranularity kNoQuantization =
-    QuantizationGranularity::None;
+    QuantizationGranularity::NoQuantization;
 
 struct QuantizationConfig {
   int nbits;
diff --git a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp
@@ -395,19 +395,19 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
           std::to_string(config.kernel.w);
 
       config.test_case_name = prefix + suffix;
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      // The default operator tested is activation + weight quantized conv2d;
+      // however, only test this if the int8 dot product extension is supported
+      if (vkcompute::api::context()
+              ->adapter_ptr()
+              ->supports_int8_dot_product()) {
+        test_cases.push_back(
+            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      }
 
       Conv2dConfig wo_quant_config = config;
       wo_quant_config.op_name = "conv2d_q8csw";
       test_cases.push_back(create_test_case_from_config(
           wo_quant_config, storage_type, vkapi::kFloat));
-      // Conv2dConfig config2 = config;
-      // config2.shader_variant_name = "conv2d_q8csw_linear_tiled";
-      // config2.name_suffix = prefix + suffix;
-      // test_cases.push_back(
-      //     create_test_case_from_config(config2, storage_type,
-      //     vkapi::kFloat));
     }
   }
 
@@ -778,7 +778,7 @@ int main(int argc, char* argv[]) {
       quantized_conv2d_flop_calculator,
       "QuantizedConv2d",
       0,
-      1,
+      10,
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/q8csw_linear.cpp b/backends/vulkan/test/custom_ops/q8csw_linear.cpp
@@ -151,9 +151,9 @@ std::vector<TestCase> generate_quantized_linear_easy_cases() {
   std::vector<TestCase> test_cases;
 
   // Single simple configuration for debugging
-  int M = 16;
-  int K = 64;
-  int N = 32;
+  int M = 4;
+  int K = 4;
+  int N = 4;
 
   LinearConfig config = {
       M, // Batch size
@@ -217,9 +217,13 @@ std::vector<TestCase> generate_quantized_linear_test_cases() {
     config.test_case_name = generated_test_case_name;
 
     for (const auto& storage_type : storage_types) {
-      // Test both activation+weight quantized and weight only quantized
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      if (vkcompute::api::context()
+              ->adapter_ptr()
+              ->supports_int8_dot_product()) {
+        // Test both activation+weight quantized and weight only quantized
+        test_cases.push_back(
+            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      }
 
       LinearConfig wo_quant_config = config;
       wo_quant_config.op_name = "linear_q8csw";
@@ -462,7 +466,6 @@ int main(int argc, char* argv[]) {
 
   ReferenceComputeFunc ref_fn = reference_impl;
 
-  // Execute easy test cases using the new framework with custom FLOP calculator
   auto results = execute_test_cases(
       generate_quantized_linear_test_cases,
       quantized_linear_flop_calculator,