Update on "[ET-VK][Ops] dequantize_per_channel reference impl and testing"

morelos · morelos · commit 84e097fc8df2 · 2025-07-03T11:27:45.000-07:00
# Context In order to properly enable dynamic quantization, we create the dequantize_per_channel operator as its seemingly useful to have for the pipeline. # Changes This creates the wrapper for the cpu reference implementation, and also a dummy reference implementation I created just to test against it. Differential Revision: [D77746138](https://our.internmc.facebook.com/intern/diff/D77746138/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -616,22 +616,26 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
         // Handle dtype conversion between Vulkan and ExecutorTorch (in-place)
         if (vulkan_dtype == vkapi::kFloat &&
             et_dtype == executorch::aten::ScalarType::Double) {
-          // Convert float32 to float64 in-place (backwards to avoid overwriting)
+          // Convert float32 to float64 in-place (backwards to avoid
+          // overwriting)
           double* data_64 = args[o]->toTensor().mutable_data_ptr<double>();
           const float* data_32 = args[o]->toTensor().const_data_ptr<float>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<double>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         } else if (
             vulkan_dtype == vkapi::kInt &&
             et_dtype == executorch::aten::ScalarType::Long) {
           // Convert int32 to int64 in-place (backwards to avoid overwriting)
           int64_t* data_64 = args[o]->toTensor().mutable_data_ptr<int64_t>();
-          const int32_t* data_32 = args[o]->toTensor().const_data_ptr<int32_t>();
+          const int32_t* data_32 =
+              args[o]->toTensor().const_data_ptr<int32_t>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<int64_t>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         }
       }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
@@ -51,17 +51,19 @@ utils::uvec3 quantize_per_channel_local_wg_size(
 
   const ValueRef input = args.at(1).refs.at(0);
 
-  utils::uvec3 local_wg_size = graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides global_workgroup_size
-  // by local_workgroup_size to get the number of workgroups to dispatch.
-  // For per-channel quantization along the batch axis, we need to ensure that
-  // we dispatch the correct number of workgroups in the Z dimension to cover
-  // all batch-channel combinations.
+  utils::uvec3 local_wg_size =
+      graph->create_local_wg_size(global_workgroup_size);
+
+  // WORKAROUND: The CommandBuffer::dispatch function divides
+  // global_workgroup_size by local_workgroup_size to get the number of
+  // workgroups to dispatch. For per-channel quantization along the batch axis,
+  // we need to ensure that we dispatch the correct number of workgroups in the
+  // Z dimension to cover all batch-channel combinations.
   //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], local_wg_size[2])
-  // might reduce the number of workgroups dispatched. To ensure we dispatch
-  // global_workgroup_size[2] workgroups in the Z dimension, we set local_wg_size[2] = 1.
+  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
+  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
+  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
+  // we set local_wg_size[2] = 1.
   const auto input_sizes = graph->sizes_of(input);
   if (global_workgroup_size[2] > 1 && input_sizes[3] > 0) {
     local_wg_size[2] = 1;
@@ -241,8 +243,8 @@ void add_quantize_per_channel_node(
 
   int num_channels;
   if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) {
-    // For batch dimension quantization in 4D tensors, pass the actual number of channels
-    // so the shader can correctly unfold the batch-channel folding
+    // For batch dimension quantization in 4D tensors, pass the actual number of
+    // channels so the shader can correctly unfold the batch-channel folding
     num_channels = static_cast<int>(input_sizes[1]); // Channel dimension
   } else {
     num_channels = static_cast<int>(input_sizes[axis_val]);
diff --git a/backends/vulkan/test/op_tests/dequantize_test.cpp b/backends/vulkan/test/op_tests/dequantize_test.cpp
@@ -100,7 +100,15 @@ Tensor& dequantize_per_channel_out_no_context(
     executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   return torch::executor::native::dequantize_per_channel_out(
-      input, scale, zero_points, axis, quant_min, quant_max, dtype, out_dtype, out);
+      input,
+      scale,
+      zero_points,
+      axis,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype,
+      out);
 }
 
 // ATen wrapper for dequantize_per_tensor
@@ -480,7 +488,8 @@ at::Tensor dequantize_per_channel_reference_impl(
     }
 
     // Store casted values to avoid repeated casting
-    const int32_t channel_zero_point_int32 = static_cast<int32_t>(channel_zero_point);
+    const int32_t channel_zero_point_int32 =
+        static_cast<int32_t>(channel_zero_point);
     const float channel_scale_float = static_cast<float>(channel_scale);
 
     // Get the input value and dequantize
@@ -490,19 +499,24 @@ at::Tensor dequantize_per_channel_reference_impl(
     // Following the CPU implementation pattern: (input - zero_point) * scale
     if (dtype == at::kByte) {
       uint8_t qvalue = input.flatten()[flat_idx].item<uint8_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kChar) {
       int8_t qvalue = input.flatten()[flat_idx].item<int8_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kShort) {
       int16_t qvalue = input.flatten()[flat_idx].item<int16_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kInt) {
       int32_t qvalue = input.flatten()[flat_idx].item<int32_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kLong) {
       int64_t qvalue = input.flatten()[flat_idx].item<int64_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else {
       throw std::runtime_error("Unsupported input dtype");
     }
@@ -878,7 +892,8 @@ void test_vulkan_dequantize_per_tensor_impl(
     output_correct =
         at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
   } else {
-    output_correct = at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
+    output_correct =
+        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
   }
   if (!output_correct) {
     std::cout << "\n"
@@ -1358,7 +1373,8 @@ void test_vulkan_dequantize_per_token_impl(
     output_correct =
         at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
   } else {
-    output_correct = at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
+    output_correct =
+        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
   }
   if (!output_correct) {
     std::cout << "\n"
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -746,8 +746,10 @@ void test_vulkan_quantize_per_tensor_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1123,8 +1125,10 @@ void test_vulkan_quantize_per_token_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1244,9 +1248,7 @@ TEST(
       at::kByte);
 }
 
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_float_to_int8) {
+TEST(VulkanQuantizePerTokenTest, test_vulkan_quantize_per_token_float_to_int8) {
   if (!vkcompute::api::context()
            ->adapter_ptr()
            ->has_full_int8_buffers_support()) {
@@ -1606,8 +1608,10 @@ void test_vulkan_quantize_per_channel_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1717,7 +1721,9 @@ TEST(
 
 // END OF REFERENCE TESTS
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis0) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis0) {
   std::vector<float> scales(9, 0.1f);
   std::vector<int> zero_points(9, 2);
 
@@ -1777,7 +1783,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis1) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis1) {
   std::vector<float> scales(14, 0.001f);
   std::vector<int> zero_points(14, -5);
 
@@ -1826,7 +1834,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis2) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis2) {
   std::vector<float> scales(11, 0.5f);
   std::vector<int> zero_points(11, 12);
 
@@ -1864,7 +1874,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis3) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis3) {
   std::vector<float> scales(7, 0.5f);
   std::vector<int> zero_points(7, 12);
 
@@ -1891,7 +1903,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_uint8_comprehensive) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_uint8_comprehensive) {
   std::vector<float> scales = {0.1, 0.2, 0.0001, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, -5, 1, 12};
 
@@ -1951,7 +1965,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_uin
       at::kByte);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_half_to_8bit) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_half_to_8bit) {
   std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, 5, 1, 12};
 
@@ -2011,7 +2027,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_half_to_8bit
       at::kByte);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_double_to_8bit) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_double_to_8bit) {
   std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, 5, 1, 12};
 
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -172,18 +172,28 @@ struct type_convert<torch::executor::optional<F>, std::optional<T>> final {
   }
 };
 
-// Specific specialization for optional tensor conversion: std::optional<at::Tensor> to std::optional<executorch::runtime::etensor::Tensor>
+// Specific specialization for optional tensor conversion:
+// std::optional<at::Tensor> to
+// std::optional<executorch::runtime::etensor::Tensor>
 template <>
-struct type_convert<const std::optional<at::Tensor>&, const std::optional<torch::executor::Tensor>&> final {
+struct type_convert<
+    const std::optional<at::Tensor>&,
+    const std::optional<torch::executor::Tensor>&>
+    final {
  public:
   const std::optional<at::Tensor>& val;
-  std::unique_ptr<struct type_convert<const at::Tensor&, const torch::executor::Tensor&>> convert_struct;
+  std::unique_ptr<
+      struct type_convert<const at::Tensor&, const torch::executor::Tensor&>>
+      convert_struct;
   explicit type_convert(const std::optional<at::Tensor>& value) : val(value) {}
   const std::optional<torch::executor::Tensor>& call() {
     static std::optional<torch::executor::Tensor> result;
     if (val.has_value()) {
-      convert_struct = std::make_unique<struct type_convert<const at::Tensor&, const torch::executor::Tensor&>>(
-          type_convert<const at::Tensor&, const torch::executor::Tensor&>(val.value()));
+      convert_struct = std::make_unique<struct type_convert<
+          const at::Tensor&,
+          const torch::executor::Tensor&>>(
+          type_convert<const at::Tensor&, const torch::executor::Tensor&>(
+              val.value()));
       result = std::optional<torch::executor::Tensor>(convert_struct->call());
     } else {
       result = std::optional<torch::executor::Tensor>();
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
@@ -292,29 +292,25 @@ Tensor& quantize_per_channel_out(
     const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();             \
     const int64_t input_numel = input.numel();                                 \
     const int64_t axis_size = input.size(axis);                                \
-    \
+                                                                               \
     /* Calculate the stride pattern for efficient channel index calculation */ \
-    int64_t axis_block_size = 1;                                              \
-    for (int64_t i = axis + 1; i < input.dim(); i++) {                        \
-      axis_block_size *= input.size(i);                                       \
+    int64_t axis_block_size = 1;                                               \
+    for (int64_t i = axis + 1; i < input.dim(); i++) {                         \
+      axis_block_size *= input.size(i);                                        \
     }                                                                          \
-    \
+                                                                               \
     /* Single loop over all elements */                                        \
-    for (int64_t i = 0; i < input_numel; i++) {                               \
+    for (int64_t i = 0; i < input_numel; i++) {                                \
       /* Calculate which channel this element belongs to */                    \
-      int64_t channel_idx = (i / axis_block_size) % axis_size;                \
-      \
+      int64_t channel_idx = (i / axis_block_size) % axis_size;                 \
+                                                                               \
       /* Get quantization parameters for this channel */                       \
       double _scale = scale_data[channel_idx];                                 \
       int64_t _zero_point = zero_point_data[channel_idx];                      \
-      \
+                                                                               \
       /* Apply quantization */                                                 \
-      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                    \
-          _scale,                                                              \
-          _zero_point,                                                         \
-          input_data_ptr[i],                                                   \
-          quant_min,                                                           \
-          quant_max);                                                          \
+      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                     \
+          _scale, _zero_point, input_data_ptr[i], quant_min, quant_max);       \
     }                                                                          \
   } break;