Update base for Update on "[ET-VK][Ops] dequantize_per_channel reference impl and testing"

morelos · morelos · commit 999d28b7395e · 2025-07-03T11:27:44.000-07:00
# Context In order to properly enable dynamic quantization, we create the dequantize_per_channel operator as its seemingly useful to have for the pipeline. # Changes This creates the wrapper for the cpu reference implementation, and also a dummy reference implementation I created just to test against it. Differential Revision: [D77746138](https://our.internmc.facebook.com/intern/diff/D77746138/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -616,22 +616,26 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
         // Handle dtype conversion between Vulkan and ExecutorTorch (in-place)
         if (vulkan_dtype == vkapi::kFloat &&
             et_dtype == executorch::aten::ScalarType::Double) {
-          // Convert float32 to float64 in-place (backwards to avoid overwriting)
+          // Convert float32 to float64 in-place (backwards to avoid
+          // overwriting)
           double* data_64 = args[o]->toTensor().mutable_data_ptr<double>();
           const float* data_32 = args[o]->toTensor().const_data_ptr<float>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<double>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         } else if (
             vulkan_dtype == vkapi::kInt &&
             et_dtype == executorch::aten::ScalarType::Long) {
           // Convert int32 to int64 in-place (backwards to avoid overwriting)
           int64_t* data_64 = args[o]->toTensor().mutable_data_ptr<int64_t>();
-          const int32_t* data_32 = args[o]->toTensor().const_data_ptr<int32_t>();
+          const int32_t* data_32 =
+              args[o]->toTensor().const_data_ptr<int32_t>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<int64_t>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         }
       }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
@@ -51,17 +51,19 @@ utils::uvec3 quantize_per_channel_local_wg_size(
 
   const ValueRef input = args.at(1).refs.at(0);
 
-  utils::uvec3 local_wg_size = graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides global_workgroup_size
-  // by local_workgroup_size to get the number of workgroups to dispatch.
-  // For per-channel quantization along the batch axis, we need to ensure that
-  // we dispatch the correct number of workgroups in the Z dimension to cover
-  // all batch-channel combinations.
+  utils::uvec3 local_wg_size =
+      graph->create_local_wg_size(global_workgroup_size);
+
+  // WORKAROUND: The CommandBuffer::dispatch function divides
+  // global_workgroup_size by local_workgroup_size to get the number of
+  // workgroups to dispatch. For per-channel quantization along the batch axis,
+  // we need to ensure that we dispatch the correct number of workgroups in the
+  // Z dimension to cover all batch-channel combinations.
   //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], local_wg_size[2])
-  // might reduce the number of workgroups dispatched. To ensure we dispatch
-  // global_workgroup_size[2] workgroups in the Z dimension, we set local_wg_size[2] = 1.
+  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
+  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
+  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
+  // we set local_wg_size[2] = 1.
   const auto input_sizes = graph->sizes_of(input);
   if (global_workgroup_size[2] > 1 && input_sizes[3] > 0) {
     local_wg_size[2] = 1;
@@ -241,8 +243,8 @@ void add_quantize_per_channel_node(
 
   int num_channels;
   if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) {
-    // For batch dimension quantization in 4D tensors, pass the actual number of channels
-    // so the shader can correctly unfold the batch-channel folding
+    // For batch dimension quantization in 4D tensors, pass the actual number of
+    // channels so the shader can correctly unfold the batch-channel folding
     num_channels = static_cast<int>(input_sizes[1]); // Channel dimension
   } else {
     num_channels = static_cast<int>(input_sizes[axis_val]);
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -746,8 +746,10 @@ void test_vulkan_quantize_per_tensor_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1123,8 +1125,10 @@ void test_vulkan_quantize_per_token_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1244,9 +1248,7 @@ TEST(
       at::kByte);
 }
 
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_float_to_int8) {
+TEST(VulkanQuantizePerTokenTest, test_vulkan_quantize_per_token_float_to_int8) {
   if (!vkcompute::api::context()
            ->adapter_ptr()
            ->has_full_int8_buffers_support()) {
@@ -1606,8 +1608,10 @@ void test_vulkan_quantize_per_channel_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1717,7 +1721,9 @@ TEST(
 
 // END OF REFERENCE TESTS
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis0) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis0) {
   std::vector<float> scales(9, 0.1f);
   std::vector<int> zero_points(9, 2);
 
@@ -1777,7 +1783,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis1) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis1) {
   std::vector<float> scales(14, 0.001f);
   std::vector<int> zero_points(14, -5);
 
@@ -1826,7 +1834,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis2) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis2) {
   std::vector<float> scales(11, 0.5f);
   std::vector<int> zero_points(11, 12);
 
@@ -1864,7 +1874,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int8_axis3) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_int8_axis3) {
   std::vector<float> scales(7, 0.5f);
   std::vector<int> zero_points(7, 12);
 
@@ -1891,7 +1903,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_int
       at::kChar);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_uint8_comprehensive) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_float_to_uint8_comprehensive) {
   std::vector<float> scales = {0.1, 0.2, 0.0001, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, -5, 1, 12};
 
@@ -1951,7 +1965,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_float_to_uin
       at::kByte);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_half_to_8bit) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_half_to_8bit) {
   std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, 5, 1, 12};
 
@@ -2011,7 +2027,9 @@ TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_half_to_8bit
       at::kByte);
 }
 
-TEST(VulkanQuantizePerChannelTest, test_vulkan_quantize_per_channel_double_to_8bit) {
+TEST(
+    VulkanQuantizePerChannelTest,
+    test_vulkan_quantize_per_channel_double_to_8bit) {
   std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, 5, 1, 12};
 
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
@@ -292,29 +292,25 @@ Tensor& quantize_per_channel_out(
     const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();             \
     const int64_t input_numel = input.numel();                                 \
     const int64_t axis_size = input.size(axis);                                \
-    \
+                                                                               \
     /* Calculate the stride pattern for efficient channel index calculation */ \
-    int64_t axis_block_size = 1;                                              \
-    for (int64_t i = axis + 1; i < input.dim(); i++) {                        \
-      axis_block_size *= input.size(i);                                       \
+    int64_t axis_block_size = 1;                                               \
+    for (int64_t i = axis + 1; i < input.dim(); i++) {                         \
+      axis_block_size *= input.size(i);                                        \
     }                                                                          \
-    \
+                                                                               \
     /* Single loop over all elements */                                        \
-    for (int64_t i = 0; i < input_numel; i++) {                               \
+    for (int64_t i = 0; i < input_numel; i++) {                                \
       /* Calculate which channel this element belongs to */                    \
-      int64_t channel_idx = (i / axis_block_size) % axis_size;                \
-      \
+      int64_t channel_idx = (i / axis_block_size) % axis_size;                 \
+                                                                               \
       /* Get quantization parameters for this channel */                       \
       double _scale = scale_data[channel_idx];                                 \
       int64_t _zero_point = zero_point_data[channel_idx];                      \
-      \
+                                                                               \
       /* Apply quantization */                                                 \
-      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                    \
-          _scale,                                                              \
-          _zero_point,                                                         \
-          input_data_ptr[i],                                                   \
-          quant_min,                                                           \
-          quant_max);                                                          \
+      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                     \
+          _scale, _zero_point, input_data_ptr[i], quant_min, quant_max);       \
     }                                                                          \
   } break;