Update on "[ET-VK][Ops] quantize_per_channel reference impl and testing"

morelos · morelos · commit 53675c5ac180 · 2025-07-03T11:27:42.000-07:00
# Context In order to properly enable dynamic quantization, we create the quantize_per_channel operator as its seemingly useful to have for the pipeline. # Changes This creates the wrapper for the cpu reference implementation, and also a dummy reference implementation I created just to test against it. Differential Revision: [D77746132](https://our.internmc.facebook.com/intern/diff/D77746132/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -616,22 +616,26 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
         // Handle dtype conversion between Vulkan and ExecutorTorch (in-place)
         if (vulkan_dtype == vkapi::kFloat &&
             et_dtype == executorch::aten::ScalarType::Double) {
-          // Convert float32 to float64 in-place (backwards to avoid overwriting)
+          // Convert float32 to float64 in-place (backwards to avoid
+          // overwriting)
           double* data_64 = args[o]->toTensor().mutable_data_ptr<double>();
           const float* data_32 = args[o]->toTensor().const_data_ptr<float>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<double>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         } else if (
             vulkan_dtype == vkapi::kInt &&
             et_dtype == executorch::aten::ScalarType::Long) {
           // Convert int32 to int64 in-place (backwards to avoid overwriting)
           int64_t* data_64 = args[o]->toTensor().mutable_data_ptr<int64_t>();
-          const int32_t* data_32 = args[o]->toTensor().const_data_ptr<int32_t>();
+          const int32_t* data_32 =
+              args[o]->toTensor().const_data_ptr<int32_t>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<int64_t>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         }
       }
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -746,8 +746,10 @@ void test_vulkan_quantize_per_tensor_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1123,8 +1125,10 @@ void test_vulkan_quantize_per_token_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  // Tolerance is 1 to address rounding errors and fp math differences between CPU/GPU
-  const bool output_correct = at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
+  // Tolerance is 1 to address rounding errors and fp math differences between
+  // CPU/GPU
+  const bool output_correct =
+      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -1244,9 +1248,7 @@ TEST(
       at::kByte);
 }
 
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_float_to_int8) {
+TEST(VulkanQuantizePerTokenTest, test_vulkan_quantize_per_token_float_to_int8) {
   if (!vkcompute::api::context()
            ->adapter_ptr()
            ->has_full_int8_buffers_support()) {
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
@@ -292,29 +292,25 @@ Tensor& quantize_per_channel_out(
     const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();             \
     const int64_t input_numel = input.numel();                                 \
     const int64_t axis_size = input.size(axis);                                \
-    \
+                                                                               \
     /* Calculate the stride pattern for efficient channel index calculation */ \
-    int64_t axis_block_size = 1;                                              \
-    for (int64_t i = axis + 1; i < input.dim(); i++) {                        \
-      axis_block_size *= input.size(i);                                       \
+    int64_t axis_block_size = 1;                                               \
+    for (int64_t i = axis + 1; i < input.dim(); i++) {                         \
+      axis_block_size *= input.size(i);                                        \
     }                                                                          \
-    \
+                                                                               \
     /* Single loop over all elements */                                        \
-    for (int64_t i = 0; i < input_numel; i++) {                               \
+    for (int64_t i = 0; i < input_numel; i++) {                                \
       /* Calculate which channel this element belongs to */                    \
-      int64_t channel_idx = (i / axis_block_size) % axis_size;                \
-      \
+      int64_t channel_idx = (i / axis_block_size) % axis_size;                 \
+                                                                               \
       /* Get quantization parameters for this channel */                       \
       double _scale = scale_data[channel_idx];                                 \
       int64_t _zero_point = zero_point_data[channel_idx];                      \
-      \
+                                                                               \
       /* Apply quantization */                                                 \
-      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                    \
-          _scale,                                                              \
-          _zero_point,                                                         \
-          input_data_ptr[i],                                                   \
-          quant_min,                                                           \
-          quant_max);                                                          \
+      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                     \
+          _scale, _zero_point, input_data_ptr[i], quant_min, quant_max);       \
     }                                                                          \
   } break;