pytorch
diff --git a/‎backends/vulkan/runtime/VulkanBackend.cpp‎
Lines changed: 8 additions & 4 deletions b/‎backends/vulkan/runtime/VulkanBackend.cpp‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp‎
Lines changed: 14 additions & 12 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Quantize.cpp‎
Lines changed: 14 additions & 12 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Quantize.cpp‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎backends/vulkan/test/op_tests/dequantize_test.cpp‎
Lines changed: 80 additions & 41 deletions b/‎backends/vulkan/test/op_tests/dequantize_test.cpp‎
Lines changed: 80 additions & 41 deletions
@@ -616,22 +616,26 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
         // Handle dtype conversion between Vulkan and ExecutorTorch (in-place)
         if (vulkan_dtype == vkapi::kFloat &&
             et_dtype == executorch::aten::ScalarType::Double) {
-          // Convert float32 to float64 in-place (backwards to avoid overwriting)
+          // Convert float32 to float64 in-place (backwards to avoid
+          // overwriting)
           double* data_64 = args[o]->toTensor().mutable_data_ptr<double>();
           const float* data_32 = args[o]->toTensor().const_data_ptr<float>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<double>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         } else if (
             vulkan_dtype == vkapi::kInt &&
             et_dtype == executorch::aten::ScalarType::Long) {
           // Convert int32 to int64 in-place (backwards to avoid overwriting)
           int64_t* data_64 = args[o]->toTensor().mutable_data_ptr<int64_t>();
-          const int32_t* data_32 = args[o]->toTensor().const_data_ptr<int32_t>();
+          const int32_t* data_32 =
+              args[o]->toTensor().const_data_ptr<int32_t>();
           for (size_t j = args[o]->toTensor().numel() - 1; j >= 0; --j) {
             data_64[j] = static_cast<int64_t>(data_32[j]);
-            if (j == 0) break; // Prevent underflow for size_t
+            if (j == 0)
+              break; // Prevent underflow for size_t
           }
         }
       }
 
@@ -51,17 +51,19 @@ utils::uvec3 dequantize_per_channel_local_wg_size(
 
   const ValueRef input = args.at(1).refs.at(0);
 
-  utils::uvec3 local_wg_size = graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides global_workgroup_size
-  // by local_workgroup_size to get the number of workgroups to dispatch.
-  // For per-channel dequantization along the batch axis, we need to ensure that
-  // we dispatch the correct number of workgroups in the Z dimension to cover
-  // all batch-channel combinations.
+  utils::uvec3 local_wg_size =
+      graph->create_local_wg_size(global_workgroup_size);
+
+  // WORKAROUND: The CommandBuffer::dispatch function divides
+  // global_workgroup_size by local_workgroup_size to get the number of
+  // workgroups to dispatch. For per-channel dequantization along the batch
+  // axis, we need to ensure that we dispatch the correct number of workgroups
+  // in the Z dimension to cover all batch-channel combinations.
   //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], local_wg_size[2])
-  // might reduce the number of workgroups dispatched. To ensure we dispatch
-  // global_workgroup_size[2] workgroups in the Z dimension, we set local_wg_size[2] = 1.
+  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
+  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
+  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
+  // we set local_wg_size[2] = 1.
   const auto input_sizes = graph->sizes_of(input);
   if (global_workgroup_size[2] > 1 && input_sizes[3] > 0) {
     local_wg_size[2] = 1;
@@ -241,8 +243,8 @@ void add_dequantize_per_channel_node(
 
   int num_channels;
   if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) {
-    // For batch dimension dequantization in 4D tensors, pass the actual number of channels
-    // so the shader can correctly unfold the batch-channel folding
+    // For batch dimension dequantization in 4D tensors, pass the actual number
+    // of channels so the shader can correctly unfold the batch-channel folding
     num_channels = static_cast<int>(input_sizes[1]); // Channel dimension
   } else {
     num_channels = static_cast<int>(input_sizes[axis_val]);
 
@@ -51,17 +51,19 @@ utils::uvec3 quantize_per_channel_local_wg_size(
 
   const ValueRef input = args.at(1).refs.at(0);
 
-  utils::uvec3 local_wg_size = graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides global_workgroup_size
-  // by local_workgroup_size to get the number of workgroups to dispatch.
-  // For per-channel quantization along the batch axis, we need to ensure that
-  // we dispatch the correct number of workgroups in the Z dimension to cover
-  // all batch-channel combinations.
+  utils::uvec3 local_wg_size =
+      graph->create_local_wg_size(global_workgroup_size);
+
+  // WORKAROUND: The CommandBuffer::dispatch function divides
+  // global_workgroup_size by local_workgroup_size to get the number of
+  // workgroups to dispatch. For per-channel quantization along the batch axis,
+  // we need to ensure that we dispatch the correct number of workgroups in the
+  // Z dimension to cover all batch-channel combinations.
   //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], local_wg_size[2])
-  // might reduce the number of workgroups dispatched. To ensure we dispatch
-  // global_workgroup_size[2] workgroups in the Z dimension, we set local_wg_size[2] = 1.
+  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
+  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
+  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
+  // we set local_wg_size[2] = 1.
   const auto input_sizes = graph->sizes_of(input);
   if (global_workgroup_size[2] > 1 && input_sizes[3] > 0) {
     local_wg_size[2] = 1;
@@ -241,8 +243,8 @@ void add_quantize_per_channel_node(
 
   int num_channels;
   if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) {
-    // For batch dimension quantization in 4D tensors, pass the actual number of channels
-    // so the shader can correctly unfold the batch-channel folding
+    // For batch dimension quantization in 4D tensors, pass the actual number of
+    // channels so the shader can correctly unfold the batch-channel folding
     num_channels = static_cast<int>(input_sizes[1]); // Channel dimension
   } else {
     num_channels = static_cast<int>(input_sizes[axis_val]);
 
@@ -100,7 +100,15 @@ Tensor& dequantize_per_channel_out_no_context(
     executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   return torch::executor::native::dequantize_per_channel_out(
-      input, scale, zero_points, axis, quant_min, quant_max, dtype, out_dtype, out);
+      input,
+      scale,
+      zero_points,
+      axis,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype,
+      out);
 }
 
 // ATen wrapper for dequantize_per_tensor
@@ -480,7 +488,8 @@ at::Tensor dequantize_per_channel_reference_impl(
     }
 
     // Store casted values to avoid repeated casting
-    const int32_t channel_zero_point_int32 = static_cast<int32_t>(channel_zero_point);
+    const int32_t channel_zero_point_int32 =
+        static_cast<int32_t>(channel_zero_point);
     const float channel_scale_float = static_cast<float>(channel_scale);
 
     // Get the input value and dequantize
@@ -490,19 +499,24 @@ at::Tensor dequantize_per_channel_reference_impl(
     // Following the CPU implementation pattern: (input - zero_point) * scale
     if (dtype == at::kByte) {
       uint8_t qvalue = input.flatten()[flat_idx].item<uint8_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kChar) {
       int8_t qvalue = input.flatten()[flat_idx].item<int8_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kShort) {
       int16_t qvalue = input.flatten()[flat_idx].item<int16_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kInt) {
       int32_t qvalue = input.flatten()[flat_idx].item<int32_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else if (dtype == at::kLong) {
       int64_t qvalue = input.flatten()[flat_idx].item<int64_t>();
-      dequantized_value = (qvalue - channel_zero_point_int32) * channel_scale_float;
+      dequantized_value =
+          (qvalue - channel_zero_point_int32) * channel_scale_float;
     } else {
       throw std::runtime_error("Unsupported input dtype");
     }
@@ -878,7 +892,8 @@ void test_vulkan_dequantize_per_tensor_impl(
     output_correct =
         at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
   } else {
-    output_correct = at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
+    output_correct =
+        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
   }
   if (!output_correct) {
     std::cout << "\n"
@@ -1358,7 +1373,8 @@ void test_vulkan_dequantize_per_token_impl(
     output_correct =
         at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
   } else {
-    output_correct = at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
+    output_correct =
+        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
   }
   if (!output_correct) {
     std::cout << "\n"
@@ -1737,16 +1753,21 @@ void test_vulkan_dequantize_per_channel_impl(
   check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
   check_dequantize_per_channel_args(input_sizes, scales, zero_points, axis);
 
-  std::vector<int64_t> input_sizes_int64(input_sizes.begin(), input_sizes.end());
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
 
   // Create random float tensor
-  at::Tensor float_x = at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor float_x =
+      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
 
   // Create scale and zero_point tensors
-  at::Tensor scale_tensor = at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor zero_point_tensor = at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt));
+  at::Tensor scale_tensor =
+      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor zero_point_tensor =
+      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt));
 
-  // Map the dtype to the corresponding quantized type and quantize the float tensor
+  // Map the dtype to the corresponding quantized type and quantize the float
+  // tensor
   c10::ScalarType qtype;
   at::Tensor adjusted_zero_points = zero_point_tensor;
 
@@ -1764,36 +1785,35 @@ void test_vulkan_dequantize_per_channel_impl(
     qtype = c10::kQInt32;
   }
 
-  // Normalize axis for ATen (ATen doesn't handle negative axes in quantize_per_channel)
+  // Normalize axis for ATen (ATen doesn't handle negative axes in
+  // quantize_per_channel)
   int64_t normalized_axis = axis;
   if (normalized_axis < 0) {
     normalized_axis += input_sizes_int64.size();
   }
 
   // Quantize using ATen
   at::Tensor quantized_aten = at::quantize_per_channel(
-      float_x,
-      scale_tensor,
-      adjusted_zero_points,
-      normalized_axis,
-      qtype);
+      float_x, scale_tensor, adjusted_zero_points, normalized_axis, qtype);
 
   // Get ATen dequantized output
   at::Tensor aten_out = at::dequantize(quantized_aten).to(out_dtype);
 
   // Extract the quantized values (int_repr) to use with our implementations
   at::Tensor quantized_input = quantized_aten.int_repr().to(dtype);
 
-  // Get reference output using torch::executor::native::dequantize_per_channel_aten
-  at::Tensor reference_out = torch::executor::native::dequantize_per_channel_aten(
-      quantized_input,
-      scale_tensor.to(at::kDouble),
-      zero_point_tensor.to(at::kLong),
-      axis,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype);
+  // Get reference output using
+  // torch::executor::native::dequantize_per_channel_aten
+  at::Tensor reference_out =
+      torch::executor::native::dequantize_per_channel_aten(
+          quantized_input,
+          scale_tensor.to(at::kDouble),
+          zero_point_tensor.to(at::kLong),
+          axis,
+          quant_min,
+          quant_max,
+          dtype,
+          out_dtype);
 
   // Build Vulkan dequantize_per_channel graph
   using namespace vkcompute;
@@ -1828,8 +1848,10 @@ void test_vulkan_dequantize_per_channel_impl(
   const ValueRef r_axis = graph.add_scalar<int64_t>(axis);
   const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
   const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-  const ValueRef r_dtype = graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-  const ValueRef r_output_dtype = graph.add_scalar<int64_t>(static_cast<int64_t>(out_dtype));
+  const ValueRef r_dtype =
+      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
+  const ValueRef r_output_dtype =
+      graph.add_scalar<int64_t>(static_cast<int64_t>(out_dtype));
 
   VK_GET_OP_FN("quantized_decomposed.dequantize_per_channel.default")
   (graph,
@@ -1854,7 +1876,9 @@ void test_vulkan_dequantize_per_channel_impl(
 
   // Copy input data to GPU
   graph.copy_into_staging(
-      r_input.staging, quantized_input.const_data_ptr(), quantized_input.numel());
+      r_input.staging,
+      quantized_input.const_data_ptr(),
+      quantized_input.numel());
 
   // copy scale tensor to GPU
   graph.copy_into_staging(
@@ -1881,7 +1905,8 @@ void test_vulkan_dequantize_per_channel_impl(
     output_correct =
         at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
   } else {
-    output_correct = at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
+    output_correct =
+        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
   }
   if (!output_correct) {
     std::cout << "\n"
@@ -1992,7 +2017,9 @@ TEST(
 
 // END OF REFERENCE TESTS
 
-TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_float_axis0) {
+TEST(
+    VulkanDequantizePerChannelTest,
+    test_vulkan_dequantize_per_channel_int8_to_float_axis0) {
   std::vector<float> scales(9, 0.1f);
   std::vector<int> zero_points(9, 2);
 
@@ -2052,7 +2079,9 @@ TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_
       at::kFloat);
 }
 
-TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_float_axis1) {
+TEST(
+    VulkanDequantizePerChannelTest,
+    test_vulkan_dequantize_per_channel_int8_to_float_axis1) {
   std::vector<float> scales(14, 0.001f);
   std::vector<int> zero_points(14, -5);
 
@@ -2101,7 +2130,9 @@ TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_
       at::kFloat);
 }
 
-TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_float_axis2) {
+TEST(
+    VulkanDequantizePerChannelTest,
+    test_vulkan_dequantize_per_channel_int8_to_float_axis2) {
   std::vector<float> scales(11, 0.5f);
   std::vector<int> zero_points(11, 12);
 
@@ -2139,7 +2170,9 @@ TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_
       at::kFloat);
 }
 
-TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_float_axis3) {
+TEST(
+    VulkanDequantizePerChannelTest,
+    test_vulkan_dequantize_per_channel_int8_to_float_axis3) {
   std::vector<float> scales(7, 0.5f);
   std::vector<int> zero_points(7, 12);
 
@@ -2166,7 +2199,9 @@ TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_int8_to_
       at::kFloat);
 }
 
-TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_uint8_to_float_comprehensive) {
+TEST(
+    VulkanDequantizePerChannelTest,
+    test_vulkan_dequantize_per_channel_uint8_to_float_comprehensive) {
   std::vector<float> scales = {0.1, 0.2, 0.0001, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, -5, 1, 12};
 
@@ -2226,7 +2261,9 @@ TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_uint8_to
       at::kFloat);
 }
 
-TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_8bit_to_half) {
+TEST(
+    VulkanDequantizePerChannelTest,
+    test_vulkan_dequantize_per_channel_8bit_to_half) {
   std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, 5, 1, 12};
 
@@ -2286,7 +2323,9 @@ TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_8bit_to_
       at::kHalf);
 }
 
-TEST(VulkanDequantizePerChannelTest, test_vulkan_dequantize_per_channel_8bit_to_double) {
+TEST(
+    VulkanDequantizePerChannelTest,
+    test_vulkan_dequantize_per_channel_8bit_to_double) {
   std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
   std::vector<int> zero_points = {0, 5, 5, 1, 12};