[ET-VK][Ops] choose_qparams.tensor test setup

morelos · morelos · commit 5391625160ec · 2025-06-13T15:49:33.000-07:00
Pull Request resolved: #11555 # Context In order to enhance my own understanding of these operators, I needed to create a reference implementation, and also build out the vulkan testing framework which creates the necessary build up when I need to call the vulkan implementations. I won't explain what the choose_qparams operator actually is in this diff, but will rather opt to explain the operator in a future diff where I implement the glsl shader, however, the reference implementation is heavily inspired by the cpu implementation. This diff is the per_tensor reference implementation. # Changes The main changes were the include of the reference implementation that is used for my own learning, and the necessary wrapper functions that will be called later when the vulkan implementation is successfully completed. It has everything necessary for this purpose, including calling the operator by its appropriate name as when defined in the C++ implementation header, and staging components correctly from the GPU and then the CPU which will be where the comparison is done. I have also included comprehensive failure print statements that prints the tensor size along with relevant parameters. This is for the per_tensor implementation. ghstack-source-id: 290376482 @exported-using-ghexport Differential Revision: [D76436894](https://our.internmc.facebook.com/intern/diff/D76436894/)
diff --git a/backends/vulkan/test/op_tests/choose_qparams_test.cpp b/backends/vulkan/test/op_tests/choose_qparams_test.cpp
@@ -114,3 +114,277 @@ std::tuple<at::Tensor, at::Tensor> choose_qparams_per_token_asymmetric_aten(
 } // namespace native
 } // namespace executor
 } // namespace torch
+
+//
+// Reference Implementation
+//
+
+/*
+ * Reference implementation of choose_qparams_tensor
+ */
+std::tuple<at::Tensor, at::Tensor> choose_qparams_tensor_reference_impl(
+    const at::Tensor& input,
+    int64_t quant_min,
+    int64_t quant_max) {
+  // Create output tensors
+  at::Tensor scale_out = at::empty({}, at::device(at::kCPU).dtype(at::kDouble));
+  at::Tensor zero_point_out =
+      at::empty({}, at::device(at::kCPU).dtype(at::kLong));
+
+  // Find min and max values in the input tensor
+  float min_val = input.min().item<float>();
+  float max_val = input.max().item<float>();
+
+  // Extend the [min, max] interval to ensure it contains 0
+  min_val = std::min(min_val, 0.f);
+  max_val = std::max(max_val, 0.f);
+
+  // Calculate scale
+  double scale =
+      (static_cast<double>(max_val) - min_val) / (quant_max - quant_min);
+
+  // Handle small scale
+  constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
+  if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
+    scale = 0.1;
+  }
+
+  if (scale < SMALL_SCALE_THRESHOLD) {
+    float org_scale = scale;
+    scale = SMALL_SCALE_THRESHOLD;
+    // Adjust min and max based on new scale
+    if (min_val == 0.0f) {
+      max_val = SMALL_SCALE_THRESHOLD * (quant_max - quant_min);
+    } else if (max_val == 0.0f) {
+      min_val = -SMALL_SCALE_THRESHOLD * (quant_max - quant_min);
+    } else {
+      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
+      min_val *= amplifier;
+      max_val *= amplifier;
+    }
+  }
+
+  // Calculate zero point
+  double zero_point_from_min = quant_min - min_val / static_cast<double>(scale);
+  double zero_point_from_max = quant_max - max_val / static_cast<double>(scale);
+  double zero_point_from_min_error =
+      std::abs(quant_min) - std::abs(min_val / static_cast<double>(scale));
+  double zero_point_from_max_error =
+      std::abs(quant_max) - std::abs(max_val / static_cast<double>(scale));
+  double initial_zero_point =
+      zero_point_from_min_error < zero_point_from_max_error
+      ? zero_point_from_min
+      : zero_point_from_max;
+
+  // Nudge zero point to be an integer
+  int64_t nudged_zero_point = 0;
+  if (initial_zero_point < quant_min) {
+    nudged_zero_point = quant_min;
+  } else if (initial_zero_point > quant_max) {
+    nudged_zero_point = quant_max;
+  } else {
+    nudged_zero_point = std::nearbyint(static_cast<float>(initial_zero_point));
+  }
+
+  // Set output values - use item_mutable() for scalar tensors
+  scale_out.fill_(scale);
+  zero_point_out.fill_(nudged_zero_point);
+
+  return std::make_tuple(scale_out, zero_point_out);
+}
+
+// Forward declaration of implementation functions
+void test_vulkan_choose_qparams_tensor_impl(
+    const std::vector<int>& input_sizes,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype,
+    const vkcompute::utils::StorageType in_storage,
+    const vkcompute::utils::StorageType out_storage);
+
+// Wrapper function to test both buffer and texture storage types
+void test_vulkan_choose_qparams_tensor(
+    const std::vector<int>& input_sizes,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype) {
+  // Test with buffer storage
+  test_vulkan_choose_qparams_tensor_impl(
+      input_sizes,
+      quant_min,
+      quant_max,
+      dtype,
+      vkcompute::utils::kBuffer,
+      vkcompute::utils::kBuffer);
+
+  // Test with texture storage
+  test_vulkan_choose_qparams_tensor_impl(
+      input_sizes,
+      quant_min,
+      quant_max,
+      dtype,
+      vkcompute::utils::kTexture3D,
+      vkcompute::utils::kTexture3D);
+}
+
+void test_reference_choose_qparams_tensor(
+    const std::vector<int>& input_sizes,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype) {
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input =
+      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
+
+  // Get reference output
+  auto [reference_scale, reference_zero_point] =
+      choose_qparams_tensor_reference_impl(input, quant_min, quant_max);
+
+  // Get implementation output
+  auto [impl_scale, impl_zero_point] =
+      torch::executor::native::choose_qparams_tensor_aten(
+          input, quant_min, quant_max, dtype);
+
+  // Compare outputs
+  const bool scale_correct = at::allclose(reference_scale, impl_scale);
+  const bool zero_point_correct =
+      at::equal(reference_zero_point, impl_zero_point);
+
+  if (!scale_correct || !zero_point_correct) {
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+
+    std::cout << "input:" << std::endl;
+    std::cout << input << std::endl;
+    std::cout << "reference scale:" << std::endl;
+    std::cout << reference_scale << std::endl;
+    std::cout << "implementation scale:" << std::endl;
+    std::cout << impl_scale << std::endl;
+    std::cout << "reference zero_point:" << std::endl;
+    std::cout << reference_zero_point << std::endl;
+    std::cout << "implementation zero_point:" << std::endl;
+    std::cout << impl_zero_point << std::endl;
+  }
+
+  ASSERT_TRUE(scale_correct && zero_point_correct);
+}
+
+void test_vulkan_choose_qparams_tensor_impl(
+    const std::vector<int>& input_sizes,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype,
+    const vkcompute::utils::StorageType in_storage,
+    const vkcompute::utils::StorageType out_storage) {
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input =
+      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
+
+  // Get reference output
+  auto [reference_scale, reference_zero_point] =
+      torch::executor::native::choose_qparams_tensor_aten(
+          input, quant_min, quant_max, dtype);
+
+  // Build Vulkan choose_qparams_tensor graph
+  using namespace vkcompute;
+
+  GraphConfig config;
+  config.set_storage_type_override(in_storage);
+  ComputeGraph graph(config);
+
+  IOValueRef r_input = graph.add_input_tensor(
+      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
+
+  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
+  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
+
+  // Output tensors
+  const ValueRef r_scale = graph.add_tensor({}, vkapi::kFloat, out_storage);
+  const ValueRef r_zero_point = graph.add_tensor({}, vkapi::kInt, out_storage);
+
+  VK_GET_OP_FN("choose_qparams.tensor")
+  (graph,
+   {
+       r_input.value,
+       r_quant_min,
+       r_quant_max,
+       r_scale,
+       r_zero_point,
+   });
+
+  ValueRef staging_scale = graph.set_output_tensor(r_scale);
+  ValueRef staging_zero_point = graph.set_output_tensor(r_zero_point);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  // Run Vulkan choose_qparams_tensor
+  graph.copy_into_staging(
+      r_input.staging, input.const_data_ptr(), input.numel());
+
+  graph.execute();
+
+  // Create output tensors to hold the results - use types that match GPU output
+  at::Tensor vk_scale =
+      at::empty({}, at::device(at::kCPU).dtype(at::kFloat)).contiguous();
+  at::Tensor vk_zero_point =
+      at::empty({}, at::device(at::kCPU).dtype(at::kInt)).contiguous();
+
+  // Copy results from GPU to CPU
+  graph.copy_from_staging(
+      staging_scale, vk_scale.mutable_data_ptr(), vk_scale.numel());
+  graph.copy_from_staging(
+      staging_zero_point,
+      vk_zero_point.mutable_data_ptr(),
+      vk_zero_point.numel());
+
+  // Convert reference values to match Vulkan output types for comparison
+  at::Tensor reference_scale_float = reference_scale.to(at::kFloat);
+  at::Tensor reference_zero_point_int = reference_zero_point.to(at::kInt);
+
+  // Compare outputs
+  const bool scale_correct = at::allclose(reference_scale_float, vk_scale);
+  const bool zero_point_correct =
+      at::equal(reference_zero_point_int, vk_zero_point);
+
+  if (!scale_correct || !zero_point_correct) {
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+    std::cout << "  storage type: "
+              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
+                                                          : "texture")
+              << std::endl;
+
+    // make sure that there arent a ton of elements in the input tensor
+    if (input.numel() < 100) {
+      std::cout << "input:" << std::endl;
+      std::cout << input << "\n" << std::endl;
+      std::cout << "reference scale:" << std::endl;
+      std::cout << reference_scale << std::endl;
+      std::cout << "vulkan scale:" << std::endl;
+      std::cout << vk_scale << "\n" << std::endl;
+      std::cout << "reference zero_point:" << std::endl;
+      std::cout << reference_zero_point << std::endl;
+      std::cout << "vulkan zero_point:" << std::endl;
+      std::cout << vk_zero_point << std::endl;
+    }
+  }
+
+  ASSERT_TRUE(scale_correct && zero_point_correct);
+}
+
+TEST(VulkanChooseQparamsTest, test_reference_choose_qparams_tensor_int8) {
+  test_reference_choose_qparams_tensor(
+      {2, 3, 4}, // input sizes
+      -128, // quant_min
+      127, // quant_max
+      at::kChar);
+}