[ET-VK][Ops] quantize_per_tensor.tensor variant

morelos · morelos · commit 6ecf0d1e6f26 · 2025-07-03T11:17:28.000-07:00
# Context We need a tensor variant for dequantize/quantize operators since that is the expected output of choose_qparams. # Changes This extends the logic that currently exists to support a tensor variant for scales and zeros. Differential Revision: [D77746136](https://our.internmc.facebook.com/intern/diff/D77746136/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl
@@ -27,9 +27,14 @@ ${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
 ${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
 
 $if MODE == "per_tensor":
+  $if SHAPE == "tensor":
+    ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
+    ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
+
   layout(push_constant) uniform restrict Block {
-    float scale;
-    int zero_point;
+    $if SHAPE == "scalar":
+      float scale;
+      int zero_point;
     int quant_min;
     int quant_max;
   };
@@ -142,7 +147,10 @@ void quantize_per_tensor() {
   const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
 
   IN_T value = t_in[in_bufi];
-  OUT_T qvalue = quantize_val(value, scale, zero_point);
+  $if SHAPE == "scalar":
+    OUT_T qvalue = quantize_val(value, scale, zero_point);
+  $if SHAPE == "tensor":
+    OUT_T qvalue = quantize_val(value, t_scale[0], t_zero_point[0]);
 
   t_out[out_bufi] = qvalue;
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
@@ -3,6 +3,7 @@ quantize_buffer:
     IN_DTYPE: float
     OUT_DTYPE: int32
     MODE: per_tensor
+    SHAPE: tensor
   generate_variant_forall:
     IN_DTYPE:
       - VALUE: half
@@ -15,6 +16,9 @@ quantize_buffer:
   shader_variants:
     - NAME: quantize_per_tensor_buffer
       MODE: per_tensor
+      SHAPE: scalar
+    - NAME: quantize_per_tensor_tensor_buffer
+      MODE: per_tensor
     - NAME: quantize_per_token_buffer
       MODE: per_token
     - NAME: quantize_per_channel_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl
@@ -17,6 +17,7 @@
 #define IVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")}
 
 #define ${MODE}
+#define ${SHAPE}
 
 ${define_active_storage_type("texture3d")}
 ${define_required_extensions(IN_DTYPE)}
@@ -32,9 +33,14 @@ ${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}
 
 $if MODE == "per_tensor":
+  $if SHAPE == "tensor":
+    ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
+    ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
+
   layout(push_constant) uniform restrict Block {
-    float scale;
-    int zero_point;
+    $if SHAPE == "scalar":
+      float scale;
+      int zero_point;
     int quant_min;
     int quant_max;
   };
@@ -146,7 +152,10 @@ void quantize_per_tensor() {
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
     IN_T value = IN_T(intex[i]);
-    OUT_T qvalue = quantize_val(value, scale, zero_point);
+    $if SHAPE == "scalar":
+      OUT_T qvalue = quantize_val(value, scale, zero_point);
+    $if SHAPE == "tensor":
+      OUT_T qvalue = quantize_val(value, t_scale[0], t_zero_point[0]);
     outtex[i] = qvalue;
   }
   write_texel(t_out, pos, outtex);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml
@@ -3,6 +3,7 @@ quantize_texture:
     IN_DTYPE: float
     OUT_DTYPE: int32
     MODE: per_tensor
+    SHAPE: tensor
   generate_variant_forall:
     IN_DTYPE:
       - VALUE: half
@@ -15,6 +16,9 @@ quantize_texture:
   shader_variants:
     - NAME: quantize_per_tensor_texture3d
       MODE: per_tensor
+      SHAPE: scalar
+    - NAME: quantize_per_tensor_tensor_texture3d
+      MODE: per_tensor
     - NAME: quantize_per_token_texture3d
       MODE: per_token
     - NAME: quantize_per_channel_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
@@ -51,17 +51,19 @@ utils::uvec3 quantize_per_channel_local_wg_size(
 
   const ValueRef input = args.at(1).refs.at(0);
 
-  utils::uvec3 local_wg_size = graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides global_workgroup_size
-  // by local_workgroup_size to get the number of workgroups to dispatch.
-  // For per-channel quantization along the batch axis, we need to ensure that
-  // we dispatch the correct number of workgroups in the Z dimension to cover
-  // all batch-channel combinations.
+  utils::uvec3 local_wg_size =
+      graph->create_local_wg_size(global_workgroup_size);
+
+  // WORKAROUND: The CommandBuffer::dispatch function divides
+  // global_workgroup_size by local_workgroup_size to get the number of
+  // workgroups to dispatch. For per-channel quantization along the batch axis,
+  // we need to ensure that we dispatch the correct number of workgroups in the
+  // Z dimension to cover all batch-channel combinations.
   //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2], local_wg_size[2])
-  // might reduce the number of workgroups dispatched. To ensure we dispatch
-  // global_workgroup_size[2] workgroups in the Z dimension, we set local_wg_size[2] = 1.
+  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
+  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
+  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
+  // we set local_wg_size[2] = 1.
   const auto input_sizes = graph->sizes_of(input);
   if (global_workgroup_size[2] > 1 && input_sizes[3] > 0) {
     local_wg_size[2] = 1;
@@ -78,13 +80,23 @@ void add_quantize_per_tensor_node(
     const ValueRef& quant_min,
     const ValueRef& quant_max,
     const ValueRef& output) {
+  const bool is_tensor_scale_zp =
+      graph.val_is_tensor(scale) && graph.val_is_tensor(zero_point);
+
   std::string kernel_name("quantize_per_tensor");
+  if (is_tensor_scale_zp) {
+    kernel_name += "_tensor";
+  }
   add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
   add_dtype_suffix(kernel_name, graph.dtype_of(input));
   add_dtype_suffix(kernel_name, graph.dtype_of(output));
 
-  float scale_val = static_cast<float>(graph.get_double(scale));
-  int zero_point_val = static_cast<int>(graph.get_int(zero_point));
+  float scale_val = 1.0;
+  int zero_point_val = 0;
+  if (!is_tensor_scale_zp) {
+    scale_val = static_cast<float>(graph.get_double(scale));
+    zero_point_val = static_cast<int>(graph.get_int(zero_point));
+  }
   int quant_min_val = static_cast<int>(graph.get_int(quant_min));
   int quant_max_val = static_cast<int>(graph.get_int(quant_max));
 
@@ -98,15 +110,17 @@ void add_quantize_per_tensor_node(
         graph.strides_ubo(input),
         graph.sizes_ubo(output),
         graph.strides_ubo(output)};
+  } else {
+    param_ubos = {
+        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
+  }
+
+  if (is_tensor_scale_zp) {
     push_constants = {
-        PushConstantDataInfo(&scale_val, sizeof(float)),
-        PushConstantDataInfo(&zero_point_val, sizeof(int)),
         PushConstantDataInfo(&quant_min_val, sizeof(int)),
         PushConstantDataInfo(&quant_max_val, sizeof(int)),
     };
   } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
     push_constants = {
         PushConstantDataInfo(&scale_val, sizeof(float)),
         PushConstantDataInfo(&zero_point_val, sizeof(int)),
@@ -120,13 +134,20 @@ void add_quantize_per_tensor_node(
       graph.hashed_layout_of(input),
   };
 
+  std::vector<ArgGroup> inputs_and_outputs = {
+      {output, vkapi::kWrite}, {input, vkapi::kRead}};
+  if (is_tensor_scale_zp) {
+    inputs_and_outputs.emplace_back(
+        ArgGroup{{scale, zero_point}, vkapi::kRead});
+  }
+
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
       default_pick_global_wg_size,
       default_pick_local_wg_size,
       // Inputs and Outputs
-      {{output, vkapi::kWrite}, {input, vkapi::kRead}},
+      inputs_and_outputs,
       // Shader param buffers
       param_ubos,
       // Push Constants
@@ -241,8 +262,8 @@ void add_quantize_per_channel_node(
 
   int num_channels;
   if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) {
-    // For batch dimension quantization in 4D tensors, pass the actual number of channels
-    // so the shader can correctly unfold the batch-channel folding
+    // For batch dimension quantization in 4D tensors, pass the actual number of
+    // channels so the shader can correctly unfold the batch-channel folding
     num_channels = static_cast<int>(input_sizes[1]); // Channel dimension
   } else {
     num_channels = static_cast<int>(input_sizes[axis_val]);
@@ -487,6 +508,9 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(
       quantized_decomposed.quantize_per_tensor.default,
       quantize_per_tensor_impl);
+  VK_REGISTER_OP(
+      quantized_decomposed.quantize_per_tensor.tensor,
+      quantize_per_tensor_impl);
   VK_REGISTER_OP(
       quantized_decomposed.quantize_per_token.default, quantize_per_token_impl);
   VK_REGISTER_OP(
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp