diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh
new file mode 100644
index 00000000000..66620e9b174
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CHOOSE_QPARAMS_GLSLH
+#define CHOOSE_QPARAMS_GLSLH
+
+// equivalent of the eps defined in the cpu implementation
+#define SMALL_SCALE_THRESHOLD 6.1e-5
+
+// Calculate scale and zero point from min and max values
+void calculate_scale_and_zero_point(
+    float min_val,
+    float max_val,
+    int qmin,
+    int qmax,
+    out float scale_val,
+    out int zero_point_val) {
+  // ensure we have zero included in our range
+  min_val = min(min_val, 0.0);
+  max_val = max(max_val, 0.0);
+
+  scale_val = (max_val - min_val) / float(qmax - qmin);
+
+  // Handle zero or very small scale
+  if (scale_val == 0.0 || isinf(1.0 / scale_val)) {
+    scale_val = 0.1;
+  }
+
+  // Cut off small scale
+  if (scale_val < SMALL_SCALE_THRESHOLD) {
+    float org_scale = scale_val;
+    scale_val = SMALL_SCALE_THRESHOLD;
+
+    // Adjust min and max based on new scale
+    if (min_val == 0.0) {
+      max_val = SMALL_SCALE_THRESHOLD * float(qmax - qmin);
+    } else if (max_val == 0.0) {
+      min_val = -SMALL_SCALE_THRESHOLD * float(qmax - qmin);
+    } else {
+      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
+      min_val *= amplifier;
+      max_val *= amplifier;
+    }
+  }
+
+  // Calculate zero point
+  float zero_point_from_min = float(qmin) - min_val / scale_val;
+  float zero_point_from_max = float(qmax) - max_val / scale_val;
+  float zero_point_from_min_error = abs(float(qmin)) - abs(min_val / scale_val);
+  float zero_point_from_max_error = abs(float(qmax)) - abs(max_val / scale_val);
+  float initial_zero_point = zero_point_from_min_error < zero_point_from_max_error
+      ? zero_point_from_min
+      : zero_point_from_max;
+
+  // Nudge zero point to integer
+  if (initial_zero_point < float(qmin)) {
+    zero_point_val = qmin;
+  } else if (initial_zero_point > float(qmax)) {
+    zero_point_val = qmax;
+  } else {
+    zero_point_val = int(round(initial_zero_point));
+  }
+}
+
+#endif // CHOOSE_QPARAMS_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl
new file mode 100644
index 00000000000..dcbfe493f34
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define IN_T ${buffer_scalar_type(IN_DTYPE)}
+
+#define ${MODE}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(IN_DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_scale", "float", "buffer")}
+${layout_declare_tensor(B, "w", "t_zero_point", "int", "buffer")}
+${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
+
+$if MODE == "per_tensor":
+  layout(push_constant) uniform restrict Block {
+    int quant_min;
+    int quant_max;
+  };
+$else:
+  layout(push_constant) uniform restrict Block {
+    int num_tokens;
+    int quant_min;
+    int quant_max;
+  };
+
+${layout_declare_ubo(B, "ivec4", "t_in_sizes")}
+${layout_declare_ubo(B, "ivec4", "t_in_strides")}
+${layout_declare_ubo(B, "ivec4", "t_scale_sizes")}
+${layout_declare_ubo(B, "ivec4", "t_scale_strides")}
+${layout_declare_ubo(B, "ivec4", "t_zero_point_sizes")}
+${layout_declare_ubo(B, "ivec4", "t_zero_point_strides")}
+
+#include "indexing_utils.h"
+#include "choose_qparams.glslh"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#define NWORKERS 64
+
+// Shared memory for reduction - must match local work group size
+shared float shared_min[NWORKERS];
+shared float shared_max[NWORKERS];
+
+/*
+ * QUANTIZATION PARAMETER COMPUTATION SHADER (BUFFER STORAGE)
+ *
+ * This shader computes quantization parameters (scale and zero_point) for converting
+ * floating-point tensors to n-bit integer representations while preserving the
+ * original data range as much as possible.
+ *
+ * ALGORITHM:
+ * 1. Find global min/max values across tensor elements using parallel reduction
+ * 2. Use tree reduction with shared memory for efficient min/max computation
+ * 3. Calculate scale = (max - min) / (quant_max - quant_min)
+ * 4. Calculate zero_point to map floating-point zero to integer value
+ *
+ * WORKGROUP CONFIGURATION:
+ * - Per-Tensor Mode:
+ *   - Global WG Size: {1, 1, 1} (single workgroup processes entire tensor)
+ *   - Local WG Size: {64, 1, 1} (matches NWORKERS for shared memory)
+ * - Per-Token Mode:
+ *   - Global WG Size: {num_tokens, 1, 1} (one workgroup per token)
+ *   - Local WG Size: {64, 1, 1} (matches NWORKERS for shared memory)
+ *
+ * SUPPORTED CONFIGURATIONS:
+ * - Buffer Storage: Uses simple linear indexing through buffer elements
+ * - No axis mapping or packing considerations - processes elements sequentially
+ * - Works with any tensor layout since it accesses buffer data linearly
+ *
+ * TREE REDUCTION VISUALIZATION FOR MIN/MAX FINDING:
+ * For 8 threads processing elements [10, 1, 8, 1, 0, 2, 3, 5]:
+ *
+ * Initial shared_min/shared_max arrays populated by each thread:
+ * shared_min:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
+ * shared_max:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
+ * Thread:      |  0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ *
+ * Stride 1 (compare pairs, keep min/max):
+ * shared_min:  |  1 |   | 1 |   | 0 |   | 3 |   |  (min(10,1), min(8,1), min(0,2), min(3,5))
+ * shared_max:  | 10 |   | 8 |   | 2 |   | 5 |   |  (max(10,1), max(8,1), max(0,2), max(3,5))
+ * Active:      |  0 |   | 2 |   | 4 |   | 6 |   |
+ *
+ * Stride 2 (compare pairs, keep min/max):
+ * shared_min:  |  0 |   |   |   | 0 |   |   |   |  (min(1,1), min(0,3))
+ * shared_max:  | 10 |   |   |   | 5 |   |   |   |  (max(10,8), max(2,5))
+ * Active:      |  0 |   |   |   | 4 |   |   |   |
+ *
+ * Stride 4 (final comparison):
+ * shared_min:  |  0 |   |   |   |   |   |   |   |  (min(0,0) = 0)
+ * shared_max:  | 10 |   |   |   |   |   |   |   |  (max(10,5) = 10)
+ * Active:      |  0 |   |   |   |   |   |   |   |
+ *
+ * Final result: global_min = 0, global_max = 10 (stored in shared_min[0], shared_max[0])
+ *
+ * PER-TENSOR QUANTIZATION:
+ * - Single workgroup processes entire tensor with strided access
+ * - Each thread processes elements [thread_id, thread_id + 64, thread_id + 128, ...]
+ * - Tree reduction combines all thread results into global min/max
+ * - Output: Single scale and zero_point values
+ *
+ * PER-TOKEN QUANTIZATION:
+ * - Multiple workgroups, each processing one token
+ * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements)
+ * - Each workgroup finds min/max within its assigned token
+ * - Output: Array of scale and zero_point values (one per token)
+ */
+
+#ifdef per_tensor
+
+void choose_qparams_per_tensor() {
+  uint global_id = gl_GlobalInvocationID.x;
+  uint local_id = gl_LocalInvocationID.x;
+  uint total_threads = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
+
+  uint total_elements = uint(t_in_sizes.x * t_in_sizes.y * t_in_sizes.z * t_in_sizes.w);
+
+  // Each thread processes multiple elements with stride
+  float thread_min = 1.0/0.0;  // +infinity
+  float thread_max = -1.0/0.0; // -infinity
+  bool found_valid = false;
+
+  for (uint i = global_id; i < total_elements; i += total_threads) {
+    float val = t_in[i];
+    if (!isnan(val) && !isinf(val)) {
+      if (!found_valid) {
+        thread_min = val;
+        thread_max = val;
+        found_valid = true;
+      } else {
+        thread_min = min(thread_min, val);
+        thread_max = max(thread_max, val);
+      }
+    }
+  }
+
+  // Intra-group reduction using shared memory
+  shared_min[local_id] = thread_min;
+  shared_max[local_id] = thread_max;
+  barrier();
+
+  // Tree reduction within work group
+  for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) {
+    if (local_id < stride) {
+      float other_min = shared_min[local_id + stride];
+      float other_max = shared_max[local_id + stride];
+
+      if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) {
+        shared_min[local_id] = other_min;
+      }
+      if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) {
+        shared_max[local_id] = other_max;
+      }
+    }
+    barrier();
+  }
+
+  // Final result calculation (single workgroup only)
+  if (local_id == 0) {
+    float global_min = shared_min[0];
+    float global_max = shared_max[0];
+
+    float scale_val;
+    int zero_point_val;
+    calculate_scale_and_zero_point(global_min, global_max, quant_min, quant_max, scale_val, zero_point_val);
+
+    t_scale[0] = scale_val;
+    t_zero_point[0] = zero_point_val;
+  }
+}
+
+#else
+
+void choose_qparams_per_token() {
+  uint global_id = gl_GlobalInvocationID.x;
+  uint local_id = gl_LocalInvocationID.x;
+  uint group_id = gl_WorkGroupID.x;
+  uint total_workgroups = gl_NumWorkGroups.x;
+
+  uint total_elements = uint(t_in_sizes.x * t_in_sizes.y * t_in_sizes.z * t_in_sizes.w);
+  uint token_size = total_elements / uint(num_tokens);
+
+  // Calculate how many tokens each workgroup should process
+  // This handles the case where we have more tokens than workgroups
+  uint tokens_per_workgroup = (uint(num_tokens) + total_workgroups - 1) / total_workgroups;
+
+  // Calculate which tokens this workgroup is responsible for
+  uint start_token = group_id * tokens_per_workgroup;
+  uint end_token = min(start_token + tokens_per_workgroup, uint(num_tokens));
+
+  // Early exit if this workgroup has no tokens to process
+  if (start_token >= uint(num_tokens)) {
+    return;
+  }
+
+  // Process each token assigned to this workgroup
+  for (uint token_id = start_token; token_id < end_token; token_id++) {
+    // Calculate the start and end indices for this token
+    uint token_start = token_id * token_size;
+    uint token_end = token_start + token_size;
+
+    // Each thread processes multiple elements within the token with stride
+    float thread_min = 1.0/0.0;  // +infinity
+    float thread_max = -1.0/0.0; // -infinity
+    bool found_valid = false;
+
+    // Process elements within this token only
+    for (uint i = token_start + local_id; i < token_end; i += gl_WorkGroupSize.x) {
+      float val = t_in[i];
+      if (!isnan(val) && !isinf(val)) {
+        if (!found_valid) {
+          thread_min = val;
+          thread_max = val;
+          found_valid = true;
+        } else {
+          thread_min = min(thread_min, val);
+          thread_max = max(thread_max, val);
+        }
+      }
+    }
+
+    // Intra-group reduction using shared memory
+    shared_min[local_id] = thread_min;
+    shared_max[local_id] = thread_max;
+    barrier();
+
+    // Tree reduction within work group
+    for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) {
+      if (local_id < stride) {
+        float other_min = shared_min[local_id + stride];
+        float other_max = shared_max[local_id + stride];
+
+        if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) {
+          shared_min[local_id] = other_min;
+        }
+        if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) {
+          shared_max[local_id] = other_max;
+        }
+      }
+      barrier();
+    }
+
+    // Final calculation for this token
+    if (local_id == 0) {
+      float token_min = shared_min[0];
+      float token_max = shared_max[0];
+
+      float scale_val;
+      int zero_point_val;
+      calculate_scale_and_zero_point(token_min, token_max, quant_min, quant_max, scale_val, zero_point_val);
+
+      t_scale[token_id] = scale_val;
+      t_zero_point[token_id] = zero_point_val;
+    }
+
+    // Synchronize before processing next token
+    barrier();
+  }
+}
+
+#endif
+
+void main() {
+  choose_qparams_${MODE}();
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml
new file mode 100644
index 00000000000..c37039f68e9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml
@@ -0,0 +1,12 @@
+choose_qparams_buffer:
+  parameter_names_with_default_values:
+    IN_DTYPE: float
+    MODE: per_tensor
+  generate_variant_forall:
+    IN_DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: choose_qparams_tensor_buffer
+      MODE: per_tensor
+    - NAME: choose_qparams_per_token_asymmetric_buffer
+      MODE: per_token
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl
new file mode 100644
index 00000000000..282f1de170a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define IN_T ${buffer_scalar_type(IN_DTYPE)}
+#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")}
+
+#define ${MODE}
+
+${define_active_storage_type("texture3d")}
+${define_required_extensions(IN_DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_scale", "float", "texture3d")}
+${layout_declare_tensor(B, "w", "t_zero_point", "int", "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}
+
+$if MODE == "per_tensor":
+  layout(push_constant) uniform restrict Block {
+    int quant_min;
+    int quant_max;
+  };
+$else:
+  layout(push_constant) uniform restrict Block {
+    int num_tokens;
+    int quant_min;
+    int quant_max;
+  };
+
+${layout_declare_ubo(B, "ivec3", "t_in_limits")}
+${layout_declare_ubo(B, "ivec3", "t_scale_limits")}
+${layout_declare_ubo(B, "ivec3", "t_zero_point_limits")}
+
+#include "indexing_utils.h"
+#include "choose_qparams.glslh"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#define NWORKERS 64
+
+// Shared memory for reduction - must match local work group size
+shared float shared_min[NWORKERS];
+shared float shared_max[NWORKERS];
+
+/*
+ * QUANTIZATION PARAMETER COMPUTATION SHADER (TEXTURE STORAGE)
+ *
+ * This shader computes quantization parameters (scale and zero_point) for converting
+ * floating-point tensors to n-bit integer representations while preserving the
+ * original data range as much as possible.
+ *
+ * ALGORITHM:
+ * 1. Find global min/max values across tensor elements using parallel reduction
+ * 2. Use tree reduction with shared memory for efficient min/max computation
+ * 3. Calculate scale = (max - min) / (quant_max - quant_min)
+ * 4. Calculate zero_point to map floating-point zero to integer value
+ *
+ * WORKGROUP CONFIGURATION:
+ * - Per-Tensor Mode:
+ *   - Global WG Size: Default (typically {num_elements, 1, 1})
+ *   - Local WG Size: Default (typically {64, 1, 1})
+ * - Per-Token Mode:
+ *   - Global WG Size: Default (typically based on tensor dimensions)
+ *   - Local WG Size: Default (typically {64, 1, 1}, or based on global WG size)
+ *
+ * SUPPORTED CONFIGURATIONS:
+ * - Texture Storage: Uses 3D texture indexing with linear texel iteration
+ * - Assumes width-packed layout (packed_dim = 0) in current implementation
+ * - Handles texel padding for non-multiple-of-4 tensor dimensions
+ * - Note: Axis mapping support depends on indexing utilities
+ *
+ * TREE REDUCTION VISUALIZATION FOR MIN/MAX FINDING:
+ * For 8 threads processing elements [10, 1, 8, 1, 0, 2, 3, 5]:
+ *
+ * Initial shared_min/shared_max arrays populated by each thread:
+ * shared_min:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
+ * shared_max:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
+ * Thread:      |  0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ *
+ * Stride 1 (compare pairs, keep min/max):
+ * shared_min:  |  1 |   | 1 |   | 0 |   | 3 |   |  (min(10,1), min(8,1), min(0,2), min(3,5))
+ * shared_max:  | 10 |   | 8 |   | 2 |   | 5 |   |  (max(10,1), max(8,1), max(0,2), max(3,5))
+ * Active:      |  0 |   | 2 |   | 4 |   | 6 |   |
+ *
+ * Stride 2 (compare pairs, keep min/max):
+ * shared_min:  |  0 |   |   |   | 0 |   |   |   |  (min(1,1), min(0,3))
+ * shared_max:  | 10 |   |   |   | 5 |   |   |   |  (max(10,8), max(2,5))
+ * Active:      |  0 |   |   |   | 4 |   |   |   |
+ *
+ * Stride 4 (final comparison):
+ * shared_min:  |  0 |   |   |   |   |   |   |   |  (min(0,0) = 0)
+ * shared_max:  | 10 |   |   |   |   |   |   |   |  (max(10,5) = 10)
+ * Active:      |  0 |   |   |   |   |   |   |   |
+ *
+ * Final result: global_min = 0, global_max = 10 (stored in shared_min[0], shared_max[0])
+ *
+ * PER-TENSOR QUANTIZATION:
+ * - Single workgroup processes entire tensor
+ * - Each thread processes multiple texels with stride
+ * - Thread 0: texels [0, 64, 128, ...] -> elements [0-3, 256-259, 512-515, ...]
+ * - Thread 1: texels [1, 65, 129, ...] -> elements [4-7, 260-263, 516-519, ...]
+ * - Tree reduction combines all thread results into global min/max
+ * - Output: Single scale and zero_point values
+ *
+ * PER-TOKEN QUANTIZATION:
+ * - Multiple workgroups, each processing subset of tokens
+ * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements)
+ * - Each workgroup processes multiple tokens if num_tokens > num_workgroups
+ * - Within each token, threads process texels containing token elements
+ * - Output: Array of scale and zero_point values (one per token)
+ */
+
+#ifdef per_tensor
+
+void choose_qparams_per_tensor() {
+  uint global_id = gl_GlobalInvocationID.x;
+  uint local_id = gl_LocalInvocationID.x;
+  uint group_id = gl_WorkGroupID.x;
+  uint total_threads = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
+
+  uint total_texels = uint(t_in_limits.x * t_in_limits.y * t_in_limits.z);
+
+  // Each thread processes multiple texels with stride
+  float thread_min = 1.0/0.0;  // +infinity
+  float thread_max = -1.0/0.0; // -infinity
+  bool found_valid = false;
+
+  // Process texels with stride across all threads
+  for (uint texel_idx = global_id; texel_idx < total_texels; texel_idx += total_threads) {
+    // Convert linear texel index to 3D coordinates
+    uint z = texel_idx / uint(t_in_limits.x * t_in_limits.y);
+    uint remainder = texel_idx % uint(t_in_limits.x * t_in_limits.y);
+    uint y = remainder / uint(t_in_limits.x);
+    uint x = remainder % uint(t_in_limits.x);
+    ivec3 texel_pos = ivec3(int(x), int(y), int(z));
+
+    FVEC4_T texel_data = load_texel(t_in, texel_pos);
+
+    // For texture storage, we assume width-packed (packed_dim = 0)
+    // Calculate number of valid elements in this texel (handle padding)
+    int packed_dim = 0; // Width dimension is packed
+    ivec4 sizes = ivec4(t_in_limits, 1); // Convert limits to sizes format
+    ivec4 tensor_coord = to_tensor_idx(texel_pos, sizes, packed_dim);
+
+    // Calculate total tensor elements to determine padding
+    int total_elements = t_in_limits.x * t_in_limits.y * t_in_limits.z * 4;
+    int linear_tensor_idx = tensor_coord.x + tensor_coord.y * sizes.x +
+                            tensor_coord.z * sizes.x * sizes.y;
+    int remaining_elements = total_elements - (linear_tensor_idx);
+    int valid_elements = min(4, remaining_elements);
+
+    // Find min/max within this texel, considering only valid elements
+    if (valid_elements >= 1 && !isnan(texel_data.x) && !isinf(texel_data.x)) {
+      if (!found_valid) {
+        thread_min = texel_data.x;
+        thread_max = texel_data.x;
+        found_valid = true;
+      } else {
+        thread_min = min(thread_min, texel_data.x);
+        thread_max = max(thread_max, texel_data.x);
+      }
+    }
+
+    if (valid_elements >= 2 && !isnan(texel_data.y) && !isinf(texel_data.y)) {
+      if (!found_valid) {
+        thread_min = texel_data.y;
+        thread_max = texel_data.y;
+        found_valid = true;
+      } else {
+        thread_min = min(thread_min, texel_data.y);
+        thread_max = max(thread_max, texel_data.y);
+      }
+    }
+
+    if (valid_elements >= 3 && !isnan(texel_data.z) && !isinf(texel_data.z)) {
+      if (!found_valid) {
+        thread_min = texel_data.z;
+        thread_max = texel_data.z;
+        found_valid = true;
+      } else {
+        thread_min = min(thread_min, texel_data.z);
+        thread_max = max(thread_max, texel_data.z);
+      }
+    }
+
+    if (valid_elements >= 4 && !isnan(texel_data.w) && !isinf(texel_data.w)) {
+      if (!found_valid) {
+        thread_min = texel_data.w;
+        thread_max = texel_data.w;
+        found_valid = true;
+      } else {
+        thread_min = min(thread_min, texel_data.w);
+        thread_max = max(thread_max, texel_data.w);
+      }
+    }
+  }
+
+  // Intra-workgroup reduction using shared memory
+  shared_min[local_id] = thread_min;
+  shared_max[local_id] = thread_max;
+  barrier();
+
+  // Tree reduction within work group
+  for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) {
+    if (local_id < stride) {
+      float other_min = shared_min[local_id + stride];
+      float other_max = shared_max[local_id + stride];
+
+      if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) {
+        shared_min[local_id] = other_min;
+      }
+      if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) {
+        shared_max[local_id] = other_max;
+      }
+    }
+    barrier();
+  }
+
+  // Final result calculation (single workgroup only for reliability)
+  if (local_id == 0 && group_id == 0) {
+    float global_min = shared_min[0];
+    float global_max = shared_max[0];
+
+    float scale_val;
+    int zero_point_val;
+    calculate_scale_and_zero_point(global_min, global_max, quant_min, quant_max, scale_val, zero_point_val);
+
+    write_texel(t_scale, ivec3(0, 0, 0), vec4(scale_val, 0.0, 0.0, 0.0));
+    write_texel(t_zero_point, ivec3(0, 0, 0), ivec4(zero_point_val, 0, 0, 0));
+  }
+}
+
+#else
+
+void choose_qparams_per_token() {
+  // Each token is processed by multiple workgroups for parallel reduction
+  uint local_id = gl_LocalInvocationID.x;
+  uint group_id = gl_WorkGroupID.x;
+  uint total_workgroups = gl_NumWorkGroups.x;
+
+  uint total_texels = uint(t_in_limits.x * t_in_limits.y * t_in_limits.z);
+
+  // Calculate texels per token (assuming last dimension contains the token data)
+  // For per-token quantization, we assume tokens are along the last dimension
+  uint texels_per_token = total_texels / uint(num_tokens);
+
+  // Calculate how many tokens each workgroup should process
+  uint tokens_per_workgroup = (uint(num_tokens) + total_workgroups - 1) / total_workgroups;
+
+  // Calculate which tokens this workgroup is responsible for
+  uint start_token = group_id * tokens_per_workgroup;
+  uint end_token = min(start_token + tokens_per_workgroup, uint(num_tokens));
+
+  // Process each token assigned to this workgroup
+  for (uint token_id = start_token; token_id < end_token; token_id++) {
+    // Calculate the texel range for this token
+    uint token_start_texel = token_id * texels_per_token;
+    uint token_end_texel = token_start_texel + texels_per_token;
+
+    // Each thread processes multiple texels within the token
+    float thread_min = 1.0/0.0;  // +infinity
+    float thread_max = -1.0/0.0; // -infinity
+    bool found_valid = false;
+
+    // Process texels within this token only
+    for (uint texel_idx = token_start_texel + local_id; texel_idx < token_end_texel; texel_idx += gl_WorkGroupSize.x) {
+      // Convert linear texel index to 3D coordinates
+      uint z = texel_idx / uint(t_in_limits.x * t_in_limits.y);
+      uint remainder = texel_idx % uint(t_in_limits.x * t_in_limits.y);
+      uint y = remainder / uint(t_in_limits.x);
+      uint x = remainder % uint(t_in_limits.x);
+      ivec3 texel_pos = ivec3(int(x), int(y), int(z));
+
+      FVEC4_T texel_data = load_texel(t_in, texel_pos);
+
+      // For texture storage, we assume width-packed (packed_dim = 0)
+      // Calculate number of valid elements in this texel (handle padding)
+      int packed_dim = 0; // Width dimension is packed
+      ivec4 sizes = ivec4(t_in_limits, 1); // Convert limits to sizes format
+      ivec4 tensor_coord = to_tensor_idx(texel_pos, sizes, packed_dim);
+
+      // Calculate total tensor elements to determine padding
+      int total_elements = t_in_limits.x * t_in_limits.y * t_in_limits.z * 4;
+      int linear_tensor_idx = tensor_coord.x + tensor_coord.y * sizes.x +
+                              tensor_coord.z * sizes.x * sizes.y;
+      int remaining_elements = total_elements - (linear_tensor_idx);
+      int valid_elements = min(4, remaining_elements);
+
+      // Find min/max within this texel, considering only valid elements
+      if (valid_elements >= 1 && !isnan(texel_data.x) && !isinf(texel_data.x)) {
+        if (!found_valid) {
+          thread_min = texel_data.x;
+          thread_max = texel_data.x;
+          found_valid = true;
+        } else {
+          thread_min = min(thread_min, texel_data.x);
+          thread_max = max(thread_max, texel_data.x);
+        }
+      }
+
+      if (valid_elements >= 2 && !isnan(texel_data.y) && !isinf(texel_data.y)) {
+        if (!found_valid) {
+          thread_min = texel_data.y;
+          thread_max = texel_data.y;
+          found_valid = true;
+        } else {
+          thread_min = min(thread_min, texel_data.y);
+          thread_max = max(thread_max, texel_data.y);
+        }
+      }
+
+      if (valid_elements >= 3 && !isnan(texel_data.z) && !isinf(texel_data.z)) {
+        if (!found_valid) {
+          thread_min = texel_data.z;
+          thread_max = texel_data.z;
+          found_valid = true;
+        } else {
+          thread_min = min(thread_min, texel_data.z);
+          thread_max = max(thread_max, texel_data.z);
+        }
+      }
+
+      if (valid_elements >= 4 && !isnan(texel_data.w) && !isinf(texel_data.w)) {
+        if (!found_valid) {
+          thread_min = texel_data.w;
+          thread_max = texel_data.w;
+          found_valid = true;
+        } else {
+          thread_min = min(thread_min, texel_data.w);
+          thread_max = max(thread_max, texel_data.w);
+        }
+      }
+    }
+
+    // Intra-workgroup reduction using shared memory
+    shared_min[local_id] = thread_min;
+    shared_max[local_id] = thread_max;
+    barrier();
+
+    // Tree reduction within work group
+    for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) {
+      if (local_id < stride) {
+        float other_min = shared_min[local_id + stride];
+        float other_max = shared_max[local_id + stride];
+
+        // Handle infinity values properly
+        if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) {
+          shared_min[local_id] = other_min;
+        }
+        if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) {
+          shared_max[local_id] = other_max;
+        }
+      }
+      barrier();
+    }
+
+    // Final calculation for this token
+    if (local_id == 0) {
+      float token_min = shared_min[0];
+      float token_max = shared_max[0];
+
+      float scale_val;
+      int zero_point_val;
+      calculate_scale_and_zero_point(token_min, token_max, quant_min, quant_max, scale_val, zero_point_val);
+
+      // Convert token_id to 3D coordinates for output texture
+      // Assuming output tensors have the same layout as input but with different dimensions
+      uint out_z = token_id / uint(t_scale_limits.x * t_scale_limits.y);
+      uint out_remainder = token_id % uint(t_scale_limits.x * t_scale_limits.y);
+      uint out_y = out_remainder / uint(t_scale_limits.x);
+      uint out_x = out_remainder % uint(t_scale_limits.x);
+      ivec3 out_pos = ivec3(int(out_x), int(out_y), int(out_z));
+
+      write_texel(t_scale, out_pos, vec4(scale_val, 0.0, 0.0, 0.0));
+      write_texel(t_zero_point, out_pos, ivec4(zero_point_val, 0, 0, 0));
+    }
+
+    // Synchronize before processing next token
+    barrier();
+  }
+}
+
+#endif
+
+void main() {
+  choose_qparams_${MODE}();
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml
new file mode 100644
index 00000000000..f3961b87a0f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml
@@ -0,0 +1,12 @@
+choose_qparams_texture:
+  parameter_names_with_default_values:
+    IN_DTYPE: float
+    MODE: per_tensor
+  generate_variant_forall:
+    IN_DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: choose_qparams_tensor_texture3d
+      MODE: per_tensor
+    - NAME: choose_qparams_per_token_asymmetric_texture3d
+      MODE: per_token
diff --git a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp
new file mode 100644
index 00000000000..1dc2d34afbf
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+namespace vkcompute {
+
+namespace {
+
+void resize_choose_qparams_tensor_output(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  const ValueRef scale_out = args.at(0).refs.at(0);
+  const ValueRef zero_point_out = args.at(0).refs.at(1);
+
+  // Both scale and zero_point are scalar tensors for per-tensor quantization
+  // Since we use single workgroup approach, no extra buffer space needed
+  graph->virtual_resize(scale_out, {});
+  graph->virtual_resize(zero_point_out, {});
+}
+
+void resize_choose_qparams_per_token_output(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  const ValueRef scale_out = args.at(0).refs.at(0);
+  const ValueRef zero_point_out = args.at(0).refs.at(1);
+  const ValueRef input = args.at(1).refs.at(0);
+
+  // Calculate output sizes for scale and zero_point tensors
+  const auto input_sizes = graph->sizes_of(input);
+  std::vector<int64_t> output_sizes;
+  output_sizes.reserve(input_sizes.size() - 1);
+  for (size_t i = 0; i < input_sizes.size() - 1; i++) {
+    output_sizes.push_back(input_sizes[i]);
+  }
+  output_sizes.push_back(1);
+
+  graph->virtual_resize(scale_out, output_sizes);
+  graph->virtual_resize(zero_point_out, output_sizes);
+}
+
+// Custom workgroup size pickers for ChooseQParams operations
+utils::uvec3 choose_qparams_pick_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  // For per-tensor quantization, we want a single workgroup that can handle
+  // all elements with proper reduction. The shader uses NWORKERS=64 threads.
+  const ValueRef input = args.at(1).refs.at(0);
+
+  if (graph->is_buffer_storage(input)) {
+    // For buffer storage, use a single workgroup in X dimension
+    // The shader will handle strided access across all elements
+    return {1u, 1u, 1u};
+  } else {
+    // For texture storage, use the default logic
+    return graph->create_global_wg_size(args.at(0).refs.at(0));
+  }
+}
+
+utils::uvec3 choose_qparams_pick_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef input = args.at(1).refs.at(0);
+
+  if (graph->is_buffer_storage(input)) {
+    // For buffer storage, use 64 threads in X dimension to match NWORKERS
+    // This ensures the shared memory arrays are properly sized
+    return {64u, 1u, 1u};
+  } else {
+    // For texture storage, use the default logic
+    return graph->create_local_wg_size(global_workgroup_size);
+  }
+}
+
+utils::uvec3 choose_qparams_per_token_pick_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef input = args.at(1).refs.at(0);
+
+  if (graph->is_buffer_storage(input)) {
+    // For per-token quantization, we need one workgroup per token
+    // Calculate number of tokens (product of all dimensions except the last
+    // one)
+    const auto input_sizes = graph->sizes_of(input);
+    int64_t num_tokens = 1;
+    for (size_t i = 0; i < input_sizes.size() - 1; i++) {
+      num_tokens *= input_sizes[i];
+    }
+
+    return {static_cast<uint32_t>(num_tokens), 1u, 1u};
+  } else {
+    // For texture storage, use the default logic
+    return graph->create_global_wg_size(args.at(0).refs.at(0));
+  }
+}
+
+utils::uvec3 choose_qparams_per_token_pick_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef input = args.at(1).refs.at(0);
+
+  if (graph->is_buffer_storage(input)) {
+    // For buffer storage, use 64 threads in X dimension to match NWORKERS
+    return {64u, 1u, 1u};
+  } else {
+    // For texture storage, use the default logic
+    return graph->create_local_wg_size(global_workgroup_size);
+  }
+}
+
+} // namespace
+
+void add_choose_qparams_tensor_node(
+    ComputeGraph& graph,
+    const ValueRef& input,
+    const ValueRef& quant_min,
+    const ValueRef& quant_max,
+    const ValueRef& scale_out,
+    const ValueRef& zero_point_out) {
+  std::string kernel_name("choose_qparams_tensor");
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
+  add_dtype_suffix(kernel_name, graph.dtype_of(input));
+
+  int quant_min_val = static_cast<int>(graph.get_int(quant_min));
+  int quant_max_val = static_cast<int>(graph.get_int(quant_max));
+
+  vkapi::ParamsBindList param_ubos;
+
+  if (graph.is_buffer_storage(input)) {
+    param_ubos = {
+        graph.sizes_ubo(input),
+        graph.strides_ubo(input),
+        graph.sizes_ubo(scale_out),
+        graph.strides_ubo(scale_out),
+        graph.sizes_ubo(zero_point_out),
+        graph.strides_ubo(zero_point_out)};
+  } else {
+    param_ubos = {
+        graph.logical_limits_ubo(input),
+        graph.logical_limits_ubo(scale_out),
+        graph.logical_limits_ubo(zero_point_out)};
+  }
+
+  std::vector<PushConstantDataInfo> push_constants;
+  push_constants = {
+      PushConstantDataInfo(&quant_min_val, sizeof(int)),
+      PushConstantDataInfo(&quant_max_val, sizeof(int)),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      choose_qparams_pick_global_wg_size,
+      choose_qparams_pick_local_wg_size,
+      // Inputs and Outputs
+      {{scale_out, vkapi::kWrite},
+       {zero_point_out, vkapi::kWrite},
+       {input, vkapi::kRead}},
+      // Shader param buffers
+      param_ubos,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize Args
+      {},
+      // Resizing Logic
+      resize_choose_qparams_tensor_output));
+}
+
+void add_choose_qparams_per_token_asymmetric_node(
+    ComputeGraph& graph,
+    const ValueRef& input,
+    const ValueRef& scale_out,
+    const ValueRef& zero_point_out) {
+  std::string kernel_name("choose_qparams_per_token_asymmetric");
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
+  add_dtype_suffix(kernel_name, graph.dtype_of(input));
+
+  // Calculate number of tokens (product of all dimensions except the last one)
+  int64_t num_tokens = 1;
+  const auto input_sizes = graph.sizes_of(input);
+  for (size_t i = 0; i < input_sizes.size() - 1; i++) {
+    num_tokens *= input_sizes[i];
+  }
+
+  int num_tokens_val = static_cast<int>(num_tokens);
+  int quant_min_val = -128; // Fixed for asymmetric quantization
+  int quant_max_val = 127; // Fixed for asymmetric quantization
+
+  vkapi::ParamsBindList param_ubos;
+
+  if (graph.is_buffer_storage(input)) {
+    param_ubos = {
+        graph.sizes_ubo(input),
+        graph.strides_ubo(input),
+        graph.sizes_ubo(scale_out),
+        graph.strides_ubo(scale_out),
+        graph.sizes_ubo(zero_point_out),
+        graph.strides_ubo(zero_point_out)};
+  } else {
+    param_ubos = {
+        graph.logical_limits_ubo(input),
+        graph.logical_limits_ubo(scale_out),
+        graph.logical_limits_ubo(zero_point_out)};
+  }
+
+  std::vector<PushConstantDataInfo> push_constants;
+  push_constants = {
+      PushConstantDataInfo(&num_tokens_val, sizeof(int)),
+      PushConstantDataInfo(&quant_min_val, sizeof(int)),
+      PushConstantDataInfo(&quant_max_val, sizeof(int)),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      choose_qparams_per_token_pick_global_wg_size,
+      choose_qparams_per_token_pick_local_wg_size,
+      // Inputs and Outputs
+      {{scale_out, vkapi::kWrite},
+       {zero_point_out, vkapi::kWrite},
+       {input, vkapi::kRead}},
+      // Shader param buffers
+      param_ubos,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize Args
+      {},
+      // Resizing Logic
+      resize_choose_qparams_per_token_output));
+}
+
+void choose_qparams_tensor_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef input = args[arg_idx++];
+  const ValueRef quant_min = args[arg_idx++];
+  const ValueRef quant_max = args[arg_idx++];
+  const ValueRef scale_out = args[arg_idx++];
+  const ValueRef zero_point_out = args[arg_idx++];
+
+  // Check tensor types
+  VK_CHECK_COND(graph.val_is_tensor(input));
+  VK_CHECK_COND(graph.val_is_tensor(scale_out));
+  VK_CHECK_COND(graph.val_is_tensor(zero_point_out));
+
+  // Verify input is a floating point type
+  VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kFloat ||
+      graph.dtype_of(input) == vkapi::kHalf ||
+      graph.dtype_of(input) == vkapi::kDouble);
+
+  // Verify output types - accept CPU types but convert to GPU types
+  VK_CHECK_COND(
+      graph.dtype_of(scale_out) == vkapi::kFloat ||
+      graph.dtype_of(scale_out) == vkapi::kDouble);
+  VK_CHECK_COND(
+      graph.dtype_of(zero_point_out) == vkapi::kInt ||
+      graph.dtype_of(zero_point_out) == vkapi::kLong);
+
+  // Check that texture storage is width packed
+  if (!graph.is_buffer_storage(input)) {
+    VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim);
+  }
+
+  add_choose_qparams_tensor_node(
+      graph, input, quant_min, quant_max, scale_out, zero_point_out);
+}
+
+void choose_qparams_per_token_asymmetric_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef input = args[arg_idx++];
+  const ValueRef scale_out = args[arg_idx++];
+  const ValueRef zero_point_out = args[arg_idx++];
+
+  // Check tensor types
+  VK_CHECK_COND(graph.val_is_tensor(input));
+  VK_CHECK_COND(graph.val_is_tensor(scale_out));
+  VK_CHECK_COND(graph.val_is_tensor(zero_point_out));
+
+  // Verify input is a floating point type
+  VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kFloat ||
+      graph.dtype_of(input) == vkapi::kHalf ||
+      graph.dtype_of(input) == vkapi::kDouble);
+
+  // Verify output types - accept CPU types but convert to GPU types
+  VK_CHECK_COND(
+      graph.dtype_of(scale_out) == vkapi::kFloat ||
+      graph.dtype_of(scale_out) == vkapi::kDouble);
+  VK_CHECK_COND(
+      graph.dtype_of(zero_point_out) == vkapi::kInt ||
+      graph.dtype_of(zero_point_out) == vkapi::kLong);
+
+  add_choose_qparams_per_token_asymmetric_node(
+      graph, input, scale_out, zero_point_out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(choose_qparams.tensor, choose_qparams_tensor_impl);
+  VK_REGISTER_OP(
+      choose_qparams_per_token_asymmetric.default,
+      choose_qparams_per_token_asymmetric_impl);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/choose_qparams_test.cpp b/backends/vulkan/test/op_tests/choose_qparams_test.cpp
index 24c856e9d46..55e96151387 100644
--- a/backends/vulkan/test/op_tests/choose_qparams_test.cpp
+++ b/backends/vulkan/test/op_tests/choose_qparams_test.cpp
@@ -516,6 +516,58 @@ TEST(VulkanChooseQparamsTest, test_reference_choose_qparams_tensor_int8) {
       at::kChar);
 }
 
+TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_uint8_4D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_tensor(
+      {5, 3, 2, 4}, // input sizes
+      0, // quant_min
+      255, // quant_max
+      at::kByte);
+}
+
+TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_2D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_tensor(
+      {5, 5}, // input sizes
+      -128, // quant_min
+      127, // quant_max
+      at::kChar);
+}
+
+TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_3D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_tensor(
+      {12, 8, 2}, // input sizes
+      -128, // quant_min
+      127, // quant_max
+      at::kChar);
+}
+
+TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_4D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_tensor(
+      {10, 10, 6, 4}, // input sizes
+      -128, // quant_min
+      127, // quant_max
+      at::kChar);
+}
+
 void test_reference_choose_qparams_per_token_asymmetric(
     const std::vector<int>& input_sizes,
     at::ScalarType dtype) {
@@ -673,3 +725,47 @@ TEST(
       {2, 3, 4}, // input sizes (2*3=6 tokens)
       at::kChar);
 }
+
+TEST(
+    VulkanChooseQparamsTest,
+    test_vulkan_choose_qparams_per_token_asymmetric_int8_1D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_per_token_asymmetric({7}, at::kChar);
+}
+
+TEST(
+    VulkanChooseQparamsTest,
+    test_vulkan_choose_qparams_per_token_asymmetric_int8_2D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_per_token_asymmetric({2, 2}, at::kChar);
+}
+
+TEST(
+    VulkanChooseQparamsTest,
+    test_vulkan_choose_qparams_per_token_asymmetric_int8_3D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_per_token_asymmetric({3, 6, 4}, at::kChar);
+}
+
+TEST(
+    VulkanChooseQparamsTest,
+    test_vulkan_choose_qparams_per_token_asymmetric_int8_4D) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_choose_qparams_per_token_asymmetric({128, 2, 16, 3}, at::kChar);
+}