Update on "[ET-VK] linear_qta8a_qga4w graph pass"

morelos · morelos · commit 8b67c742ac28 · 2025-07-17T15:18:56.000-07:00
# Changes * Introduce `linear_qta8a_qga4w` custom operator in `custom_ops_lib.py` to handle dynamic activation + grouped weight quantized linear operations * Add pattern matching and fusion logic in `FuseQuantizedOpsTransform` to detect and replace dequant + dequant + linear sequences with the new fused operator * Implement comprehensive test coverage in `test_vulkan_passes.py` for the QTA8A_QGA4W fusion pattern validation * Add 4-bit weight packing utilities and grouped quantization support for efficient memory usage # Motivation The existing quantization workflow in Vulkan backend processes dynamic activation + grouped weight quantized linear operations as separate quantize/dequantize/linear steps, which creates performance overhead through: * Multiple kernel dispatches instead of a single fused operation * Intermediate tensor allocations for dequantized weights and activations * Suboptimal memory bandwidth utilization The new `linear_qta8a_qga4w` operator fuses the entire sequence into a single operation that: * Directly processes 8-bit quantized activations with per-token scales/zero-points * Handles 4-bit grouped quantized weights with configurable group sizes * Eliminates intermediate dequantization steps by performing dequantization inline * Reduces memory footprint through packed 4-bit weight storage This aligns with the broader goal of optimizing quantized model inference in the Vulkan backend by leveraging graph-level transformations to improve computational efficiency while maintaining numerical accuracy. Differential Revision: [D78291269](https://our.internmc.facebook.com/intern/diff/D78291269/) [ghstack-poisoned]
diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py
@@ -215,57 +215,20 @@ def fuse_into_linear_qcnw_node(
 #########################
 
 
-def matches_linear_qta8a_qga4w_pattern(
-    program: ExportedProgram, node: torch.fx.Node
-) -> Optional[Tuple[int, int]]:
-    """
-    Checks if the nodes surrounding a linear node matches the pattern for dynamic
-    activation + grouped weight quantized linear (QTA8A_QGA4W).
-
-    This pattern involves:
-    1. Dynamic quantization of input activations (8-bit)
-    2. Grouped quantization of weights (4-bit with group size)
-
-    The expected pattern from Int8DynActInt4WeightQuantizer is:
-        scale, zero_point = choose_qparams_affine(input)
-        quantized_input = quantize_affine(input, scale, zero_point)
-        dequantized_input = dequantize_affine(quantized_input, ...)
-        dequantized_weight = dequantize_affine(weight, weight_scales, weight_zeros)
-        output = linear(dequantized_input, dequantized_weight)
-
-    If the pattern matches, return (group_size, weight_bits), otherwise None.
-    """
-    if not utils.is_linear_node(node):
-        return None
-
-    input_node = node.args[0]
-    weight_node = node.args[1]
-
-    # Type checking - ensure we have torch.fx.Node objects
-    if not isinstance(weight_node, torch.fx.Node):
-        return None
-    if not isinstance(input_node, torch.fx.Node):
-        return None
-
-    # Check if input is dequantized with dequantize_affine (from dynamic quantization)
-    if not (
-        input_node.op == "call_function"
-        and input_node.target is not None
-        and hasattr(input_node.target, "__name__")
-        and "dequantize_affine" in getattr(input_node.target, "__name__", "")
-    ):
-        return None
+def _is_dequantize_affine_node(node: torch.fx.Node) -> bool:
+    """Check if a node is a dequantize_affine function call."""
+    return (
+        node.op == "call_function"
+        and node.target is not None
+        and hasattr(node.target, "__name__")
+        and "dequantize_affine" in getattr(node.target, "__name__", "")
+    )
 
-    # Check if weight is dequantized with dequantize_affine
-    if not (
-        weight_node.op == "call_function"
-        and weight_node.target is not None
-        and hasattr(weight_node.target, "__name__")
-        and "dequantize_affine" in getattr(weight_node.target, "__name__", "")
-    ):
-        return None
 
-    # Get the original quantized weight and quantization parameters
+def _validate_qta8a_qga4w_nodes(
+    program: ExportedProgram, weight_node: torch.fx.Node
+) -> Optional[Tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node]]:
+    """Validate and extract weight quantization nodes for QTA8A_QGA4W pattern."""
     if len(weight_node.args) < 4:
         return None
 
@@ -287,7 +250,16 @@ def matches_linear_qta8a_qga4w_pattern(
     if not is_param_node(program, weight_zeros):
         return None
 
-    # Get tensors to analyze the quantization scheme
+    return orig_weight, weight_scales, weight_zeros
+
+
+def _validate_qta8a_qga4w_tensors(
+    program: ExportedProgram,
+    orig_weight: torch.fx.Node,
+    weight_scales: torch.fx.Node,
+    weight_zeros: torch.fx.Node,
+) -> Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+    """Validate and extract weight tensors for QTA8A_QGA4W pattern."""
     orig_weight_tensor = get_param_tensor(program, orig_weight)
     weight_scales_tensor = get_param_tensor(program, weight_scales)
     weight_zeros_tensor = get_param_tensor(program, weight_zeros)
@@ -299,20 +271,24 @@ def matches_linear_qta8a_qga4w_pattern(
     if not isinstance(weight_zeros_tensor, torch.Tensor):
         return None
 
-    # Check if weight is quantized to 4 bits (values should be in [-8, 7] range)
+    return orig_weight_tensor, weight_scales_tensor, weight_zeros_tensor
+
+
+def _validate_4bit_quantization(orig_weight_tensor: torch.Tensor) -> bool:
+    """Check if weight tensor is quantized to 4 bits."""
     quant_min = orig_weight_tensor.min().item()
     quant_max = orig_weight_tensor.max().item()
+    return quant_min >= -8 and quant_max <= 7
 
-    if not (quant_min >= -8 and quant_max <= 7):
-        return None
-
-    # Determine group size from the scales tensor shape
-    # For grouped quantization, scales shape should be [out_features, in_features // group_size]
-    out_features, in_features = orig_weight_tensor.shape
 
+def _calculate_group_size(
+    orig_weight_tensor: torch.Tensor, weight_scales_tensor: torch.Tensor
+) -> Optional[int]:
+    """Calculate and validate group size from tensor shapes."""
     if len(weight_scales_tensor.shape) != 2:
         return None
 
+    out_features, in_features = orig_weight_tensor.shape
     scales_out_features, num_groups = weight_scales_tensor.shape
 
     if scales_out_features != out_features:
@@ -322,6 +298,70 @@ def matches_linear_qta8a_qga4w_pattern(
     if in_features % group_size != 0:
         return None
 
+    return group_size
+
+
+def matches_linear_qta8a_qga4w_pattern(
+    program: ExportedProgram, node: torch.fx.Node
+) -> Optional[Tuple[int, int]]:
+    """
+    Checks if the nodes surrounding a linear node matches the pattern for dynamic
+    activation + grouped weight quantized linear (QTA8A_QGA4W).
+
+    This pattern involves:
+    1. Dynamic quantization of input activations (8-bit)
+    2. Grouped quantization of weights (4-bit with group size)
+
+    The expected pattern from Int8DynActInt4WeightQuantizer is:
+        scale, zero_point = choose_qparams_affine(input)
+        quantized_input = quantize_affine(input, scale, zero_point)
+        dequantized_input = dequantize_affine(quantized_input, ...)
+        dequantized_weight = dequantize_affine(weight, weight_scales, weight_zeros)
+        output = linear(dequantized_input, dequantized_weight)
+
+    If the pattern matches, return (group_size, weight_bits), otherwise None.
+    """
+    if not utils.is_linear_node(node):
+        return None
+
+    input_node = node.args[0]
+    weight_node = node.args[1]
+
+    # Type checking - ensure we have torch.fx.Node objects
+    if not isinstance(weight_node, torch.fx.Node):
+        return None
+    if not isinstance(input_node, torch.fx.Node):
+        return None
+
+    # Check if input and weight are dequantized with dequantize_affine
+    if not _is_dequantize_affine_node(input_node):
+        return None
+    if not _is_dequantize_affine_node(weight_node):
+        return None
+
+    # Validate and extract weight quantization nodes
+    weight_nodes = _validate_qta8a_qga4w_nodes(program, weight_node)
+    if weight_nodes is None:
+        return None
+    orig_weight, weight_scales, weight_zeros = weight_nodes
+
+    # Validate and extract weight tensors
+    weight_tensors = _validate_qta8a_qga4w_tensors(
+        program, orig_weight, weight_scales, weight_zeros
+    )
+    if weight_tensors is None:
+        return None
+    orig_weight_tensor, weight_scales_tensor, weight_zeros_tensor = weight_tensors
+
+    # Check if weight is quantized to 4 bits
+    if not _validate_4bit_quantization(orig_weight_tensor):
+        return None
+
+    # Calculate and validate group size
+    group_size = _calculate_group_size(orig_weight_tensor, weight_scales_tensor)
+    if group_size is None:
+        return None
+
     # Verify this is 4-bit grouped quantization
     weight_bits = 4
 
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
@@ -258,7 +258,6 @@ def linear_qta8a_qga4w(
         weight_zeros: Per-group zero points for weights
     """
     original_x_shape = x_quantized.shape
-    batch_size = original_x_shape[0]
     feature_dim = original_x_shape[-1]
 
     # Reshape for processing