Update on "[ET-VK][AOT] Enable exporting Q8 Quantized Linear + Convolution"

ssjia · ssjia · commit 5b8fdb952dc1 · 2025-08-29T17:33:18.000-07:00
As title. Introduce fusion patterns to enable fusing quantized convolution and linear graph patterns into a custom op. ## Changes Introduce the concept of using custom pattern detection functions to detect graph patterns rather than solely relying on SubgraphMatcher. The issue with SubgraphMatcher is that a large number of graph patterns may need to be exported to obtain variants for different combinations of decompositions/quantization workflows. Having a custom detection function improves maintainability. Implement detection + replacement functions for quantized linear and quantized conv2d. Differential Revision: [D81323425](https://our.internmc.facebook.com/intern/diff/D81323425/) [ghstack-poisoned]
diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py
@@ -23,26 +23,19 @@ def __init__(self, edge_program: torch.export.ExportedProgram):
 
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
-            # Criteria for a foldable Q/DQ node:
-            # - only one user (dequantize)
             if utils.is_quant_node(node):
-                if len(node.users) > 1:
-                    continue
-
-                dq_node = None
+                original_node = node.args[0]
+                assert isinstance(original_node, torch.fx.Node)
+                # For each direct user that is a dequant node, connect the original
+                # node to the users of the dequant node.
                 for user in node.users:
                     if utils.is_dequant_node(user):
                         dq_node = user
-
-                if dq_node is None:
-                    continue
-
-                original_node = node.args[0]
-                assert isinstance(original_node, torch.fx.Node)
-                dq_node.replace_all_uses_with(original_node)
+                        dq_node.replace_all_uses_with(original_node)
 
         graph_module.recompile()
         dead_code_elimination_pass(graph_module)
         # Re-trace to validate everything is ok
         graph_module = super().call(graph_module).graph_module
+
         return PassResult(graph_module, True)
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
@@ -391,6 +391,7 @@ def conv2d_q8ta_q8csw(
     padding: list,
     dilation: list,
     groups: int,
+    out_channels: int,
 ):
     weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32)
 
@@ -409,6 +410,10 @@ def conv2d_q8ta_q8csw(
     # Reshape to original 4D format (OC, IC, H, W)
     qweights_4d = qweights_transposed.view(OC, IC, H, W)
 
+    # Remove any padding added to output channels dim
+    if out_channels != OC:
+        qweights_4d = qweights_4d[:out_channels, :, :, :]
+
     # Dequantize weights
     weights = torch.ops.quantized_decomposed.dequantize_per_channel(
         qweights_4d,
@@ -443,11 +448,13 @@ def conv2d_q8ta_q8csw(
         SymInt[] stride,
         SymInt[] padding,
         SymInt[] dilation,
-        SymInt groups) -> Tensor
+        SymInt groups,
+        SymInt out_channels) -> Tensor
     """
 )
 lib.impl(name, conv2d_q8ta_q8csw, "CompositeExplicitAutograd")
 conv2d_q8ta_q8csw_op = getattr(getattr(torch.ops, namespace), name)
+
 ######################
 ## apply_rotary_emb ##
 ######################
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -490,6 +490,7 @@ def register_quantized_conv_op():
             utils.NO_STORAGE,  # padding (non tensor)
             utils.NO_STORAGE,  # dilation (non tensor)
             utils.NO_STORAGE,  # groups (non tensor)
+            utils.NO_STORAGE,  # original OC count (non tensor)
         ],
         supports_resize=False,
         supports_prepacking=True,
diff --git a/backends/vulkan/patterns/quantized_convolution.py b/backends/vulkan/patterns/quantized_convolution.py
@@ -151,19 +151,38 @@ def make_conv2d_q8ta_q8csw_custom_op(
 
     # Reshape weight tensor from (OC, IC, H, W) to (IC * H * W, OC) for matrix multiplication
     # This prepares the weights for Im2Col-based convolution computation
-    OC, IC, H, W = weight_tensor.shape
+    orig_OC, IC, H, W = weight_tensor.shape
+    OC = orig_OC
+    fake_weight = match.weight_node.meta["val"]
+
+    # The implementation requires that for grouped convolutions, a group does not cross
+    # any texel boundary.
+    if match.groups > 1:
+        assert (OC / match.groups) % 4 == 0
+
+    # The implementation requires that OC is a multiple of 4 so that data load/stores
+    # are well aligned with texel boundaries. If the original output channel count is
+    # not a multiple of 4, then add padding.
+    if OC % 4 != 0:
+        num_padding = 4 - (OC % 4)
+        # Pad the OC (output channel) dimension at the end with zeros
+        weight_tensor = torch.nn.functional.pad(
+            weight_tensor, (0, 0, 0, 0, 0, 0, 0, num_padding)
+        )
+        fake_weight = torch.nn.functional.pad(
+            fake_weight, (0, 0, 0, 0, 0, 0, 0, num_padding)
+        )
+        OC, IC, H, W = weight_tensor.shape
 
     weight_tensor_reshaped = (
         weight_tensor.permute(2, 3, 1, 0).contiguous().view(IC * H * W, OC)
     )
+    fake_weight_reshaped = (
+        fake_weight.permute(2, 3, 1, 0).contiguous().view(IC * H * W, OC)
+    )
     utils.update_program_state_dict(ep, match.weight_node.name, weight_tensor_reshaped)
     # Need to make sure the fake tensor matches the updated tensor's properties
-    match.weight_node.meta["val"] = (
-        match.weight_node.meta["val"]
-        .permute(1, 2, 3, 0)
-        .contiguous()
-        .view(IC * H * W, OC)
-    )
+    match.weight_node.meta["val"] = fake_weight_reshaped
 
     first_graph_node = list(graph_module.graph.nodes)[0]
     with graph_module.graph.inserting_before(first_graph_node):
@@ -200,6 +219,7 @@ def make_conv2d_q8ta_q8csw_custom_op(
                 match.padding,
                 match.dilation,
                 match.groups,
+                orig_OC,
             ),
         )
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -478,6 +478,8 @@ void conv2d_q8csw_linear_tiled_impl(
   const ValueRef padding = args.at(idx++);
   const ValueRef dilation = args.at(idx++);
   const ValueRef groups = args.at(idx++);
+  const ValueRef orig_OC = args.at(idx++);
+  (void)orig_OC;
   const ValueRef output = args.at(idx++);
 
   const ValueRef packed_weight = prepack_q8_linear_weight(graph, weight);
@@ -552,6 +554,8 @@ void conv2d_q8ta_q8csw_linear_tiled_impl(
   const ValueRef padding = args.at(idx++);
   const ValueRef dilation = args.at(idx++);
   const ValueRef groups = args.at(idx++);
+  const ValueRef orig_OC = args.at(idx++);
+  (void)orig_OC;
   const ValueRef output = args.at(idx++);
 
   const ValueRef packed_weight = prepack_q8_linear_weight(graph, weight);
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -2544,7 +2544,85 @@ def forward(self, x):
             0.75,
         )
 
-        input_tensor = torch.ones((1, 3, 32, 32), dtype=torch.float32)
+        # Create sample inputs
+        sample_inputs = (input_tensor,)
+
+        # Create XNNPACK quantizer with symmetric quantization config
+        quantizer = XNNPACKQuantizer()
+        operator_config = get_symmetric_quantization_config(
+            is_per_channel=True,
+            is_dynamic=False,
+        )
+        quantizer.set_global(operator_config)
+
+        # Test the quantized module using the existing quantize_and_lower_module function
+        # Use higher tolerance since quantization introduces some error
+        edge_program = quantize_and_lower_module(
+            conv_sequence_module, sample_inputs, quantizer
+        )
+
+        et_program = edge_program.to_executorch()
+        self.check_vk_delegation(et_program)
+
+        self.run_delegated_model_and_check_output(
+            et_program,
+            conv_sequence_module,
+            sample_inputs,
+            atol=1e-2,
+            rtol=1e-1,
+        )
+
+    def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence_all_reduced(self):
+        """
+        Test a sequence of convolution layers quantized with PT2E quantization.
+        This test creates a module with multiple Conv2d layers in sequence and applies
+        XNNPACK symmetric quantization to test the quantized model execution.
+        Similar to the linear sequence test but using convolution layers.
+        """
+
+        import executorch.backends.vulkan.test.utils as test_utils
+
+        class ConvSequenceModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=3,
+                    out_channels=32,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                )
+                self.conv2 = torch.nn.Conv2d(
+                    in_channels=32,
+                    out_channels=1,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                )
+
+                MAX = 0.75
+                MIN = -0.25
+                self.conv1.weight.data = test_utils.random_uniform_tensor(
+                    self.conv1.weight.shape, MIN, MAX
+                )
+                self.conv2.weight.data = test_utils.random_uniform_tensor(
+                    self.conv2.weight.shape, MIN, MAX
+                )
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        # Create the module
+        conv_sequence_module = ConvSequenceModule()
+
+        input_tensor = test_utils.random_uniform_tensor(
+            (1, 3, 32, 32),
+            -0.25,
+            0.75,
+        )
+
         # Create sample inputs
         sample_inputs = (input_tensor,)