Adding mixed quantization support (#14134)

Marco Giordano · facebook-github-bot · commit 10689f5a082e · 2025-09-25T08:15:59.000-07:00
Summary:

# Context
This Diff adds support for mixed quantization operators in Executorch. Now weights and biases can be quantized, while inputs and activations are kept in floating point.

# In this diff
1. Op nodes are returned from each pattern matching
2. Dequantize nodes are bypassed if not needed in the final graph.

Reviewed By: skrtskrtfb

Differential Revision: D81519735
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -402,7 +402,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                 pattern.partition_types(),
             )
             for fused_partition in fused_partitions:
-                anchors = pattern.get_anchors(graph_module, fused_partition)
+                anchors, op_node = pattern.get_anchors(graph_module, fused_partition)
                 if not anchors or anchors.empty:
                     continue
                 if any(self.is_fused(p.nodes) for p in fused_partition):
@@ -443,13 +443,17 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                 bias_inputs = [node.args[0] for node in dequants_biases]
                 other_inputs = [node.args[idx] for node, idx in anchors.others]
 
-                # The node is the first index of the list and first of the tuple
-                anchor_output_node = anchors.output[0][0]
-
-                assert len(anchor_output_node.users) == 1
-                quant_node = list(anchor_output_node.users.keys())[0]
-
-                with graph_module.graph.inserting_after(anchor_output_node):
+                # Check if there's a quantization node after the operation
+                quant_node = None
+                if len(anchors.output) == 1:
+                    # Check if it's actually a quantization node
+                    if hasattr(op_node, 'users') and len(op_node.users) == 1:
+                        potential_quant_node = list(op_node.users.keys())[0]
+                        if (potential_quant_node.target ==
+                            torch.ops.quantized_decomposed.quantize_per_tensor.default):
+                            quant_node = potential_quant_node
+
+                with graph_module.graph.inserting_after(op_node):
                     args = tuple(
                         inputs_inputs + weights_inputs + other_inputs + bias_inputs
                     )
@@ -463,7 +467,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                         )
                     elif isinstance(pattern, CatPattern):
                         args, kwargs = get_args_and_kwargs_cat(
-                            inputs_inputs, other_inputs, anchor_output_node
+                            inputs_inputs, other_inputs, op_node
                         )
                     elif isinstance(pattern, ConvReluPatterns):
                         # For ConvReLU, we are fusing Conv+ReLU
@@ -494,7 +498,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_weights,
                             bias_inputs,
                             quant_node,
-                            anchor_output_node,
+                            op_node,
                         )
                     elif isinstance(pattern, LinearPattern):
                         args, kwargs = get_args_and_kwargs_linear(
@@ -543,18 +547,26 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_inputs,
                             quant_node,
                         )
+
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
                         args,
                         kwargs,
                     )
-                    fused.meta = quant_node.meta
-                    quant_node.replace_all_uses_with(fused)
+
+                    if quant_node:
+                        fused.meta = quant_node.meta
+                    else:
+                        fused.meta = op_node.meta
+                    op_node.replace_all_uses_with(fused)
+                    if op_node.op == "output":
+                        _ = graph_module.graph.output((fused,))
 
             legalize_graph(graph_module)
             graph_module.graph.eliminate_dead_code()
-            # pyre-fixme[7]: Incompatible return type
             graph_module.recompile()
+        return PassResult(graph_module, True)
+
 
     @classmethod
     # pyre-ignore[2]: Parameter `nodes` has no type specified
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
@@ -67,7 +67,7 @@ def partition_types(self) -> list[OpOverload]:
     @abstractmethod
     def get_anchors(
         self, gm: torch.fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> Optional[PartitionAnchors]:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         pass
 
     @abstractmethod
@@ -85,7 +85,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         addmm_node = fused_partition[0].nodes[-1]
 
@@ -101,12 +101,12 @@ def get_anchors(
             qscheme=torch.per_tensor_affine,
         )
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(addmm_node, 1)],
             weights=[(addmm_node, 2)],
             biases=[(addmm_node, 0, bias_qspec)],
             output=[(addmm_node,)],
-        )
+        ), addmm_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_linear.default
@@ -118,7 +118,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         add_node = fused_partition[0].nodes[-1]
 
@@ -129,16 +129,16 @@ def get_anchors(
             add_node.args[1], fx.Node
         )
         if not is_tensor_add or len(add_node.kwargs) > 0:
-            return PartitionAnchors(
+            return (PartitionAnchors(
                 empty=True,
-            )
+            ), add_node)
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(add_node, 0), (add_node, 1)],
             weights=[],
             biases=[],
             output=[(add_node,)],
-        )
+        ), add_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_add.default
@@ -150,16 +150,16 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         bmm_node = fused_partition[0].nodes[-1]
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(bmm_node, 0), (bmm_node, 1)],
             weights=[],
             biases=[],
             output=[(bmm_node,)],
-        )
+        ), bmm_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_matmul.default
@@ -171,7 +171,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         cat_node = fused_partition[0].nodes[-1]
 
@@ -198,14 +198,14 @@ def get_anchors(
                 )
             )
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=args,
             weights=[],
             biases=[],
             output=[
                 (cat_node, SharedQuantizationSpec((cat_node.args[0][0], cat_node)))
             ],
-        )
+        ), cat_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.aten.cat.default
@@ -217,7 +217,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv1d_node = fused_partition[0].nodes[-1]
 
@@ -238,13 +238,13 @@ def get_anchors(
         if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None:
             bias = [(conv1d_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(conv1d_node, 0)],
             weights=[(conv1d_node, 1)],
             # pyre-fixme[6]: Incompatible parameter type
             biases=bias,
             output=[(conv1d_node,)],
-        )
+        ), conv1d_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_conv2d_nchw.default
@@ -256,7 +256,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv2d_node = fused_partition[0].nodes[-1]
 
@@ -277,13 +277,13 @@ def get_anchors(
         if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None:
             bias = [(conv2d_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(conv2d_node, 0)],
             weights=[(conv2d_node, 1)],
             # pyre-fixme[6]: Incompatible parameter type
             biases=bias,
             output=[(conv2d_node,)],
-        )
+        ), conv2d_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_conv2d_nchw.default
@@ -295,7 +295,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         layer_norm_node = fused_partition[0].nodes[-1]
 
@@ -311,14 +311,14 @@ def get_anchors(
 
         # Weights are used in quantized mode by our kernel, so they are
         # passed in as others here along with the normalized shape.
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(layer_norm_node, 0)],
             weights=[],
             biases=[],
             # Ordering: normalized_shape, weights, bias
             others=others,
             output=[(layer_norm_node,)],
-        )
+        ), layer_norm_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_layer_norm.default
@@ -330,7 +330,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         linear_node = fused_partition[0].nodes[-1]
 
@@ -351,13 +351,13 @@ def get_anchors(
         if len(linear_node.args) > 2:
             bias = [(linear_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(linear_node, 0)],
             weights=[(linear_node, 1)],
             # pyre-fixme[6]: Incompatible parameter type
             biases=bias,
             output=[(linear_node,)],
-        )
+        ), linear_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_linear.default
@@ -369,16 +369,16 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         matmul_node = fused_partition[0].nodes[-1]
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(matmul_node, 0), (matmul_node, 1)],
             weights=[],
             biases=[],
             output=[(matmul_node,)],
-        )
+        ), matmul_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_matmul.default
@@ -392,16 +392,16 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         relu_node = fused_partition[0].nodes[-1]
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(relu_node, 0)],
             weights=[],
             biases=[],
             output=[(relu_node,)],
-        )
+        ), relu_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_relu.default
@@ -427,7 +427,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # The first node should be conv, the second should be relu
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv_node = fused_partition[0].nodes[-1]  # Second to last node
@@ -451,13 +451,13 @@ def get_anchors(
         if len(conv_node.args) > 2 and conv_node.args[2] is not None:
             bias = [(conv_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
+        return (PartitionAnchors(
             inputs=[(conv_node, 0)],
             weights=[(conv_node, 1)],
             # pyre-fixme[6]: Incompatible parameter type
             biases=bias,
             output=[(relu_node,)],  # Output is from the relu node
-        )
+        ), relu_node)
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_conv2d_nchw.default
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
@@ -116,7 +116,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
             if not no_outside_users(fused_partition):
                 continue
 
-            anchors = self.pattern.get_anchors(model, fused_partition)
+            anchors, _ = self.pattern.get_anchors(model, fused_partition)
             if not anchors or anchors.empty:
                 continue
             if is_annotated(