Utility function for numerical correctness of edge dialect graphs and reference implementations (pytorch#14036)

Andrew Grebenisan · facebook-github-bot · commit ef9f31f9e643 · 2025-09-08T16:05:12.000-07:00
Summary:

Created two utility functions
1. Converts an edge dialect graph into one where custom cadence op nodes are replaced with python references
2. Validates the outputs (and optionally intermediates) of the graphs

Updated two tests in test_replace_ops_passes to utilize these utility functions.

Differential Revision: D81843001
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -82,6 +82,8 @@ python_library(
     ],
     deps = [
         ":utils",
+        ":ops_registrations",
+        ":ref_implementations",
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py
@@ -5,17 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
-
 from dataclasses import dataclass
+from functools import partial
+from operator import attrgetter
 from typing import Callable, List, Optional, Set, Type, Union
 
+import executorch.backends.cadence.aot.ops_registrations  # noqa
+import executorch.backends.cadence.aot.ref_implementations  # noqa
+
 import torch
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from executorch.exir.pass_base import PassBase, PassResult
 
 from torch._ops import OpOverloadPacket
+from torch.utils._pytree import PyTree
 
 
 # Is an overlap in tensor lifetime and storage allowed at the current opt level?
@@ -115,6 +120,155 @@ def op_counts_match(
     return True
 
 
+def construct_reference_graph_module(
+    graph_module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    Given a graph module in edge dialect, construct a new graph module with the same
+    structure as the input graph module, but with all cadence custom op nodes
+    replaced with their corresponding reference implementations in torch.ops.cadence.<name>.
+    """
+    new_graph = torch.fx.Graph()
+    val_map = {}
+
+    def _get_cadence_op_with_overload(node: torch.fx.Node) -> Optional[str]:
+        """Get full cadence operation name with overload."""
+        if not (node.op == "call_function" and isinstance(node.target, EdgeOpOverload)):
+            return None
+
+        schema_name = node.target._schema.name
+        if not schema_name.startswith("cadence::"):
+            return None
+
+        base_op_name = schema_name.split("::", 1)[1]
+        prefix = f"cadence_{base_op_name}_"
+
+        return (
+            f"{base_op_name}.{node.name[len(prefix):]}"
+            if node.name.startswith(prefix)
+            else base_op_name
+        )
+
+    for node in graph_module.graph.nodes:
+        if node.op == "call_function" and isinstance(node.target, EdgeOpOverload):
+            # Schema name format: "namespace::operation_name"
+            op = _get_cadence_op_with_overload(node)
+            if op is None:  # Copy the nodes as-is
+                new_node = new_graph.node_copy(node, lambda n: val_map[n])
+                val_map[node] = new_node
+                continue
+
+            try:
+                ref_op = attrgetter(op)(torch.ops.cadence)
+            except AttributeError:
+                raise RuntimeError(
+                    f"Could not find reference implementation for {op} in {torch.ops.cadence}"
+                )
+            new_node = new_graph.create_node(
+                node.op,
+                ref_op,
+                args=tuple(
+                    val_map[arg] if isinstance(arg, torch.fx.Node) else arg
+                    for arg in node.args
+                ),
+                kwargs={
+                    k: val_map[v] if isinstance(v, torch.fx.Node) else v
+                    for k, v in node.kwargs.items()
+                },
+                name=node.name,
+            )
+            val_map[node] = new_node
+        else:
+            # Copy all other nodes as-is
+            new_node = new_graph.node_copy(node, lambda n: val_map[n])
+            val_map[node] = new_node
+
+    # Create a new GraphModule with the new graph and the same code as the original
+    return torch.fx.GraphModule(graph_module, new_graph)
+
+
+def numerically_equivalent(
+    graph_module: torch.fx.GraphModule,
+    example_inputs: tuple[torch.Tensor, ...],
+    exact_match: bool,
+    rtol: float = 1e-3,
+    atol: float = 1e-3,
+    validate_intermediates: bool = False,
+) -> Union[bool, tuple[bool, dict[str, torch.Tensor], dict[str, torch.Tensor]]]:
+    """
+    Constructs a new GraphModule from the input graph_module, replacing all cadence EdgeOpOverload
+    nodes with their corresponding reference implementations in
+    executorch.backends.cadence.aot.ref_implementations (i.e., torch.ops.cadence.<name>).
+    All aten nodes are left unchanged.
+
+    Args:
+        graph_module: The input graph module to be checked for numerical equivalence.
+        example_inputs: Example inputs to the graph module.
+        exact_match: If True, the outputs the original and transformed graph modules must be exactly equal.
+        rtol: Relative tolerance for torch.allclose. Unused if exact_match is True.
+        atol: Absolute tolerance for torch.allclose. Unused if exact_match is True.
+        validate_intermediates: If True, also check that the intermediate values of the original and transformed
+            graph modules are numerically equivalent. If False, only check that the final outputs are equivalent.
+
+    Returns:
+        True if the original and transformed graph modules are numerically equivalent, False otherwise. Raises
+        an error if the cadence reference implementation does not exist.
+    """
+
+    # Create a new GraphModule with the new graph and the same code as the original
+    new_graph_module = construct_reference_graph_module(graph_module)
+
+    # Add forward hooks to capture all intermediates from both original and new GraphModules
+    orig_intermediates: list[PyTree] = []
+    ref_intermediates: list[PyTree] = []
+
+    def get_orig_intermediate(
+        module: torch.fx.GraphModule, input: PyTree, output: PyTree
+    ) -> None:
+        nonlocal orig_intermediates
+        orig_intermediates.append(output)
+
+    def get_new_intermediate(
+        module: torch.fx.GraphModule, input: PyTree, output: PyTree
+    ) -> None:
+        nonlocal ref_intermediates
+        ref_intermediates.append(output)
+
+    hooks = []
+    if validate_intermediates:
+        for module in graph_module.modules():
+            hooks.append(module.register_forward_hook(get_orig_intermediate))
+
+        for module in new_graph_module.modules():
+            # Don't bother saving hooks for new graph module since we're
+            # throwing out the new graph after this function call
+            module.register_forward_hook(get_new_intermediate)
+
+    orig_outs = graph_module(*example_inputs)
+    new_outs = new_graph_module(*example_inputs)
+    for hook in hooks:
+        hook.remove()
+
+    if not validate_intermediates:
+        orig_intermediates = [orig_outs]
+        ref_intermediates = [new_outs]
+
+    assert (
+        len(orig_intermediates) == len(ref_intermediates)
+        and len(orig_intermediates) > 0
+    )
+    if exact_match:
+        comparison_func = torch.equal
+    else:
+        comparison_func = partial(torch.allclose, rtol=rtol, atol=atol, equal_nan=False)
+
+    close_tree = torch.utils._pytree.tree_map(
+        comparison_func, orig_intermediates, ref_intermediates
+    )
+    close_leaves, _ = torch.utils._pytree.tree_flatten(close_tree)
+    return all(close_leaves)
+
+
 # Testing utils
 # Return the compute/function nodes in the graph
 def get_compute_nodes_in_gm(graph_module: torch.fx.GraphModule) -> List[torch.fx.Node]:
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -979,18 +979,18 @@ def call_operator(
     ) -> ProxyValue:
         if op not in {
             exir_ops.edge.cadence.convolution.default,
-            exir_ops.edge.cadence.quantized_conv_nchw.default,
+            exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
         }:
             return super().call_operator(op, args, kwargs, meta)
 
-        quantized_op = op == exir_ops.edge.cadence.quantized_conv_nchw.default
+        quantized_op = op == exir_ops.edge.cadence.quantized_conv_nchw.per_tensor
 
         if not quantized_op and len(args) == 8 and args[-1] is True:
             # Already in NHWC layout.
             return super().call_operator(op, args, kwargs, meta)
 
         new_op = (
-            exir_ops.edge.cadence.quantized_conv_nhwc.default
+            exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor
             if quantized_op
             else exir_ops.edge.cadence.convolution.default
         )
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -15,7 +15,11 @@
     GraphBuilder,
     single_op_builder,
 )
-from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
+from executorch.backends.cadence.aot.pass_utils import (
+    count_node,
+    numerically_equivalent,
+    op_counts_match,
+)
 from executorch.backends.cadence.aot.replace_ops import (
     MakeSliceAndCatDimOutermostPass,
     ReplaceAdaptiveAvgPoolWithAtenAvgPoolPass,
@@ -1611,7 +1615,7 @@ def test_no_transpose_if_already_channel_last(self) -> None:
 
     def create_quantized_convolution_graph_module(
         self, channels_last: Optional[bool] = None
-    ) -> torch.fx.GraphModule:
+    ) -> tuple[torch.fx.GraphModule, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """Helper to create a quantized conv node.
 
         quantized_conv(
@@ -1621,23 +1625,32 @@ def create_quantized_convolution_graph_module(
             Tensor out_shift, bool channel_last=False) -> (Tensor Z)"
         """
         if channels_last:
-            x = torch.randn(1, 224, 56, 3)
-            w = torch.randn(16, 16, 16, 3)
+            x = torch.randint(
+                low=-128, high=127, size=(1, 224, 56, 3), dtype=torch.int8
+            )
+            w = torch.randint(
+                low=-128, high=127, size=(16, 16, 16, 3), dtype=torch.int8
+            )
         else:
-            x = torch.randn(1, 3, 224, 56)
-            w = torch.randn(16, 3, 16, 16)
-        b = torch.randn(16)
+            x = torch.randint(
+                low=-128, high=127, size=(1, 3, 224, 56), dtype=torch.int8
+            )
+            w = torch.randint(
+                low=-128, high=127, size=(16, 3, 16, 16), dtype=torch.int8
+            )
+
+        b = torch.randint(low=-128, high=127, size=(16,), dtype=torch.int32)
         stride = (2, 2)
         padding = (0, 0)
         dilation = (1, 1)
         groups = 1
         input_zero_point = 0
-        w_zero_point = torch.randn(1)
-        b_scale = torch.randn(1)
+        w_zero_point = 1
+        b_scale = 0.8
         out_scale = 1
         out_zero_point = 0
-        out_multiplier = torch.randn(1)
-        out_shift = torch.randn(1)
+        out_multiplier = 0
+        out_shift = 0
         args = (
             x,
             w,
@@ -1660,44 +1673,39 @@ def create_quantized_convolution_graph_module(
                     x,
                     w,
                     b,
-                    w_zero_point,
-                    b_scale,
-                    out_multiplier,
-                    out_shift,
                 ),
-                op=exir_ops.edge.cadence.quantized_conv_nhwc.default,
+                op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
                 args=args,
-            )
+            ), (x, w, b)
         else:
             return single_op_builder(
                 placeholders=(
                     x,
                     w,
                     b,
-                    w_zero_point,
-                    b_scale,
-                    out_multiplier,
-                    out_shift,
                 ),
-                op=exir_ops.edge.cadence.quantized_conv_nchw.default,
+                op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
                 args=args,
-            )
+            ), (x, w, b)
 
     def test_quantized_convolution_default_channel_last(self) -> None:
         # Create a graph with a single convolution node.
-        gm = self.create_quantized_convolution_graph_module()
+        gm, (x, w, b) = self.create_quantized_convolution_graph_module()
         self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.default), 1
+            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor), 1
         )
         self.assertEqual(count_node(gm, exir_ops.edge.aten.permute_copy.default), 0)
 
+        self.assertTrue(numerically_equivalent(gm, (x, w, b), True))
+
         # Apply replacement pass.
         p = ReplaceConvWithChannelLastConvPass()
         gm_after_replacement = p.call(gm).graph_module
         # Check that no replacement was made.
         self.assertEqual(
             count_node(
-                gm_after_replacement, exir_ops.edge.cadence.quantized_conv_nhwc.default
+                gm_after_replacement,
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
             ),
             1,
         )
@@ -1707,26 +1715,33 @@ def test_quantized_convolution_default_channel_last(self) -> None:
             3,
         )
 
+        self.assertTrue(numerically_equivalent(gm_after_replacement, (x, w, b), True))
+
     def test_no_transpose_if_already_quantized_conv_channel_last(self) -> None:
         # Create a graph with a single im2row node.
-        gm = self.create_quantized_convolution_graph_module(channels_last=True)
+        gm, (x, w, b) = self.create_quantized_convolution_graph_module(
+            channels_last=True
+        )
         # Check if graph module is valid by running exportpass on it.
         gm = ExportPass().call(gm).graph_module
         self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.default), 1
+            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor), 1
         )
+        self.assertTrue(numerically_equivalent(gm, (x, w, b), True))
 
         # Apply replacement pass.
         p = ReplaceConvWithChannelLastConvPass()
         gm_after_replacement = p.call(gm).graph_module
         # Check that no replacement was made.
         self.assertEqual(
             count_node(
-                gm_after_replacement, exir_ops.edge.cadence.quantized_conv_nhwc.default
+                gm_after_replacement,
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
             ),
             1,
         )
         self.assertEqual(count_node(gm, exir_ops.edge.aten.permute_copy.default), 0)
+        self.assertTrue(numerically_equivalent(gm_after_replacement, (x, w, b), True))
 
 
 class TestMakeSliceAndCatDimOutermostPass(unittest.TestCase):