[Minimizer] Gracefully exit when there is no discrepancy in block mode (pytorch#154076)

AutinMitra · pytorchmergebot · commit 5623d3022840 · 2025-05-23T06:42:07.000Z
Summary: Previously, when there is no discrepancy in results for block mode, net_min_base will throw an OOB error. This occurs due to the block _block_traverse_impl returning an OOB after exhausting subgraphs all the way down to a single node There is also an issue where we may get an unsound subgraph (i.e. mark an earlier node as the "end" even if the correct end is later). This is due to an incorrect check (start_idx == mid) where there can possibly be two values left before the program pre-maturely returns Test Plan: Buck UI: https://www.internalfb.com/buck2/52524c26-ace5-4593-8a4b-843a54eb206a Test UI: https://www.internalfb.com/intern/testinfra/testrun/3096224973363310 Network: Up: 0B Down: 15MiB (reSessionID-cd404e97-395f-49fc-8381-373e90a1378f) Executing actions. Remaining 0/1 Command: test. Time elapsed: 53.7s Tests finished: Pass 7. Fail 0. Fatal 0. Skip 0. Build failure 0 Differential Revision: D75143242 Pull Request resolved: pytorch#154076 Approved by: https://github.com/jfix71
diff --git a/test/fx/test_net_min_base.py b/test/fx/test_net_min_base.py
@@ -0,0 +1,102 @@
+# Owner(s): ["module: fx"]
+
+from unittest import mock
+
+import torch
+from torch.fx.passes.net_min_base import (
+    _MinimizerBase,
+    _MinimizerSettingBase,
+    FxNetMinimizerResultMismatchError,
+)
+from torch.fx.passes.tools_common import Names
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestNetMinBaseBlock(TestCase):
+    def setUp(self) -> None:
+        # Setup test fixtures for each test method
+
+        class SimpleModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 5)
+                self.linear2 = torch.nn.Linear(5, 5)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.linear(x)
+                x = self.linear2(x)
+                x = self.relu(x)
+                return x
+
+        self.compare_fn = mock.MagicMock()
+
+        self.module = torch.fx.symbolic_trace(SimpleModule())
+        self.sample_input = (torch.randn(2, 10),)
+        self.settings = _MinimizerSettingBase(traverse_method="block")
+        self.minimizer = _MinimizerBase(
+            module=self.module,
+            sample_input=self.sample_input,
+            settings=self.settings,
+            compare_fn=self.compare_fn,
+        )
+        self.report = []
+
+    def assert_problematic_nodes(self, culprit_names: Names) -> None:
+        """
+        Quick helper function to assert that a set of nodes (when present together in a subgraph) cause a discrepancy
+        """
+        with mock.patch("torch.fx.passes.net_min_base._MinimizerBase._run_and_compare"):
+
+            def run_and_compare_side_effect(
+                split_module: torch.fx.GraphModule,
+                submod_name: str,
+                output_names: Names,
+                report_idx: int = -1,
+            ) -> None:
+                submodule = getattr(split_module, submod_name)
+
+                # Remove input/output layer
+                names = set([node.name for node in submodule.graph.nodes][1:-1])
+                if set(culprit_names) <= names:
+                    raise FxNetMinimizerResultMismatchError
+
+            self.minimizer._run_and_compare.side_effect = run_and_compare_side_effect
+
+            # Every single node should be a discrepancy
+            culprits = self.minimizer.minimize()
+            self.assertEqual({node.name for node in culprits}, set(culprit_names))
+
+    def test_no_discrepancy(self) -> None:
+        # No discrepancies should handle gracefully with an empty set
+        with (
+            mock.patch("torch.fx.passes.net_min_base._MinimizerBase.run_a"),
+            mock.patch("torch.fx.passes.net_min_base._MinimizerBase.run_b"),
+        ):
+            # Have both run_a and run_b return the same result
+            return_value = torch.zeros((2, 5))
+            self.minimizer.run_a.return_value = return_value
+            self.minimizer.run_b.return_value = return_value
+            self.compare_fn.return_value = (0, True)
+
+            # There should be no discrepancy between the two, and thus we should receive an empty set
+            culprits = self.minimizer.minimize()
+            self.assertEqual(culprits, set())
+
+    def test_all_nodes_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["linear", "linear2", "relu"])
+
+    def test_first_node_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["linear"])
+
+    def test_last_node_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["relu"])
+
+    def test_middle_node_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["linear2"])
+
+    def test_contiguous_partial_discrepancy_end(self) -> None:
+        self.assert_problematic_nodes(["linear2", "relu"])
+
+    def test_continugous_partial_discrepancy_beginning(self) -> None:
+        self.assert_problematic_nodes(["linear", "linear2"])
diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import logging
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from typing import Any, Callable, cast, Optional
 
 import torch
 import torch.fx
@@ -539,7 +539,7 @@ def _sequential_traverse(self, nodes: NodeList) -> NodeSet:
 
     def _block_traverse_impl(
         self, nodes: NodeList, start_idx: int, end_idx: int, find_last_node: bool
-    ) -> int:
+    ) -> Optional[int]:
         """
         Recursive block search implementation.
         find_last_node: If True, search for the last node which result in numerics difference
@@ -588,7 +588,7 @@ def _block_traverse_impl(
                 f"Culprits found from node {first_node_name} to {last_node_name}."
             )
 
-            if start_idx == mid:
+            if start_idx == mid == end_idx:
                 report.extend(
                     [
                         "This is the last node in the sub-module. ",
@@ -616,16 +616,19 @@ def _block_traverse_impl(
                 f"Culprits not found from node start to {mid}:{nodes[mid].name}."
             )
 
-            if start_idx == mid:
-                report.extend(
-                    [
-                        "This is the last node in the sub-module. ",
-                        "Search in the current branch is successful with node",
-                        f"{start_idx}, node name: {nodes[start_idx].name}.",
-                    ]
-                )
-                self.print_report(report)
-                return start_idx + 1 if find_last_node else start_idx - 1
+            if start_idx == mid == end_idx:
+                # We did not find anything if the pointers have not moved
+                if (start_idx == 0 and not find_last_node) or (
+                    start_idx == len(nodes) - 1 and find_last_node
+                ):
+                    report.append(
+                        f"At {'last' if find_last_node else 'first'} node, no culprits found."
+                    )
+                    self.print_report(report)
+                    return None
+
+                # Otherwise, we have converged on the border between discrepancy and valid
+                return start_idx + (1 if find_last_node else -1)
 
             report.append(
                 "Proceed to split and lower the halves of the current "
@@ -661,39 +664,59 @@ def _block_traverse(
 
         start_idx = 0
         end_idx = len(nodes) - 1
+
+        final_start_idx: Optional[int] = start_idx
+        final_end_idx: Optional[int] = end_idx
+
         run_both = True if find_last_node is None else False
 
         # step 1: find (0, end_idx) of culprit block
         if run_both or find_last_node:
             last_node_report.append("Start searching for last node in culprit")
             self.print_report(last_node_report)
-            end_idx = self._block_traverse_impl(nodes, start_idx, end_idx, True)
+            final_end_idx = self._block_traverse_impl(nodes, start_idx, end_idx, True)
+
+            if final_end_idx is None:
+                last_node_report.append("No culprits found")
+                self.print_report(last_node_report)
+                return culprits
+
             last_node_report.extend(
-                ["Finish Pass 1", f"Find end_idx = {end_idx}:{nodes[end_idx].name}"]
+                [
+                    "Finish Pass 1",
+                    f"Find end_idx = {final_end_idx}:{nodes[final_end_idx].name}",
+                ]
             )
             self.print_report(last_node_report)
 
         # step 2: reduce culprit block to (start_idx, end_idx)
         if run_both or not find_last_node:
             first_node_report = ["Start searching for first node in culprit"]
             self.print_report(first_node_report)
-            start_idx = self._block_traverse_impl(
-                nodes[0 : end_idx + 1], start_idx, end_idx, False
+            final_start_idx = self._block_traverse_impl(
+                nodes[0 : end_idx + 1], start_idx, final_end_idx or end_idx, False
             )
+
+            if final_start_idx is None:
+                last_node_report.append("No culprits found")
+                self.print_report(last_node_report)
+                return culprits
+
             first_node_report.append("*" * 50)
             self.reports.append(first_node_report)
             first_node_report.extend(
                 [
                     "Finish Pass 2",
-                    f"Find start_idx = {start_idx}:{nodes[start_idx].name}",
+                    f"Find start_idx = {final_start_idx}:{nodes[final_start_idx].name}",
                 ]
             )
             self.print_report(first_node_report)
 
-        # step 3: form module with minimum culprits
-        culprits.update(nodes[start_idx : end_idx + 1])
+        # step 3: form module with minimum culprits. These indexes are guaranteed to exist
+        range_start, range_end = cast(int, final_start_idx), cast(int, final_end_idx)
+        culprits.update(nodes[range_start : range_end + 1])
         result_report = [
-            f"Finish searching, found minimum block ({nodes[start_idx]},{nodes[end_idx]})"
+            f"Finish searching, found minimum block ({nodes[range_start]},{nodes[range_end]})"
         ]
         self.reports.append(result_report)
         self.print_report(result_report)