[Inductor] Prevent kernel fusion with too many unique inputs and outputs (pytorch#166275)

andyanwang · pytorchmergebot · commit bc5111cd8dc5 · 2025-10-29T16:41:34.000Z
MTIA triton currently has a limit that it can't support the cases when there are too many input/output buffers. This PR adds the limitation to prevent large fusion with many input/output buffer. Differential Revision: [D85509351](https://our.internmc.facebook.com/intern/diff/D85509351/) Pull Request resolved: pytorch#166275 Approved by: https://github.com/eellison ghstack dependencies: pytorch#166274
diff --git a/test/inductor/test_inductor_scheduler.py b/test/inductor/test_inductor_scheduler.py
@@ -1,11 +1,14 @@
 # Owner(s): ["module: inductor"]
 
 from unittest import skipIf
+from unittest.mock import Mock
 
 import torch
 import torch._inductor.metrics as metrics
 import torch.utils.flop_counter
 from torch._dynamo.utils import counters
+from torch._inductor.dependencies import Dep, ReadWrites
+from torch._inductor.scheduler import BaseSchedulerNode, Scheduler
 from torch._inductor.utils import fresh_inductor_cache
 from torch.testing._internal.common_cuda import SM70OrLater
 from torch.testing._internal.common_device_type import (
@@ -15,6 +18,7 @@
 )
 from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
 from torch.testing._internal.inductor_utils import IS_BIG_GPU
+from torch.utils._ordered_set import OrderedSet
 
 
 def FlopCounterMode(*args, **kwargs):
@@ -132,6 +136,79 @@ def test_flop_counter_op(self, device, dtype, options):
             counters["inductor"]["flop_count"] = 0
         torch._logging.set_logs()
 
+    def test_fusion_prevent_too_many_reads_and_writes_prevents_fusion(self):
+        """Test that fusion is prevented when unique I/O buffers exceed threshold"""
+        # Setup: Create nodes with many unique I/O buffers
+        # node1: reads [A, B, C], writes [D]
+        # node2: reads [D, E, F], writes [G]
+        # D becomes internal (node2 reads node1's write)
+        # After fusion: unique I/O = {A, B, C, E, F, G} = 6 buffers
+        scheduler = Mock(spec=Scheduler)
+        scheduler.can_buffer_be_removed_through_fusion = Mock(return_value=False)
+
+        node1 = self._create_mock_node(
+            name="node1", reads=["A", "B", "C"], writes=["D"]
+        )
+        node2 = self._create_mock_node(
+            name="node2", reads=["D", "E", "F"], writes=["G"]
+        )
+
+        # Execute: Check with threshold of 5 (should prevent fusion since 6 > 5)
+        result = Scheduler.fusion_prevent_too_many_reads_and_writes(
+            scheduler, node1, node2, threshold=5
+        )
+
+        # Assert: Fusion should be prevented (6 unique buffers > 5 threshold)
+        self.assertTrue(result)
+
+    def test_fusion_prevent_too_many_reads_and_writes_allows_fusion(self):
+        """Test that fusion is allowed when intermediate buffers are removed"""
+        # Setup: Create nodes where node2 reads node1's output
+        # node1: reads [A, B], writes [C]
+        # node2: reads [C, D], writes [E]
+        # C becomes internal (node2 reads node1's write)
+        # After fusion: unique I/O = {A, B, D, E} = 4 buffers
+        scheduler = Mock(spec=Scheduler)
+        scheduler.can_buffer_be_removed_through_fusion = Mock(return_value=False)
+
+        node1 = self._create_mock_node(name="node1", reads=["A", "B"], writes=["C"])
+        node2 = self._create_mock_node(name="node2", reads=["C", "D"], writes=["E"])
+
+        # Execute: Check with threshold of 5 (should allow fusion since 4 <= 5)
+        result = Scheduler.fusion_prevent_too_many_reads_and_writes(
+            scheduler, node1, node2, threshold=5
+        )
+
+        # Assert: Fusion should be allowed (4 unique buffers <= 5 threshold)
+        self.assertFalse(result)
+
+    def _create_mock_node(self, name: str, reads: list[str], writes: list[str]) -> Mock:
+        """Helper method to create a mock scheduler node with specified reads/writes"""
+        node = Mock(spec=BaseSchedulerNode)
+        node.get_name = Mock(return_value=name)
+        node.get_nodes = Mock(return_value=[node])
+
+        # Create mock Dep objects for reads and writes
+        read_deps = OrderedSet()
+        for read_name in reads:
+            dep = Mock(spec=Dep)
+            dep.name = read_name
+            read_deps.add(dep)
+
+        write_deps = OrderedSet()
+        for write_name in writes:
+            dep = Mock(spec=Dep)
+            dep.name = write_name
+            write_deps.add(dep)
+
+        # Create mock ReadWrites object
+        read_writes = Mock(spec=ReadWrites)
+        read_writes.reads = read_deps
+        read_writes.writes = write_deps
+
+        node.read_writes = read_writes
+        return node
+
 
 instantiate_device_type_tests(TestScheduler, globals())
 
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
@@ -530,6 +530,17 @@ def can_fuse(
             WhyNoFuse(node1, node2)("Fusion will increase peak memory")
             return False
 
+        if (
+            config.max_fusion_unique_io_buffers is not None
+            and scheduler.fusion_prevent_too_many_reads_and_writes(
+                node1,
+                node2,
+                config.max_fusion_unique_io_buffers,
+            )
+        ):
+            WhyNoFuse(node1, node2)("fusion_prevent_too_many_reads_and_writes")
+            return False
+
         return True
 
     @staticmethod
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -688,6 +688,10 @@ def use_autoheuristic(name: str) -> bool:
 # how many nodes to attempt pairwise fusion with in a buffer group
 max_fusion_buffer_group_pairwise_attempts = 64
 
+# maximum number of unique input/output buffers allowed in fused kernels.
+# The check is disabled if set to None.
+max_fusion_unique_io_buffers: Optional[int] = None
+
 # max number of inputs to generate cat as a pointwise op with masked loads
 max_pointwise_cat_inputs = 8
 
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -4113,6 +4113,54 @@ def _find_single_user_inputs(
             return True
         return False
 
+    def fusion_prevent_too_many_reads_and_writes(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode, threshold: int
+    ) -> bool:
+        # After fusion, we need to calculate the unique I/O buffers
+        # accounting for buffers that become internal (removed through fusion)
+
+        # Get all nodes that will be in the fused node
+        fused_node_names = OrderedSet(
+            [node.get_name() for node in node1.get_nodes()]
+            + [node.get_name() for node in node2.get_nodes()]
+        )
+
+        # Calculate node2 reads that can be removed through fusion,
+        # i.e. node2 reads that are outputs of node1
+        node1_write_names = OrderedSet(dep.name for dep in node1.read_writes.writes)
+        node2_read_names = OrderedSet(dep.name for dep in node2.read_writes.reads)
+        reads_removed_through_fusion = node2_read_names & node1_write_names
+
+        # Calculate node1 writes that can be removed through fusion,
+        # i.e. node1 writes that are only read by node2
+        writes_removed_through_fusion: OrderedSet[str] = OrderedSet()
+        for write_dep in node1.read_writes.writes:
+            if self.can_buffer_be_removed_through_fusion(
+                write_dep.name, fused_node_names
+            ):
+                writes_removed_through_fusion.add(write_dep.name)
+
+        # Get all unique reads (union of both nodes' reads)
+        all_read_names = OrderedSet(
+            dep.name for dep in node1.read_writes.reads
+        ) | OrderedSet(dep.name for dep in node2.read_writes.reads)
+
+        # Get all unique writes (union of both nodes' writes)
+        all_write_names = OrderedSet(
+            dep.name for dep in node1.read_writes.writes
+        ) | OrderedSet(dep.name for dep in node2.read_writes.writes)
+
+        # Remove reads that become internal
+        unique_reads = all_read_names - reads_removed_through_fusion
+
+        # Remove writes that become internal
+        unique_writes = all_write_names - writes_removed_through_fusion
+
+        # Get all unique buffer names (reads and writes combined, but no double counting)
+        unique_io_buffers = unique_reads | unique_writes
+
+        return len(unique_io_buffers) > threshold
+
     def are_long_distant_nodes(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> bool: