[Graph Partition] add symints to get_graph_inputs (pytorch#154679)

BoyuanFeng · pytorchmergebot · commit be16f21ca62f · 2025-06-05T06:46:28.000Z
During `codegen_inputs`, we check whether there are undefined symbols: https://github.com/pytorch/pytorch/blob/65b1aedd09e98fcafcdd893ca4924f4fa598fd18/torch/_inductor/codegen/wrapper.py#L1668-L1674 Previously, for graph partition inputs, we do not explicitly add symints. https://github.com/pytorch/pytorch/blob/65b1aedd09e98fcafcdd893ca4924f4fa598fd18/torch/_inductor/codegen/wrapper.py#L3265-L3272 We relied on sizes/strides of TensorBox for codegen symint inputs. For example, a tensor with shape `[s0, 2]` will implicitly codegen `s0` as an input here. This works fine in most cases since backed symint has to come from some tensor shapes. https://github.com/pytorch/pytorch/blob/65b1aedd09e98fcafcdd893ca4924f4fa598fd18/torch/_inductor/codegen/wrapper.py#L1624-L1632 In rare cases, this does not work. One example is saved tensors for backward where a tensor may have shape `[2*s0, 2]`. Since `2*s0` is an expression but not a symbol, `codegen_input_symbol_assignment` would not handle `s0` and later there would be an error when `_verify_input_symbol_assignment`. The fix is add symints to `get_graph_inputs`. An alternative way is to update `codegen_input_symbol_assignment` but I want to minimize the change to graph partition only. Pull Request resolved: pytorch#154679 Approved by: https://github.com/eellison
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
@@ -801,12 +801,7 @@ def fn(x):
         out_dt = torch.matmul(tmp_dt, y_dt)
         out_dt.sum().backward()
 
-    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @skip_if_lt_x_gpu(1)
-    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
-    @patch.object(torch._inductor.config, "compile_threads", 1)
-    @patch.object(torch._inductor.config, "reorder_for_compute_comm_overlap", True)
-    def test_tp_compile_comm_reordering(self):
+    def _test_tp_compile_comm_reordering(self):
         class FakeAttention(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -876,6 +871,23 @@ def forward(self, input):
             code
         )
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(1)
+    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+    @patch.object(torch._inductor.config, "compile_threads", 1)
+    @patch.object(torch._inductor.config, "reorder_for_compute_comm_overlap", True)
+    def test_tp_compile_comm_reordering(self):
+        self._test_tp_compile_comm_reordering()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(1)
+    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+    @patch.object(torch._inductor.config, "compile_threads", 1)
+    @patch.object(torch._inductor.config, "reorder_for_compute_comm_overlap", True)
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_tp_compile_comm_reordering_graph_partition(self):
+        self._test_tp_compile_comm_reordering()
+
 
 @instantiate_parametrized_tests
 class TestDTensorCompileE2E(DTensorTestBase):
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -14968,6 +14968,27 @@ def f(x, y):
             compiled_out = f_compiled(x, y)
             self.assertEqual(compiled_out, f(x, y))
 
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_cat_backward(self):
+            def f(x, w):
+                y = torch.cat((x, x), dim=0)
+                z = y @ w
+                return z @ z.T
+
+            compiled_f = torch.compile(f)
+
+            for shape in (2, 3):
+                torch.manual_seed(42)
+                eager_x = torch.randn(shape, 2, device=self.device)
+                eager_w = torch.randn(2, 2, device=self.device, requires_grad=True)
+                torch.manual_seed(42)
+                compiled_x = torch.randn(shape, 2, device=self.device)
+                compiled_w = torch.randn(2, 2, device=self.device, requires_grad=True)
+
+                f(eager_x, eager_w).sum().backward()
+                compiled_f(compiled_x, compiled_w).sum().backward()
+                self.assertEqual(eager_w.grad, compiled_w.grad)
+
         @dynamo_config.patch("capture_dynamic_output_shape_ops", True)
         @config.patch(implicit_fallbacks=True)
         @torch._inductor.config.patch("graph_partition", True)
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -3282,7 +3282,9 @@ def get_graph_inputs(
         self,
     ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]:
         if signature := self.partition_signatures:
-            inputs = signature.input_nodes
+            inputs = signature.input_nodes | {
+                str(s): s for s in signature.symbol_inputs
+            }
         else:
             inputs = V.graph.graph_inputs
         return inputs
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -4251,7 +4251,15 @@ def filter_symbols(
             *(get_input_node_symbols(node) for _, node in input_nodes.items())
         )
 
-        return filter_symbols(candidate_symbols)
+        candidate_symbols = filter_symbols(candidate_symbols)
+
+        res: OrderedSet[sympy.Symbol] = OrderedSet()
+        for s in candidate_symbols:
+            symplified_s = V.graph.sizevars.simplify(s)
+            # use free_symbols only when s is simplified to an Integer or expr
+            res.update(symplified_s.free_symbols)
+
+        return OrderedSet(sorted(res, key=operator.attrgetter("name")))
 
     def get_graph_partition_signature(
         self, partitions: list[PartitionType], skip_cudagraphs: list[bool]