[Graph Partition] fix graph partition input signature for fallback kernels (pytorch#166985)

pytorchbot · BoyuanFeng · web-flow · commit ba8639586b23 · 2025-11-05T15:09:31.000-05:00
[Graph Partition] fix graph partition input signature for fallback kernels (pytorch#165815) Scheduler relies on node.last_usage to free buffers. `last_usage` may contain a buffer that is allocated in previous graph partition AND not directly accessed in the current graph partition. ## Example ```python def f(x): y = x + 1 z = torch.ops.aten.view.dtype(y, torch.float8_e4m3fn) z_cpu = z.cpu() u_cuda = z_cpu.cuda() return u_cuda ``` In the generated code, we have ``` def partition_0(args): ... # Topologically Sorted Source Nodes: [y, z], Original ATen: [aten.add, aten.view] buf1 = torch.ops.aten.view.dtype(buf0, torch.float8_e4m3fn) # < ------ buf1 is a view of buf0 buf2 = buf1 # <------- buf2 is buf1 assert_size_stride(buf2, (8, ), (1, ), 'torch.ops.aten.view.dtype') assert_alignment(buf2, 16, 'torch.ops.aten.view.dtype') return (buf2, ) def call(self, args): ... (buf2,) = self.partitions[0](partition0_args) ... buf3.copy_(buf2, False) del buf0 del buf1 del buf2 # <---- `del buf2` leads to `del buf0`. BUT `buf0` is not returned from partition_0. ... ``` Note: view is treated as a fallback kernel due to its special dtype. https://github.com/pytorch/pytorch/blob/de09bab4b66002a8a9a2195f50f96a78868a3d39/torch/_inductor/lowering.py#L841-L843 ## Fix This PR fixes the issue by also returning these buffers to be freed later. Pull Request resolved: pytorch#165815 Approved by: https://github.com/eellison (cherry picked from commit 1891239) Co-authored-by: Boyuan Feng <boyuan@meta.com>
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
@@ -2805,6 +2805,22 @@ def f(x, y):
             # 2 graph partitions lead to 2 cudagraph
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+        def test_graph_partition_view_fallback(self):
+            def f(x):
+                y = x + 1
+                z = torch.ops.aten.view.dtype(y, torch.float8_e4m3fn)
+                z_cpu = z.cpu()
+                u_cuda = z_cpu.cuda()
+                return u_cuda
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            for _ in range(3):
+                x = torch.ones(2, dtype=torch.int32, device="cuda")
+                eager_out = f(x)
+                compiled_out = compiled_f(x)
+                self.assertEqual(eager_out, compiled_out)
+
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_log_message(self):
             def foo(x, y):
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -4926,6 +4926,16 @@ def is_none_layout(buf_name: str) -> bool:
             for node in partition:
                 buffer_names_to_free.update(node.last_usage)
 
+            # buffer_names_to_free may contain buffers allocated in previous
+            # graph partitions. These buffers should also be a partition
+            # input.
+            extra_input_names = [
+                name
+                for name in (buffer_names_to_free - output_names)
+                if name in name_to_node
+            ]
+            partition_input_names.update(extra_input_names)
+
             input_nodes = {
                 name: name_to_node[name]
                 for name in partition_input_names