[AOTI] Fix unknown constant type for device-moved constants (pytorch#168138)

sevenEng · pytorchmergebot · commit 34bb9c4f5d06 · 2025-11-20T03:32:33.000Z
### Issue When we have the flag `use_runtime_constant_folding=False`, if we move a constant (buffer or parameter) to a different device, we'll generate a new buf/param during compilation time with a new name where the new device (+counter) will be appended, e.g.: ``` # noramlised name orig buf: model_x_submodule_y_buf0_name moved buf: model_x_submodule_y_buf0_name_cpu0 ``` However, these new names are not registered in `V.graph.constants`. During cpp wrapper code generation, they won't be recognised, hence will get the `ConstantType::Unknown`. It'll cause issues for model loading during runtime. https://github.com/pytorch/pytorch/blob/b8a3165d28b672ac6d84128e66265bf471b92a55/torch/_inductor/codegen/cpp_wrapper_cpu.py#L851-L862 ### Fix After we do the new const name allocation following device movement, check if the original constant is any recognised buffer or parameter, if so, register the new ones with graph as well. ### Failed Unittest before the patch ``` =========================================================================== short test summary info ============================================================================ FAILED [3.9054s] test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleCpu::test_device_moved_constant_cpu - RuntimeError: Expected to not find "torch::aot_inductor::ConstantType::Unknown" but found it FAILED [3.1852s] test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleGpu::test_device_moved_constant_cuda - RuntimeError: Expected to not find "torch::aot_inductor::ConstantType::Unknown" but found it ================================================================ 2 failed, 1 skipped, 916 deselected in 11.81s ================================================================= ``` cc. @muchulee8 @desertfire Pull Request resolved: pytorch#168138 Approved by: https://github.com/muchulee8
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -671,6 +671,49 @@ def forward(self, x):
                 code
             )
 
+    @requires_gpu
+    def test_device_moved_constant(self):
+        # testing both directions
+        device_movements = [
+            (torch.device(type=GPU_TYPE, index=0), torch.device("cpu")),
+            (torch.device("cpu"), torch.device(type=GPU_TYPE, index=0)),
+        ]
+
+        class Model(torch.nn.Module):
+            def __init__(self, from_device):
+                super().__init__()
+                self.register_buffer("_buf", torch.randn(6, 7, device=from_device))
+                self._param = torch.nn.Parameter(
+                    torch.rand(6, 7, device=from_device), requires_grad=False
+                )
+
+            def forward(self, x):
+                to_device = x.device
+                moved_buf = self._buf.to(to_device)
+                moved_param = self._param.to(to_device)
+                return moved_buf, moved_param
+
+        with config.patch(
+            {
+                "aot_inductor.use_runtime_constant_folding": False,
+            }
+        ):
+            for from_device, to_device in device_movements:
+                model = Model(from_device)
+                example_inputs = (torch.randn(6, 7, device=to_device),)
+                _, code = run_and_get_cpp_code(
+                    AOTIRunnerUtil.compile, model, example_inputs
+                )
+                FileCheck().check_not("torch::aot_inductor::ConstantType::Unknown").run(
+                    code
+                )
+                FileCheck().check_count(
+                    "torch::aot_inductor::ConstantType::Buffer", 2, exactly=True
+                ).run(code)
+                FileCheck().check_count(
+                    "torch::aot_inductor::ConstantType::Parameter", 2, exactly=True
+                ).run(code)
+
     def test_subclasses(self):
         device_to_init = self.device
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
@@ -1114,11 +1114,35 @@ def constant_name(self, name: str, device_override: Optional[torch.device]) -> s
         with torch.utils._python_dispatch._disable_current_modes():
             # caller might have OrderedSet fake tensor mode which will create a fake tensor
             # when calling .to, so unset modes here
-            return self.allocate_non_dup_const_name(
+            non_dup_const_name = self.allocate_non_dup_const_name(
                 f"{name}_{device_override.type}{device_override.index or 0}",
                 self.constants[name].to(device_override),
             )
 
+            assert non_dup_const_name in self.constants, (
+                f"{non_dup_const_name} should be in V.graph.constants already"
+            )
+
+            # register device-copied buffers and parameters to graph as well
+            # to codegen correct torch::aot_inductor::ConstantType for them rather than `Unknown`
+            if any(
+                name == normalize_name(buffer_name)
+                for buffer_name in self.named_buffers
+            ):
+                self.named_buffers[non_dup_const_name] = self.constants[
+                    non_dup_const_name
+                ]
+
+            if any(
+                name == normalize_name(param_name)
+                for param_name in self.named_parameters
+            ):
+                self.named_parameters[non_dup_const_name] = self.constants[
+                    non_dup_const_name
+                ]
+
+            return non_dup_const_name
+
     # pyrefly: ignore [bad-override]
     def placeholder(
         self,