[precompile] Integrate AOTI as a backend. (pytorch#167338)

zhxchen17 · pytorchmergebot · commit b6570615f8e3 · 2025-11-14T15:33:11.000Z
Fixes #ISSUE_NUMBER Pull Request resolved: pytorch#167338 Approved by: https://github.com/jamesjwu
diff --git a/test/dynamo/test_aot_compile.py b/test/dynamo/test_aot_compile.py
@@ -1,9 +1,11 @@
 # Owner(s): ["module: dynamo"]
 
+import copy
 import functools
 import inspect
 import os
 import pickle
+import unittest
 from contextlib import contextmanager
 from unittest.mock import patch
 
@@ -13,13 +15,16 @@
 import torch._inductor.test_case
 import torch.onnx.operators
 import torch.utils.cpp_extension
-from torch._dynamo.aot_compile import ModelInput, SerializableCallable
+from torch._dynamo.aot_compile import AOTCompiledModel, ModelInput, SerializableCallable
 from torch._dynamo.exc import PackageError, Unsupported
 from torch._dynamo.package import DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.fx._graph_pickler import GraphPickler
-from torch.testing._internal.common_utils import instantiate_parametrized_tests
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    TEST_CUDA,
+)
 
 
 MY_LAMBDA = lambda x: x + 1  # noqa: E731
@@ -599,6 +604,92 @@ def fn(x, y=1):
         actual = compiled_fn(*inputs)
         self.assertEqual(expected, actual)
 
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_aot_compile_with_aoti(self):
+        with torch.device("cuda"):
+            from torch._dynamo.hooks import Hooks
+
+            def fn(x, y):
+                return x + y
+
+            def make_inputs():
+                return (torch.randn(3, 4), torch.randn(3, 4))
+
+            compiled_fn = torch._dynamo.aot_compile.aot_compile_fullgraph(
+                fn,
+                (make_inputs(), {}),
+                Hooks(),
+                torch._TorchCompileAOTInductorWrapper(None, None, None),
+            )
+
+            test_inputs = make_inputs()
+            expected = fn(*test_inputs)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(expected, actual)
+            compiled_fn.save_compiled_function(self.path())
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(expected, actual)
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_aot_compile_with_aoti_module(self):
+        with torch.device("cuda"):
+            from torch._dynamo.hooks import Hooks
+
+            mod = SimpleLinearModule()
+
+            def make_inputs():
+                return (torch.randn(4, 3),)
+
+            compiled_mod = torch._dynamo.aot_compile.aot_compile_module(
+                mod,
+                [ModelInput(make_inputs(), {}, [])],
+                Hooks(),
+                torch._TorchCompileAOTInductorWrapper(None, None, None),
+            )
+
+            def get_grads(m: torch.nn.Module):
+                return {name: p.grad for name, p in m.named_parameters()}
+
+            original_mod = copy.deepcopy(mod)
+            test_inputs = make_inputs()
+            expected = mod(*test_inputs)
+            expected.sum().backward()
+            expected_grads = get_grads(mod)
+
+            actual = compiled_mod(*test_inputs)
+            self.assertEqual(expected, actual)
+            serialized = compiled_mod.serialize()
+            compiled_fn = AOTCompiledModel.deserialize(original_mod, serialized)
+            actual = compiled_fn(*test_inputs)
+            actual.sum().backward()
+            self.assertEqual(get_grads(original_mod), expected_grads)
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_aot_compile_with_aoti_torch_compile(self):
+        with torch.device("cuda"):
+
+            def fn(x, y):
+                return x + y
+
+            def make_inputs():
+                return (torch.randn(3, 4), torch.randn(3, 4))
+
+            compiled_fn = torch.compile(
+                fn, fullgraph=True, options={"use_aoti": True}
+            ).aot_compile((make_inputs(), {}))
+            test_inputs = make_inputs()
+            expected = fn(*test_inputs)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(expected, actual)
+            compiled_fn.save_compiled_function(self.path())
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*test_inputs)
+            self.assertEqual(compiled_fn._artifacts.backend_name, "aotinductor")
+            self.assertEqual(expected, actual)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/__init__.py b/torch/__init__.py
@@ -2439,6 +2439,35 @@ def reset(self):
                 reset_cudagraph_trees()
 
 
+class _TorchCompileAOTInductorWrapper(_TorchCompileInductorWrapper):
+    compiler_name = "aotinductor"
+
+    def __init__(self, mode, options, dynamic):
+        super().__init__(mode, options, dynamic)
+        self.apply_options({"cpp_wrapper": True})
+        self.apply_options({"aot_inductor.package": True})
+
+    def __call__(self, model_, inputs_):
+        from contextlib import nullcontext
+        from unittest import mock
+
+        from torch._guards import detect_fake_mode
+        from torch._inductor.virtualized import V
+
+        fake_mode = detect_fake_mode(inputs_)
+        ctx = (
+            mock.patch.object(fake_mode, "allow_non_fake_inputs", True)
+            if fake_mode
+            else nullcontext()
+        )
+        with (
+            V.set_aot_compilation(True),
+            ctx,
+            torch._inductor.config.patch("enable_autograd_for_aot", True),
+        ):
+            return super().__call__(model_, inputs_)
+
+
 class _TorchCompileWrapper:
     def __init__(self, backend, mode, options, dynamic):
         from torch._dynamo.backends.registry import lookup_backend
@@ -2672,8 +2701,10 @@ def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
             backend = bisect_backend
 
     guard_filter_fn = None
+    use_aoti = False
     if options and isinstance(options, dict):
         guard_filter_fn = options.pop("guard_filter_fn", None)
+        use_aoti = options.pop("use_aoti", False)
 
     if torch.compiler.is_exporting():
         warnings.warn(
@@ -2700,7 +2731,10 @@ def export_wrapped_fn(*args, **kwargs):
         return export_wrapped_fn
 
     if backend == "inductor":
-        backend = _TorchCompileInductorWrapper(mode, options, dynamic)
+        if use_aoti:
+            backend = _TorchCompileAOTInductorWrapper(mode, options, dynamic)
+        else:
+            backend = _TorchCompileInductorWrapper(mode, options, dynamic)
     else:
         backend = _TorchCompileWrapper(backend, mode, options, dynamic)
 
diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py
@@ -53,6 +53,7 @@ class CompileArtifacts:
     argdefs: Optional[tuple[Any, ...]]
     source_info: "SourceInfo"
     device_type: str
+    backend_name: str
     system_info: SystemInfo = dataclasses.field(default_factory=SystemInfo.current)
 
     def check_compatibility(self) -> None:
@@ -166,7 +167,8 @@ def deserialize(cls, data: bytes) -> "AOTCompiledFunction":
         state = pickle.loads(data)
         state["bytecode"] = SerializedCode.to_code_object(state["bytecode"])
         deserializer, compiled_fn_state = state["compiled_fn"]
-        state["compiled_fn"] = deserializer(compiled_fn_state)
+        with torch._inductor.config.patch(enable_autograd_for_aot=True):
+            state["compiled_fn"] = deserializer(compiled_fn_state)
         state["original_code"] = SerializedCode.to_code_object(state["original_code"])
 
         artifacts = CompileArtifacts(**state)
@@ -273,6 +275,7 @@ def new_guard_filter_fn(
             argdefs=fn.__defaults__,
             source_info=source_info,
             device_type=device_type,
+            backend_name=getattr(backend, "compiler_name", "unknown"),
         )
         aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
 
diff --git a/torch/_functorch/_aot_autograd/aot_autograd_result.py b/torch/_functorch/_aot_autograd/aot_autograd_result.py
@@ -511,6 +511,7 @@ def wrap_post_compile(
         ).post_compile(
             compiled_fw_func, aot_config, runtime_metadata=self.runtime_metadata
         )
+        compiled_fw_func._boxed_call = True
         disable_amp = torch._C._is_any_autocast_enabled()
 
         if needs_autograd:
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -1640,7 +1640,9 @@ def codegen_and_compile(
                             # pyrefly: ignore [unbound-name]
                             (str, list, torch.fx.GraphModule),
                         ), type(compiled_fn)
-                        return CompiledAOTI(compiled_fn)
+                        return CompiledAOTI(
+                            filename=compiled_fn, device_type=graph.device_type
+                        )
 
                     # TODO: Hoist this above V.aot_compilation
                     # pyrefly: ignore [unbound-name]
@@ -2713,7 +2715,7 @@ def bw_compiler(
             or torch._guards.TracingContext(fake_mode)
         )
 
-        if V.aot_compilation:
+        if V.aot_compilation and not config.enable_autograd_for_aot:
             from .utils import is_valid_aoti_model_name
 
             is_valid_aoti_model_name()
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -1190,6 +1190,8 @@ def decide_compile_threads() -> int:
 
 file_lock_timeout: int = int(os.environ.get("TORCHINDUCTOR_FILE_LOCK_TIMEOUT", "600"))
 
+enable_autograd_for_aot: bool = False
+
 
 def get_worker_log_path() -> Optional[str]:
     log_loc = None
diff --git a/torch/_inductor/output_code.py b/torch/_inductor/output_code.py
@@ -773,20 +773,95 @@ class CompiledAOTI(OutputCode):
     """
 
     filename: Union[str, list[Union[str, Weights]], torch.fx.GraphModule]
+    device_type: str
+    current_callable: Optional[Callable[..., Any]] = None
+    _cached_files: dict[str, bytes] = dataclasses.field(default_factory=dict)
+
+    def __post_init__(self):
+        if not config.aot_inductor.link_libtorch:
+            return
+
+        if (
+            torch._inductor.cpp_builder._IS_MACOS
+            or torch._inductor.cpp_builder._IS_WINDOWS
+        ):
+            return
+
+        if config.aot_inductor.cross_target_platform == "windows":
+            return
+
+        if config.aot_inductor.package_cpp_only:
+            return
+
+        if not config.enable_autograd_for_aot:
+            return
+
+        if isinstance(self.filename, list):
+            current_callable = next(
+                fn for fn in self.filename if isinstance(fn, str) and fn.endswith(".so")
+            )
+        else:
+            current_callable = self.filename
+
+        if isinstance(current_callable, torch.fx.GraphModule):
+            self.current_callable = current_callable
+            return
+
+        if self.device_type.startswith("cuda"):
+            current_callable = (
+                torch._C._aoti.AOTIModelContainerRunnerCuda(  # type: ignore[call-arg]
+                    current_callable,
+                    1,
+                    self.device_type,
+                    "",
+                    True,
+                ).run  # type: ignore[attr-defined]
+            )  # type: ignore[attr-defined]
+        elif self.device_type == "cpu":
+            current_callable = (
+                torch._C._aoti.AOTIModelContainerRunnerCpu(  # type: ignore[call-arg]
+                    current_callable, 1
+                ).run  # type: ignore[attr-defined]
+            )  # type: ignore[attr-defined]
+        else:
+            raise RuntimeError(f"unsupported device type {self.device_type}")
+        self.current_callable = current_callable
+        self._boxed_call = True
+        for file in self._cached_files:
+            if not os.path.exists(file):
+                with open(file, "wb") as f:
+                    f.write(self._cached_files[file])
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
-        raise NotImplementedError("NYI")
+        if self.current_callable is None:
+            raise RuntimeError("AOTInductor compiled so is not loaded")
+        return self.current_callable(inputs)
+
+    def prepare_for_serialization(self) -> None:
+        self.current_callable = None
+        self._cached_files = {}
+        filenames: list[str] = []
+        if isinstance(self.filename, list):
+            filenames = self.filename  # type: ignore[assignment]
+        elif isinstance(self.filename, str):
+            filenames = [self.filename]
+        for name in filenames:
+            with open(name, "rb") as f:
+                self._cached_files[name] = f.read()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["current_callable"] = None
+        return state
 
     def post_compile(
         self,
         example_inputs: Sequence[InputType],
         constants: CompiledFxGraphConstants,
         graph_kwargs: _CompileFxKwargs,
     ) -> None:
-        pass
-
-    def prepare_for_serialization(self) -> None:
-        pass
+        if self.current_callable is None:
+            self.__post_init__()
 
     def set_triton_bundle(self, triton_bundle: Any) -> None:
         pass
diff --git a/torch/csrc/inductor/aoti_runner/pybind.cpp b/torch/csrc/inductor/aoti_runner/pybind.cpp
@@ -66,6 +66,12 @@ void initAOTIRunnerBindings(PyObject* module) {
            int,
            const std::string&,
            const std::string&>())
+      .def(py::init<
+           const std::string&,
+           int,
+           const std::string&,
+           const std::string&,
+           const bool>())
       .def(
           "run",
           &AOTIModelContainerRunnerCuda::run,

Original file line number	Diff line number	Diff line change
`@@ -511,6 +511,7 @@ def wrap_post_compile(`
`511`	`511`	`).post_compile(`
`512`	`512`	`compiled_fw_func, aot_config, runtime_metadata=self.runtime_metadata`
`513`	`513`	`)`
	`514`	`+ compiled_fw_func._boxed_call = True`
`514`	`515`	`disable_amp = torch._C._is_any_autocast_enabled()`
`515`	`516`
`516`	`517`	`if needs_autograd:`