Add option to run AOT Precompile in benchmark (pytorch#164906)

yiming0416 · pytorchmergebot · commit 102b7885ff40 · 2025-10-14T20:59:55.000Z
Use the existing benchmark infra to get some signals for AOT precompile pass rate on OSS models. Here we also measure and log the loading time. ``` python ./benchmarks/dynamo/huggingface.py --accuracy --inference --aot-precompile python ./benchmarks/dynamo/timm_models.py --accuracy --inference --aot-precompile python ./benchmarks/dynamo/torchbench.py --accuracy --inference --aot-precompile ``` Pull Request resolved: pytorch#164906 Approved by: https://github.com/zhxchen17
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
@@ -1060,6 +1060,8 @@ def maybe_mark_profile(*args, **kwargs):
             frozen_model_iter_fn = export_nativert(model, example_inputs)
         elif args.torchscript_jit_trace:
             frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
+        elif args.aot_precompile:
+            frozen_model_iter_fn = aot_precompile(model, example_inputs)
         else:
             if kwargs["hf_llm"]:
                 # If it's an llm, we want to optimize model.forward, and use
@@ -1495,6 +1497,37 @@ def opt_export(_, example_inputs):
     return opt_export
 
 
+def aot_precompile(model, example_inputs):
+    example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+
+    with tempfile.NamedTemporaryFile(suffix=".pt", delete=False) as f:
+        save_path = f.name
+
+    with fresh_cache(), torch._dynamo.config.patch("enable_aot_compile", True):
+        compiled_fn = torch.compile(
+            model,
+            fullgraph=True,
+            options={"guard_filter_fn": lambda guards: [False for _ in guards]},
+        ).forward.aot_compile((example_args, example_kwargs))
+
+        compiled_fn.save_compiled_function(save_path)
+
+        torch._dynamo.reset()
+        with open(save_path, "rb") as f:
+            load_start_time = time.perf_counter()
+            loaded_fn = torch.compiler.load_compiled_function(f)
+            load_end_time = time.perf_counter()
+            print(
+                f"AOT Precompile loading time: {load_end_time - load_start_time} seconds"
+            )
+
+            def opt_aot_precompile(_, example_inputs, collect_outputs=False):
+                example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+                return loaded_fn(model, *example_args, **example_kwargs)
+
+            return opt_aot_precompile
+
+
 def export_nativert(model, example_inputs):
     optimized = NativeRTCache.load(model, example_inputs)
 
@@ -2274,6 +2307,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     or self.args.export_aot_inductor
                     or self.args.export_nativert
                     or self.args.torchscript_jit_trace
+                    or self.args.aot_precompile
                 ):
                     # apply export on module directly
                     # no need for n iterations
@@ -2729,6 +2763,7 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                 self.args.export_aot_inductor
                 or self.args.export_nativert
                 or self.args.torchscript_jit_trace
+                or self.args.aot_precompile
             ):
                 optimized_model_iter_fn = optimize_ctx
             else:
@@ -3505,6 +3540,11 @@ def get_example_inputs(self):
         action="store_true",
         help="Measure pass rate with Export+AOTInductor",
     )
+    group.add_argument(
+        "--aot-precompile",
+        action="store_true",
+        help="Measure pass rate with AOT Precompile",
+    )
     group.add_argument(
         "--export-nativert",
         action="store_true",
@@ -3935,6 +3975,10 @@ def run(runner, args, original_dir=None):
         optimize_ctx = export
         experiment = speedup_experiment
         output_filename = "export.csv"
+    elif args.aot_precompile:
+        optimize_ctx = aot_precompile
+        experiment = speedup_experiment
+        output_filename = "aot_precompile.csv"
     elif args.export_nativert:
         optimize_ctx = export_nativert
         experiment = speedup_experiment