add simplefsdp's autobucketing pass entry

ruisizhang123 · ruisizhang123 · commit 481f97a92002 · 2025-08-28T18:02:35.000-07:00
diff --git a/torchtitan/experiments/auto_parallel/parallelize_llama.py b/torchtitan/experiments/auto_parallel/parallelize_llama.py
@@ -20,6 +20,27 @@
 from torchtitan.tools.logging import logger
 
 
+class simplefsdp_autobucketing_config:
+    """
+    Config for simplefsdp's autobucketing pass, which by default would give good performance.
+    To make the results tunable, we expose the following parameters:
+    - relax_ratio: relax comp time to include more comm in one bucket
+                with this config, comp is updated as comp * (1 + relax_ratio)
+    - peak_memory_offset: relax peak_memory to include more comm in one bucket
+                with this config, peak_memory is updated as (peak_memory + peak_memory_offset)
+    - load_cache: set to True to load cache from save_estimation_path
+    - enable_bucket_ir: set to True to bucket all_gather/reduce_scatter
+    - enable_reorder_ir: set to True to reorder all_gather/reduce_satter
+    """
+
+    relax_ratio = 0
+    peak_memory_offset = 0
+    load_cache = False
+    save_estimation_path = "/mnt/mffuse/cache_ruisi/estimation_mast.pkl"
+    enable_bucket_ir = True
+    enable_reorder_ir = True
+
+
 def parallelize_llama(
     model,
     world_mesh: DeviceMesh,
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -8,6 +8,7 @@
 import os
 import time
 from datetime import timedelta
+from functools import partial
 from typing import Any, Generator, Iterable, Optional
 
 import torch
@@ -125,32 +126,28 @@ def __init__(self, job_config: JobConfig):
 
         # allow configuring inductor comms optimizations from torchtitan commandline
         if job_config.experimental.enable_simplefsdp_passes:
-            try:
-                from torch._inductor.simple_fsdp.bucket import bucket_fsdp_all_gather_concat_on_scheduler_ir
-            except ImportError:
-                print("Must use pytorch from unlanded https://github.com/pytorch/pytorch/pull/160282, e.g. torchtitan_conda_prod:5e4101faa448c2ee6b62ddd76ee08e8c")
-                raise
+            # enable simplefsdp's autobucketing and reorder passes (original code in https://github.com/pytorch/pytorch/pull/160282)
+            from autoparallel.auto_bucketing import (
+                simple_fsdp_autobucketing_reordering_pass,
+            )
 
-            # Configs from Ruisi
+            from torchtitan.experiments.auto_parallel.parallelize_llama import (
+                simplefsdp_autobucketing_config,
+            )
 
-            # set to 0.1 if you want to make bucketing more efficient with mixed dtype collectives
-            torch._inductor.config.simplefsdp.relax_ratio = 0
             torch._inductor.config.allow_buffer_reuse = False
-            torch._inductor.config.simplefsdp.estimate_ir = False
-            torch._inductor.config.simplefsdp.estimate_verbose = False
-            torch._inductor.config.simplefsdp.save_estimation_path = "/mnt/mffuse/cache_ruisi/estimation_mast_"+job_config.model.flavor+".pkl"
-            # set to True after the first communication estimation results are saved. This would reduce decision making time.
-            torch._inductor.config.simplefsdp.load_cache = False
-            torch._inductor.config.simplefsdp.enable_bucket_ir = True
-            torch._inductor.config.simplefsdp.enable_reorder_ir = True
-            torch._inductor.config.simplefsdp.simplefsdp_only = False # False for 2d True for 1d
-            torch._inductor.config.simplefsdp.peak_memory_offset = 0
-            torch._inductor.config.simplefsdp.bucketing_type = "auto"
+            torch._inductor.config.reorder_for_compute_comm_overlap = True
+            simple_fsdp_autobucketing_reordering_pass = partial(
+                simple_fsdp_autobucketing_reordering_pass,
+                configs=simplefsdp_autobucketing_config,
+            )
+            torch._inductor.config.reorder_for_compute_comm_overlap_passes = [
+                simple_fsdp_autobucketing_reordering_pass
+            ]
 
             # Don't use both sets of passes at the same time!
             torch._inductor.config.bucket_all_gathers_fx = "none"
             torch._inductor.config.bucket_reduce_scatters_fx = "none"
-            torch._inductor.config.reorder_for_compute_comm_overlap = False
         else:
             torch._inductor.config.bucket_all_gathers_fx = (
                 job_config.experimental.bucket_all_gathers_fx