async tp minor fix (#1629)

tianyu-l · web-flow · commit 2025abb5fbfe · 2025-08-24T21:53:48.000-07:00
follow up of #1619 to fix remaining errors. also fixing a TODO
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -63,9 +63,6 @@ def parallelize_llama(
     ):
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
-    model_compile_enabled = (
-        job_config.compile.enable and "model" in job_config.compile.components
-    )
     if parallel_dims.tp_enabled:
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.float8.recipe_name in (
@@ -104,6 +101,9 @@ def parallelize_llama(
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(model, job_config.activation_checkpoint)
 
+    model_compile_enabled = (
+        job_config.compile.enable and "model" in job_config.compile.components
+    )
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
         # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
@@ -160,7 +160,7 @@ def parallelize_llama(
         apply_ddp(
             model,
             dp_mesh,
-            enable_compile=job_config.training.compile,
+            enable_compile=model_compile_enabled,
             enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
         )
 
diff --git a/torchtitan/experiments/simple_fsdp/parallelize.py b/torchtitan/experiments/simple_fsdp/parallelize.py
@@ -9,6 +9,7 @@
 
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims
+from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_tp
 from torchtitan.tools.logging import logger
 
@@ -37,16 +38,7 @@ def parallelize_llama(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    model_compile_enabled = (
-        job_config.compile.enable and "model" in job_config.compile.components
-    )
     if parallel_dims.tp_enabled:
-        if (
-            job_config.parallelism.enable_async_tensor_parallel
-            and not model_compile_enabled
-        ):
-            raise RuntimeError("Async TP requires torch.compile")
-
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.float8.recipe_name in (
             "rowwise",
@@ -64,8 +56,8 @@ def parallelize_llama(
             tp_mesh,
             loss_parallel=not job_config.parallelism.disable_loss_parallel,
             enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
-            enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
         )
+        maybe_enable_async_tp(job_config, tp_mesh)
 
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(model, job_config.activation_checkpoint)
@@ -98,11 +90,10 @@ def parallelize_llama(
             mode=dp_mode,
             ac_mode=job_config.activation_checkpoint.mode,
             mp_policy=mp_policy,
-            tp_mesh=tp_mesh if parallel_dims.tp_enabled else None,
         )
         logger.info("Applied Data Parallel (dp mode=%s) to the model", dp_mode)
 
-    if model_compile_enabled:
+    if job_config.compile.enable and "model" in job_config.compile.components:
         torch._inductor.config.reorder_for_peak_memory = False
         model = torch.compile(model, fullgraph=True)
 
diff --git a/torchtitan/experiments/simple_fsdp/simple_fsdp.py b/torchtitan/experiments/simple_fsdp/simple_fsdp.py
@@ -185,7 +185,12 @@ def _custom_policy(ctx, func, *args, **kwargs):
 
 class ReplicateComputation(torch.nn.Module):
     def __init__(
-        self, device_mesh, param_sharding, mode, regional_ac, mp_policy, tp_mesh
+        self,
+        device_mesh,
+        param_sharding,
+        mode,
+        regional_ac,
+        mp_policy,
     ):
         super().__init__()
         self.device_mesh = device_mesh
@@ -197,7 +202,6 @@ def __init__(
         mp_policy = mp_policy or MixedPrecisionPolicy()
         self.param_dtype = mp_policy.param_dtype
         self.reduce_dtype = mp_policy.reduce_dtype
-        self.tp_mesh = tp_mesh
 
     def replicate_compute(self, x):
         # data parallel runtime replicate parameters and do local compute
@@ -207,10 +211,7 @@ def replicate_compute(self, x):
         # support for FSDP/DDP/HSDP + TP (assuming TP shards the inner-most dim)
         if x._spec.mesh.mesh_dim_names[-1] == "tp":
             tp_placement = x._spec.placements[-1]
-            # TODO: remove tp_mesh as an input arg to data_parallel API and use x._spec.mesh["tp"]
-            #       after DeviceMesh supports slicing a non-root mesh
-            # dp_mesh, tp_mesh = self.device_mesh, x._spec.mesh["tp"]
-            dp_mesh, tp_mesh = self.device_mesh, self.tp_mesh
+            dp_mesh, tp_mesh = self.device_mesh, x._spec.mesh["tp"]
 
             # re-wrap 2D DTensor to 1D DTensor on dp_mesh for efficient FSDP all-gather
             sharded_local_tensor = x.to_local()
@@ -270,7 +271,6 @@ def data_parallel(
     mode="replicate",
     ac_mode: str = "none",
     mp_policy: Optional[MixedPrecisionPolicy] = None,
-    tp_mesh: Optional[DeviceMesh] = None,
 ):
     if mode == "replicate":
         param_sharding = (Replicate(),)
@@ -314,7 +314,6 @@ def data_parallel(
                 #         mode,
                 #         regional_ac,
                 #         mp_policy=mp_policy,
-                #         tp_mesh=tp_mesh,
                 #     ),
                 #     unsafe=True,
                 # )
@@ -328,7 +327,6 @@ def data_parallel(
                 mode,
                 regional_ac,
                 mp_policy=mp_policy,
-                tp_mesh=tp_mesh,
             ),
         )
     return model
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -92,7 +92,10 @@ def parallelize_deepseekv3(
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(model, job_config.activation_checkpoint)
 
-    if job_config.compile.enable and "model" in job_config.compile.components:
+    model_compile_enabled = (
+        job_config.compile.enable and "model" in job_config.compile.components
+    )
+    if model_compile_enabled:
         # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
         torch._dynamo.config.capture_scalar_outputs = True
         apply_compile(model)
@@ -147,7 +150,7 @@ def parallelize_deepseekv3(
         apply_ddp(
             model,
             dp_mesh,
-            enable_compile=job_config.training.compile,
+            enable_compile=model_compile_enabled,
             enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
         )
 
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -64,9 +64,6 @@ def parallelize_llama(
     ):
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
-    model_compile_enabled = (
-        job_config.compile.enable and "model" in job_config.compile.components
-    )
     if parallel_dims.tp_enabled:
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.float8.recipe_name in (
@@ -90,6 +87,9 @@ def parallelize_llama(
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(model, job_config.activation_checkpoint)
 
+    model_compile_enabled = (
+        job_config.compile.enable and "model" in job_config.compile.components
+    )
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
         apply_compile(model)