[DeepSeek] add torch.compile + async TP (#1588)

tianyu-l · web-flow · commit 9e2468960969 · 2025-08-18T21:27:49.000-07:00
verified that torch.compile works. However, I didn't see async TP in trace. cc @danielvegamyhre @fegin Could you help take a look?
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -108,10 +108,9 @@ def parallelize_llama(
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if job_config.training.compile:
-        apply_compile(model)
-
         # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
         torch._dynamo.config.capture_scalar_outputs = True
+        apply_compile(model)
 
     dp_mesh: DeviceMesh | None = None
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
@@ -503,7 +502,7 @@ def apply_compile(model: nn.Module):
     repeated structure. Alternatively one can compile the whole model (after applying DP).
     """
     for layer_id, transformer_block in model.layers.named_children():
-        # TODO: remove when torch.compile supports fullgraph=True for llama4 moe
+        # TODO: remove when torch.compile supports fullgraph=True for MoE
         fullgraph = True
         if transformer_block.moe_enabled:
             fullgraph = False
diff --git a/torchtitan/models/deepseek_v3/README.md b/torchtitan/models/deepseek_v3/README.md
@@ -47,6 +47,7 @@ CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml
 - Tensor Parallel (TP)
 - Expert Parallel (EP)
 - Pipeline Parallel (PP)
+- torch.compile
 
 
 ## HuggingFace -> DCP Checkpoint Conversion
@@ -65,8 +66,8 @@ Some limitations:
 ## To be added
 - Parallelism
     - Context Parallel support for DeepSeek V3
-- torch.compile
 - Quantization
 - Testing
-    - perfomance and loss converging tests
-    - CI integration
+    - loss converging tests (verified)
+    - perfomance (WIP)
+    - CI integration (WIP)
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -47,12 +47,11 @@ def parallelize_deepseekv3(
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
-        if job_config.parallelism.enable_async_tensor_parallel:
-            # TODO(jianiw): This branch needs to be tested and enabled
-            raise NotImplementedError(
-                "Currently, async TP is not tested for deepseekv3. \
-                torch.compile is not supported yet, which is required for async TP."
-            )
+        if (
+            job_config.parallelism.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
 
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.float8.recipe_name in (
@@ -94,7 +93,9 @@ def parallelize_deepseekv3(
         apply_ac(model, job_config.activation_checkpoint)
 
     if job_config.training.compile:
-        raise NotImplementedError("torch.compile is not supported yet for deepseekv3")
+        # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
+        torch._dynamo.config.capture_scalar_outputs = True
+        apply_compile(model)
 
     dp_mesh: DeviceMesh | None = None
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
@@ -251,6 +252,12 @@ def apply_non_moe_tp(
             parallelize_plan=layer_plan,
         )
 
+    if enable_async_tp:
+        from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+
+        torch._inductor.config._micro_pipeline_tp = True
+        enable_symm_mem_for_group(tp_mesh.get_group().group_name)
+
     logger.info(
         f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}{'Async ' if enable_async_tp else ''}"
         "Tensor Parallelism to the model"