feat: more numerically stable qwen custom plan (NVIDIA-NeMo#1235)

terrykong · web-flow · commit 9afc419c5f2e · 2025-10-06T16:50:30.000Z
Signed-off-by: Terry Kong &lt;terryk@nvidia.com&gt;
diff --git a/examples/custom_parallel.py b/examples/custom_parallel.py
@@ -26,3 +26,37 @@
     "model.layers.*.mlp.down_proj": RowwiseParallel(),
     "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
 }
+
+"""
+Note on numerical stability:
+
+- Default plans that keep attention output proj and mlp downproj RowwiseParallel are numerically
+  unstable and tend to increase with larger TP (e.g., TP >= 4).
+
+Enable this custom plan via:
+
+- policy.dtensor_cfg.custom_parallel_plan=examples.custom_parallel.qwen_model_tp_plan_stable
+
+Based on https://github.com/NVIDIA-NeMo/Automodel/blob/d79ccb94b0eca94a4c479313db2f9eee80db0139/nemo_automodel/components/distributed/optimized_tp_plans.py#L205-L217
+"""
+qwen_model_tp_plan_stable = {
+    "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
+    "model.embed_tokens": RowwiseParallel(
+        input_layouts=Replicate(),
+    ),
+    "model.layers.*.self_attn.q_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.k_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.v_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.o_proj": ColwiseParallel(
+        input_layouts=Shard(-1),
+        output_layouts=Replicate(),
+        use_local_output=True,
+    ),
+    "model.layers.*.mlp.up_proj": ColwiseParallel(),
+    "model.layers.*.mlp.gate_proj": ColwiseParallel(),
+    "model.layers.*.mlp.down_proj": ColwiseParallel(
+        input_layouts=Shard(-1),
+        output_layouts=Replicate(),
+        use_local_output=True,
+    ),
+}