[TRTLLM-6741][fix] Add heuristics for lm head tp size when enable_lm_head_tp_in_adp=True (#7891)

Njuapp · chzblych · web-flow · commit cdce68c3e0aa · 2025-09-30T09:24:35.000+08:00
Signed-off-by: Cheng Hang &lt;chang@nvidia.com&gt;
Co-authored-by: Yanchao Lu &lt;yanchaol@nvidia.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -2205,8 +2205,9 @@ def launchTestJobs(pipeline, testFilter)
         // "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 5, 8, 2],
     // ]
     multiNodesSBSAConfigs = [:]
-    multiNodesSBSAConfigs += (1..7).collectEntries { i ->
-        ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 7, 8, 2]]
+    def numMultiNodeTests = 9
+    multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
+        ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
     }
     fullSet += multiNodesSBSAConfigs.keySet()
 
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -404,12 +404,8 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
         self.norm = RMSNorm(hidden_size=config.hidden_size,
                             eps=config.rms_norm_eps,
                             dtype=config.torch_dtype)
-        if self.model_config.mapping.enable_attention_dp and \
-            getattr(self.model_config.mapping, 'enable_lm_head_tp_in_adp', False):
-            self.mapping_lm_head_tp = create_lm_head_tp_mapping(
-                self.model_config.mapping)
-        else:
-            self.mapping_lm_head_tp = self.model_config.mapping
+
+        self.mapping_lm_head_tp = None
 
     @torch.compile(options={"max-autotune": True})
     def get_last_token_states(self, hidden_states, attn_metadata):
@@ -433,19 +429,23 @@ def forward(self,
                 hidden_states = hidden_states[-1].unsqueeze(0)
 
         enable_attention_dp = self.model_config.mapping.enable_attention_dp
-        enable_lm_head_tp_in_adp = self.model_config.mapping.enable_lm_head_tp_in_adp
+        enable_lm_head_tp_in_adp = enable_attention_dp and self.model_config.mapping.enable_lm_head_tp_in_adp
 
         # Add pre-lm gather logic
         if enable_lm_head_tp_in_adp:
             # ADP + LM TP mode: perform All-Gather before LM_head
+            self.mapping_lm_head_tp = create_lm_head_tp_mapping(
+                self.model_config.mapping, hidden_states.shape[0])
             hidden_states = allgather(hidden_states,
                                       self.mapping_lm_head_tp,
                                       dim=0)
 
         # Temporarily disable gather_output when not in ADP mode or (in ADP mode and LM TP is enabled)
         if not enable_attention_dp or enable_lm_head_tp_in_adp:
             lm_head.gather_output = False
-        logits = lm_head(hidden_states, is_spec_decoding_head=True)
+        logits = lm_head(hidden_states,
+                         mapping_lm_head_tp=self.mapping_lm_head_tp,
+                         is_spec_decoding_head=True)
         if not enable_attention_dp or enable_lm_head_tp_in_adp:
             lm_head.gather_output = True
         return logits
diff --git a/tensorrt_llm/_torch/modules/embedding.py b/tensorrt_llm/_torch/modules/embedding.py
@@ -7,9 +7,9 @@
 
 from tensorrt_llm.functional import AllReduceParams
 from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.math_utils import ceil_div
 
 from ..distributed import allgather
-from ..utils import create_lm_head_tp_mapping
 from .linear import Linear, TensorParallelMode
 
 
@@ -38,8 +38,6 @@ def __init__(
         mapping = mapping or Mapping()
         self.enable_lm_head_tp_in_adp = mapping.enable_attention_dp and \
             getattr(mapping, 'enable_lm_head_tp_in_adp', False)
-        if self.enable_lm_head_tp_in_adp:
-            mapping = create_lm_head_tp_mapping(mapping)
 
         tp_size = mapping.tp_size
 
@@ -78,18 +76,6 @@ def __init__(
         self.weight = Parameter(torch.empty(weight_shape, dtype=dtype))
         self.register_parameter("bias", None)
 
-        # For LM head TP in ADP, we need to slice the weight for the LM head
-        self.lm_head_slice_obj = None
-        if self.enable_lm_head_tp_in_adp:
-            tp_rank = self.mapping.tp_rank
-            tp_size = self.mapping.tp_size
-            slice_width = math.ceil(self.out_features / tp_size)
-            slice_start = tp_rank * slice_width
-            slice_end = min((tp_rank + 1) * slice_width, self.out_features)
-            slice_obj = [slice(None)] * len(self.weight.shape)
-            slice_obj[0] = slice(slice_start, slice_end)
-            self.lm_head_slice_obj = tuple(slice_obj)
-
     @property
     def vocab_size_padded(self) -> int:
         if self.tp_mode == TensorParallelMode.COLUMN and self.gather_output:
@@ -102,10 +88,18 @@ def forward(
         input: torch.Tensor,
         *,
         all_reduce_params: Optional[AllReduceParams] = None,
+        mapping_lm_head_tp: Optional[Mapping] = None,
         is_spec_decoding_head: bool = False,
     ) -> torch.Tensor:
         if is_spec_decoding_head and self.enable_lm_head_tp_in_adp:
-            output = F.linear(input, self.weight[self.lm_head_slice_obj], None)
+            # For LM head TP in ADP, we need to slice the weight for the LM head
+            tp_rank = mapping_lm_head_tp.tp_rank
+            tp_size = mapping_lm_head_tp.tp_size
+            slice_width = ceil_div(self.out_features, tp_size)
+            slice_start = tp_rank * slice_width
+            slice_end = min((tp_rank + 1) * slice_width, self.out_features)
+            output = F.linear(input, self.weight[slice_start:slice_end, :],
+                              None)
         else:
             output = super().forward(input, all_reduce_params=all_reduce_params)
         if (self.tp_mode == TensorParallelMode.COLUMN and self.gather_output
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -1,5 +1,4 @@
 import contextlib
-import os
 import threading
 from dataclasses import dataclass
 from enum import Enum
@@ -288,8 +287,13 @@ def get_per_request_piecewise_cuda_graph_flag() -> bool:
     return getattr(_global_attrs, 'per_request_piecewise_cuda_graph_flag', True)
 
 
-def create_lm_head_tp_mapping(mapping: Mapping) -> Mapping:
-    lm_head_tp_size = int(os.getenv('LM_HEAD_TP_SIZE', 2))
+def create_lm_head_tp_mapping(mapping: Mapping, token_count: int) -> Mapping:
+    # We use heuristic to determine the lm_head_tp_size
+    # Since token_count=256 will hit the boundary of math-bound problem
+    # We use 256 // token_count to determine the lm_head_tp_size
+    lm_head_tp_size_raw = 256 // token_count
+    lm_head_tp_size = nearest_in_buckets(lm_head_tp_size_raw,
+                                         [1, mapping.gpus_per_node])
     assert mapping.tp_size % lm_head_tp_size == 0
     lm_head_pp_size = mapping.pp_size * mapping.tp_size // lm_head_tp_size
 
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1903,7 +1903,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
 
     @skip_pre_blackwell
     @pytest.mark.parametrize(
-        "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
+        "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,enable_lm_head_tp_in_adp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
         [
             #  Use a larger batch_size to speed up the tests
             pytest.param(8,
@@ -1912,6 +1912,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          3,
                          False,
                          False,
+                         False,
                          True,
                          True,
                          32,
@@ -1923,6 +1924,31 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          3,
                          False,
                          False,
+                         False,
+                         True,
+                         True,
+                         32,
+                         "TRTLLM",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(8,
+                         1,
+                         4,
+                         3,
+                         False,
+                         True,
+                         True,
+                         True,
+                         True,
+                         32,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(8,
+                         1,
+                         4,
+                         3,
+                         False,
+                         True,
+                         True,
                          True,
                          True,
                          32,
@@ -1934,6 +1960,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          0,
                          True,
                          True,
+                         False,
                          True,
                          True,
                          32,
@@ -1945,6 +1972,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          0,
                          True,
                          True,
+                         False,
                          True,
                          True,
                          32,
@@ -1956,6 +1984,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          0,
                          True,
                          True,
+                         False,
                          True,
                          True,
                          16,
@@ -1967,6 +1996,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          1,
                          True,
                          True,
+                         False,
                          True,
                          True,
                          32,
@@ -1978,19 +2008,22 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          1,
                          True,
                          True,
+                         False,
                          True,
                          True,
                          8,
                          "CUTLASS",
                          marks=pytest.mark.skip_less_mpi_world_size(8)),
         ],
         ids=[
-            "latency", "latency_trtllmgen", "throughput", "throughput_tp8",
+            "latency", "latency_trtllmgen", "latency_adp_lmtp",
+            "latency_trtllmgen_adp_lmtp", "throughput", "throughput_tp8",
             "throughput_tp4", "throughput_mtp", "throughput_bs8_mtp"
         ])
     def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
-                              attention_dp, cuda_graph, overlap_scheduler,
-                              max_batch_size, moe_backend):
+                              attention_dp, enable_lm_head_tp_in_adp,
+                              cuda_graph, overlap_scheduler, max_batch_size,
+                              moe_backend):
         if moe_backend == "TRTLLM" and (get_sm_version() == 120
                                         or get_sm_version() == 121):
             pytest.skip(
@@ -2016,6 +2049,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
                  enable_attention_dp=attention_dp,
+                 enable_lm_head_tp_in_adp=enable_lm_head_tp_in_adp,
                  speculative_config=mtp_config) as llm:
 
             assert llm.args.moe_config.backend == moe_backend
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -483,6 +483,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -34,6 +34,8 @@ accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -34,6 +34,8 @@ l0_gb200_multi_nodes:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90)