[TRTLLM-10060][feat] Enable attention dp for Nemotron Super v3.

nv-guomingz · nv-guomingz · commit 7fe7dabff0b5 · 2025-12-30T23:33:00.000+08:00
Signed-off-by: nv-guomingz &lt;137257613+nv-guomingz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
@@ -13,6 +13,12 @@ def preprocess_weights(self, weights: dict) -> dict:
         config = self.config.pretrained_config
         tp_size = self.config.mapping.tp_size
         tp_rank = self.config.mapping.tp_rank
+        enable_attention_dp = self.config.mapping.enable_attention_dp
+
+        # For Mamba2 layers, use tp_size=1 when attention DP is enabled
+        mamba_tp_size = 1 if enable_attention_dp else tp_size
+        mamba_tp_rank = 0 if enable_attention_dp else tp_rank
+
         d_inner = config.mamba_head_dim * config.mamba_num_heads
 
         def _split_mamba2_mixer_in_proj(w: torch.Tensor) -> torch.Tensor:
@@ -24,12 +30,12 @@ def _split_mamba2_mixer_in_proj(w: torch.Tensor) -> torch.Tensor:
                 ],
                 dim=0)
             w = []
-            for rank in range(tp_size):
-                in_proj_z_rank = split(in_proj_z, tp_size, rank)
-                in_proj_x_rank = split(in_proj_x, tp_size, rank)
-                in_proj_b_rank = split(in_proj_b, tp_size, rank)
-                in_proj_c_rank = split(in_proj_c, tp_size, rank)
-                in_proj_dt_rank = split(in_proj_dt, tp_size, rank)
+            for rank in range(mamba_tp_size):
+                in_proj_z_rank = split(in_proj_z, mamba_tp_size, rank)
+                in_proj_x_rank = split(in_proj_x, mamba_tp_size, rank)
+                in_proj_b_rank = split(in_proj_b, mamba_tp_size, rank)
+                in_proj_c_rank = split(in_proj_c, mamba_tp_size, rank)
+                in_proj_dt_rank = split(in_proj_dt, mamba_tp_size, rank)
                 y = torch.concat([
                     in_proj_z_rank, in_proj_x_rank, in_proj_b_rank,
                     in_proj_c_rank, in_proj_dt_rank
@@ -67,16 +73,16 @@ def _split_mamba2_mixer_in_proj(w: torch.Tensor) -> torch.Tensor:
                 else:
                     new_weights[key] = weights[name]
             elif "A" in key:
-                w = split(weights[name], tp_size, tp_rank)
+                w = split(weights[name], mamba_tp_size, mamba_tp_rank)
                 w = w.to(torch.float32)
                 w = -torch.exp(w)
                 new_weights[key] = w
             elif "D" in key:
-                w = split(weights[name], tp_size, tp_rank)
+                w = split(weights[name], mamba_tp_size, mamba_tp_rank)
                 w = w.to(torch.float32)
                 new_weights[key] = w
             elif "dt_bias" in key:
-                w = split(weights[name], tp_size, tp_rank)
+                w = split(weights[name], mamba_tp_size, mamba_tp_rank)
                 w = w.to(torch.float32)
                 new_weights[key] = w
             elif "mixer.in_proj" in key:
@@ -91,16 +97,16 @@ def _split_mamba2_mixer_in_proj(w: torch.Tensor) -> torch.Tensor:
                     w, [d_inner, n_groups * d_state, n_groups * d_state], dim=0)
 
                 w = []
-                for rank in range(tp_size):
-                    conv_x_rank = split(conv_x, tp_size, rank)
-                    conv_b_rank = split(conv_b, tp_size, rank)
-                    conv_c_rank = split(conv_c, tp_size, rank)
+                for rank in range(mamba_tp_size):
+                    conv_x_rank = split(conv_x, mamba_tp_size, rank)
+                    conv_b_rank = split(conv_b, mamba_tp_size, rank)
+                    conv_c_rank = split(conv_c, mamba_tp_size, rank)
                     y = torch.concat([conv_x_rank, conv_b_rank, conv_c_rank])
                     w.append(y)
                 w = torch.concat(w).contiguous()
                 new_weights[key] = w
             elif "mixer.norm.weight" in key:
-                w = split(weights[name], tp_size, tp_rank)
+                w = split(weights[name], mamba_tp_size, mamba_tp_rank)
                 new_weights[key] = w
             # Remap MoE expert weights.
             elif "mixer.experts." in key:
diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
@@ -32,7 +32,7 @@
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
 from ..modules.fused_moe import MoEWeightLoadingMode, create_moe
-from ..modules.linear import Linear
+from ..modules.linear import Linear, TensorParallelMode
 from ..modules.mamba.mamba2_mixer import Mamba2Mixer
 from ..modules.mlp import MLP
 from ..modules.multi_stream_utils import maybe_execute_in_parallel
@@ -85,8 +85,10 @@ def __init__(
         self,
         model_config: ModelConfig[NemotronHConfig],
         layer_idx: int,
+        reduce_output: bool = False,
     ):
         config = model_config.pretrained_config
+
         super().__init__(
             hidden_size=config.hidden_size,
             num_attention_heads=config.num_attention_heads,
@@ -97,6 +99,7 @@ def __init__(
             layer_idx=layer_idx,
             dtype=config.torch_dtype,
             config=model_config,
+            reduce_output=reduce_output,
         )
 
     def forward(
@@ -154,6 +157,7 @@ def __init__(
             shared_expert_intermediate_size = (
                 config.moe_shared_expert_intermediate_size *
                 config.n_shared_experts)
+
             self.shared_experts = MLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=shared_expert_intermediate_size,
@@ -193,11 +197,14 @@ def __init__(
             activation_type=self.activation_type,
         )
 
-        # AllReduce for combining shared and routed expert outputs in multi-GPU settings.
-        self.allreduce = AllReduce(
-            mapping=model_config.mapping,
-            strategy=model_config.allreduce_strategy,
-        )
+        if not model_config.mapping.enable_attention_dp:
+            # AllReduce for combining shared and routed expert outputs in multi-GPU settings.
+            self.allreduce = AllReduce(
+                mapping=model_config.mapping,
+                strategy=model_config.allreduce_strategy,
+            )
+        else:
+            self.allreduce = None
 
         # Setup latent projection layers.
         if self.use_latent_moe:
@@ -314,7 +321,11 @@ def __init__(
         elif layer_type == "-":
             self.mixer = MLPLayer(model_config, layer_idx)
         elif layer_type == "*":
-            self.mixer = TransformerLayer(model_config, layer_idx)
+            self.mixer = TransformerLayer(
+                model_config,
+                layer_idx,
+                reduce_output=not model_config.mapping.enable_attention_dp
+                and model_config.mapping.tp_size > 1)
         elif layer_type == "E":
             self.mixer = NemotronHMOE(model_config,
                                       layer_idx=layer_idx,
@@ -357,12 +368,24 @@ def __init__(self, model_config: ModelConfig[NemotronHConfig]):
             aux_stream_list[2],
         }
 
-        # calculate embeddings
-        self.embed_tokens = Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            dtype=config.torch_dtype,
-        )
+        if model_config.mapping.enable_attention_dp:
+            # When attention_dp is enabled, we cannot do all_reduce since
+            # the problem size of different ranks are different.
+            # So, we don't do parallelism here.
+            self.embed_tokens = Embedding(
+                config.vocab_size,
+                config.hidden_size,
+                dtype=config.torch_dtype,
+            )
+        else:
+            self.embed_tokens = Embedding(
+                config.vocab_size,
+                config.hidden_size,
+                dtype=config.torch_dtype,
+                mapping=model_config.mapping,
+                tensor_parallel_mode=TensorParallelMode.COLUMN,
+                gather_output=True,
+            )
 
         # create layers
         layers = []
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -57,8 +57,23 @@ def __init__(
 
         config = config or ModelConfig()
         self.mapping = config.mapping
-        tp_rank = config.mapping.tp_rank
-        tp_size = config.mapping.tp_size
+
+        if config.mapping.enable_attention_dp:
+            from tensorrt_llm.mapping import Mapping
+            self.mapping = Mapping(
+                world_size=config.mapping.pp_size,
+                tp_size=1,
+                pp_size=config.mapping.pp_size,
+                rank=config.mapping.rank,
+                gpus_per_node=config.mapping.gpus_per_node,
+                enable_attention_dp=True,
+            )
+            tp_size = 1
+            tp_rank = 0
+        else:
+            self.mapping = config.mapping
+            tp_size = config.mapping.tp_size
+            tp_rank = config.mapping.tp_rank
 
         d_inner = head_dim * nheads
         d_in_proj = 2 * d_inner + 2 * n_groups * d_state + nheads
diff --git a/tensorrt_llm/_torch/modules/mlp.py b/tensorrt_llm/_torch/modules/mlp.py
@@ -4,6 +4,8 @@
 import torch
 from torch import nn
 
+from tensorrt_llm.mapping import Mapping
+
 from ..model_config import ModelConfig
 from ..peft.lora.layer import LoraLayer, LoraModuleType
 from .linear import Linear, TensorParallelMode, WeightMode, WeightsLoadingConfig
@@ -21,24 +23,33 @@ def __init__(self,
                  config: Optional[ModelConfig] = None,
                  layer_idx: Optional[int] = None,
                  reduce_output: bool = True):
-
+        if config.mapping.enable_attention_dp:
+            mapping = Mapping(
+                world_size=config.mapping.pp_size,
+                tp_size=1,
+                pp_size=config.mapping.pp_size,
+                rank=config.mapping.rank,
+                gpus_per_node=config.mapping.gpus_per_node,
+                enable_attention_dp=True,
+            )
+        else:
+            mapping = config.mapping
         super().__init__()
         self.layer_idx = layer_idx
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.activation = activation
 
         config = config or ModelConfig()
-        self.up_lora = LoraLayer(
-            [LoraModuleType.MLP_H_TO_4H],
-            [self.intermediate_size // config.mapping.tp_size])
+        self.up_lora = LoraLayer([LoraModuleType.MLP_H_TO_4H],
+                                 [self.intermediate_size // mapping.tp_size])
 
         self.up_proj = Linear(
             self.hidden_size,
             self.intermediate_size,
             bias=bias,
             dtype=dtype,
-            mapping=config.mapping,
+            mapping=mapping,
             tensor_parallel_mode=TensorParallelMode.COLUMN,
             weights_loading_config=WeightsLoadingConfig(
                 weight_mode=WeightMode.VANILLA),
@@ -55,7 +66,7 @@ def __init__(self,
             self.hidden_size,
             bias=bias,
             dtype=dtype,
-            mapping=config.mapping,
+            mapping=mapping,
             tensor_parallel_mode=TensorParallelMode.ROW,
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
diff --git a/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py b/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py
@@ -45,7 +45,7 @@ def __init__(
         self.mamba_ssm_cache_dtype = ssm_cache_dtype
 
         # get tp size
-        tp_size = mapping.tp_size
+        tp_size = mapping.tp_size if not mapping.enable_attention_dp else 1
 
         # derive mamba parameters for conv and ssm states
         d_inner = head_dim * num_heads