3232from ..modules .decoder_layer import DecoderLayer
3333from ..modules .embedding import Embedding
3434from ..modules .fused_moe import MoEWeightLoadingMode , create_moe
35- from ..modules .linear import Linear
35+ from ..modules .linear import Linear , TensorParallelMode
3636from ..modules .mamba .mamba2_mixer import Mamba2Mixer
3737from ..modules .mlp import MLP
3838from ..modules .multi_stream_utils import maybe_execute_in_parallel
@@ -85,8 +85,10 @@ def __init__(
8585 self ,
8686 model_config : ModelConfig [NemotronHConfig ],
8787 layer_idx : int ,
88+ reduce_output : bool = False ,
8889 ):
8990 config = model_config .pretrained_config
91+
9092 super ().__init__ (
9193 hidden_size = config .hidden_size ,
9294 num_attention_heads = config .num_attention_heads ,
@@ -97,6 +99,7 @@ def __init__(
9799 layer_idx = layer_idx ,
98100 dtype = config .torch_dtype ,
99101 config = model_config ,
102+ reduce_output = reduce_output ,
100103 )
101104
102105 def forward (
@@ -154,6 +157,7 @@ def __init__(
154157 shared_expert_intermediate_size = (
155158 config .moe_shared_expert_intermediate_size *
156159 config .n_shared_experts )
160+
157161 self .shared_experts = MLP (
158162 hidden_size = config .hidden_size ,
159163 intermediate_size = shared_expert_intermediate_size ,
@@ -163,7 +167,8 @@ def __init__(
163167 config = model_config ,
164168 layer_idx = self .layer_idx ,
165169 reduce_output = False ,
166- )
170+ overridden_tp_size = 1
171+ if model_config .mapping .enable_attention_dp else None )
167172 # Setup MoE gate.
168173 self .gate = DeepseekV3Gate (
169174 self .hidden_size ,
@@ -193,11 +198,14 @@ def __init__(
193198 activation_type = self .activation_type ,
194199 )
195200
196- # AllReduce for combining shared and routed expert outputs in multi-GPU settings.
197- self .allreduce = AllReduce (
198- mapping = model_config .mapping ,
199- strategy = model_config .allreduce_strategy ,
200- )
201+ if not model_config .mapping .enable_attention_dp :
202+ # AllReduce for combining shared and routed expert outputs in multi-GPU settings.
203+ self .allreduce = AllReduce (
204+ mapping = model_config .mapping ,
205+ strategy = model_config .allreduce_strategy ,
206+ )
207+ else :
208+ self .allreduce = None
201209
202210 # Setup latent projection layers.
203211 # These layers should NOT be TP-sharded to ensure MoE receives
@@ -322,7 +330,11 @@ def __init__(
322330 elif layer_type == "-" :
323331 self .mixer = MLPLayer (model_config , layer_idx )
324332 elif layer_type == "*" :
325- self .mixer = TransformerLayer (model_config , layer_idx )
333+ self .mixer = TransformerLayer (
334+ model_config ,
335+ layer_idx ,
336+ reduce_output = not model_config .mapping .enable_attention_dp
337+ and model_config .mapping .tp_size > 1 )
326338 elif layer_type == "E" :
327339 self .mixer = NemotronHMOE (model_config ,
328340 layer_idx = layer_idx ,
@@ -365,12 +377,24 @@ def __init__(self, model_config: ModelConfig[NemotronHConfig]):
365377 aux_stream_list [2 ],
366378 }
367379
368- # calculate embeddings
369- self .embed_tokens = Embedding (
370- config .vocab_size ,
371- config .hidden_size ,
372- dtype = config .torch_dtype ,
373- )
380+ if model_config .mapping .enable_attention_dp :
381+ # When attention_dp is enabled, we cannot do all_reduce since
382+ # the problem size of different ranks are different.
383+ # So, we don't do parallelism here.
384+ self .embed_tokens = Embedding (
385+ config .vocab_size ,
386+ config .hidden_size ,
387+ dtype = config .torch_dtype ,
388+ )
389+ else :
390+ self .embed_tokens = Embedding (
391+ config .vocab_size ,
392+ config .hidden_size ,
393+ dtype = config .torch_dtype ,
394+ mapping = model_config .mapping ,
395+ tensor_parallel_mode = TensorParallelMode .COLUMN ,
396+ gather_output = True ,
397+ )
374398
375399 # create layers
376400 layers = []
0 commit comments