@@ -272,22 +272,15 @@ def __init__(self,
272272 model_config = model_config ,
273273 aux_stream = aux_stream_dict [AuxStreamType .MoeChunkingOverlap ])
274274
275- self .shared_output_scale = None
276- # The block scale size is 128, which requires shared_expert_intermediate_size to be divisible by 128.
277- assert shared_expert_intermediate_size % 128 == 0
278- if self .use_dp :
279- # If using attention DP, the shared experts also use DP instead of TP.
280- shared_tp_size = 1
281- else :
282- # Due to the restriction of block scale size (i.e., 128), the supported TP sizes only include 1, 2, 4, 8, and 16.
283- # The math.gcd operation ensures that shared_tp_size falls in the supported TP sizes.
284- shared_tp_size = math .gcd (
285- shared_expert_intermediate_size // 128 ,
286- model_config .mapping .tp_size ,
287- )
288- # If shared_tp_size has been overridden, the output of shared experts needs to be scaled down accordingly before all-reduce.
289- if shared_tp_size != model_config .mapping .tp_size :
290- self .shared_output_scale = shared_tp_size / model_config .mapping .tp_size
275+ self .mapping = model_config .mapping
276+
277+ # FIXME: incompatible with mixed quantization mode (including excluding modules from quantization)
278+ block_size = 1
279+ if model_config .quant_config and model_config .quant_config .group_size is not None :
280+ block_size = model_config .quant_config .group_size
281+
282+ shared_tp_size , self .shared_output_scale = self ._compute_shared_expert_tp_size (
283+ shared_expert_intermediate_size , block_size )
291284
292285 self .shared_experts = GatedMLP (
293286 hidden_size = hidden_size ,
@@ -298,14 +291,49 @@ def __init__(self,
298291 overridden_tp_size = shared_tp_size ,
299292 reduce_output = False )
300293
301- self .mapping = model_config .mapping
302294 self .all_reduce = AllReduce (self .mapping )
303295 self .aux_stream = aux_stream_dict [AuxStreamType .MoeShared ]
304296 self .event_dict = {
305297 key : torch .cuda .Event ()
306298 for key in [EventType .Main , EventType .MoeShared ]
307299 }
308300
301+ def _compute_shared_expert_tp_size (self , intermediate_size : int ,
302+ block_size : int ) -> int :
303+ """
304+ In the case of Deepseek-R1, the TP size of MLP is capped by intermediate_size // block_size.
305+ For example, when the intermediate_size is 2048 and block scaling size is 128,
306+ TP sizes are limited to {1, 2, 4, 8, 16} because of 2048/128 = 16.
307+
308+ Args:
309+ intermediate_size (int): MLP intermediate size.
310+ block_size (int): The quantization block scale size. In the case of Deepseek FP8 recipe,
311+ it's 128. For NVFP4, it's 16.
312+
313+ Returns:
314+ int: The computed tp_size.
315+ """
316+
317+ assert intermediate_size % block_size == 0 , "intermediate_size must be divisible by block_size."
318+
319+ shared_output_scale = None
320+ # The block scale size is 128, which requires shared_expert_intermediate_size to be divisible by 128.
321+ if self .use_dp :
322+ # If using attention DP, the shared experts also use DP instead of TP.
323+ shared_tp_size = 1
324+ else :
325+ # Due to the restriction of block scale size (i.e., 128), the supported TP sizes only include 1, 2, 4, 8, and 16.
326+ # The math.gcd operation ensures that shared_tp_size falls in the supported TP sizes.
327+ shared_tp_size = math .gcd (
328+ intermediate_size // block_size ,
329+ self .mapping .tp_size ,
330+ )
331+ # If shared_tp_size has been overridden, the output of shared experts needs to be scaled down accordingly before all-reduce.
332+ if shared_tp_size != self .mapping .tp_size :
333+ shared_output_scale = shared_tp_size / self .mapping .tp_size
334+
335+ return shared_tp_size , shared_output_scale
336+
309337 def compute_routed_output (self , hidden_states , hidden_states_fp4 ,
310338 all_rank_num_tokens , min_latency_mode ):
311339 # max-throughput
@@ -405,6 +433,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
405433 "0" ) == "0"
406434 self .enable_fusion = enable_fusion and not self .enable_attention_dp
407435
436+ # FIXME: incompatible with mixed quantization mode (including excluding modules from quantization)
408437 self .is_nvfp4 = model_config .quant_config .layer_quant_mode .has_nvfp4 ()
409438 has_tp = mapping .has_tp ()
410439 has_pp = mapping .has_pp ()
@@ -427,22 +456,11 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
427456 model_config = model_config ,
428457 aux_stream_dict = aux_stream_dict )
429458 else :
430- # The block scale size is 128, which requires intermediate_size to be divisible by 128.
431- assert config .intermediate_size % 128 == 0
432- if self .enable_attention_dp :
433- # If using attention DP, the MLP also uses DP instead of TP.
434- self .mlp_tp_size = 1
435- else :
436- # Due to the restriction of block scale size (i.e., 128), the supported TP sizes only include 1, 2, 4, 8, and 16.
437- # To avoid the costly inter-node all-reduce, we further restrict TP size to be divisible by gpus_per_node.
438- # The two math.gcd operations ensure that mlp_tp_size falls in the candidate TP sizes.
439- self .mlp_tp_size = math .gcd (
440- math .gcd (
441- config .intermediate_size // 128 ,
442- mapping .tp_size ,
443- ),
444- mapping .gpus_per_node , # Avoid costly inter-node TP
445- )
459+ block_size = 1
460+ if model_config .quant_config and model_config .quant_config .group_size is not None :
461+ block_size = model_config .quant_config .group_size
462+ self .mlp_tp_size = self ._compute_mlp_tp_size (
463+ config .intermediate_size , block_size )
446464
447465 self .fusion_config .PRE_MLP_FUSION = self .enable_fusion and has_tp and self .is_nvfp4
448466 self .fusion_config .POST_MLP_FUSION = self .enable_fusion and self .mlp_tp_size > 1 and not has_pp
@@ -479,6 +497,37 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
479497 if not self .deepseek_allreduce_disabled :
480498 self .deepseek_allreduce = DeepseekAllReduce (self .mapping )
481499
500+ def _compute_mlp_tp_size (self , intermediate_size : int ,
501+ block_size : int ) -> int :
502+ """
503+ For DeepSeek‑R1, MLP TP size is limited by intermediate_size // block_size
504+ and must also be multiples of gpus_per_node to avoid expensive inter‑node allreduce.
505+
506+ Args:
507+ intermediate_size (int): MLP intermediate size.
508+ block_size (int): The quantization block scale size. In the case of Deepseek FP8 recipe,
509+ it's 128. For NVFP4, it's 16.
510+
511+ Returns:
512+ int: The computed tp_size.
513+ """
514+
515+ assert intermediate_size % block_size == 0 , "intermediate_size must be divisible by block_size."
516+
517+ if self .enable_attention_dp :
518+ # If using attention DP, the MLP also uses DP instead of TP.
519+ mlp_tp_size = 1
520+ else :
521+ # The two math.gcd operations ensure that mlp_tp_size falls in the candidate TP sizes.
522+ mlp_tp_size = math .gcd (
523+ math .gcd (
524+ intermediate_size // block_size ,
525+ self .mapping .tp_size ,
526+ ),
527+ self .mapping .gpus_per_node , # Avoid costly inter-node TP
528+ )
529+ return mlp_tp_size
530+
482531 def forward (
483532 self ,
484533 position_ids : torch .LongTensor ,
0 commit comments