Skip to content

Commit 17eba98

Browse files
authored
Refactor Deepseek tp_size calculation (NVIDIA#3695)
Signed-off-by: Hao Lu <[email protected]>
1 parent d51ae53 commit 17eba98

File tree

2 files changed

+88
-33
lines changed

2 files changed

+88
-33
lines changed

tensorrt_llm/_torch/model_config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,12 @@ def _load_json_quant_config(key: str):
126126
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
127127
quant_config.exclude_modules = ["*eh_proj"]
128128

129+
block_size = hf_quant_config.get("weight_block_size", [])
130+
assert tuple(block_size) == (
131+
128,
132+
128), "FP8_BLOCK_SCALES only supports block_size=(128,128)"
133+
quant_config.group_size = block_size[0]
134+
129135
return cls(pretrained_config=pretrained_config,
130136
quant_config=quant_config,
131137
quant_config_dict=layer_quant_config,

tensorrt_llm/_torch/models/modeling_deepseekv3.py

Lines changed: 82 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -272,22 +272,15 @@ def __init__(self,
272272
model_config=model_config,
273273
aux_stream=aux_stream_dict[AuxStreamType.MoeChunkingOverlap])
274274

275-
self.shared_output_scale = None
276-
# The block scale size is 128, which requires shared_expert_intermediate_size to be divisible by 128.
277-
assert shared_expert_intermediate_size % 128 == 0
278-
if self.use_dp:
279-
# If using attention DP, the shared experts also use DP instead of TP.
280-
shared_tp_size = 1
281-
else:
282-
# Due to the restriction of block scale size (i.e., 128), the supported TP sizes only include 1, 2, 4, 8, and 16.
283-
# The math.gcd operation ensures that shared_tp_size falls in the supported TP sizes.
284-
shared_tp_size = math.gcd(
285-
shared_expert_intermediate_size // 128,
286-
model_config.mapping.tp_size,
287-
)
288-
# If shared_tp_size has been overridden, the output of shared experts needs to be scaled down accordingly before all-reduce.
289-
if shared_tp_size != model_config.mapping.tp_size:
290-
self.shared_output_scale = shared_tp_size / model_config.mapping.tp_size
275+
self.mapping = model_config.mapping
276+
277+
# FIXME: incompatible with mixed quantization mode (including excluding modules from quantization)
278+
block_size = 1
279+
if model_config.quant_config and model_config.quant_config.group_size is not None:
280+
block_size = model_config.quant_config.group_size
281+
282+
shared_tp_size, self.shared_output_scale = self._compute_shared_expert_tp_size(
283+
shared_expert_intermediate_size, block_size)
291284

292285
self.shared_experts = GatedMLP(
293286
hidden_size=hidden_size,
@@ -298,14 +291,49 @@ def __init__(self,
298291
overridden_tp_size=shared_tp_size,
299292
reduce_output=False)
300293

301-
self.mapping = model_config.mapping
302294
self.all_reduce = AllReduce(self.mapping)
303295
self.aux_stream = aux_stream_dict[AuxStreamType.MoeShared]
304296
self.event_dict = {
305297
key: torch.cuda.Event()
306298
for key in [EventType.Main, EventType.MoeShared]
307299
}
308300

301+
def _compute_shared_expert_tp_size(self, intermediate_size: int,
302+
block_size: int) -> int:
303+
"""
304+
In the case of Deepseek-R1, the TP size of MLP is capped by intermediate_size // block_size.
305+
For example, when the intermediate_size is 2048 and block scaling size is 128,
306+
TP sizes are limited to {1, 2, 4, 8, 16} because of 2048/128 = 16.
307+
308+
Args:
309+
intermediate_size (int): MLP intermediate size.
310+
block_size (int): The quantization block scale size. In the case of Deepseek FP8 recipe,
311+
it's 128. For NVFP4, it's 16.
312+
313+
Returns:
314+
int: The computed tp_size.
315+
"""
316+
317+
assert intermediate_size % block_size == 0, "intermediate_size must be divisible by block_size."
318+
319+
shared_output_scale = None
320+
# The block scale size is 128, which requires shared_expert_intermediate_size to be divisible by 128.
321+
if self.use_dp:
322+
# If using attention DP, the shared experts also use DP instead of TP.
323+
shared_tp_size = 1
324+
else:
325+
# Due to the restriction of block scale size (i.e., 128), the supported TP sizes only include 1, 2, 4, 8, and 16.
326+
# The math.gcd operation ensures that shared_tp_size falls in the supported TP sizes.
327+
shared_tp_size = math.gcd(
328+
intermediate_size // block_size,
329+
self.mapping.tp_size,
330+
)
331+
# If shared_tp_size has been overridden, the output of shared experts needs to be scaled down accordingly before all-reduce.
332+
if shared_tp_size != self.mapping.tp_size:
333+
shared_output_scale = shared_tp_size / self.mapping.tp_size
334+
335+
return shared_tp_size, shared_output_scale
336+
309337
def compute_routed_output(self, hidden_states, hidden_states_fp4,
310338
all_rank_num_tokens, min_latency_mode):
311339
# max-throughput
@@ -405,6 +433,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
405433
"0") == "0"
406434
self.enable_fusion = enable_fusion and not self.enable_attention_dp
407435

436+
# FIXME: incompatible with mixed quantization mode (including excluding modules from quantization)
408437
self.is_nvfp4 = model_config.quant_config.layer_quant_mode.has_nvfp4()
409438
has_tp = mapping.has_tp()
410439
has_pp = mapping.has_pp()
@@ -427,22 +456,11 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
427456
model_config=model_config,
428457
aux_stream_dict=aux_stream_dict)
429458
else:
430-
# The block scale size is 128, which requires intermediate_size to be divisible by 128.
431-
assert config.intermediate_size % 128 == 0
432-
if self.enable_attention_dp:
433-
# If using attention DP, the MLP also uses DP instead of TP.
434-
self.mlp_tp_size = 1
435-
else:
436-
# Due to the restriction of block scale size (i.e., 128), the supported TP sizes only include 1, 2, 4, 8, and 16.
437-
# To avoid the costly inter-node all-reduce, we further restrict TP size to be divisible by gpus_per_node.
438-
# The two math.gcd operations ensure that mlp_tp_size falls in the candidate TP sizes.
439-
self.mlp_tp_size = math.gcd(
440-
math.gcd(
441-
config.intermediate_size // 128,
442-
mapping.tp_size,
443-
),
444-
mapping.gpus_per_node, # Avoid costly inter-node TP
445-
)
459+
block_size = 1
460+
if model_config.quant_config and model_config.quant_config.group_size is not None:
461+
block_size = model_config.quant_config.group_size
462+
self.mlp_tp_size = self._compute_mlp_tp_size(
463+
config.intermediate_size, block_size)
446464

447465
self.fusion_config.PRE_MLP_FUSION = self.enable_fusion and has_tp and self.is_nvfp4
448466
self.fusion_config.POST_MLP_FUSION = self.enable_fusion and self.mlp_tp_size > 1 and not has_pp
@@ -479,6 +497,37 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
479497
if not self.deepseek_allreduce_disabled:
480498
self.deepseek_allreduce = DeepseekAllReduce(self.mapping)
481499

500+
def _compute_mlp_tp_size(self, intermediate_size: int,
501+
block_size: int) -> int:
502+
"""
503+
For DeepSeek‑R1, MLP TP size is limited by intermediate_size // block_size
504+
and must also be multiples of gpus_per_node to avoid expensive inter‑node allreduce.
505+
506+
Args:
507+
intermediate_size (int): MLP intermediate size.
508+
block_size (int): The quantization block scale size. In the case of Deepseek FP8 recipe,
509+
it's 128. For NVFP4, it's 16.
510+
511+
Returns:
512+
int: The computed tp_size.
513+
"""
514+
515+
assert intermediate_size % block_size == 0, "intermediate_size must be divisible by block_size."
516+
517+
if self.enable_attention_dp:
518+
# If using attention DP, the MLP also uses DP instead of TP.
519+
mlp_tp_size = 1
520+
else:
521+
# The two math.gcd operations ensure that mlp_tp_size falls in the candidate TP sizes.
522+
mlp_tp_size = math.gcd(
523+
math.gcd(
524+
intermediate_size // block_size,
525+
self.mapping.tp_size,
526+
),
527+
self.mapping.gpus_per_node, # Avoid costly inter-node TP
528+
)
529+
return mlp_tp_size
530+
482531
def forward(
483532
self,
484533
position_ids: torch.LongTensor,

0 commit comments

Comments
 (0)