Skip to content

Commit 17df5ca

Browse files
committed
Updated moe names in tests
Signed-off-by: Kinjal Patel <[email protected]>
1 parent 1c821d8 commit 17df5ca

File tree

3 files changed

+41
-44
lines changed

3 files changed

+41
-44
lines changed

modelopt/torch/quantization/mode.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -293,10 +293,7 @@ def convert(self) -> ConvertEntrypoint:
293293
def wrapped_func(model, config, forward_loop=None):
294294
# Access _calib_func as a class attribute to avoid binding
295295
# Check if _calib_func is defined as a class attribute
296-
calib_results = wrapped_calib_func(
297-
model, config, forward_loop, func=self.__class__._calib_func
298-
)
299-
return calib_results
296+
return wrapped_calib_func(model, config, forward_loop, func=self.__class__._calib_func)
300297

301298
return wrapped_func
302299

tests/_test_utils/torch_dist/plugins/megatron_common.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -497,15 +497,15 @@ def convert_maybe_fp8(v):
497497
)
498498

499499

500-
def copy_weights_from_grouped_to_non_grouped(grouped_model, non_grouped_model):
501-
"""Copy weights from grouped MoE model to non-grouped MoE model."""
502-
grouped_state = grouped_model.state_dict()
503-
non_grouped_state = non_grouped_model.state_dict()
500+
def copy_weights_from_grouped_to_non_grouped(te_grouped_moe_model, sequential_moe_model):
501+
"""Copy weights from TEGrouped MoE model to sequential MoE model."""
502+
te_grouped_state = te_grouped_moe_model.state_dict()
503+
sequential_state = sequential_moe_model.state_dict()
504504

505-
# Map grouped weights to non-grouped weights
505+
# Map grouped weights to sequential weights
506506
weight_mapping = {}
507-
non_grouped_key_template = "decoder.layers.{}.mlp.experts.local_experts.{}.linear_fc{}.weight"
508-
for key, value in grouped_state.items():
507+
sequential_key_template = "decoder.layers.{}.mlp.experts.local_experts.{}.linear_fc{}.weight"
508+
for key, value in te_grouped_state.items():
509509
if "experts.linear_fc" in key and "weight" in key:
510510
# Extract expert index from grouped weight name
511511
# Format: decoder.layers.X.mlp.experts.linear_fcY.weightZ
@@ -514,19 +514,19 @@ def copy_weights_from_grouped_to_non_grouped(grouped_model, non_grouped_model):
514514
fc_idx = parts[5] # Y (linear_fc1 or linear_fc2)
515515
weight_idx = parts[6] # Z (weight0, weight1, etc.)
516516

517-
# Map to non-grouped format: decoder.layers.X.mlp.experts.local_experts.Y.linear_fcZ.weight
517+
# Map to sequential format: decoder.layers.X.mlp.experts.local_experts.Y.linear_fcZ.weight
518518
expert_idx = weight_idx.replace("weight", "")
519-
non_grouped_key = non_grouped_key_template.format(layer_idx, expert_idx, fc_idx[-1])
520-
weight_mapping[non_grouped_key] = value
519+
sequential_key = sequential_key_template.format(layer_idx, expert_idx, fc_idx[-1])
520+
weight_mapping[sequential_key] = value
521521
elif isinstance(value, torch.Tensor):
522522
weight_mapping[key] = value
523523

524-
# Copy weights to non-grouped model
525-
for non_grouped_key in non_grouped_state:
526-
if non_grouped_key in weight_mapping:
527-
non_grouped_state[non_grouped_key] = weight_mapping[non_grouped_key].clone()
524+
# Copy weights to sequential model
525+
for sequential_key in sequential_state:
526+
if sequential_key in weight_mapping:
527+
sequential_state[sequential_key] = weight_mapping[sequential_key].clone()
528528

529-
non_grouped_model.load_state_dict(non_grouped_state)
529+
sequential_moe_model.load_state_dict(sequential_state)
530530

531531

532532
def compare_amax_sync_across_expert_parallel(model):
@@ -549,7 +549,7 @@ def compare_amax_sync_across_expert_parallel(model):
549549
expert_amax_values = {}
550550
for name, module in model.named_modules():
551551
if isinstance(module, mtq.nn.TensorQuantizer) and hasattr(module, "_amax"):
552-
# Check for both grouped and non-grouped MoE patterns
552+
# Check for both TEGrouped and sequential MoE patterns
553553
if "local_experts" in name or ("experts" in name and "linear_fc" in name):
554554
expert_amax_values[name] = (
555555
module.amax.item() if hasattr(module.amax, "item") else module.amax
@@ -569,10 +569,10 @@ def compare_amax_sync_across_expert_parallel(model):
569569
for name, amax_val in rank_amax.items():
570570
# Create quantizer type key by normalizing the name
571571
if "local_experts" in name:
572-
# Non-grouped MoE: replace expert index with wildcard
572+
# sequential MoE: replace expert index with wildcard
573573
quantizer_type = re.sub(r"local_experts\.\d+", "local_experts.*", name)
574574
else:
575-
# Grouped MoE: use the name as-is since experts are grouped
575+
# TEGrouped MoE: use the name as-is since experts are grouped
576576
quantizer_type = name
577577

578578
if quantizer_type not in expert_quantizers:

tests/gpu/torch/quantization/plugins/test_megatron.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -544,8 +544,8 @@ def test_moe_sharded_state_dict(need_8_gpus, tmp_path, config):
544544
)
545545

546546

547-
def _test_grouped_vs_non_grouped_quantize_helper(tp_size, ep_size, etp_size, rank, size):
548-
"""Test that grouped and non-grouped MoE models produce similar amax values."""
547+
def _test_te_grouped_vs_sequential_quantize_helper(tp_size, ep_size, etp_size, rank, size):
548+
"""Test that TEGrouped and sequential MoE models produce similar amax values."""
549549
initialize_for_megatron(
550550
tensor_model_parallel_size=tp_size,
551551
expert_model_parallel_size=ep_size,
@@ -559,8 +559,8 @@ def _test_grouped_vs_non_grouped_quantize_helper(tp_size, ep_size, etp_size, ran
559559
def forward_fn(model):
560560
return megatron_prefill(model, prompt_tokens)
561561

562-
# Create grouped MoE model
563-
grouped_moe_model = _gpt_model_provider(
562+
# Create TEGrouped MoE model
563+
te_grouped_moe_model = _gpt_model_provider(
564564
tp_size=tp_size,
565565
ep_size=ep_size,
566566
etp_size=etp_size,
@@ -569,14 +569,14 @@ def forward_fn(model):
569569
use_te=True,
570570
num_moe_experts=4,
571571
)
572-
num_grouped_mlp = sum(
573-
isinstance(module, TEGroupedMLP) for module in grouped_moe_model.modules()
572+
num_te_grouped_mlp = sum(
573+
isinstance(module, TEGroupedMLP) for module in te_grouped_moe_model.modules()
574574
)
575-
assert num_grouped_mlp == 4, (
576-
f"TEGrupedMoEModel has {num_grouped_mlp} TEGroupedMLP modules, it should have 4"
575+
assert num_te_grouped_mlp == 4, (
576+
f"TEGrupedMoEModel has {num_te_grouped_mlp} TEGroupedMLP modules, it should have 4"
577577
)
578578

579-
# Create non-grouped MoE model
579+
# Create sequential MoE model
580580
sequential_moe_model = _gpt_model_provider(
581581
tp_size=tp_size,
582582
ep_size=ep_size,
@@ -592,37 +592,37 @@ def forward_fn(model):
592592
f"SequentialMoEModel has {num_sequential_mlp} SequentialMLP modules, it should have 4"
593593
)
594594
# Copy weights from grouped to non-grouped model
595-
copy_weights_from_grouped_to_non_grouped(grouped_moe_model, sequential_moe_model)
595+
copy_weights_from_grouped_to_non_grouped(te_grouped_moe_model, sequential_moe_model)
596596

597597
# Compare model outputs before quantization
598-
grouped_moe_output = forward_fn(grouped_moe_model)
599-
non_grouped_moe_output = forward_fn(sequential_moe_model)
600-
assert torch.allclose(grouped_moe_output, non_grouped_moe_output, atol=1e-6, rtol=1e-6)
598+
te_grouped_moe_output = forward_fn(te_grouped_moe_model)
599+
sequential_moe_output = forward_fn(sequential_moe_model)
600+
assert torch.allclose(te_grouped_moe_output, sequential_moe_output, atol=1e-6, rtol=1e-6)
601601

602602
# Quantize grouped model
603-
mtq.quantize(grouped_moe_model, mtq.FP8_DEFAULT_CFG, forward_fn)
603+
mtq.quantize(te_grouped_moe_model, mtq.FP8_DEFAULT_CFG, forward_fn)
604604

605605
# Quantize non-grouped model
606606
mtq.quantize(sequential_moe_model, mtq.FP8_DEFAULT_CFG, forward_fn)
607607

608608
# Compare model outputs after quantization
609-
grouped_moe_quant_output = forward_fn(grouped_moe_model)
610-
non_grouped_moe_quant_output = forward_fn(sequential_moe_model)
609+
te_grouped_moe_quant_output = forward_fn(te_grouped_moe_model)
610+
sequential_moe_quant_output = forward_fn(sequential_moe_model)
611611
assert torch.allclose(
612-
grouped_moe_quant_output, non_grouped_moe_quant_output, atol=1e-6, rtol=1e-6
612+
te_grouped_moe_quant_output, sequential_moe_quant_output, atol=1e-6, rtol=1e-6
613613
)
614614

615615

616-
def test_grouped_vs_non_grouped_quantize():
617-
"""Test that grouped and non-grouped MoE models produce similar quantized models."""
616+
def test_te_grouped_vs_sequential_quantize():
617+
"""Test that TEGrouped and sequential MoE models produce similar quantized models."""
618618

619619
size = torch.cuda.device_count()
620620
if size < 4:
621621
pytest.skip("Requires at least 4 GPUs for expert parallel test")
622622

623623
spawn_multiprocess_job(
624624
size=size,
625-
job=partial(_test_grouped_vs_non_grouped_quantize_helper, 1, 2, 2),
625+
job=partial(_test_te_grouped_vs_sequential_quantize_helper, 1, 2, 2),
626626
backend="nccl",
627627
)
628628

@@ -666,8 +666,8 @@ def forward_fn(model):
666666
if isinstance(module, mtq.nn.TensorQuantizer):
667667
# Check if this is an expert quantizer
668668
is_expert_quantizer = (
669-
"local_experts" in name # Non-grouped MoE
670-
or ("experts" in name and "linear_fc" in name) # Grouped MoE
669+
"local_experts" in name # sequential MoE
670+
or ("experts" in name and "linear_fc" in name) # TEGrouped MoE
671671
)
672672

673673
if is_expert_quantizer and hasattr(module, "_amax"):

0 commit comments

Comments
 (0)