Add GPT MoE pruning unit test

kevalmorabia97 · kevalmorabia97 · commit 050e1a59f70c · 2025-11-04T11:38:10.000-08:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,7 +6,7 @@ Model Optimizer Changelog (Linux)
 
 **New Features**
 
-- Add MoE pruning support for ``num_moe_experts`` and ``moe_shared_expert_intermediate_size`` in Minitron pruning (``mcore_minitron``).
+- Add MoE (e.g. Qwen3-30B-A3B) pruning support for ``num_moe_experts`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
 
 0.39 (2025-11-07)
 ^^^^^^^^^^^^^^^^^
diff --git a/examples/megatron-lm/README.md b/examples/megatron-lm/README.md
@@ -20,7 +20,7 @@
 | Model | Quantization | EAGLE3 | Q-LoRA | Pruning (PP only) | Distillation |
 | :---: | :---: | :---: | :---: | :---: | :---: |
 | `moonshotai/Kimi-K2-Instruct` | ✅ | **Online** | | | |
-| `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | **Online** | | | |
+| `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | **Online** | | ✅ | ✅ |
 | `Qwen/Qwen3-{0.6B, 8B}` | ✅ | **Online** | | ✅ | ✅ |
 | `deepseek-ai/DeepSeek-R1` | ✅ | **Online** | | | |
 | `meta-llama/Llama-{3.1-8B, 3.1-405B, 3.2-1B}-Instruct` | ✅ | **Online** | | ✅ | ✅ |
@@ -112,14 +112,16 @@ Coming soon ...
 
 Checkout pruning [getting started section](../pruning/README.md#getting-started) and [guidelines](../pruning/README.md#pruning-guidelines) for configuring pruning parameters in the pruning README.
 
-Pruning is supported for GPT and Mamba models in Pipeline Parallel mode. Available pruning options are:
+Pruning is supported for GPT and Mamba models in Pipeline Parallel mode. Available pruning dimensions are:
 
 - `TARGET_FFN_HIDDEN_SIZE`
 - `TARGET_HIDDEN_SIZE`
 - `TARGET_NUM_ATTENTION_HEADS`
 - `TARGET_NUM_QUERY_GROUPS`
 - `TARGET_MAMBA_NUM_HEADS`
 - `TARGET_MAMBA_HEAD_DIM`
+- `TARGET_NUM_MOE_EXPERTS`
+- `TARGET_MOE_SHARED_EXPERT_INTERMEDIATE_SIZE`
 - `TARGET_NUM_LAYERS`
 - `LAYERS_TO_DROP` (comma separated, 1-indexed list of layer numbers to directly drop)
 
diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
@@ -263,7 +263,7 @@ def set_hidden_size_hp(self, hidden_size: TracedHp) -> None:
         self.linear_fc1.input_size = hidden_size
         self.linear_fc2.output_size = hidden_size
 
-    def modify(self, ffn_hidden_size_divisor: int) -> None:
+    def modify(self, ffn_hidden_size_divisor: int, **kwargs) -> None:
         """Modify the ffn_hidden_size hparam choices based on search space config."""
         hp_mlp = self.get_hparam(self.hparam_name)
         choices = {int(make_divisible(c, ffn_hidden_size_divisor)) for c in hp_mlp.choices}  # type: ignore[arg-type]
@@ -937,7 +937,7 @@ def modify(
                 hp.choices = list(set(hp.choices) & choices | {hp.original})
 
         # Modify MLP hparam (regular or MoE)
-        elif isinstance(self.mlp, (MLP, MoELayer)):
+        if isinstance(self.mlp, (MLP, MoELayer)):
             self.mlp.modify(
                 ffn_hidden_size_divisor=ffn_hidden_size_divisor,
                 num_moe_experts_divisor=num_moe_experts_divisor,
diff --git a/tests/_test_utils/torch/megatron/models.py b/tests/_test_utils/torch/megatron/models.py
@@ -142,10 +142,12 @@ def get_mcore_gpt_model(
     normalization: str = "LayerNorm",
     transformer_impl: str = "modelopt" if HAS_TE else "local",
     use_cpu_initialization: bool = False,
-    num_moe_experts: int | None = None,
-    moe_grouped_gemm: bool = False,
     bf16: bool = True,
     use_te: bool = False,
+    # MoE-specific parameters
+    moe_grouped_gemm: bool = False,
+    moe_shared_expert_intermediate_size: int | None = None,
+    num_moe_experts: int | None = None,
 ) -> GPTModel:
     assert activation_func in ["swiglu", "squared_relu"]
     assert normalization in ["LayerNorm", "RMSNorm"]
@@ -169,22 +171,25 @@ def squared_relu(x):
         expert_model_parallel_size=expert_model_parallel_size,
         expert_tensor_parallel_size=expert_tensor_parallel_size,
         sequence_parallel=False,
-        moe_grouped_gemm=moe_grouped_gemm,
         num_layers=num_layers,
         num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage,
         num_layers_in_last_pipeline_stage=num_layers_in_last_pipeline_stage,
         hidden_size=hidden_size,
         num_attention_heads=num_attention_heads,
         num_query_groups=num_query_groups,
         ffn_hidden_size=ffn_hidden_size,
-        num_moe_experts=num_moe_experts,
         activation_func=squared_relu if activation_func == "squared_relu" else F.silu,
         normalization=normalization,
         gated_linear_unit=(activation_func == "swiglu"),
         add_bias_linear=False,
         use_cpu_initialization=use_cpu_initialization,
         pipeline_dtype=torch.bfloat16 if bf16 else torch.float32,
         bf16=bf16,
+        # MoE-specific parameters
+        moe_grouped_gemm=moe_grouped_gemm,
+        moe_router_dtype="fp32",
+        moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size,
+        num_moe_experts=num_moe_experts,
     )
 
     if transformer_impl == "local":
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
@@ -134,7 +134,6 @@ def forward_loop(m):
 
     # Assert weights are pruned correctly
     for layer in model.decoder.layers:
-        print(rank, layer.mlp)
         assert layer.mlp.linear_fc1.weight.shape == (
             pruned_ffn * (2 if activation_func == "swiglu" else 1),
             pruned_hidden_size,
@@ -238,3 +237,116 @@ def test_mcore_gpt_pruning(
         ),
         backend="nccl",
     )
+
+
+def _test_mcore_gpt_pruning_moe(ckpt_path, rank, size):
+    num_layers = size
+    hidden_size = 256
+    ffn_hidden_size = 256
+    num_moe_experts = 8
+    moe_shared_expert_intermediate_size = 128
+    max_sequence_length = 16
+    vocab_size = 64
+    batch_size = 2
+
+    def _get_model(initialize_megatron=True):
+        model = get_mcore_gpt_model(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=size,
+            initialize_megatron=initialize_megatron,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            ffn_hidden_size=ffn_hidden_size,
+            max_sequence_length=max_sequence_length,
+            vocab_size=vocab_size,
+            activation_func="squared_relu",
+            num_moe_experts=num_moe_experts,
+            moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size,
+        ).cuda()
+        return model
+
+    model = _get_model()
+    sd = model.state_dict()
+
+    def forward_loop(m):
+        for _ in range(5):
+            run_mcore_inference_with_dummy_input(m, batch_size, hidden_size)
+
+    pruned_ffn = ffn_hidden_size // 2
+    pruned_hidden_size = hidden_size // 2
+    pruned_num_moe_experts = num_moe_experts // 2
+    pruned_moe_ffn = moe_shared_expert_intermediate_size // 2
+
+    export_config = {
+        "ffn_hidden_size": pruned_ffn,
+        "hidden_size": pruned_hidden_size,
+        "num_moe_experts": pruned_num_moe_experts,
+        "moe_shared_expert_intermediate_size": pruned_moe_ffn,
+    }
+
+    mtp.prune(
+        model,
+        mode="mcore_minitron",
+        constraints={"export_config": export_config},
+        dummy_input=None,  # Not used
+        config={"scores_path": ckpt_path, "forward_loop": forward_loop},
+    )
+
+    # Assert weights are pruned correctly
+    for layer in model.decoder.layers:
+        moe = layer.mlp
+        assert moe.router.num_experts == pruned_num_moe_experts
+        assert moe.router.weight.shape == (pruned_num_moe_experts, pruned_hidden_size)
+        assert moe.experts.num_local_experts == pruned_num_moe_experts
+        assert len(moe.experts.local_experts) == pruned_num_moe_experts
+        for expert in moe.experts.local_experts:
+            assert expert.linear_fc1.weight.shape == (pruned_ffn, pruned_hidden_size), (
+                expert.linear_fc1.weight.shape,
+                pruned_ffn,
+                pruned_hidden_size,
+            )
+            assert expert.linear_fc2.weight.shape == (pruned_hidden_size, pruned_ffn), (
+                expert.linear_fc2.weight.shape,
+                pruned_hidden_size,
+                pruned_ffn,
+            )
+        assert moe.shared_experts.linear_fc1.weight.shape == (
+            pruned_moe_ffn,
+            pruned_hidden_size,
+        )
+        assert moe.shared_experts.linear_fc2.weight.shape == (
+            pruned_hidden_size,
+            pruned_moe_ffn,
+        )
+
+    # Assert model.config is updated for correct save/restoring
+    assert model.config.ffn_hidden_size == pruned_ffn
+    assert model.config.hidden_size == pruned_hidden_size
+    assert model.config.num_moe_experts == pruned_num_moe_experts
+    assert model.config.moe_shared_expert_intermediate_size == pruned_moe_ffn
+
+    # Assert forward pass works on the pruned model
+    prompt_tokens = torch.randint(0, vocab_size, (batch_size, max_sequence_length)).cuda()
+    output = run_mcore_inference(model, prompt_tokens, pruned_hidden_size)
+
+    # Assert re-pruning from scores_path works without running the forward loop again
+    model_rerun = _get_model(initialize_megatron=False)
+    model_rerun.load_state_dict(sd)
+    mtp.prune(
+        model_rerun,
+        mode="mcore_minitron",
+        constraints={"export_config": export_config},
+        dummy_input=None,  # Not used
+        config={"scores_path": ckpt_path},
+    )
+
+    output_rerun = run_mcore_inference(model_rerun, prompt_tokens, pruned_hidden_size)
+    assert torch.allclose(output, output_rerun, atol=1e-5)
+
+
+def test_mcore_gpt_pruning_moe(tmp_path):
+    spawn_multiprocess_job(
+        size=torch.cuda.device_count(),
+        job=partial(_test_mcore_gpt_pruning_moe, tmp_path / "minitron_scores.pth"),
+        backend="nccl",
+    )