DynamicModuleList, moe_ffn hparam, remove force_assign

kevalmorabia97 · kevalmorabia97 · commit 5b62318cac1f · 2025-11-05T08:04:32.000-08:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/examples/megatron-lm/README.md b/examples/megatron-lm/README.md
@@ -121,6 +121,7 @@ Pruning is supported for GPT and Mamba models in Pipeline Parallel mode. Availab
 - `TARGET_MAMBA_NUM_HEADS`
 - `TARGET_MAMBA_HEAD_DIM`
 - `TARGET_NUM_MOE_EXPERTS`
+- `TARGET_MOE_FFN_HIDDEN_SIZE`
 - `TARGET_MOE_SHARED_EXPERT_INTERMEDIATE_SIZE`
 - `TARGET_NUM_LAYERS`
 - `LAYERS_TO_DROP` (comma separated, 1-indexed list of layer numbers to directly drop)
diff --git a/examples/pruning/README.md b/examples/pruning/README.md
@@ -89,7 +89,7 @@ If your model parameters are already sorted, you can skip the sorting step by se
 
 | **Algorithm** | **Model** | **Pruning Constraints** |
 | :---: | :---: | :---: |
-| Minitron | Megatron-core / NeMo based GPT / Mamba / MoE / Hybrid Models<sup>1</sup> | Export config with width (`hidden_size`, `ffn_hidden_size`, `num_attention_heads`, `num_query_groups`, `mamba_num_heads`, `mamba_head_dim`, `num_moe_experts`, `moe_shared_expert_intermediate_size`) and/or depth (`num_layers`) values |
+| Minitron | Megatron-core / NeMo based GPT / Mamba / MoE / Hybrid Models<sup>1</sup> | Export config with width (`hidden_size`, `ffn_hidden_size`, `num_attention_heads`, `num_query_groups`, `mamba_num_heads`, `mamba_head_dim`, `num_moe_experts`, `moe_ffn_hidden_size`, `moe_shared_expert_intermediate_size`) and/or depth (`num_layers`) values |
 | FastNAS | Computer Vision models | flops, parameters |
 | GradNAS | HuggingFace BERT, GPT-J | flops, parameters |
 
diff --git a/modelopt/torch/nas/modules/container.py b/modelopt/torch/nas/modules/container.py
@@ -26,7 +26,7 @@
 from ..registry import DMRegistry
 from ..traced_hp import TracedHp
 
-__all__ = ["_DynamicSequential"]
+__all__ = ["DynamicModuleList", "_DynamicSequential"]
 
 
 def _activate_depth(func: Callable) -> Callable:
@@ -97,3 +97,35 @@ def modify(self, *, min_depth: int = 0):
         """
         hp = self.get_hparam("depth")
         hp.choices = [d for d in hp.choices if d >= min_depth]
+
+
+# NOTE: We provide a parent class since we do not register to DMRegistry and explicitly convert a module if needed.
+class DynamicModuleList(DynamicModule, nn.ModuleList):
+    """An ``nn.ModuleList`` container with dynamic hyperparams and variable ``depth``.
+
+    Unlike _DynamicSequential, this module supports sorting/reordering of modules based on
+    importance in addition to variable depth.
+    """
+
+    def _setup(self):
+        # register hyperparameters
+        self._register_hparam("depth", TracedHp(list(range(1, len(self) + 1))))
+
+        # register _modules as a dynamic attribute
+        self._register_dynamic_attribute("_modules", self._get_modules)
+
+    @staticmethod
+    def _get_modules(mod: "DynamicModuleList", modules: dict) -> dict:
+        """Get modules with dynamic depth and ordering applied based on active_slice."""
+        hp = mod.get_hparam("depth")
+        active_slice = hp.active_slice
+
+        items = list(modules.items())
+
+        if isinstance(active_slice, slice):
+            active_items = items[active_slice]
+        else:
+            active_items = [items[idx] for idx in active_slice.tolist()]
+
+        # Re-create dict with keys as str(index) from 0 to len(active_items)
+        return {str(i): module for i, (_, module) in enumerate(active_items)}
diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
@@ -16,6 +16,7 @@
 """Plugin to add NAS/Pruning support for megatron-core Language models like GPT and Mamba."""
 
 import types
+from abc import ABC
 from collections.abc import Callable, Sequence
 from typing import Any
 
@@ -52,10 +53,10 @@
 from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
 from megatron.core.transformer.transformer_layer import TransformerLayer
 
+from modelopt.torch.nas.modules import DynamicModuleList
 from modelopt.torch.opt.dynamic import DynamicModule
 from modelopt.torch.opt.hparam import HPType
 from modelopt.torch.opt.searcher import ConstraintsDict
-from modelopt.torch.opt.utils import named_hparams
 from modelopt.torch.trace import Symbol
 from modelopt.torch.utils import distributed as dist
 from modelopt.torch.utils import (
@@ -201,6 +202,8 @@ def _setup(self):
         )
         if isinstance(self, SharedExpertMLP):
             self.hparam_name = "moe_shared_expert_intermediate_size"
+        elif self.config.num_moe_experts is not None:
+            self.hparam_name = "moe_ffn_hidden_size"
         else:
             self.hparam_name = "ffn_hidden_size"
         self.linear_fc1 = DMRegistry.convert(self.linear_fc1)
@@ -650,8 +653,9 @@ def export(self) -> torch.nn.Module:
 
 
 # MoE DynamicModules ###############################################################################
+# Add ABC to avoid TypeError: object layout differs (because parent if TopKRouter inherits from ABC)
 @DMRegistry.register({TopKRouter: "megatron.core.transformer.moe.router.TopKRouter"})
-class _DynamicTopKRouter(DynamicModule):
+class _DynamicTopKRouter(DynamicModule, ABC):
     """A TopKRouter with dynamic hyperparams."""
 
     def _setup(self):
@@ -660,11 +664,11 @@ def _setup(self):
         # Register num_moe_experts hparam name to match TransformerConfig's name.
         #   Will be overridden by _DynamicSequentialMLP's hp.
         self._register_hparam("num_moe_experts", TracedHp(list(range(1, self.weight.shape[0] + 1))))
-        self._register_dynamic_attribute("num_experts", lambda mod, val: mod.num_moe_experts)
         # Register hidden_size reference (will be overridden by _DynamicMoELayer's hidden_size)
         self._register_hparam("hidden_size", TracedHp(list(range(1, self.weight.shape[1] + 1))))
 
         # Register dynamic attributes
+        self._register_dynamic_attribute("num_experts", lambda mod, val: mod.num_moe_experts)
         self._register_dynamic_attribute("weight", self._get_router_weight)
         if self.enable_expert_bias:
             self._register_dynamic_attribute("expert_bias", self._get_slice_by_num_moe_experts)
@@ -702,7 +706,9 @@ def _setup(self):
             lambda mod, val: mod.num_moe_experts,  # EP = 1
         )
 
-        # Convert each individual expert MLP to dynamic
+        # Convert local_experts list and each individual expert MLP to dynamic modules
+        self.local_experts = DynamicModuleList.convert(self.local_experts)
+        self.local_experts.depth = num_moe_experts  # Reuse same hparam for depth
         for i in range(len(self.local_experts)):
             self.local_experts[i] = DMRegistry.convert(self.local_experts[i])
 
@@ -725,6 +731,11 @@ def set_hidden_size_hp(self, hidden_size: TracedHp) -> None:
 
     def _expert_l2_imp_forward_hook(self, module, input, output):
         """Track expert importance based on L2 norms of expert outputs."""
+        # Dont aggregate activations from non-max subnets (e.g. from profiling)
+        num_moe_experts = self.get_hparam("num_moe_experts")
+        if num_moe_experts.active != num_moe_experts.max:
+            return
+
         # Split output back to per-expert outputs using torch.split
         tokens_per_expert_list = input[1].tolist()
         # use full precision to avoid overflow
@@ -757,26 +768,11 @@ def _estimate_expert_importance(self) -> TracedHp.Importance:
             self._activations["expert_sample_counts"] + 1e-8
         )
 
-    def _export_drop_experts(self) -> None:
-        """Drop experts during export based on active hyperparameter value."""
-        # Get sorted + trimmed order of experts to keep
-        active_slice = self.get_hparam("num_moe_experts").active_slice
-
-        # Trim experts based on active hparam value
-        if isinstance(active_slice, slice):
-            kept_experts = self.local_experts[: active_slice.stop]
-        else:
-            kept_experts = [self.local_experts[i] for i in active_slice]
-
-        # Replace the ModuleList with pruned experts
-        self.local_experts = nn.ModuleList(kept_experts)
-
     def export(self) -> torch.nn.Module:
         """Export the dynamic module to a standard SequentialMLP."""
         self.hook_handle.remove()
 
-        # Drop experts based on active hparam value and export remaining experts
-        self._export_drop_experts()
+        self.local_experts.export()
         for expert in self.local_experts:
             expert.export()
 
@@ -789,9 +785,6 @@ class _DynamicMoELayer(DynamicModule):
     """A MoELayer with dynamic hyperparams."""
 
     def _setup(self):
-        # TODO: Add DynamicTokenDispatcher for moe_shared_expert_overlap support
-        assert not self.shared_expert_overlap, "moe_shared_expert_overlap is not supported yet!"
-
         # Convert to dynamic modules
         # Reuse _DynamicSequentialMLP's num_moe_experts hparam for _DynamicTopKRouter's hparam so
         #   importance estimator is not lost.
@@ -810,7 +803,11 @@ def _setup(self):
 
     def _get_local_expert_indices(self, mod: "_DynamicMoELayer", val: list[int]) -> list[int]:
         """Get local expert indices for the current active hparam value."""
-        return list(range(mod.num_local_experts))
+        active_slice = self.experts.get_hparam("num_moe_experts").active_slice
+        if isinstance(active_slice, slice):
+            return list(range(active_slice.stop))
+        else:
+            return active_slice.tolist()
 
     def set_hidden_size_hp(self, hidden_size: TracedHp) -> None:
         """Set hidden size for all MoE components from global hidden_size hparam."""
@@ -955,17 +952,6 @@ def export(self):
         super().export()
         return self
 
-    def freeze(self):
-        """Freeze the dynamic module."""
-        super().freeze()
-        if isinstance(self.self_attention, SelfAttention):
-            self.input_layernorm.freeze()
-            self.self_attention.freeze()
-
-        if isinstance(self.mlp, (MLP, MoELayer)):
-            self.pre_mlp_layernorm.freeze()
-            self.mlp.freeze()
-
 
 # Mamba DynamicModules #############################################################################
 class MambaNumHeadsHp(TracedHp):
@@ -1356,11 +1342,6 @@ def export(self):
         super().export()
         return self
 
-    def freeze(self):
-        """Freeze the hyperparameters."""
-        self.mixer.freeze()
-        super().freeze()
-
 
 if HAS_MAMBA:
     DMRegistry.register({ExtendedRMSNorm: "megatron.core.ssm.mamba_mixer.ExtendedRMSNorm"})(
@@ -1559,12 +1540,6 @@ def _export_drop_layers(self) -> None:
 
     def export(self) -> torch.nn.Module:
         """Export the dynamic module to a torch.nn.Module."""
-        # TODO: Improve this!
-        # Slice order needs to be reset before exporting since weights are already
-        # force assigned and we dont want to sort them again (losing the correct order)
-        for n, hp in named_hparams(self, configurable=True):
-            hp.enforce_order(None)
-
         for handle in self.hook_handles:
             handle.remove()
         self._export_drop_layers()
@@ -1578,12 +1553,6 @@ def export(self) -> torch.nn.Module:
         super().export()
         return self
 
-    def freeze(self) -> None:
-        """Freeze the dynamic module."""
-        super().freeze()
-        for layer in self.decoder.layers:
-            layer.freeze()
-
     def get_activations_and_layer_scores(
         self,
     ) -> tuple[list[dict[str, torch.Tensor]], dict[int, torch.Tensor]]:
diff --git a/modelopt/torch/nas/search_space.py b/modelopt/torch/nas/search_space.py
@@ -162,10 +162,6 @@ def sort_parameters(self, hps_to_sort: set[str] | None = None, verbose: bool = F
                     f"{'order' if hp._importance_is_order else 'importance'}={importance}"
                 )
 
-        # now that we have enforced an order we can force reassign all parameters/buffers!
-        for _, mod in self.named_dynamic_modules():
-            mod.force_assign()
-
         # go back to old config
         self.select(config)
 
diff --git a/modelopt/torch/opt/dynamic.py b/modelopt/torch/opt/dynamic.py
@@ -586,28 +586,6 @@ def export(self) -> nn.Module:
 
         return self
 
-    @torch.no_grad()
-    def force_assign(self):
-        """Force re-assign all dynamic attributes to their current values.
-
-        .. warning::
-
-            Note that this method overwrites the actual buffers and parameters! Only use in
-            specific circumstances!!
-        """
-        # force-reassign all dynamic attributes
-        for name in self._get_dm_attribute_manager().da_keys():
-            val = getattr(self, name)
-            if isinstance(val, torch.Tensor):
-                val = val.detach().clone()
-            if name in self._parameters:
-                val = val if val is None else Parameter(val)
-                self.register_parameter(name, val)
-            elif name in self._buffers:
-                self.register_buffer(name, val)
-            else:
-                setattr(self, name, val)
-
     @classmethod
     @torch.no_grad()
     def convert(cls, module: nn.Module) -> "DynamicModule":
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -71,8 +71,9 @@
     "mamba_num_heads",
     "mamba_head_dim",
     # MoE
-    "num_moe_experts",
+    "moe_ffn_hidden_size",
     "moe_shared_expert_intermediate_size",
+    "num_moe_experts",
     # 2. Depth pruning
     "num_layers",
 }
diff --git a/tests/_test_utils/torch/megatron/models.py b/tests/_test_utils/torch/megatron/models.py
@@ -146,6 +146,7 @@ def get_mcore_gpt_model(
     use_te: bool = False,
     # MoE-specific parameters
     moe_grouped_gemm: bool = False,
+    moe_ffn_hidden_size: int | None = None,
     moe_shared_expert_intermediate_size: int | None = None,
     num_moe_experts: int | None = None,
 ) -> GPTModel:
@@ -188,6 +189,7 @@ def squared_relu(x):
         # MoE-specific parameters
         moe_grouped_gemm=moe_grouped_gemm,
         moe_router_dtype="fp32",
+        moe_ffn_hidden_size=moe_ffn_hidden_size,
         moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size,
         num_moe_experts=num_moe_experts,
     )
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
@@ -48,7 +48,6 @@
     _DynamicVocabParallelEmbedding,
     expand_head_indices,
 )
-from modelopt.torch.nas.registry import DMRegistry
 from modelopt.torch.opt.utils import named_dynamic_modules, search_space_size
 from modelopt.torch.prune.plugins.mcore_minitron import _convert_model_to_dynamic_space
 from modelopt.torch.utils import flatten_tree
@@ -198,10 +197,6 @@ def _test_gpt_parameter_sorting(activation_func, rank, size):
     # 3 hps per layer + 1 for hidden_size (num_layers is not sorted!)
     assert len(sortable_per_pp) == 3 * num_layers // size + 1
 
-    # Export since sorting force reassigns SelfAttention weights which we dont want to re-sort!
-    # TODO: ideally we shouldn't need this
-    dynamic_space.export(DMRegistry)
-
     # sanity check if the model functionality is preserved after sorting
     y2 = run_mcore_inference(model, prompt_tokens)
 
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
diff --git a/tests/unit/torch/nas/modules/test_container.py b/tests/unit/torch/nas/modules/test_container.py

Original file line number	Diff line number	Diff line change
`@@ -162,10 +162,6 @@ def sort_parameters(self, hps_to_sort: set[str] \| None = None, verbose: bool = F`
`162`	`162`	`f"{'order' if hp._importance_is_order else 'importance'}={importance}"`
`163`	`163`	`)`
`164`	`164`
`165`		`- # now that we have enforced an order we can force reassign all parameters/buffers!`
`166`		`- for _, mod in self.named_dynamic_modules():`
`167`		`- mod.force_assign()`
`168`		`-`
`169`	`165`	`# go back to old config`
`170`	`166`	`self.select(config)`
`171`	`167`