Support checkpointing Minitron

kevalmorabia97 · kevalmorabia97 · commit 3d7f8af9125a · 2025-09-24T06:48:12.000-07:00
scores

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -17,6 +17,7 @@ Model Optimizer Changelog (Linux)
 - ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
 - Upgrade TensorRT-LLM dependency to 1.1.0rc2.
 - Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``.
+- Support storing and restoring Minitron pruning activations and scores for re-pruning without running the forward loop again.
 - Add Minitron pruning example for Megatron-LM framework. See ``examples/megatron-lm`` for more details.
 
 0.35 (2025-09-04)
diff --git a/docs/source/guides/3_pruning.rst b/docs/source/guides/3_pruning.rst
@@ -4,7 +4,7 @@ Pruning
 
 .. tip::
 
-    Checkout `Llama 3.1 NeMo Minitron Pruning <https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/llama/pruning-distillation>`_ and
+    Checkout `Qwen 3 NeMo Minitron Pruning & Distillation <https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/qwen/pruning-distillation>`_ and
     `ResNet20 on CIFAR-10 Notebook <https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/pruning/cifar_resnet.ipynb>`_
     for an end-to-end example of pruning.
 
diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md
@@ -144,7 +144,7 @@ Loss balancers:
 
 Checkout the stand-alone distillation script in the [NVIDIA NeMo repository](https://docs.nvidia.com/nemo-framework/user-guide/latest/model-optimization/distillation/distillation.html).
 
-You can also look at the tutorial notebooks [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/llama/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Llama 3.1 8B step-by-step in NeMo framework.
+You can also look at the NeMo tutorial notebooks [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/qwen/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Qwen 3 8B step-by-step in NeMo framework. Hugging Face models can also be converted to NeMo format and used subsequently as shown in the tutorial.
 
 ## Knowledge Distillation (KD) for HuggingFace Models
 
diff --git a/examples/pruning/README.md b/examples/pruning/README.md
@@ -67,12 +67,14 @@ export_config = {
 
 
 # Run the pruning process
+# Save minitron scores so we can re-run pruning with different export configs without running the forward loop again
+# NOTE: Skip checkpoint path on re-running if you want to change the dataset
 mtp.prune(
     model,
     mode="mcore_minitron",
     constraints={"export_config": export_config},
     dummy_input=None,  # Not used
-    config={"forward_loop": forward_loop},
+    config={"forward_loop": forward_loop, "checkpoint": "modelopt_minitron_scores.pth"},
 )
 ```
 
@@ -91,11 +93,11 @@ mtp.prune(
 
 ## Examples
 
-### Minitron Pruning for Megatron-LM / NeMo Framework LLMs (e.g. Llama 3.1, Nemotron Nano)
+### Minitron Pruning for Megatron-LM / NeMo Framework LLMs (e.g. Qwen 3, Nemotron Nano)
 
 Checkout the Minitron pruning example for the [Megatron-LM Framework](../megatron-lm/README.md#-pruning) and [NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/model-optimization/pruning/pruning.html) which showcases the usage of the powerful Minitron pruning algorithm developed by NVIDIA Research for pruning LLMs like Llama 3.1 8B, Qwen 3 8B, Nemotron Nano 12B v2, etc.
 
-You can also look at the NeMo tutorial notebooks [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/llama/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Llama 3.1 8B step-by-step in NeMo framework. Hugging Face models can also be converted to NeMo format and used subsequently as shown in the tutorial.
+You can also look at the NeMo tutorial notebooks [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/qwen/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Qwen 3 8B step-by-step in NeMo framework. Hugging Face models can also be converted to NeMo format and used subsequently as shown in the tutorial.
 
 Some of the models pruned using Minitron method followed by distillation and post-training are:
 
diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
@@ -1234,10 +1234,10 @@ def _emb_layernorm_forward_hook(self, module, input, output) -> None:
         output = output.to(torch.float32)  # use full precision to avoid overflow
         activations = output.abs().mean(dim=0)  # [batch_size, hidden_size]
         activations = activations.pow(2).sum(dim=0)  # [hidden_size]
-        if module not in self._activations:
-            self._activations[module] = activations
+        if id(module) not in self._activations:
+            self._activations[id(module)] = activations
         else:
-            self._activations[module] += activations
+            self._activations[id(module)] += activations
 
     def _estimate_hidden_size_importance(self) -> TracedHp.Importance:
         """Return the activation magnitude-based importance of the hidden_size."""
@@ -1293,7 +1293,7 @@ def _export_drop_layers(self) -> None:
         for layer in self.decoder.layers:
             assert layer._scores > 0, "No scores collected for importance estimation."
 
-        # gather layer scores from all TP regions
+        # gather layer scores from all PP ranks
         layer_scores = {}
         for layer in self.decoder.layers:
             layer_scores[layer.layer_number] = layer._scores
@@ -1302,7 +1302,7 @@ def _export_drop_layers(self) -> None:
             all_pp_layer_scores, layer_scores, group=get_pipeline_model_parallel_group()
         )
         layer_scores = {k: v for d in all_pp_layer_scores for k, v in d.items()}  # type: ignore[attr-defined]
-        print_rank_0(f"Layerwise scores for depth pruning: {layer_scores}")
+        print_rank_0(f"Layerwise scores (1-indexed) for depth pruning: {layer_scores}")
         assert sorted(layer_scores.keys()) == list(range(1, num_layers_hp.max + 1))  # type: ignore[arg-type]
 
         # sort layers by scores and drop the lowest ones
diff --git a/modelopt/torch/opt/searcher.py b/modelopt/torch/opt/searcher.py
@@ -27,6 +27,7 @@
 from collections.abc import Callable
 from contextlib import nullcontext
 from typing import Any, final
+from warnings import warn
 
 import numpy as np
 import pulp
@@ -239,7 +240,11 @@ def load_search_checkpoint(self) -> bool:
         """Load function for search checkpoint returning indicator whether checkpoint was loaded."""
         # check if checkpoint exists
         checkpoint: str | None = self.config["checkpoint"]
-        if checkpoint is None or not os.path.exists(checkpoint):
+        if checkpoint is None:
+            return False
+        if not os.path.exists(checkpoint):
+            if dist.is_master():
+                warn(f"Checkpoint {checkpoint} does not exist! Initializing from scratch.")
             return False
 
         # iterate through state dict and load keys
@@ -250,14 +255,16 @@ def load_search_checkpoint(self) -> bool:
             setattr(self, key, state)
         return True
 
-    def save_search_checkpoint(self) -> None:
+    def save_search_checkpoint(self, verbose=False) -> None:
         """Save function for search checkpoint."""
         # check if save requirements are satisfied
         checkpoint: str | None = self.config["checkpoint"]
         if checkpoint is None or not dist.is_master():
             return
 
         # save state dict
+        if verbose:
+            print(f"Saving searcher state to {checkpoint}...")
         save_dirname, _ = os.path.split(checkpoint)
         if save_dirname:
             os.makedirs(save_dirname, exist_ok=True)
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -24,6 +24,8 @@
 Actual dynamic module implementations are at :mod:`modelopt.torch.nas.plugins.megatron`.
 """
 
+import copy
+
 import torch
 from pydantic import create_model
 
@@ -39,6 +41,7 @@
 from modelopt.torch.opt.config import ModeloptBaseConfig, get_kwargs_for_create_model_with_rules
 from modelopt.torch.opt.searcher import BaseSearcher, SearchConfig, SearchStateDict
 from modelopt.torch.opt.utils import named_hparams
+from modelopt.torch.utils import distributed as dist
 from modelopt.torch.utils import print_rank_0
 
 from ..fastnas import FastNASModeDescriptor
@@ -60,23 +63,19 @@
 class MCoreMinitronSearcher(BaseSearcher):
     """Searcher for Minitron pruning algorithm."""
 
+    activations: dict[int, dict[str, torch.Tensor]]
+    scores: dict[int, dict[str, torch.Tensor]]
+    ckpt_world_size: int
+
     @property
     def default_search_config(self) -> SearchConfig:
         """Get the default config for the searcher."""
         return {**super().default_search_config, "max_iter_data_loader": 1024}
 
     @property
     def default_state_dict(self) -> SearchStateDict:
-        """Return default state dict."""
-        return {}  # Not used
-
-    def sanitize_search_config(self, config: SearchConfig | None) -> SearchConfig:
-        """Sanitize the search config dict."""
-        config = super().sanitize_search_config(config)
-        assert config["data_loader"] or config["forward_loop"], (
-            "Data loader or forward loop must be provided for importance estimation!"
-        )
-        return config
+        """Return default state dict for importance scores and activations from forward loop."""
+        return {"activations": {}, "scores": {}, "ckpt_world_size": dist.size()}
 
     def before_search(self) -> None:
         """Optional pre-processing steps before the search."""
@@ -87,10 +86,11 @@ def before_search(self) -> None:
             "Only `export_config` constraint is supported for pruning!"
         )
 
+        self.constraints["export_config"] = copy.deepcopy(self.constraints["export_config"])
         export_config = self.constraints["export_config"]
         assert isinstance(export_config, dict)  # to keep mypy happy
         assert export_config.keys() <= SUPPORTED_HPARAMS, (
-            f"Only {SUPPORTED_HPARAMS} are supported for pruning!"
+            f"Only {SUPPORTED_HPARAMS} are supported for pruning! Received: {export_config.keys()}"
         )
 
         assert ("num_attention_heads" in export_config and "num_query_groups" in export_config) or (
@@ -124,14 +124,47 @@ def before_search(self) -> None:
     def run_search(self) -> None:
         """Run actual search."""
         # Run forward loop to collect activations and sort parameters
-        assert self.forward_loop is not None
-        is_training = self.model.training
-        self.model.eval()
-        print_rank_0("Running forward loop...")
-        with torch.no_grad():
-            self.forward_loop(self.model)
+        if self.scores and self.activations:  # Available from checkpoint
+            print_rank_0("Loading activations and scores per rank from checkpoint...")
+            assert self.ckpt_world_size == dist.size(), "World size mismatch!"
+            rank = dist.rank()
+            for n, m in self.model.named_modules():
+                if hasattr(m, "_scores"):
+                    m._scores = self.scores[rank][n]
+                if hasattr(m, "_activations"):
+                    m._activations = self.activations[rank][n]
+        else:
+            print_rank_0("Running forward loop...")
+            assert self.forward_loop is not None
+            is_training = self.model.training
+            self.model.eval()
+            with torch.no_grad():
+                self.forward_loop(self.model)
+            self.model.train(is_training)
+
+            # Store activations and layer scores for re-pruning with different export configs
+            rank = dist.rank()
+            rank_scores = {}
+            rank_activations = {}
+            for n, m in self.model.named_modules():
+                if hasattr(m, "_scores"):
+                    rank_scores[n] = m._scores
+                if hasattr(m, "_activations"):
+                    rank_activations[n] = m._activations
+
+            # Gather scores and activations from all ranks to rank 0
+            all_scores = dist.allgather(rank_scores)
+            all_activations = dist.allgather(rank_activations)
+
+            # Store all ranks' data in the searcher's state
+            for r in range(dist.size()):
+                self.scores[r] = all_scores[r]
+                self.activations[r] = all_activations[r]
+
+            self.save_search_checkpoint(verbose=True)
+            dist.barrier()
+
         sort_parameters(self.model, self.hps_to_sort, verbose=True)
-        self.model.train(is_training)
 
         # Prune homogeneously
         export_config = self.constraints["export_config"]
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
@@ -41,6 +41,7 @@ def _test_mcore_gpt_pruning(
     pruned_hidden_size_div,
     pruned_num_layers_div,
     uneven_pp,
+    ckpt_path,
     rank,
     size,
 ):
@@ -66,22 +67,26 @@ def _test_mcore_gpt_pruning(
         else:
             raise ValueError(f"Unsupported size {size}")
 
-    model = get_mcore_gpt_model(
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=size,
-        initialize_megatron=True,
-        num_layers=num_layers,
-        hidden_size=hidden_size,
-        num_attention_heads=num_attention_heads,
-        num_query_groups=num_query_groups,
-        ffn_hidden_size=ffn_hidden_size,
-        max_sequence_length=max_sequence_length,
-        vocab_size=vocab_size,
-        activation_func=activation_func,
-        normalization=normalization,
-        num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage,
-        num_layers_in_last_pipeline_stage=num_layers_in_last_pipeline_stage,
-    )
+    def _get_model(initialize_megatron=True):
+        model = get_mcore_gpt_model(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=size,
+            initialize_megatron=initialize_megatron,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_query_groups=num_query_groups,
+            ffn_hidden_size=ffn_hidden_size,
+            max_sequence_length=max_sequence_length,
+            vocab_size=vocab_size,
+            activation_func=activation_func,
+            normalization=normalization,
+            num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage,
+            num_layers_in_last_pipeline_stage=num_layers_in_last_pipeline_stage,
+        )
+        return model
+
+    model = _get_model()
 
     def forward_loop(m):
         for _ in range(5):
@@ -110,7 +115,7 @@ def forward_loop(m):
         mode="mcore_minitron",
         constraints={"export_config": export_config},
         dummy_input=None,  # Not used
-        config={"forward_loop": forward_loop},
+        config={"forward_loop": forward_loop, "checkpoint": ckpt_path},
     )
 
     # Assert weights are pruned correctly
@@ -139,6 +144,17 @@ def forward_loop(m):
     # Assert forward pass works on the pruned model
     run_mcore_inference_with_dummy_input(model, batch_size, pruned_hidden_size)
 
+    # Assert re-pruning from checkpoint works without running the forward loop again
+    if ckpt_path:
+        model = _get_model(initialize_megatron=False)
+        mtp.prune(
+            model,
+            mode="mcore_minitron",
+            constraints={"export_config": export_config},
+            dummy_input=None,  # Not used
+            config={"checkpoint": ckpt_path},
+        )
+
 
 @pytest.mark.parametrize(
     (
@@ -152,16 +168,18 @@ def forward_loop(m):
         "hidden_size_div",
         "num_layers_div",
         "uneven_pp",
+        "test_ckpt",
     ),
     [
-        (8, 8, "squared_relu", "LayerNorm", 4, 1, 1, 1, 1, False),  # MHA - pruned ffn/4
-        (8, 4, "squared_relu", "RMSNorm", 1, 2, 2, 1, 1, False),  # GQA - pruned attention/2
-        (8, 4, "swiglu", "RMSNorm", 1, 1, 1, 4, 1, False),  # GQA - pruned hidden_size/4
-        (8, 8, "swiglu", "LayerNorm", 1, 1, 1, 1, 2, False),  # MHA - pruned num_layers/2
-        (8, 4, "swiglu", "RMSNorm", 2, 2, 2, 2, 2, True),  # GQA - pruned all/2, uneven pp
+        (8, 8, "squared_relu", "LayerNorm", 4, 1, 1, 1, 1, False, False),  # MHA - pruned ffn/4
+        (8, 4, "squared_relu", "RMSNorm", 1, 2, 2, 1, 1, False, False),  # GQA - pruned attention/2
+        (8, 4, "swiglu", "RMSNorm", 1, 1, 1, 4, 1, False, False),  # GQA - pruned hidden_size/4
+        (8, 8, "swiglu", "LayerNorm", 1, 1, 1, 1, 2, False, False),  # MHA - pruned num_layers/2
+        (8, 4, "swiglu", "RMSNorm", 2, 2, 2, 2, 2, True, True),  # GQA - pruned all/2, uneven pp
     ],
 )
 def test_mcore_gpt_pruning(
+    tmp_path,
     num_attention_heads,
     num_query_groups,
     activation_func,
@@ -172,6 +190,7 @@ def test_mcore_gpt_pruning(
     hidden_size_div,
     num_layers_div,
     uneven_pp,
+    test_ckpt,
 ):
     spawn_multiprocess_job(
         size=torch.cuda.device_count(),
@@ -187,6 +206,7 @@ def test_mcore_gpt_pruning(
             hidden_size_div,
             num_layers_div,
             uneven_pp,
+            tmp_path / "modelopt_minitron_scores.pth" if test_ckpt else None,
         ),
         backend="nccl",
     )
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
diff --git a/tox.ini b/tox.ini