NVIDIA
diff --git a/‎.gitlab/tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.gitlab/tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.rst‎
Lines changed: 2 additions & 1 deletion b/‎CHANGELOG.rst‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source/guides/3_pruning.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/guides/3_pruning.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm_distill/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_distill/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm_eval/README.md‎
Lines changed: 4 additions & 4 deletions b/‎examples/llm_eval/README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/llm_eval/gen_model_answer.py‎
Lines changed: 19 additions & 23 deletions b/‎examples/llm_eval/gen_model_answer.py‎
Lines changed: 19 additions & 23 deletions
diff --git a/‎examples/llm_eval/lm_eval_tensorrt_llm.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/llm_eval/lm_eval_tensorrt_llm.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/llm_eval/mmlu.py‎
Lines changed: 5 additions & 3 deletions b/‎examples/llm_eval/mmlu.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎examples/llm_eval/run_fastchat.sh‎
Lines changed: 10 additions & 10 deletions b/‎examples/llm_eval/run_fastchat.sh‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 0 additions & 10 deletions b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 0 additions & 10 deletions
@@ -27,7 +27,7 @@ unit:
 ##### GPU Tests #####
 .multi-gpu-tests-default:
   extends: .tests-default
-  timeout: 60m
+  timeout: 90m
   image: nvcr.io/nvidia/pytorch:25.06-py3
   variables:
     GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
 
@@ -8,7 +8,7 @@ Model Optimizer Changelog (Linux)
 
 - Deprecated ModelOpt's custom docker images. Please use the PyTorch, TensorRT-LLM or TensorRT docker image directly or refer to the `installation guide <https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html>`_ for more details.
 - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
-- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly.
+- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. ``engine_dir`` is replaced with ``checkpoint_dir`` in ``examples/llm_ptq`` and ``examples/vlm_ptq``. For performance evaluation, please use ``trtllm-bench`` directly.
 - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
 - Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
 
@@ -17,6 +17,7 @@ Model Optimizer Changelog (Linux)
 - ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
 - Upgrade TensorRT-LLM dependency to 1.1.0rc2.
 - Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``.
+- Support storing and restoring Minitron pruning activations and scores for re-pruning without running the forward loop again.
 - Add Minitron pruning example for Megatron-LM framework. See ``examples/megatron-lm`` for more details.
 
 0.35 (2025-09-04)
 
@@ -4,7 +4,7 @@ Pruning
 
 .. tip::
 
-    Checkout `Llama 3.1 NeMo Minitron Pruning <https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/llama/pruning-distillation>`_ and
+    Checkout `Qwen 3 NeMo Minitron Pruning & Distillation <https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/qwen/pruning-distillation>`_ and
     `ResNet20 on CIFAR-10 Notebook <https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/pruning/cifar_resnet.ipynb>`_
     for an end-to-end example of pruning.
 
 
@@ -144,7 +144,7 @@ Loss balancers:
 
 Checkout the stand-alone distillation script in the [NVIDIA NeMo repository](https://docs.nvidia.com/nemo-framework/user-guide/latest/model-optimization/distillation/distillation.html).
 
-You can also look at the tutorial notebooks [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/llama/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Llama 3.1 8B step-by-step in NeMo framework.
+You can also look at the NeMo tutorial notebooks [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/qwen/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Qwen 3 8B step-by-step in NeMo framework. Hugging Face models can also be converted to NeMo format and used subsequently as shown in the tutorial.
 
 ## Knowledge Distillation (KD) for HuggingFace Models
 
 
@@ -93,7 +93,7 @@ If `trust_remote_code` needs to be true, please append the command with the `--t
 ### TensorRT-LLM
 
 ```sh
-python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,engine_dir=<Quantized checkpoint dir> --tasks <comma separated tasks> --batch_size <engine batch size>
+python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,checkpoint_dir=<Quantized checkpoint dir> --tasks <comma separated tasks> --batch_size <max batch size>
 ```
 
 ## MMLU
@@ -137,10 +137,10 @@ python mmlu.py --model_name causal --model_path <HF model folder or model card>
 python mmlu.py --model_name causal --model_path <HF model folder or model card> --quant_cfg $MODELOPT_QUANT_CFG_TO_SEARCH --auto_quantize_bits $EFFECTIVE_BITS --batch_size 4
 ```
 
-### Evaluate the TensorRT-LLM engine
+### Evaluate with TensorRT-LLM
 
 ```bash
-python mmlu.py --model_name causal --model_path <HF model folder or model card> --engine_dir <Quantized checkpoint dir>
+python mmlu.py --model_name causal --model_path <HF model folder or model card> --checkpoint_dir <Quantized checkpoint dir>
 ```
 
 ## MT-Bench
@@ -160,7 +160,7 @@ bash run_fastchat.sh -h <HF model folder or model card>
 bash run_fastchat.sh -h <HF model folder or model card> --quant_cfg MODELOPT_QUANT_CFG
 ```
 
-### Evaluate the TensorRT-LLM engine
+### Evaluate with TensorRT-LLM
 
 ```bash
 bash run_fastchat.sh -h <HF model folder or model card> <Quantized checkpoint dir>
 
@@ -118,7 +118,7 @@ def run_eval(
     max_gpu_memory,
     dtype,
     revision,
-    engine_dir,
+    checkpoint_dir,
     nim_model,
     args,
 ):
@@ -150,7 +150,7 @@ def run_eval(
             revision=revision,
             top_p=top_p,
             temperature=temperature,
-            engine_dir=engine_dir,
+            checkpoint_dir=checkpoint_dir,
             nim_model=nim_model,
         )
         for i in range(0, len(questions), chunk_size)
@@ -174,25 +174,22 @@ def get_model_answers(
     revision,
     top_p=None,
     temperature=None,
-    engine_dir=None,
+    checkpoint_dir=None,
     nim_model=None,
 ):
     # Model Optimizer modification
-    if engine_dir:
-        tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code)
-        if engine_dir:
-            # get model type
-            last_part = os.path.basename(engine_dir)
-            model_type = last_part.split("_")[0]
-            # Some models require to set pad_token and eos_token based on external config (e.g., qwen)
-            if model_type == "qwen":
-                tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643)
-                tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643)
-
-            assert LLM is not None, "tensorrt_llm APIs could not be imported."
-            model = LLM(engine_dir, tokenizer=tokenizer)
-        else:
-            raise ValueError("engine_dir is required for TensorRT LLM inference.")
+    tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code)
+    if checkpoint_dir:
+        # get model type
+        last_part = os.path.basename(checkpoint_dir)
+        model_type = last_part.split("_")[0]
+        # Some models require to set pad_token and eos_token based on external config (e.g., qwen)
+        if model_type == "qwen":
+            tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643)
+            tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643)
+
+        assert LLM is not None, "tensorrt_llm APIs could not be imported."
+        model = LLM(checkpoint_dir, tokenizer=tokenizer)
     elif not nim_model:
         model, _ = load_model(
             model_path,
@@ -205,7 +202,6 @@ def get_model_answers(
             cpu_offloading=False,
             debug=False,
         )
-        tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code)
         if args.quant_cfg:
             quantize_model(
                 model,
@@ -259,7 +255,7 @@ def get_model_answers(
 
                 # some models may error out when generating long outputs
                 try:
-                    if not engine_dir:
+                    if not checkpoint_dir:
                         output_ids = model.generate(
                             torch.as_tensor(input_ids).cuda(),
                             do_sample=do_sample,
@@ -427,9 +423,9 @@ def reorg_answer_file(answer_file):
         help="The model revision to load.",
     )
     parser.add_argument(
-        "--engine-dir",
+        "--checkpoint-dir",
         type=str,
-        help="The path to the TensorRT LLM engine directory.",
+        help="The path to the model checkpoint directory.",
     )
     parser.add_argument(
         "--nim-model",
@@ -502,7 +498,7 @@ def reorg_answer_file(answer_file):
         max_gpu_memory=args.max_gpu_memory,
         dtype=str_to_torch_dtype(args.dtype),
         revision=args.revision,
-        engine_dir=args.engine_dir,
+        checkpoint_dir=args.checkpoint_dir,
         nim_model=args.nim_model,
         args=args,
     )
 
@@ -42,7 +42,7 @@ class TRTLLM(TemplateAPI):
     def __init__(
         self,
         tokenizer: str,
-        engine_dir: str,
+        checkpoint_dir: str,
         batch_size: int = 1,
         **kwargs,
     ):
@@ -56,11 +56,11 @@ def __init__(
         if self.tokenizer.pad_token_id is None:
             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
 
-        assert isinstance(engine_dir, str)
+        assert isinstance(checkpoint_dir, str)
 
-        self.llm = LLM(checkpoint_dir=engine_dir, tokenizer=self.tokenizer)
+        self.llm = LLM(checkpoint_dir=checkpoint_dir, tokenizer=self.tokenizer)
         self.max_length = self.llm.max_seq_len - 1
-        logger.info("Loaded TRT-LLM engine")
+        logger.info("Loaded TRT-LLM")
 
     def model_call(
         self,
 
@@ -252,9 +252,9 @@ def main(
     mto.enable_huggingface_checkpointing()
     model_path = kwargs["model_path"]
     tokenizer = get_tokenizer(model_path, trust_remote_code=kwargs.get("trust_remote_code", False))
-    if kwargs.get("engine_dir"):
+    if kwargs.get("checkpoint_dir"):
         # get model type
-        last_part = os.path.basename(kwargs["engine_dir"])
+        last_part = os.path.basename(kwargs["checkpoint_dir"])
         model_type = last_part.split("_")[0]
         # Some models require to set pad_token and eos_token based on external config (e.g., qwen)
         if model_type == "qwen":
@@ -264,7 +264,9 @@ def main(
         assert LLM is not None, "tensorrt_llm APIs could not be imported."
         medusa_choices = kwargs.get("medusa_choices")
         model = LLM(
-            checkpoint_dir=kwargs["engine_dir"], tokenizer=tokenizer, medusa_choices=medusa_choices
+            checkpoint_dir=kwargs["checkpoint_dir"],
+            tokenizer=tokenizer,
+            medusa_choices=medusa_choices,
         )
     else:
         model = select_model(
 
@@ -20,18 +20,18 @@
 # If you are using NIM, ensure that you export the NIM API key using:
 # export OPENAI_API_KEY=<NIM_API_KEY>
 #
-# Usage: bash run_fastchat.sh -h <HF model folder or model card> -e <engine_dir> -n <NIM model model card>
+# Usage: bash run_fastchat.sh -h <HF model folder or model card> -e <checkpoint_dir> -n <NIM model model card>
 # model_name: The HuggingFace handle or folder of the model to evaluate.
-# engine_dir: The directory where the TRT-LLM engine is stored.
+# checkpoint_dir: The directory where the checkpoint is stored.
 # nim_model_name: The handle of the NIM model to be used for evaluation.
 #
 # Example commands:
 #
 # Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model:
 # bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct
 #
-# Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with TRT-LLM engine:
-# bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -e /path/to/engine_dir
+# Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with TRT-LLM:
+# bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -e /path/to/checkpoint_dir
 #
 # Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with NIM:
 # bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -n meta-llama/Meta-Llama-3-8B-Instruct
@@ -41,7 +41,7 @@ set -e
 set -x
 
 hf_model_name=""
-engine_dir=""
+checkpoint_dir=""
 nim_model_name=""
 answer_file=""
 quant_cfg=""
@@ -56,9 +56,9 @@ while [[ "$1" != "" ]]; do
             shift
             hf_model_name=$1
             ;;
-        -e | --engine_dir )
+        -e | --checkpoint_dir )
             shift
-            engine_dir=$1
+            checkpoint_dir=$1
             ;;
         -n | --nim_model_name )
             shift
@@ -96,8 +96,8 @@ if [ "$hf_model_name" == "" ]; then
     exit 1
 fi
 
-if [ "$engine_dir" != "" ]; then
-    engine_dir=" --engine-dir $engine_dir "
+if [ "$checkpoint_dir" != "" ]; then
+    checkpoint_dir=" --checkpoint-dir $checkpoint_dir "
 fi
 
 if [ "$nim_model_name" != "" ]; then
@@ -143,7 +143,7 @@ PYTHONPATH=FastChat:$PYTHONPATH python gen_model_answer.py \
     --model-id $hf_model_name \
     --temperature 0.0001 \
     --top-p 0.0001 \
-    $engine_dir \
+    $checkpoint_dir \
     $nim_model_name \
     $answer_file \
     $quant_args
@@ -36,16 +36,6 @@ def is_speculative(hf_config):
     )
 
 
-def get_mode_type_from_engine_dir(engine_dir_str):
-    # Split the path by '/' and get the last part
-    last_part = os.path.basename(engine_dir_str)
-
-    # Split the last part by '_' and get the first segment
-    model_type = last_part.split("_")[0]
-
-    return model_type
-
-
 def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs):
     print(f"Initializing tokenizer from {ckpt_path}")