NVIDIA
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 3 additions & 1 deletion b/‎.vscode/settings.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/guides/3_pruning.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/guides/3_pruning.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/guides/7_nas.rst‎
Lines changed: 6 additions & 6 deletions b/‎docs/source/guides/7_nas.rst‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 1 addition & 3 deletions b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/pruning/cifar_resnet.ipynb‎
Lines changed: 8 additions & 8 deletions b/‎examples/pruning/cifar_resnet.ipynb‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 31 additions & 0 deletions b/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎examples/speculative_decoding/main.py‎
Lines changed: 3 additions & 34 deletions b/‎examples/speculative_decoding/main.py‎
Lines changed: 3 additions & 34 deletions
@@ -69,6 +69,7 @@ jobs:
       image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
       env:
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps: &example_steps
       - uses: actions/checkout@v4
       - uses: nv-gha-runners/setup-proxy-cache@main
 
@@ -42,5 +42,7 @@
     "evenBetterToml.schema.enabled": false, // disable toml/json schema since we have custom fields
     "python.analysis.extraPaths": [
         "./tests/" // add tests to python path just like pytest does in pyproject.toml
-    ]
+    ],
+    "git.alwaysSignOff": true,
+    "git.enableCommitSigning": true,
 }
@@ -15,7 +15,7 @@
 
 ______________________________________________________________________
 
-The **NVIDIA TensorRT Model Optimizer** (referred to as **Model Optimizer**, or **ModelOpt**) is a library comprising state-of-the-art model optimization [techniques](#techniques) including quantization, distillation, pruning, speculative decoding and sparsity to accelerate models.
+**NVIDIA TensorRT Model Optimizer** (referred to as **Model Optimizer**, or **ModelOpt**) is a library comprising state-of-the-art model optimization [techniques](#techniques) including quantization, distillation, pruning, speculative decoding and sparsity to accelerate models.
 
 **[Input]** Model Optimizer currently supports inputs of a [Hugging Face](https://huggingface.co/), [PyTorch](https://github.com/pytorch/pytorch) or [ONNX](https://github.com/onnx/onnx) model.
 
 
@@ -190,7 +190,7 @@ Following info will be printed before the pruning process is started:
     ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
     ┃ Constraint   ┃ min          ┃ centroid     ┃ max          ┃ max/min ratio ┃
     ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
-    │ flops        │ 274.34M      │ 1.28G        │ 4.59G        │ 16.73         │
+    │ flops        │ 548.68M      │ 2.56G        │ 9.18G        │ 16.73         │
     │ params       │ 2.70M        │ 9.75M        │ 25.50M       │ 9.43          │
     └──────────────┴──────────────┴──────────────┴──────────────┴───────────────┘
 
@@ -199,7 +199,7 @@ Following info will be printed before the pruning process is started:
     ┃              ┃              ┃ Satisfiable  ┃
     ┃ Constraint   ┃ Upper Bound  ┃ Upper Bound  ┃
     ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
-    │ flops        │ 2.75G        │ True         │
+    │ flops        │ 5.50G        │ True         │
     └──────────────┴──────────────┴──────────────┘
 
 
 
@@ -109,8 +109,8 @@ the search space together with your deployment constraints using
 
     import torch
 
-    # Looking for a subnet with at most 2 GFLOPs
-    constraints = {"flops": 2.0e9}
+    # Looking for a subnet with at most 4 GFLOPs
+    constraints = {"flops": 4.0e9}
 
     # Measure FLOPs against dummy_input
     # Can be provided as a single tensor or tuple of input args to the model.
@@ -129,7 +129,7 @@ Following info will be printed:
     ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
     ┃ Constraint   ┃ min          ┃ centroid     ┃ max          ┃ max/min ratio ┃
     ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
-    │ flops        │ 487.92M      │ 1.84G        │ 4.59G        │ 9.40          │
+    │ flops        │ 975.84M      │ 3.68G        │ 9.18G        │ 9.40          │
     │ params       │ 4.84M        │ 12.33M       │ 25.50M       │ 5.27          │
     └──────────────┴──────────────┴──────────────┴──────────────┴───────────────┘
 
@@ -138,7 +138,7 @@ Following info will be printed:
     ┃              ┃              ┃ Satisfiable  ┃
     ┃ Constraint   ┃ Upper Bound  ┃ Upper Bound  ┃
     ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
-    │ flops        │ 2.00G        │ True         │
+    │ flops        │ 4.00G        │ True         │
     └──────────────┴──────────────┴──────────────┘
 
     Search Space Summary:
@@ -242,8 +242,8 @@ Below is an example of running search on an AutoNAS converted and trained model.
     # Specify the sample input including target data shape for FLOPs calculation.
     dummy_input = torch.randn(1, 3, 224, 224)
 
-    # Looking for a subnet with at most 2 GFLOPs
-    search_constraints = {"flops": 2.0e9}
+    # Looking for a subnet with at most 4 GFLOPs
+    search_constraints = {"flops": 4.0e9}
 
     # search_res (dict) contains state_dict / stats of the searcher
     searched_model, search_res = mtn.search(
 
@@ -204,9 +204,7 @@ def get_model(
                 if auto_model_module != AutoModelForCausalLM:
                     model_kwargs2.pop("trust_remote_code", None)
                 model_kwargs2["torch_dtype"] = torch_dtype
-                # DeciLMForCausalLM does not support max_memory argument
-                if "architectures" in hf_config and "DeciLMForCausalLM" in hf_config.architectures:
-                    model_kwargs2.pop("max_memory", None)
+                model_kwargs2.pop("max_memory", None)
                 model = from_config(hf_config, **model_kwargs2)
 
             max_memory = get_max_memory()
 
@@ -328,6 +328,9 @@ def main(args):
             model = model.language_model
             model_type = get_model_type(model)
 
+    if model_type == "phi4mm":
+        warnings.warn("Please set the default input_mode to InputMode.LANGUAGE before quantizing.")
+
     if args.sparsity_fmt != "dense":
         if args.batch_size == 0:
             # Sparse algorithm takes more GPU memory so we reduce the batch_size by 4.
@@ -478,9 +481,6 @@ def main(args):
                 quant_cfg["quant_cfg"]["*audio*"] = {"enable": False}
                 quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
                 quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
-                warnings.warn(
-                    "Please set the default input_mode to InputMode.LANGUAGE before quantizing."
-                )
 
         if not model_is_already_quantized or calibration_only:
             # Only run single sample for preview
 
@@ -489,7 +489,7 @@
         "* prune the model;\n",
         "* obtain a valid pytorch model that can be used for fine-tuning.\n",
         "\n",
-        "Let's say you have the ResNet20 model as our base model to prune from and we are looking for a model with at most 30M FLOPs. We can provide search constraints for `flops` and/or `params` by an upper bound. The values can either be absolute numbers (e.g. `30e6`) or a string percentage (e.g. `\"75%\"`). In addition, we should also provide our training data loader to [mtp.prune](../reference/generated/modelopt.torch.prune.pruning.rst#modelopt.torch.prune.pruning.prune). The training data loader will be used to calibrate the normalization layers in the model. Finally, we will also specify a custom config for configuring the pruning search space to get a more fine-grained selection of pruned nets.\n",
+        "Let's say you have the ResNet20 model as our base model to prune from and we are looking for a model with at most 60M FLOPs. We can provide search constraints for `flops` and/or `params` by an upper bound. The values can either be absolute numbers (e.g. `60e6`) or a string percentage (e.g. `\"75%\"`). In addition, we should also provide our training data loader to [mtp.prune](../reference/generated/modelopt.torch.prune.pruning.rst#modelopt.torch.prune.pruning.prune). The training data loader will be used to calibrate the normalization layers in the model. Finally, we will also specify a custom config for configuring the pruning search space to get a more fine-grained selection of pruned nets.\n",
         "\n",
         "Finally, we can store the pruned architecture and weights using [mto.save](../reference/generated/modelopt.torch.opt.conversion.rst#modelopt.torch.opt.conversion.save).\n",
         "\n",
@@ -529,7 +529,7 @@
             "┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
             "┃\u001b[1m \u001b[0m\u001b[1mConstraint  \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmin         \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcentroid    \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax         \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax/min ratio\u001b[0m\u001b[1m \u001b[0m┃\n",
             "┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
-            "│ flops        │ 24.33M       │ 27.57M       │ 40.55M       │ 1.67          │\n",
+            "│ flops        │ 48.66M       │ 55.14M       │ 81.10M       │ 1.67          │\n",
             "│ params       │ 90.94K       │ 141.63K      │ 268.35K      │ 2.95          │\n",
             "└──────────────┴──────────────┴──────────────┴──────────────┴───────────────┘\n",
             "\u001b[3m                                              \u001b[0m\n",
@@ -538,7 +538,7 @@
             "┃\u001b[1m              \u001b[0m┃\u001b[1m              \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSatisfiable \u001b[0m\u001b[1m \u001b[0m┃\n",
             "┃\u001b[1m \u001b[0m\u001b[1mConstraint  \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mUpper Bound \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mUpper Bound \u001b[0m\u001b[1m \u001b[0m┃\n",
             "┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
-            "│ flops        │ 30.00M       │ True         │\n",
+            "│ flops        │ 60.00M       │ True         │\n",
             "└──────────────┴──────────────┴──────────────┘\n",
             "\n",
             "\n",
@@ -618,7 +618,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[best_subnet_constraints] = {'params': '173.88K', 'flops': '29.64M'}\n"
+            "[best_subnet_constraints] = {'params': '173.88K', 'flops': '59.28M'}\n"
           ]
         },
         {
@@ -656,7 +656,7 @@
         "pruned_model, _ = mtp.prune(\n",
         "    model=resnet20(ckpt=\"resnet20.pth\"),\n",
         "    mode=[(\"fastnas\", config)],\n",
-        "    constraints={\"flops\": 30e6},\n",
+        "    constraints={\"flops\": 60e6},\n",
         "    dummy_input=dummy_input,\n",
         "    config={\n",
         "        \"data_loader\": train_loader,\n",
@@ -676,7 +676,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "As we can see, the best subnet (29.6M FLOPs) has fitted our constraint of 30M FLOPs. We can also see a drop in validation accuracy for the searched model. This is very common after pruning and fine-tuning is necessary for this model.\n",
+        "As we can see, the best subnet (59.3M FLOPs) has fitted our constraint of 60M FLOPs. We can also see a drop in validation accuracy for the searched model. This is very common after pruning and fine-tuning is necessary for this model.\n",
         "\n",
         "#### Restore the pruned subnet using [mto.restore](../reference/generated/modelopt.torch.opt.conversion.rst#modelopt.torch.opt.conversion.restore)"
       ]
@@ -795,8 +795,8 @@
         "\n",
         "| Model           | FLOPs      | Params     | Test Accuracy     |\n",
         "| --------------- | ---------- | ---------- | ----------------- |\n",
-        "| ResNet20        | 40.6M      | 268k       | 90.9%             |\n",
-        "| FastNAS subnet  | 29.6M      | 174k       | 90.3%             |\n",
+        "| ResNet20        | 81.2M      | 268k       | 90.9%             |\n",
+        "| FastNAS subnet  | 59.2M      | 174k       | 90.3%             |\n",
         "\n",
         "As we see here, we have reduced the FLOPs and number of parameters which would also result in a improvement in latency with very little loss in accuracy. Good job!\n",
         "\n",
 
@@ -19,11 +19,21 @@
 
 import torch
 import transformers
+from ar_validate import validate_ar
+from datasets import load_dataset
 from torch.utils.data import Dataset
+from transformers import TrainerCallback
 from transformers.trainer_pt_utils import LabelSmoother
 
 from modelopt.torch.utils import print_rank_0
 
+try:
+    import wandb
+
+    wandb.init()
+except ImportError:
+    wandb = None
+
 IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 
 REMOVE_THINK_CHAT_TEMPLATE = (
@@ -382,3 +392,24 @@ def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
         }
 
         return batch
+
+
+class ARValidationCallback(TrainerCallback):
+    def __init__(self, ar_validate_steps: int = 1000):
+        self.ar_validate_steps = ar_validate_steps
+
+    def on_step_end(self, args, state, control, **kwargs):
+        if self.ar_validate_steps <= 0:
+            return control
+        if state.global_step % self.ar_validate_steps == 0 and state.global_step > 0:
+            print_rank_0("Running AR validation...")
+            ars = validate_ar(
+                model=kwargs["model"],
+                tokenizer=kwargs["processing_class"],
+                ds=load_dataset("HuggingFaceH4/mt_bench_prompts")["train"],
+                device=kwargs["model"].device,
+            )
+            print_rank_0(f"Step {state.global_step} AR: {sum(ars) / len(ars):.4f}")
+            if wandb:
+                wandb.log({"validate_ar": sum(ars) / len(ars)}, step=state.global_step)
+        return control
@@ -36,24 +36,15 @@
 
 import torch
 import transformers
-from ar_validate import validate_ar
-from datasets import load_dataset
-from eagle_utils import make_eagle_supervised_data_module
+from eagle_utils import ARValidationCallback, make_eagle_supervised_data_module
 from medusa_utils import make_medusa_supervised_data_module
-from transformers import Trainer, TrainerCallback
+from transformers import Trainer
 from transformers.trainer_utils import get_last_checkpoint
 
 import modelopt.torch.opt as mto
 import modelopt.torch.speculative as mtsp
 from modelopt.torch.utils import print_rank_0
 
-try:
-    import wandb
-
-    wandb.init()
-except ImportError:
-    wandb = None
-
 torch.manual_seed(0)
 mto.enable_huggingface_checkpointing()
 
@@ -147,9 +138,8 @@ def train():
         model = transformers.AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto")
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
     else:
-        model_kwargs = {"num_hidden_layers": 0} if use_offline_training else {}
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_args.model_name_or_path, torch_dtype="auto", **model_kwargs
+            model_args.model_name_or_path, torch_dtype="auto", device_map="cpu"
         )
         if use_offline_training:
             # When doing offline training, we need to set num_hidden_layers
@@ -231,34 +221,13 @@ def train():
             tokenizer, data_args, use_offline_training, max_length=training_args.training_seq_len
         )
 
-    class ARValidationCallback(TrainerCallback):
-        def __init__(self, ar_validate_steps: int = 500):
-            self.ar_validate_steps = ar_validate_steps
-
-        def on_step_end(self, args, state, control, **kwargs):
-            if self.ar_validate_steps <= 0:
-                return control
-            if state.global_step % self.ar_validate_steps == 0 and state.global_step > 0:
-                print_rank_0("Running AR validation...")
-                ars = validate_ar(
-                    model=kwargs["model"],
-                    tokenizer=kwargs["processing_class"],
-                    ds=load_dataset("HuggingFaceH4/mt_bench_prompts")["train"],
-                    device=kwargs["model"].device,
-                )
-                print_rank_0(f"Step {state.global_step} AR: {sum(ars) / len(ars):.4f}")
-                if wandb:
-                    wandb.log({"validate_ar": sum(ars) / len(ars)}, step=state.global_step)
-            return control
-
     trainer = Trainer(
         model=model,
         processing_class=tokenizer,
         args=training_args,
         callbacks=[ARValidationCallback(training_args.ar_validate_steps)],
         **data_module,
     )
-    trainer._move_model_to_device(model, trainer.args.device)
 
     # Manually enable this to return loss in eval
     trainer.can_return_loss = True
Original file line number	Diff line number	Diff line change
`@@ -42,5 +42,7 @@`
`42`	`42`	`"evenBetterToml.schema.enabled": false, // disable toml/json schema since we have custom fields`
`43`	`43`	`"python.analysis.extraPaths": [`
`44`	`44`	`"./tests/" // add tests to python path just like pytest does in pyproject.toml`
`45`		`- ]`
	`45`	`+ ],`
	`46`	`+ "git.alwaysSignOff": true,`
	`47`	`+ "git.enableCommitSigning": true,`
`46`	`48`	`}`