NVIDIA
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.gitlab/tests.yml‎
Lines changed: 3 additions & 4 deletions b/‎.gitlab/tests.yml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 3 additions & 4 deletions b/‎CHANGELOG.rst‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎docs/source/getting_started/_installation_for_Linux.rst‎
Lines changed: 1 addition & 2 deletions b/‎docs/source/getting_started/_installation_for_Linux.rst‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/diffusers/quantization/requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎examples/diffusers/quantization/requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/llm_sparsity/launch_finetune.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_sparsity/launch_finetune.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm_sparsity/requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎examples/llm_sparsity/requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 8 additions & 1 deletion b/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎examples/speculative_decoding/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/speculative_decoding/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py‎
Lines changed: 14 additions & 11 deletions b/‎examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py‎
Lines changed: 14 additions & 11 deletions
@@ -73,8 +73,7 @@ jobs:
       - uses: nv-gha-runners/setup-proxy-cache@main
       - name: Setup environment variables
         run: |
-          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV
-          echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
         run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
   gpu-tests-non-pr:
 
@@ -6,7 +6,8 @@
   rules:
     - if: $CI_PIPELINE_SOURCE == "schedule"
     - if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
-    - when: manual
+    - if: $CI_PIPELINE_SOURCE == "web" || $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED == "true"
+      when: manual
 
 ##### Unit Tests #####
 unit:
@@ -34,9 +35,7 @@ unit:
   tags: [docker, linux, 2-gpu]
   before_script:
     # Add libcudnn*.so and libnv*.so to path
-    - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
-    # Add trtexec to path
-    - export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
+    - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
     # Install git-lfs for Daring-Anteater dataset
     - apt-get update && apt-get install -y git-lfs
     - git lfs install --system
 
@@ -1,18 +1,17 @@
 Model Optimizer Changelog (Linux)
 =================================
 
-0.39 (2025-11-xx)
+0.39 (2025-11-07)
 ^^^^^^^^^^^^^^^^^
 
-**Deprecations**
-
 **New Features**
 
 - Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
 - Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``.
 - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
-- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` if no dataset is specified.
+- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` (gated dataset accessed using ``HF_TOKEN`` environment variable) if no dataset is specified.
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
+- Add support for MCore MoE PTQ/QAT/QAD.
 
 **Documentation**
 
 
@@ -41,8 +41,7 @@ Environment setup
     .. code-block:: shell
 
         export PIP_CONSTRAINT=""
-        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
-        export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
+        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
 
     You may need to install additional dependencies from the respective examples's `requirements.txt` file.
 
 
@@ -4,3 +4,6 @@ nvtx
 onnx_graphsurgeon
 opencv-python>=4.8.1.78,<4.12.0.88
 sentencepiece
+# TODO: Fix for torch 2.9
+torch<2.9
+torchvision<0.24.0
@@ -91,7 +91,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
     --warmup_ratio 0.0 \
     --lr_scheduler_type cosine \
     --logging_steps 1 \
-    --fsdp full_shard auto_wrap \
+    --fsdp 'full_shard auto_wrap' \
     --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
     --tf32 True \
     --modelopt_restore_path $MODELOPT_RESTORE_PATH \
 
@@ -1,4 +1,3 @@
 flash-attn
 sentencepiece>=0.2.0
 tensorboardX
-transformers>=4.57.0
@@ -38,9 +38,15 @@ def main():
     parser.add_argument(
         "--engine_path",
         type=str,
-        required=True,
+        default=None,
         help="Path to the TensorRT engine",
     )
+    parser.add_argument(
+        "--timing_cache_path",
+        type=str,
+        default=None,
+        help="Path to the TensorRT timing cache",
+    )
     parser.add_argument(
         "--imagenet_path", type=str, default=None, help="Path to the imagenet dataset"
     )
@@ -81,6 +87,7 @@ def main():
     # Compile the ONNX model to TRT engine and create the device model
     compilation_args = {
         "engine_path": args.engine_path,
+        "timing_cache_path": args.timing_cache_path,
     }
     compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
     device_model = DeviceModel(client, compiled_model, metadata={})
 
@@ -312,7 +312,7 @@ trainer.save_model("<path to the output directory>")
 | LLAMA 3, 3.1 | ✅ | ✅ | ✅ |
 | Mistral | ✅ | ✅ | ✅ |
 | Phi 3 | ✅ | ✅ | ✅ |
-| QWen 1.5,2,2.5 | ✅ | ✅ | ✅ |
+| QWen 1.5,2,2.5,3 | ✅ | ✅ | ✅ |
 
 ## Speculation Module Checkpoints
 
 
@@ -208,13 +208,16 @@ def keep_conversation(entry):
     num_success = 0
     pbar = tqdm(total=len(dataset), desc=f"DP#{args.dp_rank} Processing conversations")
 
-    def _post_process_trtllm_dumped(trtllm_dumped_file: str, conversation_id: int):
-        """Post-process the TRTLLM dumped file to same format as HF dumped:
+    async def _post_process_trtllm_dumped(trtllm_dumped_file: str, conversation_id: int):
+        """
+        Post-process the TRTLLM dumped file to same format as HF dumped:
         1. Remove id field, replace it with conversation_id
         2. Rename hidden_state field to hidden_states
         3. From list of length 1 to dict
         4. Rename file to conversation_id.pt
         """
+        if not trtllm_dumped_file.exists():
+            return False
         with open(trtllm_dumped_file, "rb") as f:
             trtllm_dumped = torch.load(f)
         assert isinstance(trtllm_dumped, list) and len(trtllm_dumped) == 1, (
@@ -232,35 +235,33 @@ def _post_process_trtllm_dumped(trtllm_dumped_file: str, conversation_id: int):
         output_file = args.output_dir / f"{conversation_id}.pt"
         with open(output_file, "wb") as f:
             torch.save(trtllm_dumped, f)
-
-        if trtllm_dumped_file.exists():
-            trtllm_dumped_file.unlink()
+        trtllm_dumped_file.unlink()
+        return True
 
     async def dump_hidden_states(idx: int, conversation_id: int, input_ids: list[int]):
         nonlocal num_success
         await llm_spec.generate_async(input_ids, sampling_params)
         # TRTLLM API name files starts from 1
         # ref:https://github.com/NVIDIA/TensorRT-LLM/pull/7012
         trtllm_dumped_file = args.output_dir / f"{spec_config['file_prefix']}_{idx + 1}.pt"
-        _post_process_trtllm_dumped(trtllm_dumped_file, conversation_id)
-        num_success += 1
+        dump_success = await _post_process_trtllm_dumped(trtllm_dumped_file, conversation_id)
+        num_success += int(dump_success)
         pbar.update(1)
 
     async def submit_generates():
         nonlocal num_skipped_too_long
         nonlocal num_invalid
         tasks = []
-        for idx, entry in enumerate(dataset):
+        idx = 0
+        for entry in dataset:
             conversation_id = entry.get("conversation_id", entry.get("uuid"))
 
             conversations = entry["conversations"]
             if not conversations or not isinstance(conversations, list):
                 num_invalid += 1
                 continue
 
-            input_ids = tokenizer.apply_chat_template(conversations, add_generation_template=False)[
-                :256
-            ]
+            input_ids = tokenizer.apply_chat_template(conversations, add_generation_template=False)
             num_input_tokens = (
                 input_ids.shape[1] if isinstance(input_ids, torch.Tensor) else len(input_ids)
             )
@@ -269,6 +270,8 @@ async def submit_generates():
                 continue
 
             tasks.append(dump_hidden_states(idx, conversation_id, input_ids))
+            # Increment only for valid conversations to match dump file index
+            idx += 1
         await asyncio.gather(*tasks)
 
     asyncio.run(submit_generates())