Skip to content

Commit 5e51553

Browse files
authored
Merge branch 'main' into hthadicherla/int4_onnx_fix
2 parents 58bfca0 + ff8a1ed commit 5e51553

File tree

40 files changed

+1126
-226
lines changed

40 files changed

+1126
-226
lines changed

.github/workflows/gpu_tests.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ jobs:
7373
- uses: nv-gha-runners/setup-proxy-cache@main
7474
- name: Setup environment variables
7575
run: |
76-
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV
77-
echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
76+
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
7877
- name: Run gpu tests
7978
run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
8079
gpu-tests-non-pr:

.gitlab/tests.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
rules:
77
- if: $CI_PIPELINE_SOURCE == "schedule"
88
- if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
9-
- when: manual
9+
- if: $CI_PIPELINE_SOURCE == "web" || $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED == "true"
10+
when: manual
1011

1112
##### Unit Tests #####
1213
unit:
@@ -34,9 +35,7 @@ unit:
3435
tags: [docker, linux, 2-gpu]
3536
before_script:
3637
# Add libcudnn*.so and libnv*.so to path
37-
- export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
38-
# Add trtexec to path
39-
- export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
38+
- export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
4039
# Install git-lfs for Daring-Anteater dataset
4140
- apt-get update && apt-get install -y git-lfs
4241
- git lfs install --system

CHANGELOG.rst

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
11
Model Optimizer Changelog (Linux)
22
=================================
33

4-
0.39 (2025-11-xx)
4+
0.39 (2025-11-07)
55
^^^^^^^^^^^^^^^^^
66

7-
**Deprecations**
8-
97
**New Features**
108

119
- Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
1210
- Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``.
1311
- Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
14-
- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` if no dataset is specified.
12+
- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` (gated dataset accessed using ``HF_TOKEN`` environment variable) if no dataset is specified.
1513
- Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
14+
- Add support for MCore MoE PTQ/QAT/QAD.
1615

1716
**Documentation**
1817

docs/source/getting_started/_installation_for_Linux.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ Environment setup
4141
.. code-block:: shell
4242
4343
export PIP_CONSTRAINT=""
44-
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
45-
export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
44+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
4645
4746
You may need to install additional dependencies from the respective examples's `requirements.txt` file.
4847

examples/diffusers/quantization/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ nvtx
44
onnx_graphsurgeon
55
opencv-python>=4.8.1.78,<4.12.0.88
66
sentencepiece
7+
# TODO: Fix for torch 2.9
8+
torch<2.9
9+
torchvision<0.24.0

examples/llm_sparsity/launch_finetune.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
9191
--warmup_ratio 0.0 \
9292
--lr_scheduler_type cosine \
9393
--logging_steps 1 \
94-
--fsdp full_shard auto_wrap \
94+
--fsdp 'full_shard auto_wrap' \
9595
--fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
9696
--tf32 True \
9797
--modelopt_restore_path $MODELOPT_RESTORE_PATH \
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
flash-attn
22
sentencepiece>=0.2.0
33
tensorboardX
4-
transformers>=4.57.0

examples/onnx_ptq/evaluate.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,15 @@ def main():
3838
parser.add_argument(
3939
"--engine_path",
4040
type=str,
41-
required=True,
41+
default=None,
4242
help="Path to the TensorRT engine",
4343
)
44+
parser.add_argument(
45+
"--timing_cache_path",
46+
type=str,
47+
default=None,
48+
help="Path to the TensorRT timing cache",
49+
)
4450
parser.add_argument(
4551
"--imagenet_path", type=str, default=None, help="Path to the imagenet dataset"
4652
)
@@ -81,6 +87,7 @@ def main():
8187
# Compile the ONNX model to TRT engine and create the device model
8288
compilation_args = {
8389
"engine_path": args.engine_path,
90+
"timing_cache_path": args.timing_cache_path,
8491
}
8592
compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
8693
device_model = DeviceModel(client, compiled_model, metadata={})

examples/speculative_decoding/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ trainer.save_model("<path to the output directory>")
312312
| LLAMA 3, 3.1 | ✅ | ✅ | ✅ |
313313
| Mistral | ✅ | ✅ | ✅ |
314314
| Phi 3 | ✅ | ✅ | ✅ |
315-
| QWen 1.5,2,2.5 | ✅ | ✅ | ✅ |
315+
| QWen 1.5,2,2.5,3 | ✅ | ✅ | ✅ |
316316

317317
## Speculation Module Checkpoints
318318

examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -208,13 +208,16 @@ def keep_conversation(entry):
208208
num_success = 0
209209
pbar = tqdm(total=len(dataset), desc=f"DP#{args.dp_rank} Processing conversations")
210210

211-
def _post_process_trtllm_dumped(trtllm_dumped_file: str, conversation_id: int):
212-
"""Post-process the TRTLLM dumped file to same format as HF dumped:
211+
async def _post_process_trtllm_dumped(trtllm_dumped_file: str, conversation_id: int):
212+
"""
213+
Post-process the TRTLLM dumped file to same format as HF dumped:
213214
1. Remove id field, replace it with conversation_id
214215
2. Rename hidden_state field to hidden_states
215216
3. From list of length 1 to dict
216217
4. Rename file to conversation_id.pt
217218
"""
219+
if not trtllm_dumped_file.exists():
220+
return False
218221
with open(trtllm_dumped_file, "rb") as f:
219222
trtllm_dumped = torch.load(f)
220223
assert isinstance(trtllm_dumped, list) and len(trtllm_dumped) == 1, (
@@ -232,35 +235,33 @@ def _post_process_trtllm_dumped(trtllm_dumped_file: str, conversation_id: int):
232235
output_file = args.output_dir / f"{conversation_id}.pt"
233236
with open(output_file, "wb") as f:
234237
torch.save(trtllm_dumped, f)
235-
236-
if trtllm_dumped_file.exists():
237-
trtllm_dumped_file.unlink()
238+
trtllm_dumped_file.unlink()
239+
return True
238240

239241
async def dump_hidden_states(idx: int, conversation_id: int, input_ids: list[int]):
240242
nonlocal num_success
241243
await llm_spec.generate_async(input_ids, sampling_params)
242244
# TRTLLM API name files starts from 1
243245
# ref:https://github.com/NVIDIA/TensorRT-LLM/pull/7012
244246
trtllm_dumped_file = args.output_dir / f"{spec_config['file_prefix']}_{idx + 1}.pt"
245-
_post_process_trtllm_dumped(trtllm_dumped_file, conversation_id)
246-
num_success += 1
247+
dump_success = await _post_process_trtllm_dumped(trtllm_dumped_file, conversation_id)
248+
num_success += int(dump_success)
247249
pbar.update(1)
248250

249251
async def submit_generates():
250252
nonlocal num_skipped_too_long
251253
nonlocal num_invalid
252254
tasks = []
253-
for idx, entry in enumerate(dataset):
255+
idx = 0
256+
for entry in dataset:
254257
conversation_id = entry.get("conversation_id", entry.get("uuid"))
255258

256259
conversations = entry["conversations"]
257260
if not conversations or not isinstance(conversations, list):
258261
num_invalid += 1
259262
continue
260263

261-
input_ids = tokenizer.apply_chat_template(conversations, add_generation_template=False)[
262-
:256
263-
]
264+
input_ids = tokenizer.apply_chat_template(conversations, add_generation_template=False)
264265
num_input_tokens = (
265266
input_ids.shape[1] if isinstance(input_ids, torch.Tensor) else len(input_ids)
266267
)
@@ -269,6 +270,8 @@ async def submit_generates():
269270
continue
270271

271272
tasks.append(dump_hidden_states(idx, conversation_id, input_ids))
273+
# Increment only for valid conversations to match dump file index
274+
idx += 1
272275
await asyncio.gather(*tasks)
273276

274277
asyncio.run(submit_generates())

0 commit comments

Comments
 (0)