Skip to content

Commit 85f8795

Browse files
authored
Merge branch 'main' into rislam/import-fix
2 parents 009865b + 3524732 commit 85f8795

File tree

92 files changed

+2767
-1105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+2767
-1105
lines changed

.github/CODEOWNERS

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -32,24 +32,24 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
3232
# Examples
3333
/docker @NVIDIA/modelopt-docker-codeowners
3434
/README.md @NVIDIA/modelopt-examples-codeowners
35-
examples @NVIDIA/modelopt-examples-codeowners
36-
examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
37-
examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
38-
examples/deepseek @NVIDIA/modelopt-deploy-codeowners
39-
examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
40-
examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
41-
examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
42-
examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
43-
examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
44-
examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
45-
examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
46-
examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47-
examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
48-
examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
49-
examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
50-
examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
51-
examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
52-
examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
53-
examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
54-
examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
55-
examples/windows @NVIDIA/modelopt-windows-codeowners
35+
/examples @NVIDIA/modelopt-examples-codeowners
36+
/examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
37+
/examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
38+
/examples/deepseek @NVIDIA/modelopt-deploy-codeowners
39+
/examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
40+
/examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
41+
/examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
42+
/examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
43+
/examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
44+
/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
45+
/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
46+
/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47+
/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
48+
/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
49+
/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
50+
/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
51+
/examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
52+
/examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
53+
/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
54+
/examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
55+
/examples/windows @NVIDIA/modelopt-windows-codeowners

.github/workflows/gpu_tests.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,31 @@ jobs:
2222
any_changed: ${{ steps.changed-tests.outputs.any_changed }}
2323
steps:
2424
- uses: actions/checkout@v4
25+
with:
26+
fetch-depth: 0
2527
- id: get-pr-info
2628
uses: nv-gha-runners/get-pr-info@main
29+
# Get commit from main branch that is present in the PR to use as base for changed files
30+
- id: calculate-merge-base
31+
env:
32+
PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
33+
BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
34+
run: |
35+
(echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
2736
- name: Check for changes in test-relevant directories
2837
id: changed-tests
2938
uses: step-security/[email protected]
3039
with:
40+
base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
41+
sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
3142
files: |
3243
.github/workflows/gpu_tests.yml
3344
modelopt/**
3445
tests/gpu/**
3546
tox.ini
3647
pyproject.toml
3748
setup.py
38-
base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
49+
fail_on_initial_diff_error: true
3950
wait-checks:
4051
needs: [check-file-changes]
4152
if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
7081
timeout-minutes: 90
7182
container: *gpu_container
7283
steps: *gpu_steps
84+
gpu-pr-required-check:
85+
# Run even if gpu-tests-pr is skipped
86+
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
87+
needs: [check-file-changes, gpu-tests-pr]
88+
runs-on: ubuntu-latest
89+
steps:
90+
- name: Required GPU tests did not succeed
91+
if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
92+
run: exit 1

.github/workflows/unit_tests.yml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,6 @@ name: Unit tests
44
on:
55
pull_request:
66
branches: [main, release/*]
7-
paths:
8-
- ".github/workflows/unit_tests.yml"
9-
- "modelopt/**"
10-
- "tests/unit/**"
11-
- "pyproject.toml"
12-
- "setup.py"
13-
- "tox.ini"
147
push:
158
branches: [main, release/*]
169
paths:
@@ -126,3 +119,9 @@ jobs:
126119
python-version: "3.12"
127120
- name: Run unit tests
128121
run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
122+
unit-pr-required-check:
123+
if: github.event_name == 'pull_request'
124+
needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
125+
runs-on: ubuntu-latest
126+
steps:
127+
- run: echo "All PR unit test jobs completed"

CHANGELOG.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@ Model Optimizer Changelog (Linux)
55
^^^^^^^^^^^^^^^^^
66

77
**Deprecations**
8+
- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
89

910
**Bug Fixes**
1011

1112
**New Features**
13+
- ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
1214

1315
0.35 (2025-09-04)
1416
^^^^^^^^^^^^^^^^^
@@ -21,6 +23,7 @@ Model Optimizer Changelog (Linux)
2123
**Bug Fixes**
2224

2325
- Fix attention head ranking logic for pruning Megatron Core GPT models.
26+
- Upgrade TensorRT-LLM dependency to 1.1.0rc2.
2427

2528
**New Features**
2629

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6
1+
FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
22

33
ARG PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com"
44
ENV PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL \

examples/diffusers/quantization/diffusion_trt.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,11 @@ def main():
105105

106106
image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
107107

108-
pipe = PipelineManager.create_pipeline_from(MODEL_ID[args.model], dtype_map[args.model_dtype])
108+
pipe = PipelineManager.create_pipeline_from(
109+
MODEL_ID[args.model],
110+
dtype_map[args.model_dtype],
111+
override_model_path=args.override_model_path,
112+
)
109113

110114
# Save the backbone of the pipeline and move it to the GPU
111115
add_embedding = None

examples/diffusers/quantization/quantize.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,9 @@ def __init__(self, config: ModelConfig, logger: logging.Logger):
309309

310310
@staticmethod
311311
def create_pipeline_from(
312-
model_type: ModelType, torch_dtype: torch.dtype = torch.bfloat16
312+
model_type: ModelType,
313+
torch_dtype: torch.dtype = torch.bfloat16,
314+
override_model_path: str | None = None,
313315
) -> DiffusionPipeline:
314316
"""
315317
Create and return an appropriate pipeline based on configuration.
@@ -321,7 +323,9 @@ def create_pipeline_from(
321323
ValueError: If model type is unsupported
322324
"""
323325
try:
324-
model_id = MODEL_REGISTRY[model_type]
326+
model_id = (
327+
MODEL_REGISTRY[model_type] if override_model_path is None else override_model_path
328+
)
325329
if model_type == ModelType.SD3_MEDIUM:
326330
pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
327331
elif model_type in [ModelType.FLUX_DEV, ModelType.FLUX_SCHNELL]:

examples/diffusers/quantization/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
cuda-python
2+
diffusers<=0.34.0
23
nvtx
34
onnx_graphsurgeon
45
opencv-python>=4.8.1.78,<4.12.0.88

examples/llm_ptq/hf_ptq.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
from accelerate.hooks import remove_hook_from_module
2626
from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
2727
from transformers import (
28+
AutoConfig,
2829
AutoModelForCausalLM,
30+
AutoProcessor,
2931
PreTrainedTokenizer,
3032
PreTrainedTokenizerFast,
3133
WhisperProcessor,
@@ -39,6 +41,7 @@
3941
export_tensorrt_llm_checkpoint,
4042
get_model_type,
4143
)
44+
from modelopt.torch.export.model_utils import is_multimodal_model
4245
from modelopt.torch.quantization.config import need_calibration
4346
from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
4447
from modelopt.torch.quantization.utils import is_quantized
@@ -567,19 +570,26 @@ def output_decode(generated_ids, input_shape):
567570

568571
export_path = args.export_path
569572

570-
if hasattr(full_model, "language_model"):
571-
# Save original model config and the preprocessor config to the export path for VLMs.
572-
from transformers import AutoConfig, AutoProcessor
573+
# Check if the model is a multimodal/VLM model
574+
is_vlm = is_multimodal_model(full_model)
573575

574-
print(f"Saving original model and processor configs to {export_path}")
576+
if is_vlm:
577+
# Save original model config and the processor config to the export path for VLMs.
578+
print(f"Saving original model config to {export_path}")
575579

576580
AutoConfig.from_pretrained(
577581
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
578582
).save_pretrained(export_path)
579583

580-
AutoProcessor.from_pretrained(
581-
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
582-
).save_pretrained(export_path)
584+
# Try to save processor config if available
585+
try:
586+
print(f"Saving processor config to {export_path}")
587+
AutoProcessor.from_pretrained(
588+
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
589+
).save_pretrained(export_path)
590+
except Exception as e:
591+
print(f"Warning: Could not save processor config: {e}")
592+
print("This is normal for some VLM architectures that don't use AutoProcessor")
583593

584594
if model_type == "mllama":
585595
full_model_config = model.config
@@ -732,7 +742,7 @@ def output_decode(generated_ids, input_shape):
732742
)
733743
parser.add_argument(
734744
"--verbose",
735-
help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",
745+
help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",
736746
default=True,
737747
action=argparse.BooleanOptionalAction,
738748
)

examples/llm_qat/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ def forward_loop(model):
8282

8383

8484
# Quantize the model in-place; The model should be unwrapped from any distributed wrapper
85-
# The model may be wrapped in a DataParallel or DistributedDataParallel after `mtq.quantize`
8685
model = mtq.quantize(model, mtq.INT8_DEFAULT_CFG, forward_loop)
8786

8887
# Save the modelopt quantizer states

0 commit comments

Comments
 (0)