Skip to content

Commit cd6fce2

Browse files
authored
Merge branch 'main' into kaix/sparse_attention_core
2 parents 40c1b7d + d0b0c0f commit cd6fce2

File tree

138 files changed

+6259
-2849
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+6259
-2849
lines changed

.github/CODEOWNERS

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
4444
/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
4545
/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
4646
/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47-
/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
4847
/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
4948
/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
5049
/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners

.github/workflows/example_tests.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,11 @@ jobs:
5454
checks: read
5555
secrets: inherit
5656
with:
57-
match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
57+
match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
5858
delay: 300s
5959
example-tests-pr:
6060
needs: [check-file-changes, wait-checks]
6161
if: needs.check-file-changes.outputs.any_changed == 'true'
62-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
6362
runs-on: linux-amd64-gpu-h100-latest-1
6463
timeout-minutes: 90
6564
strategy:
@@ -84,8 +83,7 @@ jobs:
8483
pytest -s tests/examples/${{ matrix.EXAMPLE }}
8584
example-tests-non-pr:
8685
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
87-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
88-
runs-on: linux-amd64-gpu-h100-latest-1
86+
runs-on: linux-amd64-gpu-h100-latest-2
8987
timeout-minutes: 90
9088
strategy:
9189
matrix:

.github/workflows/gpu_tests.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,11 @@ jobs:
5454
checks: read
5555
secrets: inherit
5656
with:
57-
match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
57+
match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
5858
delay: 300s
5959
gpu-tests-pr:
6060
needs: [check-file-changes, wait-checks]
6161
if: needs.check-file-changes.outputs.any_changed == 'true'
62-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
6362
runs-on: linux-amd64-gpu-l4-latest-1
6463
timeout-minutes: 120
6564
container: &gpu_container
@@ -78,8 +77,7 @@ jobs:
7877
run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
7978
gpu-tests-non-pr:
8079
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
81-
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
82-
runs-on: linux-amd64-gpu-h100-latest-1
80+
runs-on: linux-amd64-gpu-h100-latest-2
8381
timeout-minutes: 120
8482
container: *gpu_container
8583
steps: *gpu_steps

.github/workflows/unit_tests.yml

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
checks: read
2929
secrets: inherit
3030
with:
31-
match_pattern: '^DCO$'
31+
match_pattern: "^DCO$"
3232
linux:
3333
needs: [check-dco]
3434
runs-on: ubuntu-latest
@@ -39,7 +39,7 @@ jobs:
3939
with:
4040
python-version: "3.12"
4141
- name: Run unit tests
42-
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch28-tf_latest-unit
42+
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
4343
- name: Upload coverage reports to Codecov
4444
uses: codecov/codecov-action@v5
4545
with:
@@ -57,7 +57,7 @@ jobs:
5757
with:
5858
python-version: "3.12"
5959
- name: Run unit tests (without coverage)
60-
run: pip install tox && tox -e py312-torch28-tf_latest-unit
60+
run: pip install tox && tox -e py312-torch29-tf_latest-unit
6161
multi-py:
6262
if: github.event_name == 'pull_request'
6363
needs: [linux]
@@ -72,15 +72,15 @@ jobs:
7272
with:
7373
python-version: "3.${{ matrix.py }}"
7474
- name: Run unit tests
75-
run: pip install tox && tox -e py3${{ matrix.py }}-torch28-tf_latest-unit
75+
run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
7676
multi-torch:
7777
if: github.event_name == 'pull_request'
7878
needs: [linux]
7979
runs-on: ubuntu-latest
8080
timeout-minutes: 30
8181
strategy:
8282
matrix:
83-
torch: [26, 27]
83+
torch: [26, 27, 28]
8484
steps:
8585
- uses: actions/checkout@v4
8686
- uses: actions/setup-python@v5
@@ -102,7 +102,7 @@ jobs:
102102
with:
103103
python-version: "3.12"
104104
- name: Run unit tests
105-
run: pip install tox && tox -e py312-torch28-tf_${{ matrix.tf }}-unit
105+
run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
106106
partial-install:
107107
if: github.event_name == 'pull_request'
108108
needs: [linux]
@@ -119,8 +119,17 @@ jobs:
119119
- name: Run unit tests
120120
run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
121121
unit-pr-required-check:
122-
if: github.event_name == 'pull_request'
122+
# Run even if some jobs are skipped
123+
if: ${{ github.event_name == 'pull_request' && always() }}
123124
needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
124125
runs-on: ubuntu-latest
125126
steps:
126-
- run: echo "All PR unit test jobs completed"
127+
- name: Required unit tests did not succeed
128+
if: >-
129+
${{ needs.linux.result != 'success' ||
130+
needs.windows.result != 'success' ||
131+
needs.multi-py.result != 'success' ||
132+
needs.multi-torch.result != 'success' ||
133+
needs.multi-transformers.result != 'success' ||
134+
needs.partial-install.result != 'success' }}
135+
run: exit 1

.gitlab/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ build-and-upload-wheels:
1717
TWINE_PASSWORD: $ARTIFACTORY_TOKEN # Configured in GitLab > Settings > CI/CD
1818
REPO_URL: https://urm.nvidia.com/artifactory/api/pypi/sw-dl-algo-ammo-pypi-local
1919
- if: $CI_PIPELINE_SOURCE == "schedule"
20+
when: manual
2021
variables:
21-
when: manual
2222
RELEASE: "false"
2323
TWINE_USERNAME: gitlab-ci-token
2424
TWINE_PASSWORD: $CI_JOB_TOKEN

.gitlab/tests.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,10 @@ unit:
1515
timeout: 30m
1616
variables:
1717
PYTHON: 12
18-
TORCH: 28
18+
TORCH: 29
1919
TRANSFORMERS: latest
2020
image: python:3.$PYTHON
2121
before_script:
22-
# Install cmake to build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
23-
- if [ "$PYTHON" = "12" ]; then apt-get update && apt-get install -y cmake; fi
2422
- pip install tox
2523
script:
2624
- tox -e py3$PYTHON-torch$TORCH-tf_$TRANSFORMERS-unit

CHANGELOG.rst

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,34 @@
11
Model Optimizer Changelog (Linux)
22
=================================
3-
0.41 (2025-12-xx)
4-
^^^^^^^^^^^^^^^^^
5-
6-
**Deprecations**
7-
8-
**New Features**
9-
10-
- Add support for PyTorch Geometric quantization.
11-
12-
**Misc**
13-
14-
- Bump minimum recommended transformers version to 4.53.
15-
163

17-
0.40 (2025-12-xx)
4+
0.40 (2025-12-11)
185
^^^^^^^^^^^^^^^^^
196

207
**Bug Fixes**
218

229
- Fix a bug in FastNAS pruning (computer vision models) where the model parameters were sorted twice messing up the ordering.
10+
- Fix Q/DQ/Cast node placements in 'FP32 required' tensors in custom ops in the ONNX quantization workflow.
2311

2412
**New Features**
2513

2614
- Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
2715
- Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
2816
- Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
17+
- Add KL Divergence loss based auto_quantize method. See `auto_quantize API docs <https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize>`_ for more details.
18+
- Add support for saving and resuming auto_quantize search state. This speeds up the auto_quantize process by skipping the score estimation step if the search state is provided.
19+
- Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
20+
- Add support for PyTorch Geometric quantization.
21+
- Add per tensor and per channel MSE calibrator support.
22+
- Added support for PTQ/QAT checkpoint export and loading for running fakequant evaluation in vLLM. See ``examples/vllm_serve/README.md#load-qatptq-model-and-serve-in-vllm-wip`` for more details.
23+
24+
**Documentation**
25+
26+
- Deprecate ``examples/megatron-lm`` in favor of more detailed documentation in `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_.
27+
28+
**Misc**
29+
30+
- Bump minimum recommended transformers version to 4.53.
31+
- Replace ONNX simplification package from ``onnxsim`` to ``onnxslim``.
2932

3033
0.39 (2025-11-11)
3134
^^^^^^^^^^^^^^^^^
@@ -49,6 +52,7 @@ Model Optimizer Changelog (Linux)
4952
- Enabled native Modelopt quantization support for FP8 and NVFP4 formats in SGLang. See `SGLang quantization documentation <https://github.com/sgl-project/sglang/blob/main/docs/advanced_features/quantization.md#using-nvidia-modelopt>`_ for more details.
5053
- Added modelopt quantized checkpoints in vLLM/SGLang CI/CD pipelines (PRs are under review).
5154
- Add support for exporting QLoRA checkpoint fintuned using ModelOpt.
55+
- Update NVFP4 AWQ checkpoint export. It now fuses scaling factors of o_proj and down_proj layers into the model when possible to facilitate deployment.
5256

5357
**Documentation**
5458

@@ -72,7 +76,7 @@ Model Optimizer Changelog (Linux)
7276
- Upgrade TensorRT-LLM dependency to 1.1.0rc2.
7377
- Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``.
7478
- Support storing and restoring Minitron pruning activations and scores for re-pruning without running the forward loop again.
75-
- Add Minitron pruning example for Megatron-LM framework. See ``examples/megatron-lm`` for more details.
79+
- Add Minitron pruning example for Megatron-LM framework. See `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_ for more details.
7680

7781
0.35 (2025-09-04)
7882
^^^^^^^^^^^^^^^^^

examples/deepseek/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
DeepSeek-V3/
2+
DeepSeek-V3.2-Exp/

examples/deepseek/README.md

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,69 @@
1-
# Quantize Deepseek R1 to FP4
1+
# Quantize Deepseek models to FP4
22

3-
This example will demonstrate the steps to quantize DeepSeek R1 model to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
3+
This example will demonstrate the steps to quantize DeepSeek models to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
44

55
## Setup
66

77
Due to the model size, currently it requires 8xH200 or 16xH100 to quantize the FP8 model, we will use 8xH200 as example.
88

9-
### Convert the HF checkpoint for deepseek FP8 inference
9+
## Convert the HF checkpoint for deepseek FP8 inference
1010

1111
```bash
1212
# set up variables to run the example
1313
export HF_FP8_CKPT={path_to_downloaded_hf_checkpoint}
1414
export DS_CKPT={path_to_save_converted_checkpoint}
1515
export FP4_QUANT_PATH={path_to_save_quantization_results}
1616
export HF_FP4_PATH={path_to_save_the_final_FP4_checkpoint}
17+
```
18+
19+
### DeepSeek V3 R1 V3.1
1720

18-
# download the FP8 checkpoint from Hugginface
21+
```bash
22+
# download the FP8 checkpoint from Hugginface. This is an example of DeepSeek-R1
1923
huggingface-cli download deepseek-ai/DeepSeek-R1 --local-dir $HF_FP8_CKPT
2024

2125
# clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
2226
git clone https://github.com/deepseek-ai/DeepSeek-V3.git && cd DeepSeek-V3 && git checkout 1398800
27+
```
28+
29+
### [Experimental] DeepSeek V3.2
2330

31+
```bash
32+
# download the FP8 checkpoint from Hugginface.
33+
huggingface-cli download deepseek-ai/DeepSeek-V3.2-Exp --local-dir $HF_FP8_CKPT
34+
35+
# clone DeepSeek-V3.2 Github repository for FP8 inference,
36+
git clone https://github.com/deepseek-ai/DeepSeek-V3.2-Exp.git && cd DeepSeek-V3.2-Exp && git checkout 3b99a53
37+
38+
# Install requirements
39+
pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
40+
pip install -r inference/requirements.txt
41+
```
42+
43+
### Convert the Checkpoint
44+
45+
```bash
2446
# convert the HF checkpoint to a specific format for Deepseek
2547
python inference/convert.py --hf-ckpt-path $HF_FP8_CKPT --save-path $DS_CKPT --n-experts 256 --model-parallel 8
2648
```
2749

28-
### Post-training quantization
50+
## Post-training quantization
51+
52+
### Run the calibration scripts
2953

30-
#### Run the calibration scripts
54+
DeepSeek V3, R1, V3.1
3155

3256
```bash
3357
torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3/inference/configs/config_671B.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
3458
```
3559

36-
#### Quantize the FP8 hf checkpoint to FP4
60+
DeepSeek V3.2
61+
62+
```bash
63+
torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
64+
```
65+
66+
### Quantize the FP8 hf checkpoint to FP4
3767

3868
We provide a one-step-script which will:
3969

0 commit comments

Comments
 (0)