Skip to content

Commit 1f9996a

Browse files
committed
fix example
Signed-off-by: Jennifer Chen <[email protected]>
1 parent d46399c commit 1f9996a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+1108
-374
lines changed

.github/CODEOWNERS

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -32,24 +32,24 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
3232
# Examples
3333
/docker @NVIDIA/modelopt-docker-codeowners
3434
/README.md @NVIDIA/modelopt-examples-codeowners
35-
examples @NVIDIA/modelopt-examples-codeowners
36-
examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
37-
examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
38-
examples/deepseek @NVIDIA/modelopt-deploy-codeowners
39-
examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
40-
examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
41-
examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
42-
examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
43-
examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
44-
examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
45-
examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
46-
examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47-
examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
48-
examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
49-
examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
50-
examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
51-
examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
52-
examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
53-
examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
54-
examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
55-
examples/windows @NVIDIA/modelopt-windows-codeowners
35+
/examples @NVIDIA/modelopt-examples-codeowners
36+
/examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
37+
/examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
38+
/examples/deepseek @NVIDIA/modelopt-deploy-codeowners
39+
/examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
40+
/examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
41+
/examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
42+
/examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
43+
/examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
44+
/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
45+
/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
46+
/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47+
/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
48+
/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
49+
/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
50+
/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
51+
/examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
52+
/examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
53+
/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
54+
/examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
55+
/examples/windows @NVIDIA/modelopt-windows-codeowners

.github/workflows/gpu_tests.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,31 @@ jobs:
2222
any_changed: ${{ steps.changed-tests.outputs.any_changed }}
2323
steps:
2424
- uses: actions/checkout@v4
25+
with:
26+
fetch-depth: 0
2527
- id: get-pr-info
2628
uses: nv-gha-runners/get-pr-info@main
29+
# Get commit from main branch that is present in the PR to use as base for changed files
30+
- id: calculate-merge-base
31+
env:
32+
PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
33+
BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
34+
run: |
35+
(echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
2736
- name: Check for changes in test-relevant directories
2837
id: changed-tests
2938
uses: step-security/[email protected]
3039
with:
40+
base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
41+
sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
3142
files: |
3243
.github/workflows/gpu_tests.yml
3344
modelopt/**
3445
tests/gpu/**
3546
tox.ini
3647
pyproject.toml
3748
setup.py
38-
base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
49+
fail_on_initial_diff_error: true
3950
wait-checks:
4051
needs: [check-file-changes]
4152
if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
7081
timeout-minutes: 90
7182
container: *gpu_container
7283
steps: *gpu_steps
84+
gpu-pr-required-check:
85+
# Run even if gpu-tests-pr is skipped
86+
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
87+
needs: [check-file-changes, gpu-tests-pr]
88+
runs-on: ubuntu-latest
89+
steps:
90+
- name: Required GPU tests did not succeed
91+
if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
92+
run: exit 1

.github/workflows/unit_tests.yml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,6 @@ name: Unit tests
44
on:
55
pull_request:
66
branches: [main, release/*]
7-
paths:
8-
- ".github/workflows/unit_tests.yml"
9-
- "modelopt/**"
10-
- "tests/unit/**"
11-
- "pyproject.toml"
12-
- "setup.py"
13-
- "tox.ini"
147
push:
158
branches: [main, release/*]
169
paths:
@@ -126,3 +119,9 @@ jobs:
126119
python-version: "3.12"
127120
- name: Run unit tests
128121
run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
122+
unit-pr-required-check:
123+
if: github.event_name == 'pull_request'
124+
needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
125+
runs-on: ubuntu-latest
126+
steps:
127+
- run: echo "All PR unit test jobs completed"

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@ Model Optimizer Changelog (Linux)
55
^^^^^^^^^^^^^^^^^
66

77
**Deprecations**
8+
- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
89

910
**Bug Fixes**
1011

1112
**New Features**
13+
- ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
1214

1315
0.35 (2025-09-04)
1416
^^^^^^^^^^^^^^^^^

examples/llm_ptq/hf_ptq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):
742742
)
743743
parser.add_argument(
744744
"--verbose",
745-
help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",
745+
help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",
746746
default=True,
747747
action=argparse.BooleanOptionalAction,
748748
)

examples/nemo_run/qat/ADVANCED.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,13 @@ To run the example on slurm, edit the `SLURM_CONFIG` at the bottom of `nemo_qat_
88

99
To launch the Flow on a Slurm cluster, modify your Slurm credentials at the bottom of `nemo_qat_flow.py` and add the `--use-slurm` flag to the command. On a different server (e.g. your local server), launch the NeMo container as described in the [README](README.md) then run `python qat/nemo_qat_flow.py --use-slurm --log-dir /slurm/log/dir`, which will `ssh` into the Slurm cluster, `rsync` your files over, and launch the tasks. The log directory on the Slurm cluster should look like this after an experiment is run (assuming your experiment name is `qat_flow_ckpts`)
1010

11-
**NOTE:** `rsync` may not currently be available in the NeMo container and will be added as a dependency.
12-
13-
```
11+
```bash
1412
qat_flow_ckpts qat_flow_ckpts_1755708286
1513
```
1614

1715
If you `cd` into the experiment itself, e.g. `cd qat_flow_ckpts_1755708286`, you'll find a directory structure like the following. Each folder is for a stage of the Simplified Flow, and in each stage you can see the logs for that stage as well as the sbatch command that was run. You can `cd` into each stage and `tail -f` the log file to see the logs while the stage is running.
1816

19-
```
17+
```bash
2018
├── 00_openscience_data
2119
│   ├── code
2220
│   ├── configs
@@ -54,3 +52,5 @@ If you `cd` into the experiment itself, e.g. `cd qat_flow_ckpts_1755708286`, you
5452
│   ├── code
5553
│   └── configs
5654
```
55+
56+
**NOTE:** `rsync` may not currently be available in the NeMo container and will be added as a dependency.

examples/nemo_run/qat/nemo_qat_flow.py

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ def get_args():
140140
action="store_true",
141141
default=False,
142142
)
143+
parser.add_argument("--tensor_parallelism", type=int, default=1)
144+
parser.add_argument("--pipeline_parallelism", type=int, default=1)
143145
return parser.parse_args()
144146

145147

@@ -243,6 +245,8 @@ def main(args):
243245
train.trainer.devices = args.train_gpus
244246
train.trainer.num_nodes = args.train_nodes
245247
train.trainer.limit_val_batches = 32
248+
train.trainer.strategy.tensor_model_parallel_size = args.tensor_parallelism
249+
train.trainer.strategy.pipeline_model_parallel_size = args.pipeline_parallelism
246250

247251
# 5. Export
248252
export = run.Partial(
@@ -257,29 +261,33 @@ def main(args):
257261
mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py"
258262
eval_ptq = run.Script(
259263
mmlu_script_path,
260-
args=["--nemo_ckpt", ptq_model_out],
264+
args=["--nemo_ckpt", ptq_model_out, "--tensor_parallelism", f"{args.ptq_gpus}"],
261265
entrypoint="python",
262266
)
263267
eval_bf16 = run.Script(
264268
mmlu_script_path,
265-
args=["--nemo_ckpt", bf16_ckpt_path],
269+
args=["--nemo_ckpt", bf16_ckpt_path, "--tensor_parallelism", f"{args.ptq_gpus}"],
266270
entrypoint="python",
267271
)
268272
eval_sft = run.Script(
269273
mmlu_script_path,
270-
args=["--finetuned_ckpt_dir", exp_dir],
274+
args=["--finetuned_ckpt_dir", exp_dir, "--tensor_parallelism", f"{args.ptq_gpus}"],
271275
entrypoint="python",
272276
)
273277

274278
if args.use_slurm:
275279
cpu_executor = create_slurm_executor(SLURM_CONFIG)
276-
gpu_executor = create_slurm_executor(
280+
ptq_gpu_executor = create_slurm_executor(
277281
SLURM_CONFIG, num_gpus=args.ptq_gpus, ntasks_per_node=args.ptq_gpus
278282
)
283+
train_gpu_executor = create_slurm_executor(
284+
SLURM_CONFIG, num_gpus=args.train_gpus, ntasks_per_node=args.train_gpus
285+
)
279286
single_gpu_executor = create_slurm_executor(SLURM_CONFIG, num_gpus=1, ntasks_per_node=1)
280287
else:
281288
cpu_executor = single_gpu_executor = run.LocalExecutor()
282-
gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus)
289+
ptq_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus)
290+
train_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.train_gpus)
283291

284292
with run.Experiment(exp_dir, log_level="INFO") as exp:
285293
if not args.data_path:
@@ -294,45 +302,46 @@ def main(args):
294302
eval_bf16,
295303
tail_logs=True,
296304
name="02_mmlu_bf16",
297-
executor=single_gpu_executor,
305+
executor=ptq_gpu_executor,
298306
dependencies=[s1],
299307
)
300308

301309
# 2. PTQ model and evaluate PTQ model
302-
s2 = exp.add(ptq, tail_logs=True, name="03_ptq", executor=gpu_executor, dependencies=[s1])
310+
s2 = exp.add(
311+
ptq, tail_logs=True, name="03_ptq", executor=ptq_gpu_executor, dependencies=[s1]
312+
)
303313
s3 = exp.add(
304314
eval_ptq,
305315
tail_logs=True,
306316
name="04_mmlu_ptq",
307-
executor=single_gpu_executor,
317+
executor=ptq_gpu_executor,
308318
dependencies=[s2],
309319
)
310320
# 3. Train PTQ model (QAT or QAD)
311-
if args.use_slurm: # Set training arguments
312-
gpu_executor.nodes = args.train_nodes
313-
gpu_executor.gpus_per_node = gpu_executor.ntasks_per_node = args.train_gpus
314-
else:
315-
gpu_executor.ntasks_per_node = args.train_gpus
316321
train_dep = [s3]
317322
if not args.data_path:
318323
train_dep.append(s0)
319324
s4 = exp.add(
320-
train, tail_logs=True, name="05_train", executor=gpu_executor, dependencies=train_dep
325+
train,
326+
tail_logs=True,
327+
name="05_train",
328+
executor=train_gpu_executor,
329+
dependencies=train_dep,
321330
)
322-
323331
s5 = exp.add(
324332
eval_sft,
325333
tail_logs=True,
326334
name="06_mmlu_sft",
327-
executor=single_gpu_executor,
335+
executor=ptq_gpu_executor,
328336
dependencies=[s4],
329337
)
330-
gpu_executor.ntasks_per_node = 1 # will throw error if more than 1 task during export
338+
# WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo
339+
train_gpu_executor.ntasks_per_node = 1 # will throw error if more than 1 task during export
331340
exp.add(
332341
export,
333342
tail_logs=True,
334343
name="07_export_hf",
335-
executor=gpu_executor,
344+
executor=train_gpu_executor,
336345
dependencies=[s5],
337346
)
338347
exp.run(detach=True)
@@ -356,10 +365,7 @@ def main(args):
356365
use_local_tunnel=False,
357366
host="",
358367
user="",
359-
container_mounts=[
360-
"/path/to/logs:/path/to/logs",
361-
"/path/to/NeMo:/opt/NeMo",
362-
],
368+
container_mounts=[],
363369
job_dir="/path/to/logs",
364370
identity=None,
365371
)
@@ -369,7 +375,7 @@ def main(args):
369375
SEQUENCE_LENGTH = 4096
370376
MBS = 1
371377
GBS = 512
372-
TRAIN_STEPS = 200
378+
TRAIN_STEPS = 400
373379
VAL_INTERVAL = 50
374380
# # # # # # # # # # # # # # # # # # # # # #
375381

examples/onnx_ptq/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ The following evaluation requires the `val` directory of the [ImageNet dataset](
120120
python evaluate.py \
121121
--onnx_path=<path to classification model> \
122122
--imagenet_path=<path to the ImageNet dataset> \
123-
--quantize_mode=<fp8|int8|int4> \
123+
--engine_precision=stronglyTyped \
124124
--model_name=vit_base_patch16_224
125125
```
126126

@@ -165,7 +165,7 @@ If the input model is of type image classification, use the following script to
165165
python evaluate.py \
166166
--onnx_path=<path to the exported ONNX model> \
167167
--imagenet_path=<path to the ImageNet dataset> \
168-
--quantize_mode=stronglyTyped \
168+
--engine_precision=stronglyTyped \
169169
--model_name=vit_base_patch16_224
170170
```
171171

examples/onnx_ptq/docker/Dockerfile

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ RUN python -m pip install --upgrade pip \
1212

1313
WORKDIR /workspace
1414

15-
RUN pip install tensorrt==10.13.2.6 && \
16-
export TRT_PATH=$(python -c "import tensorrt; import os; print(os.path.dirname(tensorrt.__file__))") && \
17-
export LD_LIBRARY_PATH="$TRT_PATH/lib:/usr/include:${LD_LIBRARY_PATH}" && \
18-
export PATH="$TRT_PATH/bin:${PATH}"
15+
RUN pip install tensorrt==10.13.2.6
16+
ENV TRT_PATH=/usr/local/lib/python3.12/dist-packages/tensorrt
17+
ENV CUDNN_LIB_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/lib
18+
ENV LD_LIBRARY_PATH="${CUDNN_LIB_DIR}:${TRT_PATH}/lib:/usr/include:${LD_LIBRARY_PATH}"
19+
ENV PATH="${TRT_PATH}/bin:${PATH}"
1920

2021
# Copy application code and install requirements
2122
COPY modelopt modelopt/modelopt

examples/onnx_ptq/evaluate.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,29 +48,22 @@ def main():
4848
parser.add_argument(
4949
"--eval_data_size", type=int, default=None, help="Number of examples to evaluate"
5050
)
51-
# By default, TensorRT autotunes tensor types to generate the fastest engine. When you specify
52-
# to TensorRT that a network is strongly typed, it infers a type for each intermediate and
53-
# output tensor using the rules in the operator type specification. For networks quantized in
54-
# INT4 or FP8 mode, stronglyTyped as the mode is recommended for TensorRT deployment. Though
55-
# INT8 networks are generally compiled with int8 mode, certain INT8 ViT networks compiled with
56-
# stronglyTyped precision have shown better performance.
5751
parser.add_argument(
58-
"--quantize_mode",
52+
"--engine_precision",
5953
type=str,
6054
default="stronglyTyped",
61-
choices=["fp8", "fp16", "fp32", "int4", "int8", "int8_iq", "bf16", "best", "stronglyTyped"],
62-
help="Quantization mode for the TensorRT engine. \
63-
Supported options: fp8, fp16, fp32, int8, int8_iq(implicit quantization), bf16, best, stronglyTyped",
55+
choices=["best", "fp16", "stronglyTyped"],
56+
help="Precision mode for the TensorRT engine. \
57+
stronglyTyped is recommended, all other modes have been deprecated in TensorRT",
6458
)
6559
parser.add_argument(
6660
"--results_path", type=str, default=None, help="Save the results to the specified path"
6761
)
6862

6963
args = parser.parse_args()
70-
7164
deployment = {
7265
"runtime": "TRT",
73-
"precision": args.quantize_mode,
66+
"precision": args.engine_precision,
7467
}
7568

7669
# Create an ONNX bytes object with the specified path

0 commit comments

Comments
 (0)