NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 21 additions & 21 deletions b/‎.github/CODEOWNERS‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 21 additions & 1 deletion b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 6 additions & 7 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/nemo_run/qat/ADVANCED.md‎
Lines changed: 4 additions & 4 deletions b/‎examples/nemo_run/qat/ADVANCED.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/nemo_run/qat/nemo_qat_flow.py‎
Lines changed: 29 additions & 23 deletions b/‎examples/nemo_run/qat/nemo_qat_flow.py‎
Lines changed: 29 additions & 23 deletions
diff --git a/‎examples/onnx_ptq/README.md‎
Lines changed: 2 additions & 2 deletions b/‎examples/onnx_ptq/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/onnx_ptq/docker/Dockerfile‎
Lines changed: 5 additions & 4 deletions b/‎examples/onnx_ptq/docker/Dockerfile‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 5 additions & 12 deletions b/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 5 additions & 12 deletions
@@ -32,24 +32,24 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 # Examples
 /docker @NVIDIA/modelopt-docker-codeowners
 /README.md @NVIDIA/modelopt-examples-codeowners
-examples @NVIDIA/modelopt-examples-codeowners
-examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
-examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
-examples/deepseek @NVIDIA/modelopt-deploy-codeowners
-examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
-examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
-examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
-examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
-examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
-examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
-examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
-examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
-examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
-examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
-examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
-examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
-examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
-examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
-examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
-examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
-examples/windows @NVIDIA/modelopt-windows-codeowners
+/examples @NVIDIA/modelopt-examples-codeowners
+/examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
+/examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
+/examples/deepseek @NVIDIA/modelopt-deploy-codeowners
+/examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
+/examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
+/examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
+/examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
+/examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
+/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
+/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
+/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
+/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
+/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
+/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
+/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
+/examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
+/examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
+/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
+/examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
+/examples/windows @NVIDIA/modelopt-windows-codeowners
@@ -22,20 +22,31 @@ jobs:
       any_changed: ${{ steps.changed-tests.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - id: get-pr-info
         uses: nv-gha-runners/get-pr-info@main
+      # Get commit from main branch that is present in the PR to use as base for changed files
+      - id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
       - name: Check for changes in test-relevant directories
         id: changed-tests
         uses: step-security/[email protected]
         with:
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
           files: |
             .github/workflows/gpu_tests.yml
             modelopt/**
             tests/gpu/**
             tox.ini
             pyproject.toml
             setup.py
-          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
+          fail_on_initial_diff_error: true
   wait-checks:
     needs: [check-file-changes]
     if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
     timeout-minutes: 90
     container: *gpu_container
     steps: *gpu_steps
+  gpu-pr-required-check:
+    # Run even if gpu-tests-pr is skipped
+    if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
+    needs: [check-file-changes, gpu-tests-pr]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Required GPU tests did not succeed
+        if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
+        run: exit 1
@@ -4,13 +4,6 @@ name: Unit tests
 on:
   pull_request:
     branches: [main, release/*]
-    paths:
-      - ".github/workflows/unit_tests.yml"
-      - "modelopt/**"
-      - "tests/unit/**"
-      - "pyproject.toml"
-      - "setup.py"
-      - "tox.ini"
   push:
     branches: [main, release/*]
     paths:
@@ -126,3 +119,9 @@ jobs:
           python-version: "3.12"
       - name: Run unit tests
         run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
+  unit-pr-required-check:
+    if: github.event_name == 'pull_request'
+    needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "All PR unit test jobs completed"
@@ -5,10 +5,12 @@ Model Optimizer Changelog (Linux)
 ^^^^^^^^^^^^^^^^^
 
 **Deprecations**
+- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
 
 **Bug Fixes**
 
 **New Features**
+- ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
 
 0.35 (2025-09-04)
 ^^^^^^^^^^^^^^^^^
 
@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):
     )
     parser.add_argument(
         "--verbose",
-        help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",
+        help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",
         default=True,
         action=argparse.BooleanOptionalAction,
     )
 
@@ -8,15 +8,13 @@ To run the example on slurm, edit the `SLURM_CONFIG` at the bottom of `nemo_qat_
 
 To launch the Flow on a Slurm cluster, modify your Slurm credentials at the bottom of `nemo_qat_flow.py` and add the `--use-slurm` flag to the command. On a different server (e.g. your local server), launch the NeMo container as described in the [README](README.md) then run `python qat/nemo_qat_flow.py --use-slurm --log-dir /slurm/log/dir`, which will `ssh` into the Slurm cluster, `rsync` your files over, and launch the tasks. The log directory on the Slurm cluster should look like this after an experiment is run (assuming your experiment name is `qat_flow_ckpts`)
 
-**NOTE:** `rsync` may not currently be available in the NeMo container and will be added as a dependency.
-
-```
+```bash
 qat_flow_ckpts qat_flow_ckpts_1755708286
 ```
 
 If you `cd` into the experiment itself, e.g. `cd qat_flow_ckpts_1755708286`, you'll find a directory structure like the following. Each folder is for a stage of the Simplified Flow, and in each stage you can see the logs for that stage as well as the sbatch command that was run. You can `cd` into each stage and `tail -f` the log file to see the logs while the stage is running.
 
-```
+```bash
 ├── 00_openscience_data
 │   ├── code
 │   ├── configs
@@ -54,3 +52,5 @@ If you `cd` into the experiment itself, e.g. `cd qat_flow_ckpts_1755708286`, you
 │   ├── code
 │   └── configs
 ```
+
+**NOTE:** `rsync` may not currently be available in the NeMo container and will be added as a dependency.
@@ -140,6 +140,8 @@ def get_args():
         action="store_true",
         default=False,
     )
+    parser.add_argument("--tensor_parallelism", type=int, default=1)
+    parser.add_argument("--pipeline_parallelism", type=int, default=1)
     return parser.parse_args()
 
 
@@ -243,6 +245,8 @@ def main(args):
     train.trainer.devices = args.train_gpus
     train.trainer.num_nodes = args.train_nodes
     train.trainer.limit_val_batches = 32
+    train.trainer.strategy.tensor_model_parallel_size = args.tensor_parallelism
+    train.trainer.strategy.pipeline_model_parallel_size = args.pipeline_parallelism
 
     # 5. Export
     export = run.Partial(
@@ -257,29 +261,33 @@ def main(args):
         mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py"
     eval_ptq = run.Script(
         mmlu_script_path,
-        args=["--nemo_ckpt", ptq_model_out],
+        args=["--nemo_ckpt", ptq_model_out, "--tensor_parallelism", f"{args.ptq_gpus}"],
         entrypoint="python",
     )
     eval_bf16 = run.Script(
         mmlu_script_path,
-        args=["--nemo_ckpt", bf16_ckpt_path],
+        args=["--nemo_ckpt", bf16_ckpt_path, "--tensor_parallelism", f"{args.ptq_gpus}"],
         entrypoint="python",
     )
     eval_sft = run.Script(
         mmlu_script_path,
-        args=["--finetuned_ckpt_dir", exp_dir],
+        args=["--finetuned_ckpt_dir", exp_dir, "--tensor_parallelism", f"{args.ptq_gpus}"],
         entrypoint="python",
     )
 
     if args.use_slurm:
         cpu_executor = create_slurm_executor(SLURM_CONFIG)
-        gpu_executor = create_slurm_executor(
+        ptq_gpu_executor = create_slurm_executor(
             SLURM_CONFIG, num_gpus=args.ptq_gpus, ntasks_per_node=args.ptq_gpus
         )
+        train_gpu_executor = create_slurm_executor(
+            SLURM_CONFIG, num_gpus=args.train_gpus, ntasks_per_node=args.train_gpus
+        )
         single_gpu_executor = create_slurm_executor(SLURM_CONFIG, num_gpus=1, ntasks_per_node=1)
     else:
         cpu_executor = single_gpu_executor = run.LocalExecutor()
-        gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus)
+        ptq_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus)
+        train_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.train_gpus)
 
     with run.Experiment(exp_dir, log_level="INFO") as exp:
         if not args.data_path:
@@ -294,45 +302,46 @@ def main(args):
             eval_bf16,
             tail_logs=True,
             name="02_mmlu_bf16",
-            executor=single_gpu_executor,
+            executor=ptq_gpu_executor,
             dependencies=[s1],
         )
 
         # 2. PTQ model and evaluate PTQ model
-        s2 = exp.add(ptq, tail_logs=True, name="03_ptq", executor=gpu_executor, dependencies=[s1])
+        s2 = exp.add(
+            ptq, tail_logs=True, name="03_ptq", executor=ptq_gpu_executor, dependencies=[s1]
+        )
         s3 = exp.add(
             eval_ptq,
             tail_logs=True,
             name="04_mmlu_ptq",
-            executor=single_gpu_executor,
+            executor=ptq_gpu_executor,
             dependencies=[s2],
         )
         # 3. Train PTQ model (QAT or QAD)
-        if args.use_slurm:  # Set training arguments
-            gpu_executor.nodes = args.train_nodes
-            gpu_executor.gpus_per_node = gpu_executor.ntasks_per_node = args.train_gpus
-        else:
-            gpu_executor.ntasks_per_node = args.train_gpus
         train_dep = [s3]
         if not args.data_path:
             train_dep.append(s0)
         s4 = exp.add(
-            train, tail_logs=True, name="05_train", executor=gpu_executor, dependencies=train_dep
+            train,
+            tail_logs=True,
+            name="05_train",
+            executor=train_gpu_executor,
+            dependencies=train_dep,
         )
-
         s5 = exp.add(
             eval_sft,
             tail_logs=True,
             name="06_mmlu_sft",
-            executor=single_gpu_executor,
+            executor=ptq_gpu_executor,
             dependencies=[s4],
         )
-        gpu_executor.ntasks_per_node = 1  # will throw error if more than 1 task during export
+        # WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo
+        train_gpu_executor.ntasks_per_node = 1  # will throw error if more than 1 task during export
         exp.add(
             export,
             tail_logs=True,
             name="07_export_hf",
-            executor=gpu_executor,
+            executor=train_gpu_executor,
             dependencies=[s5],
         )
         exp.run(detach=True)
@@ -356,10 +365,7 @@ def main(args):
             use_local_tunnel=False,
             host="",
             user="",
-            container_mounts=[
-                "/path/to/logs:/path/to/logs",
-                "/path/to/NeMo:/opt/NeMo",
-            ],
+            container_mounts=[],
             job_dir="/path/to/logs",
             identity=None,
         )
@@ -369,7 +375,7 @@ def main(args):
     SEQUENCE_LENGTH = 4096
     MBS = 1
     GBS = 512
-    TRAIN_STEPS = 200
+    TRAIN_STEPS = 400
     VAL_INTERVAL = 50
     # # # # # # # # # # # # # # # # # # # # # #
 
 
@@ -120,7 +120,7 @@ The following evaluation requires the `val` directory of the [ImageNet dataset](
 python evaluate.py \
     --onnx_path=<path to classification model> \
     --imagenet_path=<path to the ImageNet dataset> \
-    --quantize_mode=<fp8|int8|int4> \
+    --engine_precision=stronglyTyped \
     --model_name=vit_base_patch16_224
 ```
 
@@ -165,7 +165,7 @@ If the input model is of type image classification, use the following script to
 python evaluate.py \
     --onnx_path=<path to the exported ONNX model> \
     --imagenet_path=<path to the ImageNet dataset> \
-    --quantize_mode=stronglyTyped \
+    --engine_precision=stronglyTyped \
     --model_name=vit_base_patch16_224
 ```
 
 
@@ -12,10 +12,11 @@ RUN python -m pip install --upgrade pip \
 
 WORKDIR /workspace
 
-RUN pip install tensorrt==10.13.2.6 && \
-    export TRT_PATH=$(python -c "import tensorrt; import os; print(os.path.dirname(tensorrt.__file__))") && \
-    export LD_LIBRARY_PATH="$TRT_PATH/lib:/usr/include:${LD_LIBRARY_PATH}" && \
-    export PATH="$TRT_PATH/bin:${PATH}"
+RUN pip install tensorrt==10.13.2.6
+ENV TRT_PATH=/usr/local/lib/python3.12/dist-packages/tensorrt
+ENV CUDNN_LIB_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/lib
+ENV LD_LIBRARY_PATH="${CUDNN_LIB_DIR}:${TRT_PATH}/lib:/usr/include:${LD_LIBRARY_PATH}"
+ENV PATH="${TRT_PATH}/bin:${PATH}"
 
 # Copy application code and install requirements
 COPY modelopt modelopt/modelopt
 
@@ -48,29 +48,22 @@ def main():
     parser.add_argument(
         "--eval_data_size", type=int, default=None, help="Number of examples to evaluate"
     )
-    # By default, TensorRT autotunes tensor types to generate the fastest engine. When you specify
-    # to TensorRT that a network is strongly typed, it infers a type for each intermediate and
-    # output tensor using the rules in the operator type specification. For networks quantized in
-    # INT4 or FP8 mode, stronglyTyped as the mode is recommended for TensorRT deployment. Though
-    # INT8 networks are generally compiled with int8 mode, certain INT8 ViT networks compiled with
-    # stronglyTyped precision have shown better performance.
     parser.add_argument(
-        "--quantize_mode",
+        "--engine_precision",
         type=str,
         default="stronglyTyped",
-        choices=["fp8", "fp16", "fp32", "int4", "int8", "int8_iq", "bf16", "best", "stronglyTyped"],
-        help="Quantization mode for the TensorRT engine. \
-            Supported options: fp8, fp16, fp32, int8, int8_iq(implicit quantization), bf16, best, stronglyTyped",
+        choices=["best", "fp16", "stronglyTyped"],
+        help="Precision mode for the TensorRT engine. \
+            stronglyTyped is recommended, all other modes have been deprecated in TensorRT",
     )
     parser.add_argument(
         "--results_path", type=str, default=None, help="Save the results to the specified path"
     )
 
     args = parser.parse_args()
-
     deployment = {
         "runtime": "TRT",
-        "precision": args.quantize_mode,
+        "precision": args.engine_precision,
     }
 
     # Create an ONNX bytes object with the specified path
Original file line number	Diff line number	Diff line change
`@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):`
`742`	`742`	`)`
`743`	`743`	`parser.add_argument(`
`744`	`744`	`"--verbose",`
`745`		`- help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",`
	`745`	`+ help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",`
`746`	`746`	`default=True,`
`747`	`747`	`action=argparse.BooleanOptionalAction,`
`748`	`748`	`)`