microsoft · ynankani · Oct 30, 2025
diff --git a/...-1.5B-Instruct/NvTensorRtRtx/Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json b/...-1.5B-Instruct/NvTensorRtRtx/Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json
@@ -0,0 +1,30 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen2.5-1.5B-Instruct",
+        "task": "text-classification"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "NvTensorRTRTXExecutionProvider" ] } ]
+        }
+    },
+    "engine": { "target": "local_system" },
+    "passes": {
+        "builder": { "type": "ModelBuilder", "precision": "fp16" },
+        "quantization": {
+            "type": "NVModelOptQuantization",
+            "algorithm": "awq",
+            "int4_block_size": 32,
+            "tokenizer_dir": "Qwen/Qwen2.5-1.5B-Instruct",
+            "calibration_method": "awq_lite",
+            "enable_mixed_quant": true,
+            "calibration_providers": ["NvTensorRtRtx"],
+            "calibration_params": {
+                "add_position_ids": false
+            }
+        }
+    },
+    "log_severity_level": 0
+}
diff --git a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/README.md b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/README.md
@@ -19,3 +19,60 @@ Use the following command to export the model using Olive with NvTensorRTRTXExec
 ```bash
 olive run --config Qwen2.5-1.5B-Instruct_model_builder_fp16.json
 ```
+
+## NVMO PTQ Mixed Precision Quantization
+
+The olive recipe `Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` produces INT4 + INT8 mixed precision quantized model using NVIDIA's TensorRT Model Optimizer toolkit with AWQ algorithm.
+
+### Setup
+
+1. Install Olive with NVIDIA TensorRT Model Optimizer toolkit
+
+    - Run following command to install Olive with TensorRT Model Optimizer.
+    ```bash
+    pip install olive-ai[nvmo]
+    ```
+
+    - If TensorRT Model Optimizer needs to be installed from a local wheel, then follow below steps.
+
+        ```bash
+        pip install olive-ai
+        pip install <modelopt-wheel>[onnx]
+        ```
+
+    - Make sure that TensorRT Model Optimizer is installed correctly.
+        ```bash
+        python -c "from modelopt.onnx.quantization.int4 import quantize as quantize_int4"
+        ```
+
+    - Refer TensorRT Model Optimizer [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/windows/_installation_with_olive.html) for its detailed installation instructions and setup dependencies.
+
+2. Install suitable onnxruntime and onnxruntime-genai packages
+
+    - Install the onnxruntime and onnxruntime-genai packages that have NvTensorRTRTXExecutionProvider support. Refer documentation for [NvTensorRtRtx execution-provider](https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider) to setup its dependencies/requirements.
+    - Note that by default, TensorRT Model Optimizer comes with onnxruntime-directml. And onnxrutime-genai-cuda package comes with onnxruntime-gpu. So, in order to use onnxruntime package with NvTensorRTRTXExecutionProvider support, one might need to uninstall existing other onnxruntime packages.
+    - Make sure that at the end, there is only one onnxruntime package installed. Use command like following for validating the onnxruntime package installation.
+    ```bash
+    python -c "import onnxruntime as ort; print(ort.get_available_providers())"
+    ```
+
+3. Install additional requirements.
+
+    - Install packages provided in requirements text file.
+    ```bash
+    pip install -r requirements-nvmo.txt
+    ```
+
+### Steps to run
+
+```bash
+olive run --config Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json
+```
+
+### Recipe details
+
+The olive recipe `Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` has 2 passes: (a) `ModelBuilder` and (b) `NVModelOptQuantization`. The `ModelBuilder` pass is used to generate the FP16 model for `NvTensorRTRTXExecutionProvider` (aka `NvTensorRtRtx` EP). Subsequently, the `NVModelOptQuantization` pass performs INT4 + INT8 mixed precision quantization using AWQ algorithm with AWQ Lite calibration method to produce the optimized model.
+
+### Troubleshoot
+
+In case of any issue related to quantization using TensorRT Model Optimizer toolkit, refer its [FAQs](https://nvidia.github.io/TensorRT-Model-Optimizer/support/2_faqs.html) for potential help or suggestions.
diff --git a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/info.yml b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/info.yml
@@ -4,3 +4,7 @@ recipes:
     file: Qwen2.5-1.5B-Instruct_model_builder_fp16.json
     devices: gpu
     eps: NvTensorRTRTXExecutionProvider
+  - name: Qwen2.5_1.5B_Instruct_NVMO_PTQ_Mixed_Precision_AWQ_Lite
+    file: Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json
+    devices: gpu
+    eps: NvTensorRTRTXExecutionProvider
diff --git a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/requirements-nvmo.txt b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/requirements-nvmo.txt
@@ -0,0 +1,4 @@
+datasets>=2.14.4
+torch
+transformers
+
diff --git a/...n-1.5B/NvTensorRtRtx/DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json b/...n-1.5B/NvTensorRtRtx/DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json
@@ -0,0 +1,30 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "task": "text-classification"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "NvTensorRTRTXExecutionProvider" ] } ]
+        }
+    },
+    "engine": { "target": "local_system" },
+    "passes": {
+        "builder": { "type": "ModelBuilder", "precision": "fp16" },
+        "quantization": {
+            "type": "NVModelOptQuantization",
+            "algorithm": "awq",
+            "int4_block_size": 32,
+            "tokenizer_dir": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            "calibration_method": "awq_lite",
+            "enable_mixed_quant": true,
+            "calibration_providers": ["NvTensorRtRtx"],
+            "calibration_params": {
+                "add_position_ids": false
+            }
+        }
+    },
+    "log_severity_level": 0
+}
diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/README.md b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/README.md
@@ -19,3 +19,60 @@ Use the following command to export the model using Olive with NvTensorRTRTXExec
 ```bash
 olive run --config DeepSeek-R1-Distill-Qwen-1.5B_model_builder_fp16.json
 ```
+
+## NVMO PTQ Mixed Precision Quantization
+
+The olive recipe `DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json` produces INT4 + INT8 mixed precision quantized model using NVIDIA's TensorRT Model Optimizer toolkit with AWQ algorithm.
+
+### Setup
+
+1. Install Olive with NVIDIA TensorRT Model Optimizer toolkit
+
+    - Run following command to install Olive with TensorRT Model Optimizer.
+    ```bash
+    pip install olive-ai[nvmo]
+    ```
+
+    - If TensorRT Model Optimizer needs to be installed from a local wheel, then follow below steps.
+
+        ```bash
+        pip install olive-ai
+        pip install <modelopt-wheel>[onnx]
+        ```
+
+    - Make sure that TensorRT Model Optimizer is installed correctly.
+        ```bash
+        python -c "from modelopt.onnx.quantization.int4 import quantize as quantize_int4"
+        ```
+
+    - Refer TensorRT Model Optimizer [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/windows/_installation_with_olive.html) for its detailed installation instructions and setup dependencies.
+
+2. Install suitable onnxruntime and onnxruntime-genai packages
+
+    - Install the onnxruntime and onnxruntime-genai packages that have NvTensorRTRTXExecutionProvider support. Refer documentation for [NvTensorRtRtx execution-provider](https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider) to setup its dependencies/requirements.
+    - Note that by default, TensorRT Model Optimizer comes with onnxruntime-directml. And onnxrutime-genai-cuda package comes with onnxruntime-gpu. So, in order to use onnxruntime package with NvTensorRTRTXExecutionProvider support, one might need to uninstall existing other onnxruntime packages.
+    - Make sure that at the end, there is only one onnxruntime package installed. Use command like following for validating the onnxruntime package installation.
+    ```bash
+    python -c "import onnxruntime as ort; print(ort.get_available_providers())"
+    ```
+
+3. Install additional requirements.
+
+    - Install packages provided in requirements text file.
+    ```bash
+    pip install -r requirements-nvmo.txt
+    ```
+
+### Steps to run
+
+```bash
+olive run --config DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json
+```
+
+### Recipe details
+
+The olive recipe `DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json` has 2 passes: (a) `ModelBuilder` and (b) `NVModelOptQuantization`. The `ModelBuilder` pass is used to generate the FP16 model for `NvTensorRTRTXExecutionProvider` (aka `NvTensorRtRtx` EP). Subsequently, the `NVModelOptQuantization` pass performs INT4 + INT8 mixed precision quantization using AWQ algorithm with AWQ Lite calibration method to produce the optimized model.
+
+### Troubleshoot
+
+In case of any issue related to quantization using TensorRT Model Optimizer toolkit, refer its [FAQs](https://nvidia.github.io/TensorRT-Model-Optimizer/support/2_faqs.html) for potential help or suggestions.
diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/info.yml
@@ -4,3 +4,7 @@ recipes:
     file: DeepSeek-R1-Distill-Qwen-1.5B_model_builder_fp16.json
     devices: gpu
     eps: NvTensorRTRTXExecutionProvider
+  - name: DeepSeek-R1-Distill-Qwen-1.5B_NVMO_PTQ_Mixed_Precision_AWQ_Lite
+    file: DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json
+    devices: gpu
+    eps: NvTensorRTRTXExecutionProvider
diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/requirements-nvmo.txt b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/requirements-nvmo.txt
@@ -0,0 +1,4 @@
+datasets>=2.14.4
+torch
+transformers
+
diff --git a/....2-1B-Instruct/NvTensorRtRtx/Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json b/....2-1B-Instruct/NvTensorRtRtx/Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json
@@ -0,0 +1,30 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "meta-llama/Llama-3.2-1B-Instruct",
+        "task": "text-classification"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "NvTensorRTRTXExecutionProvider" ] } ]
+        }
+    },
+    "engine": { "target": "local_system" },
+    "passes": {
+        "builder": { "type": "ModelBuilder", "precision": "fp16" },
+        "quantization": {
+            "type": "NVModelOptQuantization",
+            "algorithm": "awq",
+            "int4_block_size": 32,
+            "tokenizer_dir": "meta-llama/Llama-3.2-1B-Instruct",
+            "calibration_method": "awq_lite",
+            "enable_mixed_quant": true,
+            "calibration_providers": ["NvTensorRtRtx"],
+            "calibration_params": {
+                "add_position_ids": false
+            }
+        }
+    },
+    "log_severity_level": 0
+}
diff --git a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/README.md b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/README.md
@@ -19,3 +19,60 @@ Use the following command to export the model using Olive with NvTensorRTRTXExec
 ```bash
 olive run --config Llama-3.2-1B-Instruct_model_builder_fp16.json
 ```
+
+## NVMO PTQ Mixed Precision Quantization
+
+The olive recipe `Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` produces INT4 + INT8 mixed precision quantized model using NVIDIA's TensorRT Model Optimizer toolkit with AWQ algorithm.
+
+### Setup
+
+1. Install Olive with NVIDIA TensorRT Model Optimizer toolkit
+
+    - Run following command to install Olive with TensorRT Model Optimizer.
+    ```bash
+    pip install olive-ai[nvmo]
+    ```
+
+    - If TensorRT Model Optimizer needs to be installed from a local wheel, then follow below steps.
+
+        ```bash
+        pip install olive-ai
+        pip install <modelopt-wheel>[onnx]
+        ```
+
+    - Make sure that TensorRT Model Optimizer is installed correctly.
+        ```bash
+        python -c "from modelopt.onnx.quantization.int4 import quantize as quantize_int4"
+        ```
+
+    - Refer TensorRT Model Optimizer [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/windows/_installation_with_olive.html) for its detailed installation instructions and setup dependencies.
+
+2. Install suitable onnxruntime and onnxruntime-genai packages
+
+    - Install the onnxruntime and onnxruntime-genai packages that have NvTensorRTRTXExecutionProvider support. Refer documentation for [NvTensorRtRtx execution-provider](https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider) to setup its dependencies/requirements.
+    - Note that by default, TensorRT Model Optimizer comes with onnxruntime-directml. And onnxrutime-genai-cuda package comes with onnxruntime-gpu. So, in order to use onnxruntime package with NvTensorRTRTXExecutionProvider support, one might need to uninstall existing other onnxruntime packages.
+    - Make sure that at the end, there is only one onnxruntime package installed. Use command like following for validating the onnxruntime package installation.
+    ```bash
+    python -c "import onnxruntime as ort; print(ort.get_available_providers())"
+    ```
+
+3. Install additional requirements.
+
+    - Install packages provided in requirements text file.
+    ```bash
+    pip install -r requirements-nvmo.txt
+    ```
+
+### Steps to run
+
+```bash
+olive run --config Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json
+```
+
+### Recipe details
+
+The olive recipe `Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` has 2 passes: (a) `ModelBuilder` and (b) `NVModelOptQuantization`. The `ModelBuilder` pass is used to generate the FP16 model for `NvTensorRTRTXExecutionProvider` (aka `NvTensorRtRtx` EP). Subsequently, the `NVModelOptQuantization` pass performs INT4 + INT8 mixed precision quantization using AWQ algorithm with AWQ Lite calibration method to produce the optimized model.
+
+### Troubleshoot
+
+In case of any issue related to quantization using TensorRT Model Optimizer toolkit, refer its [FAQs](https://nvidia.github.io/TensorRT-Model-Optimizer/support/2_faqs.html) for potential help or suggestions.
diff --git a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/info.yml b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/info.yml
@@ -4,3 +4,7 @@ recipes:
     file: Llama-3.2-1B-Instruct_model_builder_fp16.json
     devices: gpu
     eps: NvTensorRTRTXExecutionProvider
+  - name: Llama-3.2-1B-Instruct_NVMO_PTQ_Mixed_Precision_AWQ_Lite
+    file: Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json
+    devices: gpu
+    eps: NvTensorRTRTXExecutionProvider
diff --git a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/requirements-nvmo.txt b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/requirements-nvmo.txt
@@ -0,0 +1,4 @@
+datasets>=2.14.4
+torch
+transformers
+