cp: Re-enable onnx test (#597) in r0.4.0 (#598)

chtruong814 · web-flow · commit 5ceae237237d · 2026-02-15T11:07:32.000-06:00
Signed-off-by: Charlie Truong &lt;chtruong@nvidia.com&gt;
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -206,40 +206,40 @@ jobs:
           ngc-api-user: ${{ secrets.NGC_API_USER }}
           ngc-api-key: ${{ secrets.NGC_API_KEY }}
 
-  # cicd-e2e-tests-trt-onnx:
-  #   needs: [cicd-unit-tests-trtllm, pre-flight]
-  #   runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
-  #   name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
-  #   environment: nemo-ci
-  #   if: |
-  #     (
-  #       success()
-  #       || (
-  #         needs.cicd-wait-in-queue.result == 'skipped'
-  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
-  #       )
-  #     )
-  #     && !cancelled()
-  #   steps:
-  #     - name: Checkout
-  #       uses: actions/checkout@v4
-  #     - name: main
-  #       uses: ./.github/actions/test-template
-  #       with:
-  #         script: L2_ONNX_TRT
-  #         is_optional: ${{ matrix.is_optional || false }}
-  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
-  #         has-azure-credentials: true
-  #         is_unit_test: "false"
-  #         timeout: 60
-  #         PAT: ${{ secrets.PAT }}
-  #         inference-framework: trt-onnx
-  #         test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
-  #         runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
-  #         ngc-api-user: ${{ secrets.NGC_API_USER }}
-  #         ngc-api-key: ${{ secrets.NGC_API_KEY }}
+  cicd-e2e-tests-trt-onnx:
+    needs: [cicd-unit-tests-trtllm, pre-flight]
+    runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
+    name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
+    environment: nemo-ci
+    if: |
+      (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: main
+        uses: ./.github/actions/test-template
+        with:
+          script: L2_ONNX_TRT
+          is_optional: ${{ matrix.is_optional || false }}
+          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+          has-azure-credentials: true
+          is_unit_test: "false"
+          timeout: 60
+          PAT: ${{ secrets.PAT }}
+          inference-framework: trt-onnx
+          test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
+          runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
+          ngc-api-user: ${{ secrets.NGC_API_USER }}
+          ngc-api-key: ${{ secrets.NGC_API_KEY }}
 
   cicd-e2e-tests-vllm:
     needs: [cicd-unit-tests-vllm, pre-flight]
diff --git a/nemo_export/onnx_llm_exporter.py b/nemo_export/onnx_llm_exporter.py
@@ -20,6 +20,7 @@
 import numpy as np
 import torch
 import wrapt
+from datasets import load_dataset
 from transformers import AutoModel, AutoTokenizer
 
 from nemo_deploy import ITritonDeployable
@@ -30,7 +31,6 @@
 )
 from nemo_export_deploy_common.import_utils import (
     MISSING_MODELOPT_MSG,
-    MISSING_NEMO_MSG,
     MISSING_TENSORRT_MSG,
     UnavailableError,
 )
@@ -63,17 +63,6 @@
     trt = MagicMock()
     HAVE_TENSORRT = False
 
-try:
-    from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import (
-        get_quant_cfg_choices,
-    )
-
-    QUANT_CFG_CHOICES = get_quant_cfg_choices()
-
-    HAVE_NEMO = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_NEMO = False
-
 
 @wrapt.decorator
 def noop_decorator(func):
@@ -254,6 +243,7 @@ def _export_to_onnx(
                 dynamic_axes={**dynamic_axes_input, **dynamic_axes_output},
                 verbose=verbose,
                 opset_version=opset,
+                dynamo=False,
             )
         logging.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")
 
@@ -494,17 +484,15 @@ def quantize(
             forward_loop (callable): A function that accepts the model as a single parameter
                 and runs sample data through it. This is used for calibration during quantization.
         """
-        if not HAVE_NEMO:
-            raise UnavailableError(MISSING_NEMO_MSG)
-
         if not HAVE_MODELOPT:
             raise UnavailableError(MISSING_MODELOPT_MSG)
 
+        quant_cfg_choices = get_quant_cfg_choices()
         if isinstance(quant_cfg, str):
-            assert quant_cfg in QUANT_CFG_CHOICES, (
-                f"Quantization config {quant_cfg} is not supported. Supported configs: {list(QUANT_CFG_CHOICES)}"
+            assert quant_cfg in quant_cfg_choices, (
+                f"Quantization config {quant_cfg} is not supported. Supported configs: {list(quant_cfg_choices)}"
             )
-            quant_cfg = QUANT_CFG_CHOICES[quant_cfg]
+            quant_cfg = quant_cfg_choices[quant_cfg]
 
         logging.info("Starting quantization...")
         mtq.quantize(self.model, quant_cfg, forward_loop=forward_loop)
@@ -539,3 +527,57 @@ def get_triton_output(self):
     def triton_infer_fn(self, **inputs: np.ndarray):
         """PyTriton inference function."""
         raise NotImplementedError("This function will be implemented later.")
+
+
+def get_calib_data_iter(
+    data: str = "cnn_dailymail", batch_size: int = 64, calib_size: int = 512, max_sequence_length: int = 512
+):
+    """Creates a sample data iterator for calibration."""
+    if data == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
+        text_column = "text"
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        text_column = "article"
+    else:
+        # Assume a local JSON dataset with a column named "text"
+        dataset = load_dataset("json", data_files=data, split="train")
+        text_column = "text"
+    calib_size = max(min(len(dataset), calib_size), batch_size)
+    for i in range(calib_size // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
+        for j in range(len(batch)):
+            batch[j] = batch[j][:max_sequence_length]
+        yield batch
+
+
+def get_quant_cfg_choices() -> Dict[str, Dict[str, Any]]:
+    """
+    Retrieve a dictionary of modelopt quantization configuration choices.
+
+    This function checks for the availability of specific quantization configurations defined in
+    the modelopt.torch.quantization (mtq) module and returns a dictionary mapping short names to
+    their corresponding configurations. The function is intended to work for different modelopt
+    library versions that come with variable configuration choices.
+
+    Returns:
+        dict: A dictionary where keys are short names (e.g., "fp8") and values are the
+            corresponding modelopt quantization configuration objects.
+    """
+    quant_cfg_names = [
+        ("int8", "INT8_DEFAULT_CFG"),
+        ("int8_sq", "INT8_SMOOTHQUANT_CFG"),
+        ("fp8", "FP8_DEFAULT_CFG"),
+        ("block_fp8", "FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG"),
+        ("int4_awq", "INT4_AWQ_CFG"),
+        ("w4a8_awq", "W4A8_AWQ_BETA_CFG"),
+        ("int4", "INT4_BLOCKWISE_WEIGHT_ONLY_CFG"),
+        ("nvfp4", "NVFP4_DEFAULT_CFG"),
+    ]
+
+    quant_cfg_choices = {}
+    for short_name, full_name in quant_cfg_names:
+        if config := getattr(mtq, full_name, None):
+            quant_cfg_choices[short_name] = config
+
+    return quant_cfg_choices
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,7 +83,7 @@ dependencies = [
 inframework = []
 vllm = ["vllm~=0.11.2", "pandas", "timm"]
 trtllm = ["tensorrt-llm~=1.1.0", "cuda-python~=13.0.0"]
-trt-onnx = ["tensorrt==10.14.1.48.post1", "onnx==1.18.0", "transformers==4.51.3"]
+trt-onnx = ["tensorrt==10.14.1.48.post1", "onnx==1.18.0", "onnxscript>=0.6.0", "transformers==4.51.3"]
 
 [dependency-groups]
 # This is a default group so that we install these even with bare `uv sync`
diff --git a/tests/functional_tests/L2_ONNX_TRT.sh b/tests/functional_tests/L2_ONNX_TRT.sh
@@ -15,6 +15,9 @@
 #!/bin/bash
 set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
 
+# onnx export only works with an older transformers version
+pushd .. && uv pip install transformers==4.51.3 && popd
+
 export CUDA_VISIBLE_DEVICES="0,1"
 
 coverage run \
diff --git a/tests/functional_tests/tests_onnx_trt/test_export.py b/tests/functional_tests/tests_onnx_trt/test_export.py
@@ -34,7 +34,6 @@ def tmp_dir():
         logger.warning(f"Error removing temporary directory {tmp_dir}: {e}")
 
 
-@pytest.mark.skip(reason="Temporarily disabled")
 class TestONNXTRTExport:
     def test_export_onnx_trt_embedding(self):
         subprocess.run(
diff --git a/tests/functional_tests/utils/run_onnx_trt_embedding_export.py b/tests/functional_tests/utils/run_onnx_trt_embedding_export.py
@@ -13,19 +13,16 @@
 # limitations under the License.
 
 import argparse
+import logging
 import os
 from functools import partial
 
 import tensorrt as trt
 import torch
-from nemo.collections.llm.gpt.model.hf_llama_embedding import (
-    get_llama_bidirectional_hf_model,
-)
-from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter
-from nemo.utils import logging
 from tqdm import tqdm
 
-from nemo_export.onnx_llm_exporter import OnnxLLMExporter
+from nemo_export.model_adapters.embedding.embedding_adapter import get_llama_bidirectional_hf_model
+from nemo_export.onnx_llm_exporter import OnnxLLMExporter, get_calib_data_iter
 
 
 def get_args():
diff --git a/tests/functional_tests/utils/test_export_onnx.py b/tests/functional_tests/utils/test_export_onnx.py
@@ -13,19 +13,18 @@
 # limitations under the License.
 
 import argparse
+import logging
 import os
 from functools import partial
 
 import tensorrt as trt
 import torch
-from nemo.collections.llm.gpt.model.hf_llama_embedding import (
-    get_llama_bidirectional_hf_model,
-)
-from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter
-from nemo.utils import logging
 from tqdm import tqdm
 
-from nemo_export.onnx_llm_exporter import OnnxLLMExporter
+from nemo_export.model_adapters.embedding.embedding_adapter import (
+    get_llama_bidirectional_hf_model,
+)
+from nemo_export.onnx_llm_exporter import OnnxLLMExporter, get_calib_data_iter
 
 
 def get_args():
diff --git a/tests/unit_tests/export/test_onnx_llm_exporter.py b/tests/unit_tests/export/test_onnx_llm_exporter.py
@@ -99,14 +99,6 @@ def test__override_layernorm_precision_to_fp32_without_trt(self):
         ):
             OnnxLLMExporter()._override_layernorm_precision_to_fp32(network="")
 
-    def test_quantize_without_nemo(self):
-        with (
-            mock.patch.object(OnnxLLMExporter, "__init__", lambda self: None),
-            mock.patch("nemo_export.onnx_llm_exporter.HAVE_NEMO", False),
-            pytest.raises(UnavailableError),
-        ):
-            OnnxLLMExporter().quantize(quant_cfg="", forward_loop="")
-
     def test_quantize_without_modelopt(self):
         with (
             mock.patch.object(OnnxLLMExporter, "__init__", lambda self: None),
diff --git a/tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb b/tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb
@@ -41,7 +41,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install onnxruntime"
+    "!pushd .. && uv pip install onnxruntime transformers==4.51.3 && popd"
    ]
   },
   {
@@ -154,9 +154,10 @@
     "    from functools import partial\n",
     "\n",
     "    import torch\n",
-    "    from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter\n",
     "    from tqdm import tqdm\n",
     "\n",
+    "    from nemo_export.onnx_llm_exporter import get_calib_data_iter\n",
+    "\n",
     "    def forward_loop(model, data, tokenizer):\n",
     "        for inputs in tqdm(data):\n",
     "            batch = tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
@@ -250,18 +251,6 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/onnx_tensorrt/reranker/llama_reranker.ipynb b/tutorials/onnx_tensorrt/reranker/llama_reranker.ipynb
@@ -41,7 +41,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install onnxruntime"
+    "!pushd .. && uv pip install onnxruntime transformers==4.51.3 && popd"
    ]
   },
   {
@@ -177,18 +177,6 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/uv.lock b/uv.lock