Skip to content

Commit 5ceae23

Browse files
authored
cp: Re-enable onnx test (#597) in r0.4.0 (#598)
Signed-off-by: Charlie Truong <chtruong@nvidia.com>
1 parent 41ff021 commit 5ceae23

File tree

11 files changed

+241
-110
lines changed

11 files changed

+241
-110
lines changed

.github/workflows/cicd-main.yml

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -206,40 +206,40 @@ jobs:
206206
ngc-api-user: ${{ secrets.NGC_API_USER }}
207207
ngc-api-key: ${{ secrets.NGC_API_KEY }}
208208

209-
# cicd-e2e-tests-trt-onnx:
210-
# needs: [cicd-unit-tests-trtllm, pre-flight]
211-
# runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
212-
# name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
213-
# environment: nemo-ci
214-
# if: |
215-
# (
216-
# success()
217-
# || (
218-
# needs.cicd-wait-in-queue.result == 'skipped'
219-
# && needs.pre-flight.outputs.is_ci_workload == 'true'
220-
# )
221-
# )
222-
# && !cancelled()
223-
# steps:
224-
# - name: Checkout
225-
# uses: actions/checkout@v4
226-
# - name: main
227-
# uses: ./.github/actions/test-template
228-
# with:
229-
# script: L2_ONNX_TRT
230-
# is_optional: ${{ matrix.is_optional || false }}
231-
# azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
232-
# azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
233-
# azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
234-
# has-azure-credentials: true
235-
# is_unit_test: "false"
236-
# timeout: 60
237-
# PAT: ${{ secrets.PAT }}
238-
# inference-framework: trt-onnx
239-
# test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
240-
# runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
241-
# ngc-api-user: ${{ secrets.NGC_API_USER }}
242-
# ngc-api-key: ${{ secrets.NGC_API_KEY }}
209+
cicd-e2e-tests-trt-onnx:
210+
needs: [cicd-unit-tests-trtllm, pre-flight]
211+
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
212+
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
213+
environment: nemo-ci
214+
if: |
215+
(
216+
success()
217+
|| (
218+
needs.cicd-wait-in-queue.result == 'skipped'
219+
&& needs.pre-flight.outputs.is_ci_workload == 'true'
220+
)
221+
)
222+
&& !cancelled()
223+
steps:
224+
- name: Checkout
225+
uses: actions/checkout@v4
226+
- name: main
227+
uses: ./.github/actions/test-template
228+
with:
229+
script: L2_ONNX_TRT
230+
is_optional: ${{ matrix.is_optional || false }}
231+
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
232+
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
233+
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
234+
has-azure-credentials: true
235+
is_unit_test: "false"
236+
timeout: 60
237+
PAT: ${{ secrets.PAT }}
238+
inference-framework: trt-onnx
239+
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
240+
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
241+
ngc-api-user: ${{ secrets.NGC_API_USER }}
242+
ngc-api-key: ${{ secrets.NGC_API_KEY }}
243243

244244
cicd-e2e-tests-vllm:
245245
needs: [cicd-unit-tests-vllm, pre-flight]

nemo_export/onnx_llm_exporter.py

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import numpy as np
2121
import torch
2222
import wrapt
23+
from datasets import load_dataset
2324
from transformers import AutoModel, AutoTokenizer
2425

2526
from nemo_deploy import ITritonDeployable
@@ -30,7 +31,6 @@
3031
)
3132
from nemo_export_deploy_common.import_utils import (
3233
MISSING_MODELOPT_MSG,
33-
MISSING_NEMO_MSG,
3434
MISSING_TENSORRT_MSG,
3535
UnavailableError,
3636
)
@@ -63,17 +63,6 @@
6363
trt = MagicMock()
6464
HAVE_TENSORRT = False
6565

66-
try:
67-
from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import (
68-
get_quant_cfg_choices,
69-
)
70-
71-
QUANT_CFG_CHOICES = get_quant_cfg_choices()
72-
73-
HAVE_NEMO = True
74-
except (ImportError, ModuleNotFoundError):
75-
HAVE_NEMO = False
76-
7766

7867
@wrapt.decorator
7968
def noop_decorator(func):
@@ -254,6 +243,7 @@ def _export_to_onnx(
254243
dynamic_axes={**dynamic_axes_input, **dynamic_axes_output},
255244
verbose=verbose,
256245
opset_version=opset,
246+
dynamo=False,
257247
)
258248
logging.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")
259249

@@ -494,17 +484,15 @@ def quantize(
494484
forward_loop (callable): A function that accepts the model as a single parameter
495485
and runs sample data through it. This is used for calibration during quantization.
496486
"""
497-
if not HAVE_NEMO:
498-
raise UnavailableError(MISSING_NEMO_MSG)
499-
500487
if not HAVE_MODELOPT:
501488
raise UnavailableError(MISSING_MODELOPT_MSG)
502489

490+
quant_cfg_choices = get_quant_cfg_choices()
503491
if isinstance(quant_cfg, str):
504-
assert quant_cfg in QUANT_CFG_CHOICES, (
505-
f"Quantization config {quant_cfg} is not supported. Supported configs: {list(QUANT_CFG_CHOICES)}"
492+
assert quant_cfg in quant_cfg_choices, (
493+
f"Quantization config {quant_cfg} is not supported. Supported configs: {list(quant_cfg_choices)}"
506494
)
507-
quant_cfg = QUANT_CFG_CHOICES[quant_cfg]
495+
quant_cfg = quant_cfg_choices[quant_cfg]
508496

509497
logging.info("Starting quantization...")
510498
mtq.quantize(self.model, quant_cfg, forward_loop=forward_loop)
@@ -539,3 +527,57 @@ def get_triton_output(self):
539527
def triton_infer_fn(self, **inputs: np.ndarray):
540528
"""PyTriton inference function."""
541529
raise NotImplementedError("This function will be implemented later.")
530+
531+
532+
def get_calib_data_iter(
533+
data: str = "cnn_dailymail", batch_size: int = 64, calib_size: int = 512, max_sequence_length: int = 512
534+
):
535+
"""Creates a sample data iterator for calibration."""
536+
if data == "wikitext":
537+
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
538+
text_column = "text"
539+
elif data == "cnn_dailymail":
540+
dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
541+
text_column = "article"
542+
else:
543+
# Assume a local JSON dataset with a column named "text"
544+
dataset = load_dataset("json", data_files=data, split="train")
545+
text_column = "text"
546+
calib_size = max(min(len(dataset), calib_size), batch_size)
547+
for i in range(calib_size // batch_size):
548+
batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
549+
for j in range(len(batch)):
550+
batch[j] = batch[j][:max_sequence_length]
551+
yield batch
552+
553+
554+
def get_quant_cfg_choices() -> Dict[str, Dict[str, Any]]:
555+
"""
556+
Retrieve a dictionary of modelopt quantization configuration choices.
557+
558+
This function checks for the availability of specific quantization configurations defined in
559+
the modelopt.torch.quantization (mtq) module and returns a dictionary mapping short names to
560+
their corresponding configurations. The function is intended to work for different modelopt
561+
library versions that come with variable configuration choices.
562+
563+
Returns:
564+
dict: A dictionary where keys are short names (e.g., "fp8") and values are the
565+
corresponding modelopt quantization configuration objects.
566+
"""
567+
quant_cfg_names = [
568+
("int8", "INT8_DEFAULT_CFG"),
569+
("int8_sq", "INT8_SMOOTHQUANT_CFG"),
570+
("fp8", "FP8_DEFAULT_CFG"),
571+
("block_fp8", "FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG"),
572+
("int4_awq", "INT4_AWQ_CFG"),
573+
("w4a8_awq", "W4A8_AWQ_BETA_CFG"),
574+
("int4", "INT4_BLOCKWISE_WEIGHT_ONLY_CFG"),
575+
("nvfp4", "NVFP4_DEFAULT_CFG"),
576+
]
577+
578+
quant_cfg_choices = {}
579+
for short_name, full_name in quant_cfg_names:
580+
if config := getattr(mtq, full_name, None):
581+
quant_cfg_choices[short_name] = config
582+
583+
return quant_cfg_choices

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ dependencies = [
8383
inframework = []
8484
vllm = ["vllm~=0.11.2", "pandas", "timm"]
8585
trtllm = ["tensorrt-llm~=1.1.0", "cuda-python~=13.0.0"]
86-
trt-onnx = ["tensorrt==10.14.1.48.post1", "onnx==1.18.0", "transformers==4.51.3"]
86+
trt-onnx = ["tensorrt==10.14.1.48.post1", "onnx==1.18.0", "onnxscript>=0.6.0", "transformers==4.51.3"]
8787

8888
[dependency-groups]
8989
# This is a default group so that we install these even with bare `uv sync`

tests/functional_tests/L2_ONNX_TRT.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#!/bin/bash
1616
set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
1717

18+
# onnx export only works with an older transformers version
19+
pushd .. && uv pip install transformers==4.51.3 && popd
20+
1821
export CUDA_VISIBLE_DEVICES="0,1"
1922

2023
coverage run \

tests/functional_tests/tests_onnx_trt/test_export.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ def tmp_dir():
3434
logger.warning(f"Error removing temporary directory {tmp_dir}: {e}")
3535

3636

37-
@pytest.mark.skip(reason="Temporarily disabled")
3837
class TestONNXTRTExport:
3938
def test_export_onnx_trt_embedding(self):
4039
subprocess.run(

tests/functional_tests/utils/run_onnx_trt_embedding_export.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,16 @@
1313
# limitations under the License.
1414

1515
import argparse
16+
import logging
1617
import os
1718
from functools import partial
1819

1920
import tensorrt as trt
2021
import torch
21-
from nemo.collections.llm.gpt.model.hf_llama_embedding import (
22-
get_llama_bidirectional_hf_model,
23-
)
24-
from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter
25-
from nemo.utils import logging
2622
from tqdm import tqdm
2723

28-
from nemo_export.onnx_llm_exporter import OnnxLLMExporter
24+
from nemo_export.model_adapters.embedding.embedding_adapter import get_llama_bidirectional_hf_model
25+
from nemo_export.onnx_llm_exporter import OnnxLLMExporter, get_calib_data_iter
2926

3027

3128
def get_args():

tests/functional_tests/utils/test_export_onnx.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,18 @@
1313
# limitations under the License.
1414

1515
import argparse
16+
import logging
1617
import os
1718
from functools import partial
1819

1920
import tensorrt as trt
2021
import torch
21-
from nemo.collections.llm.gpt.model.hf_llama_embedding import (
22-
get_llama_bidirectional_hf_model,
23-
)
24-
from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter
25-
from nemo.utils import logging
2622
from tqdm import tqdm
2723

28-
from nemo_export.onnx_llm_exporter import OnnxLLMExporter
24+
from nemo_export.model_adapters.embedding.embedding_adapter import (
25+
get_llama_bidirectional_hf_model,
26+
)
27+
from nemo_export.onnx_llm_exporter import OnnxLLMExporter, get_calib_data_iter
2928

3029

3130
def get_args():

tests/unit_tests/export/test_onnx_llm_exporter.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,6 @@ def test__override_layernorm_precision_to_fp32_without_trt(self):
9999
):
100100
OnnxLLMExporter()._override_layernorm_precision_to_fp32(network="")
101101

102-
def test_quantize_without_nemo(self):
103-
with (
104-
mock.patch.object(OnnxLLMExporter, "__init__", lambda self: None),
105-
mock.patch("nemo_export.onnx_llm_exporter.HAVE_NEMO", False),
106-
pytest.raises(UnavailableError),
107-
):
108-
OnnxLLMExporter().quantize(quant_cfg="", forward_loop="")
109-
110102
def test_quantize_without_modelopt(self):
111103
with (
112104
mock.patch.object(OnnxLLMExporter, "__init__", lambda self: None),

tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
"metadata": {},
4242
"outputs": [],
4343
"source": [
44-
"!pip install onnxruntime"
44+
"!pushd .. && uv pip install onnxruntime transformers==4.51.3 && popd"
4545
]
4646
},
4747
{
@@ -154,9 +154,10 @@
154154
" from functools import partial\n",
155155
"\n",
156156
" import torch\n",
157-
" from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter\n",
158157
" from tqdm import tqdm\n",
159158
"\n",
159+
" from nemo_export.onnx_llm_exporter import get_calib_data_iter\n",
160+
"\n",
160161
" def forward_loop(model, data, tokenizer):\n",
161162
" for inputs in tqdm(data):\n",
162163
" batch = tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
@@ -250,18 +251,6 @@
250251
"display_name": "Python 3 (ipykernel)",
251252
"language": "python",
252253
"name": "python3"
253-
},
254-
"language_info": {
255-
"codemirror_mode": {
256-
"name": "ipython",
257-
"version": 3
258-
},
259-
"file_extension": ".py",
260-
"mimetype": "text/x-python",
261-
"name": "python",
262-
"nbconvert_exporter": "python",
263-
"pygments_lexer": "ipython3",
264-
"version": "3.12.3"
265254
}
266255
},
267256
"nbformat": 4,

tutorials/onnx_tensorrt/reranker/llama_reranker.ipynb

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
"metadata": {},
4242
"outputs": [],
4343
"source": [
44-
"!pip install onnxruntime"
44+
"!pushd .. && uv pip install onnxruntime transformers==4.51.3 && popd"
4545
]
4646
},
4747
{
@@ -177,18 +177,6 @@
177177
"display_name": "Python 3 (ipykernel)",
178178
"language": "python",
179179
"name": "python3"
180-
},
181-
"language_info": {
182-
"codemirror_mode": {
183-
"name": "ipython",
184-
"version": 3
185-
},
186-
"file_extension": ".py",
187-
"mimetype": "text/x-python",
188-
"name": "python",
189-
"nbconvert_exporter": "python",
190-
"pygments_lexer": "ipython3",
191-
"version": "3.12.3"
192180
}
193181
},
194182
"nbformat": 4,

0 commit comments

Comments
 (0)