Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 34 additions & 34 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -206,40 +206,40 @@ jobs:
ngc-api-user: ${{ secrets.NGC_API_USER }}
ngc-api-key: ${{ secrets.NGC_API_KEY }}

# cicd-e2e-tests-trt-onnx:
# needs: [cicd-unit-tests-trtllm, pre-flight]
# runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
# name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
# environment: nemo-ci
# if: |
# (
# success()
# || (
# needs.cicd-wait-in-queue.result == 'skipped'
# && needs.pre-flight.outputs.is_ci_workload == 'true'
# )
# )
# && !cancelled()
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# - name: main
# uses: ./.github/actions/test-template
# with:
# script: L2_ONNX_TRT
# is_optional: ${{ matrix.is_optional || false }}
# azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
# azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
# azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
# has-azure-credentials: true
# is_unit_test: "false"
# timeout: 60
# PAT: ${{ secrets.PAT }}
# inference-framework: trt-onnx
# test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
# runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
# ngc-api-user: ${{ secrets.NGC_API_USER }}
# ngc-api-key: ${{ secrets.NGC_API_KEY }}
cicd-e2e-tests-trt-onnx:
needs: [cicd-unit-tests-trtllm, pre-flight]
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
environment: nemo-ci
if: |
(
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/test-template
with:
script: L2_ONNX_TRT
is_optional: ${{ matrix.is_optional || false }}
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
has-azure-credentials: true
is_unit_test: "false"
timeout: 60
PAT: ${{ secrets.PAT }}
inference-framework: trt-onnx
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
ngc-api-user: ${{ secrets.NGC_API_USER }}
ngc-api-key: ${{ secrets.NGC_API_KEY }}

cicd-e2e-tests-vllm:
needs: [cicd-unit-tests-vllm, pre-flight]
Expand Down
78 changes: 60 additions & 18 deletions nemo_export/onnx_llm_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import numpy as np
import torch
import wrapt
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer

from nemo_deploy import ITritonDeployable
Expand All @@ -30,7 +31,6 @@
)
from nemo_export_deploy_common.import_utils import (
MISSING_MODELOPT_MSG,
MISSING_NEMO_MSG,
MISSING_TENSORRT_MSG,
UnavailableError,
)
Expand Down Expand Up @@ -63,17 +63,6 @@
trt = MagicMock()
HAVE_TENSORRT = False

try:
from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import (
get_quant_cfg_choices,
)

QUANT_CFG_CHOICES = get_quant_cfg_choices()

HAVE_NEMO = True
except (ImportError, ModuleNotFoundError):
HAVE_NEMO = False


@wrapt.decorator
def noop_decorator(func):
Expand Down Expand Up @@ -254,6 +243,7 @@ def _export_to_onnx(
dynamic_axes={**dynamic_axes_input, **dynamic_axes_output},
verbose=verbose,
opset_version=opset,
dynamo=False,
)
logging.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")

Expand Down Expand Up @@ -494,17 +484,15 @@ def quantize(
forward_loop (callable): A function that accepts the model as a single parameter
and runs sample data through it. This is used for calibration during quantization.
"""
if not HAVE_NEMO:
raise UnavailableError(MISSING_NEMO_MSG)

if not HAVE_MODELOPT:
raise UnavailableError(MISSING_MODELOPT_MSG)

quant_cfg_choices = get_quant_cfg_choices()
if isinstance(quant_cfg, str):
assert quant_cfg in QUANT_CFG_CHOICES, (
f"Quantization config {quant_cfg} is not supported. Supported configs: {list(QUANT_CFG_CHOICES)}"
assert quant_cfg in quant_cfg_choices, (
f"Quantization config {quant_cfg} is not supported. Supported configs: {list(quant_cfg_choices)}"
)
quant_cfg = QUANT_CFG_CHOICES[quant_cfg]
quant_cfg = quant_cfg_choices[quant_cfg]

logging.info("Starting quantization...")
mtq.quantize(self.model, quant_cfg, forward_loop=forward_loop)
Expand Down Expand Up @@ -539,3 +527,57 @@ def get_triton_output(self):
def triton_infer_fn(self, **inputs: np.ndarray):
"""PyTriton inference function."""
raise NotImplementedError("This function will be implemented later.")


def get_calib_data_iter(
data: str = "cnn_dailymail", batch_size: int = 64, calib_size: int = 512, max_sequence_length: int = 512
):
"""Creates a sample data iterator for calibration."""
if data == "wikitext":
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
text_column = "text"
elif data == "cnn_dailymail":
dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
text_column = "article"
else:
# Assume a local JSON dataset with a column named "text"
dataset = load_dataset("json", data_files=data, split="train")
text_column = "text"
calib_size = max(min(len(dataset), calib_size), batch_size)
for i in range(calib_size // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
for j in range(len(batch)):
batch[j] = batch[j][:max_sequence_length]
yield batch


def get_quant_cfg_choices() -> Dict[str, Dict[str, Any]]:
"""
Retrieve a dictionary of modelopt quantization configuration choices.

This function checks for the availability of specific quantization configurations defined in
the modelopt.torch.quantization (mtq) module and returns a dictionary mapping short names to
their corresponding configurations. The function is intended to work for different modelopt
library versions that come with variable configuration choices.

Returns:
dict: A dictionary where keys are short names (e.g., "fp8") and values are the
corresponding modelopt quantization configuration objects.
"""
quant_cfg_names = [
("int8", "INT8_DEFAULT_CFG"),
("int8_sq", "INT8_SMOOTHQUANT_CFG"),
("fp8", "FP8_DEFAULT_CFG"),
("block_fp8", "FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG"),
("int4_awq", "INT4_AWQ_CFG"),
("w4a8_awq", "W4A8_AWQ_BETA_CFG"),
("int4", "INT4_BLOCKWISE_WEIGHT_ONLY_CFG"),
("nvfp4", "NVFP4_DEFAULT_CFG"),
]

quant_cfg_choices = {}
for short_name, full_name in quant_cfg_names:
if config := getattr(mtq, full_name, None):
quant_cfg_choices[short_name] = config

return quant_cfg_choices
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ dependencies = [
inframework = []
vllm = ["vllm~=0.11.2", "pandas", "timm"]
trtllm = ["tensorrt-llm~=1.1.0", "cuda-python~=13.0.0"]
trt-onnx = ["tensorrt==10.14.1.48.post1", "onnx==1.18.0", "transformers==4.51.3"]
trt-onnx = ["tensorrt==10.14.1.48.post1", "onnx==1.18.0", "onnxscript>=0.6.0", "transformers==4.51.3"]

[dependency-groups]
# This is a default group so that we install these even with bare `uv sync`
Expand Down
3 changes: 3 additions & 0 deletions tests/functional_tests/L2_ONNX_TRT.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#!/bin/bash
set -xeuo pipefail # Exit immediately if a command exits with a non-zero status

# onnx export only works with an older transformers version
pushd .. && uv pip install transformers==4.51.3 && popd

export CUDA_VISIBLE_DEVICES="0,1"

coverage run \
Expand Down
1 change: 0 additions & 1 deletion tests/functional_tests/tests_onnx_trt/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def tmp_dir():
logger.warning(f"Error removing temporary directory {tmp_dir}: {e}")


@pytest.mark.skip(reason="Temporarily disabled")
class TestONNXTRTExport:
def test_export_onnx_trt_embedding(self):
subprocess.run(
Expand Down
9 changes: 3 additions & 6 deletions tests/functional_tests/utils/run_onnx_trt_embedding_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,16 @@
# limitations under the License.

import argparse
import logging
import os
from functools import partial

import tensorrt as trt
import torch
from nemo.collections.llm.gpt.model.hf_llama_embedding import (
get_llama_bidirectional_hf_model,
)
from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter
from nemo.utils import logging
from tqdm import tqdm

from nemo_export.onnx_llm_exporter import OnnxLLMExporter
from nemo_export.model_adapters.embedding.embedding_adapter import get_llama_bidirectional_hf_model
from nemo_export.onnx_llm_exporter import OnnxLLMExporter, get_calib_data_iter


def get_args():
Expand Down
11 changes: 5 additions & 6 deletions tests/functional_tests/utils/test_export_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,18 @@
# limitations under the License.

import argparse
import logging
import os
from functools import partial

import tensorrt as trt
import torch
from nemo.collections.llm.gpt.model.hf_llama_embedding import (
get_llama_bidirectional_hf_model,
)
from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter
from nemo.utils import logging
from tqdm import tqdm

from nemo_export.onnx_llm_exporter import OnnxLLMExporter
from nemo_export.model_adapters.embedding.embedding_adapter import (
get_llama_bidirectional_hf_model,
)
from nemo_export.onnx_llm_exporter import OnnxLLMExporter, get_calib_data_iter


def get_args():
Expand Down
8 changes: 0 additions & 8 deletions tests/unit_tests/export/test_onnx_llm_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,6 @@ def test__override_layernorm_precision_to_fp32_without_trt(self):
):
OnnxLLMExporter()._override_layernorm_precision_to_fp32(network="")

def test_quantize_without_nemo(self):
with (
mock.patch.object(OnnxLLMExporter, "__init__", lambda self: None),
mock.patch("nemo_export.onnx_llm_exporter.HAVE_NEMO", False),
pytest.raises(UnavailableError),
):
OnnxLLMExporter().quantize(quant_cfg="", forward_loop="")

def test_quantize_without_modelopt(self):
with (
mock.patch.object(OnnxLLMExporter, "__init__", lambda self: None),
Expand Down
17 changes: 3 additions & 14 deletions tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install onnxruntime"
"!pushd .. && uv pip install onnxruntime transformers==4.51.3 && popd"
]
},
{
Expand Down Expand Up @@ -154,9 +154,10 @@
" from functools import partial\n",
"\n",
" import torch\n",
" from nemo.collections.llm.modelopt.quantization.quantizer import get_calib_data_iter\n",
" from tqdm import tqdm\n",
"\n",
" from nemo_export.onnx_llm_exporter import get_calib_data_iter\n",
"\n",
" def forward_loop(model, data, tokenizer):\n",
" for inputs in tqdm(data):\n",
" batch = tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
Expand Down Expand Up @@ -250,18 +251,6 @@
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
14 changes: 1 addition & 13 deletions tutorials/onnx_tensorrt/reranker/llama_reranker.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install onnxruntime"
"!pushd .. && uv pip install onnxruntime transformers==4.51.3 && popd"
]
},
{
Expand Down Expand Up @@ -177,18 +177,6 @@
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
Loading