Skip to content

Commit d311eda

Browse files
Add functional test cases for published checkpoints on HF (#455)
## What does this PR do? **Type of change:** new tests **Overview:** Add vLLM/SGLang/TRT LLM deployment tests for published checkpoints on HF. Add e2e test case for gpt-oss ## Usage ```python # pytest test_deploy.py -k "vllm" ``` ## Testing ## Before your PR is "*Ready for review*" - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: No - **Did you add or update any necessary documentation?**: No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: No ## Additional Information <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Tests** * Added ModelDeployer and ModelDeployerList test utilities with COMMON_PROMPTS to run multi-backend deployment and inference scenarios. * Introduced an end-to-end GPT-OSS QAT pipeline test (SFT → QAT → MXFP4 conversion) with optional deployment/benchmarking. * Added a broad parametric LLM deployment test suite covering many models/backends with readable test IDs. * Added automatic Hugging Face cache cleanup after tests to keep runs isolated. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: neoyy-mino <[email protected]> Signed-off-by: noeyy-mino <[email protected]> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent 72e2167 commit d311eda

File tree

6 files changed

+986
-11
lines changed

6 files changed

+986
-11
lines changed

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,10 @@ disable_error_code = ["attr-defined"]
118118
# print execution time for 20 slowest tests and generate coverage reports
119119
addopts = "-ra --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
120120
pythonpath = ["tests/"]
121-
markers = ["manual: Only run when --run-manual is given"]
121+
markers = [
122+
"manual: Only run when --run-manual is given",
123+
"release: Regression tests that should be run before every release"
124+
]
122125

123126

124127
[tool.coverage.run]

tests/_test_utils/deploy_utils.py

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import itertools
17+
import subprocess
18+
19+
import pytest
20+
import torch
21+
22+
# Common test prompts for all backends
23+
COMMON_PROMPTS = [
24+
"Hello, my name is",
25+
"The president of the United States is",
26+
"The capital of France is",
27+
"The future of AI is",
28+
]
29+
30+
31+
class ModelDeployer:
32+
def __init__(
33+
self,
34+
backend: str = "trtllm",
35+
model_id: str = "",
36+
tensor_parallel_size: int = 1,
37+
mini_sm: int = 89,
38+
attn_backend: str = "TRTLLM",
39+
base_model: str = "",
40+
eagle3_one_model: bool = True,
41+
):
42+
"""
43+
Initialize the ModelDeployer.
44+
45+
Args:
46+
backend: The backend to use ('vllm', 'trtllm', or 'sglang')
47+
model_id: Path to the model
48+
tensor_parallel_size: Tensor parallel size for distributed inference
49+
mini_sm: Minimum SM (Streaming Multiprocessor) requirement for the model
50+
"""
51+
self.backend = backend
52+
self.model_id = model_id
53+
self.tensor_parallel_size = tensor_parallel_size
54+
self.mini_sm = mini_sm
55+
self.attn_backend = attn_backend
56+
self.base_model = base_model
57+
self.eagle3_one_model = eagle3_one_model
58+
59+
def run(self):
60+
"""Run the deployment based on the specified backend."""
61+
if not torch.cuda.is_available() or torch.cuda.device_count() == 0:
62+
pytest.skip("CUDA is not available")
63+
return
64+
if torch.cuda.get_device_capability() < (
65+
self.mini_sm // 10,
66+
self.mini_sm % 10,
67+
):
68+
pytest.skip(reason=f"Requires sm{self.mini_sm} or higher")
69+
return
70+
71+
if torch.cuda.device_count() < self.tensor_parallel_size:
72+
pytest.skip(reason=f"Requires at least {self.tensor_parallel_size} GPUs")
73+
return
74+
if self.backend == "vllm":
75+
self._deploy_vllm()
76+
elif self.backend == "trtllm":
77+
self._deploy_trtllm()
78+
elif self.backend == "sglang":
79+
self._deploy_sglang()
80+
else:
81+
raise ValueError(f"Unknown backend: {self.backend}")
82+
# check gpu status
83+
gpu_status = subprocess.run(
84+
"nvidia-smi || true", shell=True, capture_output=True, text=True, check=True
85+
)
86+
print("\n=== GPU Status Before Test ===")
87+
print(gpu_status.stdout)
88+
print("=============================\n")
89+
90+
def _deploy_trtllm(self):
91+
"""Deploy a model using TensorRT-LLM."""
92+
try:
93+
from tensorrt_llm import LLM, SamplingParams
94+
from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig
95+
except ImportError:
96+
pytest.skip("tensorrt_llm package not available")
97+
98+
sampling_params = SamplingParams(max_tokens=32)
99+
spec_config = None
100+
llm = None
101+
kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8)
102+
if "eagle" in self.model_id.lower():
103+
spec_config = EagleDecodingConfig(
104+
max_draft_len=3,
105+
speculative_model_dir=self.model_id,
106+
eagle3_one_model=self.eagle3_one_model,
107+
)
108+
cuda_graph = CudaGraphConfig(
109+
max_batch_size=1,
110+
)
111+
llm = LLM(
112+
model=self.base_model,
113+
tensor_parallel_size=self.tensor_parallel_size,
114+
enable_attention_dp=False,
115+
disable_overlap_scheduler=True,
116+
enable_autotuner=False,
117+
speculative_config=spec_config,
118+
cuda_graph_config=cuda_graph,
119+
kv_cache_config=kv_cache_config,
120+
)
121+
else:
122+
llm = LLM(
123+
model=self.model_id,
124+
tensor_parallel_size=self.tensor_parallel_size,
125+
enable_attention_dp=False,
126+
attn_backend=self.attn_backend,
127+
trust_remote_code=True,
128+
max_batch_size=8,
129+
kv_cache_config=kv_cache_config,
130+
)
131+
132+
outputs = llm.generate(COMMON_PROMPTS, sampling_params)
133+
134+
# Print outputs
135+
for output in outputs:
136+
prompt = output.prompt
137+
generated_text = output.outputs[0].text
138+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
139+
140+
def _deploy_vllm(self):
141+
"""Deploy a model using vLLM."""
142+
try:
143+
from vllm import LLM, SamplingParams
144+
except ImportError:
145+
pytest.skip("vllm package not available")
146+
147+
quantization_method = "modelopt"
148+
if "FP4" in self.model_id:
149+
quantization_method = "modelopt_fp4"
150+
llm = LLM(
151+
model=self.model_id,
152+
quantization=quantization_method,
153+
tensor_parallel_size=self.tensor_parallel_size,
154+
trust_remote_code=True,
155+
)
156+
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
157+
outputs = llm.generate(COMMON_PROMPTS, sampling_params)
158+
159+
# Assertions and output
160+
assert len(outputs) == len(COMMON_PROMPTS), (
161+
f"Expected {len(COMMON_PROMPTS)} outputs, got {len(outputs)}"
162+
)
163+
164+
for i, output in enumerate(outputs):
165+
assert output.prompt == COMMON_PROMPTS[i], f"Prompt mismatch at index {i}"
166+
assert hasattr(output, "outputs"), f"Output {i} missing 'outputs' attribute"
167+
assert len(output.outputs) > 0, f"Output {i} has no generated text"
168+
assert hasattr(output.outputs[0], "text"), f"Output {i} missing 'text' attribute"
169+
assert isinstance(output.outputs[0].text, str), f"Output {i} text is not a string"
170+
assert len(output.outputs[0].text) > 0, f"Output {i} generated empty text"
171+
172+
print(f"Model: {self.model_id}")
173+
print(f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}")
174+
print("-" * 50)
175+
176+
def _deploy_sglang(self):
177+
"""Deploy a model using SGLang."""
178+
try:
179+
import sglang as sgl
180+
except ImportError:
181+
pytest.skip("sglang package not available")
182+
quantization_method = "modelopt"
183+
if "FP4" in self.model_id:
184+
quantization_method = "modelopt_fp4"
185+
llm = sgl.Engine(
186+
model_path=self.model_id,
187+
quantization=quantization_method,
188+
tp_size=self.tensor_parallel_size,
189+
trust_remote_code=True,
190+
)
191+
print(llm.generate(["What's the age of the earth? "]))
192+
llm.shutdown()
193+
194+
195+
class ModelDeployerList:
196+
def __init__(self, **params):
197+
self.params = {}
198+
for key, value in params.items():
199+
if isinstance(value, (list, tuple)):
200+
self.params[key] = list(value)
201+
else:
202+
self.params[key] = [value]
203+
204+
# Pre-generate all deployers for pytest compatibility
205+
self._deployers = list(self._generate_deployers())
206+
207+
def _generate_deployers(self):
208+
for values in itertools.product(*self.params.values()):
209+
deployer = ModelDeployer(**dict(zip(self.params.keys(), values)))
210+
# Set test case ID in format "model_id_backend"
211+
deployer.test_id = f"{deployer.model_id}_{deployer.backend}"
212+
yield deployer
213+
214+
def __iter__(self):
215+
return iter(self._deployers)
216+
217+
def __len__(self):
218+
return len(self._deployers)
219+
220+
def __getitem__(self, index):
221+
return self._deployers[index]
222+
223+
def __str__(self):
224+
return f"ModelDeployerList({len(self._deployers)} items)"
225+
226+
def __repr__(self):
227+
return f"ModelDeployerList({len(self._deployers)} items)"

tests/conftest.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,27 @@ def pytest_addoption(parser):
3030
default=False,
3131
help="Run manual tests",
3232
)
33+
parser.addoption(
34+
"--run-release",
35+
action="store_true",
36+
default=False,
37+
help="Run release tests",
38+
)
3339

3440

3541
def pytest_collection_modifyitems(config, items):
36-
if not config.getoption("--run-manual"):
37-
skipper = pytest.mark.skip(reason="Only run when --run-manual is given")
38-
for item in items:
39-
if "manual" in item.keywords:
40-
item.add_marker(skipper)
42+
"""Skip tests with specific markers unless their corresponding flag is provided."""
43+
skip_marks = [
44+
("manual", "--run-manual"),
45+
("release", "--run-release"),
46+
]
47+
48+
for mark_name, option_name in skip_marks:
49+
if not config.getoption(option_name):
50+
skipper = pytest.mark.skip(reason=f"Only run when {option_name} is given")
51+
for item in items:
52+
if mark_name in item.keywords:
53+
item.add_marker(skipper)
4154

4255

4356
@pytest.fixture

tests/examples/cnn_qat/test_resnet50.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@
2020
from _test_utils.torch.misc import minimum_gpu
2121

2222
imagenet_path = os.getenv("IMAGENET_PATH")
23-
if not imagenet_path or not os.path.isdir(imagenet_path):
24-
pytest.skip(
25-
"IMAGENET_PATH environment variable is not set or does not point to a valid directory",
26-
allow_module_level=True,
27-
)
23+
skip_no_imagenet = pytest.mark.skipif(
24+
not imagenet_path or not os.path.isdir(imagenet_path),
25+
reason="IMAGENET_PATH environment variable is not set or does not point to a valid directory",
26+
)
2827

2928

3029
def _build_common_command():
@@ -59,6 +58,7 @@ def _run_qat_command(base_cmd, common_args, output_dir, example_dir="cnn_qat"):
5958
run_example_command(full_command, example_dir)
6059

6160

61+
@skip_no_imagenet
6262
@minimum_gpu(1)
6363
def test_cnn_qat_single_gpu(tmp_path):
6464
"""Test CNN QAT on single GPU."""
@@ -68,6 +68,7 @@ def test_cnn_qat_single_gpu(tmp_path):
6868
_run_qat_command(base_command, common_args, tmp_path)
6969

7070

71+
@skip_no_imagenet
7172
@minimum_gpu(2)
7273
def test_cnn_qat_multi_gpu(tmp_path):
7374
"""Test CNN QAT on multiple GPUs."""

0 commit comments

Comments
 (0)