diff --git a/tests/_test_utils/deploy_utils.py b/tests/_test_utils/deploy_utils.py new file mode 100644 index 000000000..85a97b616 --- /dev/null +++ b/tests/_test_utils/deploy_utils.py @@ -0,0 +1,227 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import subprocess + +import pytest +import torch + +# Common test prompts for all backends +COMMON_PROMPTS = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +class ModelDeployer: + def __init__( + self, + backend: str = "trtllm", + model_id: str = "", + tensor_parallel_size: int = 1, + mini_sm: int = 89, + attn_backend: str = "TRTLLM", + base_model: str = "", + eagle3_one_model: bool = True, + ): + """ + Initialize the ModelDeployer. + + Args: + backend: The backend to use ('vllm', 'trtllm', or 'sglang') + model_id: Path to the model + tensor_parallel_size: Tensor parallel size for distributed inference + mini_sm: Minimum SM (Streaming Multiprocessor) requirement for the model + """ + self.backend = backend + self.model_id = model_id + self.tensor_parallel_size = tensor_parallel_size + self.mini_sm = mini_sm + self.attn_backend = attn_backend + self.base_model = base_model + self.eagle3_one_model = eagle3_one_model + + def run(self): + """Run the deployment based on the specified backend.""" + if not torch.cuda.is_available() or torch.cuda.device_count() == 0: + pytest.skip("CUDA is not available") + return + if torch.cuda.get_device_capability() < ( + self.mini_sm // 10, + self.mini_sm % 10, + ): + pytest.skip(reason=f"Requires sm{self.mini_sm} or higher") + return + + if torch.cuda.device_count() < self.tensor_parallel_size: + pytest.skip(reason=f"Requires at least {self.tensor_parallel_size} GPUs") + return + if self.backend == "vllm": + self._deploy_vllm() + elif self.backend == "trtllm": + self._deploy_trtllm() + elif self.backend == "sglang": + self._deploy_sglang() + else: + raise ValueError(f"Unknown backend: {self.backend}") + # check gpu status + gpu_status = subprocess.run( + "nvidia-smi || true", shell=True, capture_output=True, text=True, check=True + ) + print("\n=== GPU Status Before Test ===") + print(gpu_status.stdout) + print("=============================\n") + + def _deploy_trtllm(self): + """Deploy a model using TensorRT-LLM.""" + try: + from tensorrt_llm import LLM, SamplingParams + from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig + except ImportError: + pytest.skip("tensorrt_llm package not available") + + sampling_params = SamplingParams(max_tokens=32) + spec_config = None + llm = None + kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8) + if "eagle" in self.model_id.lower(): + spec_config = EagleDecodingConfig( + max_draft_len=3, + speculative_model_dir=self.model_id, + eagle3_one_model=self.eagle3_one_model, + ) + cuda_graph = CudaGraphConfig( + max_batch_size=1, + ) + llm = LLM( + model=self.base_model, + tensor_parallel_size=self.tensor_parallel_size, + enable_attention_dp=False, + disable_overlap_scheduler=True, + enable_autotuner=False, + speculative_config=spec_config, + cuda_graph_config=cuda_graph, + kv_cache_config=kv_cache_config, + ) + else: + llm = LLM( + model=self.model_id, + tensor_parallel_size=self.tensor_parallel_size, + enable_attention_dp=False, + attn_backend=self.attn_backend, + trust_remote_code=True, + max_batch_size=8, + kv_cache_config=kv_cache_config, + ) + + outputs = llm.generate(COMMON_PROMPTS, sampling_params) + + # Print outputs + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + def _deploy_vllm(self): + """Deploy a model using vLLM.""" + try: + from vllm import LLM, SamplingParams + except ImportError: + pytest.skip("vllm package not available") + + quantization_method = "modelopt" + if "FP4" in self.model_id: + quantization_method = "modelopt_fp4" + llm = LLM( + model=self.model_id, + quantization=quantization_method, + tensor_parallel_size=self.tensor_parallel_size, + trust_remote_code=True, + ) + sampling_params = SamplingParams(temperature=0.8, top_p=0.9) + outputs = llm.generate(COMMON_PROMPTS, sampling_params) + + # Assertions and output + assert len(outputs) == len(COMMON_PROMPTS), ( + f"Expected {len(COMMON_PROMPTS)} outputs, got {len(outputs)}" + ) + + for i, output in enumerate(outputs): + assert output.prompt == COMMON_PROMPTS[i], f"Prompt mismatch at index {i}" + assert hasattr(output, "outputs"), f"Output {i} missing 'outputs' attribute" + assert len(output.outputs) > 0, f"Output {i} has no generated text" + assert hasattr(output.outputs[0], "text"), f"Output {i} missing 'text' attribute" + assert isinstance(output.outputs[0].text, str), f"Output {i} text is not a string" + assert len(output.outputs[0].text) > 0, f"Output {i} generated empty text" + + print(f"Model: {self.model_id}") + print(f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}") + print("-" * 50) + + def _deploy_sglang(self): + """Deploy a model using SGLang.""" + try: + import sglang as sgl + except ImportError: + pytest.skip("sglang package not available") + quantization_method = "modelopt" + if "FP4" in self.model_id: + quantization_method = "modelopt_fp4" + llm = sgl.Engine( + model_path=self.model_id, + quantization=quantization_method, + tp_size=self.tensor_parallel_size, + trust_remote_code=True, + ) + print(llm.generate(["What's the age of the earth? "])) + llm.shutdown() + + +class ModelDeployerList: + def __init__(self, **params): + self.params = {} + for key, value in params.items(): + if isinstance(value, (list, tuple)): + self.params[key] = list(value) + else: + self.params[key] = [value] + + # Pre-generate all deployers for pytest compatibility + self._deployers = list(self._generate_deployers()) + + def _generate_deployers(self): + for values in itertools.product(*self.params.values()): + deployer = ModelDeployer(**dict(zip(self.params.keys(), values))) + # Set test case ID in format "model_id_backend" + deployer.test_id = f"{deployer.model_id}_{deployer.backend}" + yield deployer + + def __iter__(self): + return iter(self._deployers) + + def __len__(self): + return len(self._deployers) + + def __getitem__(self, index): + return self._deployers[index] + + def __str__(self): + return f"ModelDeployerList({len(self._deployers)} items)" + + def __repr__(self): + return f"ModelDeployerList({len(self._deployers)} items)" diff --git a/tests/examples/cnn_qat/test_resnet50.py b/tests/examples/cnn_qat/test_resnet50.py index 77da56c1f..503ac0d1a 100644 --- a/tests/examples/cnn_qat/test_resnet50.py +++ b/tests/examples/cnn_qat/test_resnet50.py @@ -20,11 +20,10 @@ from _test_utils.torch_misc import minimum_gpu imagenet_path = os.getenv("IMAGENET_PATH") -if not imagenet_path or not os.path.isdir(imagenet_path): - pytest.skip( - "IMAGENET_PATH environment variable is not set or does not point to a valid directory", - allow_module_level=True, - ) +skip_no_imagenet = pytest.mark.skipif( + not imagenet_path or not os.path.isdir(imagenet_path), + reason="IMAGENET_PATH environment variable is not set or does not point to a valid directory", +) def _build_common_command(): @@ -59,6 +58,7 @@ def _run_qat_command(base_cmd, common_args, output_dir, example_dir="cnn_qat"): run_example_command(full_command, example_dir) +@skip_no_imagenet @minimum_gpu(1) def test_cnn_qat_single_gpu(tmp_path): """Test CNN QAT on single GPU.""" @@ -68,6 +68,7 @@ def test_cnn_qat_single_gpu(tmp_path): _run_qat_command(base_command, common_args, tmp_path) +@skip_no_imagenet @minimum_gpu(2) def test_cnn_qat_multi_gpu(tmp_path): """Test CNN QAT on multiple GPUs.""" diff --git a/tests/examples/gpt_oss/test_gpt_oss_qat.py b/tests/examples/gpt_oss/test_gpt_oss_qat.py new file mode 100644 index 000000000..cbdfcf563 --- /dev/null +++ b/tests/examples/gpt_oss/test_gpt_oss_qat.py @@ -0,0 +1,242 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import subprocess + +import pytest +from _test_utils.examples.run_command import run_example_command +from _test_utils.torch_misc import minimum_gpu + + +class GPTOSS: + """Test GPT-OSS-20B QAT (Quantization-Aware Training) pipeline. + + This test suite covers the complete GPT-OSS-20B optimization pipeline: + + Step 1: test_gpt_oss_sft_training - Supervised Fine-Tuning (SFT) + Input: openai/gpt-oss-20b + Output: gpt-oss-20b-sft + + Step 2: test_gpt_oss_qat_training - Quantization-Aware Training (QAT) + Input: gpt-oss-20b-sft (from Step 1) + Output: gpt-oss-20b-qat + + Step 3: test_gpt_oss_mxfp4_conversion - MXFP4 Weight-Only Conversion + Input: gpt-oss-20b-qat (from Step 2) + Output: gpt-oss-20b-qat-real-mxfp4 + + Each step can be run independently (with mock inputs) or as part of the full pipeline. + """ + + def __init__(self, model_path): + self.model_path = model_path + + def gpt_oss_sft_training(self, tmp_path): + """Test supervised fine-tuning (SFT) of GPT-OSS-20B model - Step 1.""" + model_name = self.model_path.split("/")[-1] + output_dir = tmp_path / f"{model_name}-sft" + + # Command for SFT training (Step 1) + cmd_parts = [ + "accelerate", + "launch", + "--config_file", + "configs/zero3.yaml", + "sft.py", + "--config", + "configs/sft_full.yaml", + "--model_name_or_path", + self.model_path, + "--output_dir", + str(output_dir), + ] + + run_example_command(cmd_parts, "gpt-oss") + + # Verify SFT output directory exists + assert output_dir.exists(), "SFT output directory should exist after training" + + def gpt_oss_qat_training(self, tmp_path): + """Test quantization-aware training (QAT) with MXFP4 configuration - Step 2.""" + # This test assumes test_gpt_oss_sft_training has been run first + # Look for the SFT output directory from step 1 + model_name = self.model_path.split("/")[-1] + sft_dir = tmp_path / f"{model_name}-sft" + + # If SFT directory doesn't exist, create a mock one for standalone testing + if not sft_dir.exists(): + sft_dir.mkdir() + + # Create minimal config.json for the mock model + config_content = { + "model_type": "gpt_oss", + "hidden_size": 5120, + "num_attention_heads": 40, + "num_hidden_layers": 44, + "vocab_size": 100000, + "torch_dtype": "bfloat16", + } + + import json + + with open(sft_dir / "config.json", "w") as f: + json.dump(config_content, f) + + qat_output_dir = tmp_path / f"{model_name}-qat" + + # Command for QAT training (Step 2) + cmd_parts = [ + "accelerate", + "launch", + "--config_file", + "configs/zero3.yaml", + "sft.py", + "--config", + "configs/sft_full.yaml", + "--model_name_or_path", + str(sft_dir), + "--quant_cfg", + "MXFP4_MLP_WEIGHT_ONLY_CFG", + "--output_dir", + str(qat_output_dir), + ] + + run_example_command(cmd_parts, "gpt-oss") + + # Verify QAT output directory exists + assert qat_output_dir.exists(), "QAT output directory should exist after training" + + def gpt_oss_mxfp4_conversion(self, tmp_path): + """Test conversion to MXFP4 weight-only format - Step 3.""" + # This test assumes test_gpt_oss_qat_training has been run first + # Look for the QAT output directory from step 2 + model_name = self.model_path.split("/")[-1] + qat_dir = tmp_path / f"{model_name}-qat" + + # If QAT directory doesn't exist, create a mock one for standalone testing + if not qat_dir.exists(): + qat_dir.mkdir() + + # Create minimal config.json for the mock model + config_content = { + "model_type": "gpt_oss", + "hidden_size": 5120, + "num_attention_heads": 40, + "num_hidden_layers": 44, + "vocab_size": 100000, + "torch_dtype": "bfloat16", + } + + import json + + with open(qat_dir / "config.json", "w") as f: + json.dump(config_content, f) + + conversion_output_dir = tmp_path / f"{model_name}-qat-real-mxfp4" + + # Command for MXFP4 conversion (Step 3) + cmd_parts = [ + "python", + "convert_oai_mxfp4_weight_only.py", + "--model_path", + str(qat_dir), + "--output_path", + str(conversion_output_dir), + ] + + run_example_command(cmd_parts, "gpt-oss") + + # Verify conversion output directory exists + assert conversion_output_dir.exists(), "MXFP4 conversion output directory should exist" + + def deploy_gpt_oss_trtllm(self, tmp_path): + """Deploy GPT-OSS model with TensorRT-LLM.""" + # Prepare benchmark data + tensorrt_llm_workspace = "/app/tensorrt_llm" + script = os.path.join(tensorrt_llm_workspace, "benchmarks", "cpp", "prepare_dataset.py") + model_name = self.model_path.split("/")[-1] + benchmark_file = str(tmp_path / f"{model_name}_synthetic_128_128.txt") + + if not os.path.exists(benchmark_file) or os.path.getsize(benchmark_file) == 0: + print(f"Creating dataset file '{benchmark_file}'...") + with open(benchmark_file, "w") as fp: + subprocess.run( + f"python {script} --stdout --tokenizer={self.model_path} token-norm-dist --input-mean 128 \ + --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1400", + shell=True, + check=True, + stdout=fp, + ) + else: + print(f"Dataset file '{benchmark_file}' already exists.") + + assert os.path.isfile(benchmark_file), f"Benchmark file '{benchmark_file}' should exist" + + cmd_parts = [ + "trtllm-bench", + "--model", + self.model_path, + "throughput", + "--backend", + "pytorch", + "--dataset", + benchmark_file, + "--kv_cache_free_gpu_mem_fraction", + "0.9", + "--report_json", + str(tmp_path / "low_latency_throughput.json"), + ] + run_example_command(cmd_parts, "gpt-oss") + + +@pytest.mark.parametrize( + "model_path", + [ + pytest.param("openai/gpt-oss-20b", id="gpt-oss-20b", marks=minimum_gpu(2)), + pytest.param("openai/gpt-oss-120b", id="gpt-oss-120b", marks=minimum_gpu(8)), + ], +) +def test_gpt_oss_complete_pipeline(model_path, tmp_path): + """Test the complete GPT-OSS optimization pipeline by executing all 3 steps in sequence.""" + # Create GPTOSS instance with model path + gpt_oss = GPTOSS(model_path) + model_name = model_path.split("/")[-1] + + # Execute Step 1: SFT Training + gpt_oss.gpt_oss_sft_training(tmp_path) + + # Execute Step 2: QAT Training + gpt_oss.gpt_oss_qat_training(tmp_path) + + # Execute Step 3: MXFP4 Conversion + gpt_oss.gpt_oss_mxfp4_conversion(tmp_path) + + # Verify all output directories exist + sft_dir = tmp_path / f"{model_name}-sft" + qat_dir = tmp_path / f"{model_name}-qat" + conversion_dir = tmp_path / f"{model_name}-qat-real-mxfp4" + + assert sft_dir.exists(), "SFT output directory should exist after Step 1" + assert qat_dir.exists(), "QAT output directory should exist after Step 2" + assert conversion_dir.exists(), "MXFP4 conversion output directory should exist after Step 3" + + print(f"Complete pipeline executed successfully for {model_path}!") + print(f"Step 1 output: {sft_dir}") + print(f"Step 2 output: {qat_dir}") + print(f"Step 3 output: {conversion_dir}") + + # Deploy with TensorRT-LLM + gpt_oss.deploy_gpt_oss_trtllm(tmp_path) diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py new file mode 100644 index 000000000..d41622a2c --- /dev/null +++ b/tests/examples/llm_ptq/test_deploy.py @@ -0,0 +1,462 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil + +import pytest +from _test_utils.deploy_utils import ModelDeployerList + + +def idfn(val): + if hasattr(val, "test_id"): + return val.test_id + return str(val) + + +# clean up hf cache +HF_CACHE_PATH = os.getenv("HF_HUB_CACHE", os.path.expanduser("~/.cache/huggingface/hub")) + + +def clear_hf_cache(): + """Clear Hugging Face cache directory.""" + try: + if os.path.exists(HF_CACHE_PATH): + print(f"Clearing HF cache at: {HF_CACHE_PATH}") + for item in os.listdir(HF_CACHE_PATH): + item_path = os.path.join(HF_CACHE_PATH, item) + if os.path.isdir(item_path) and "nvidia" in item: + shutil.rmtree(item_path, ignore_errors=True) + print(f"✓ Removed: {item}") + print("✓ HF cache cleared successfully") + else: + print(f"HF cache path does not exist: {HF_CACHE_PATH}") + except Exception as e: + print(f"⚠ Warning: Failed to clear HF cache: {e}") + + +@pytest.fixture(autouse=True) +def cleanup_after_test(): + """Automatically clean up after each test.""" + yield # Run the test + clear_hf_cache() # Clean up after test completes + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/DeepSeek-R1-FP4", + backend=("vllm", "trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/DeepSeek-R1-FP4-v2", + backend=("vllm", "trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/DeepSeek-R1-0528-FP4", + backend=("vllm", "trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/DeepSeek-R1-0528-FP4-v2", + backend=("vllm", "trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/DeepSeek-V3-0324-FP4", + backend=("vllm", "trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_deepseek(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + # Llama-3.1 + *ModelDeployerList( + model_id="nvidia/Llama-3.1-8B-Instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3.1-8B-Instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + ), + # ModelDeployer(model_id="nvidia/Llama-3.1-8B-Medusa-FP8", backend="vllm"), + *ModelDeployerList( + model_id="nvidia/Llama-3.1-70B-Instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3.3-70B-Instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3.3-70B-Instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3.1-405B-Instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3.1-405B-Instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + # Llama-4 + *ModelDeployerList( + model_id="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + ), + *ModelDeployerList( + model_id="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_llama(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/Qwen3-8B-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-8B-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-14B-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-14B-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-235B-A22B-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=2, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-235B-A22B-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/QwQ-32B-FP4", backend=("trtllm", "vllm", "sglang"), mini_sm=100 + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-32B-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Qwen2.5-VL-7B-Instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Qwen2.5-VL-7B-Instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-30B-A3B-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_qwen(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend=("trtllm", "vllm", "sglang") + ), + *ModelDeployerList( + model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP4", + backend=("trtllm", "vllm", "sglang"), + mini_sm=100, + ), + # ModelDeployer(model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend="sglang"), unsupported + ], + ids=idfn, +) +def test_mixtral(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ # TRTLLM bug: https://nvbugs/5451286 + *ModelDeployerList( + model_id="nvidia/gemma-3-12b-it-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + attn_backend="FLASHINFER", + ), + *ModelDeployerList( + model_id="nvidia/gemma-3-12b-it-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + attn_backend="FLASHINFER", + ), + *ModelDeployerList( + model_id="nvidia/gemma-3-27b-it-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + attn_backend="FLASHINFER", + ), + *ModelDeployerList( + model_id="nvidia/gemma-3-27b-it-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + attn_backend="FLASHINFER", + ), + ], + ids=idfn, +) +def test_gemma(command): + command.run() + + +# test phi +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/Phi-4-multimodal-instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Phi-4-multimodal-instruct-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Phi-4-reasoning-plus-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Phi-4-reasoning-plus-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + ], + ids=idfn, +) +def test_phi(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/Kimi-K2-Instruct-FP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_kimi(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=4, + mini_sm=89, + ), + ], + ids=idfn, +) +def test_llama_nemotron(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/Llama-3.1-8B-Medusa-FP8", + backend=("trtllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3.1-70B-Medusa-FP8", + backend=("trtllm", "sglang"), + tensor_parallel_size=2, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Llama-3.1-405B-Medusa-FP8", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +@pytest.mark.skip(reason="Medusa is not supported yet") +def test_medusa(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + base_model="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", + model_id="nvidia/Llama-4-Maverick-17B-128E-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + ), + *ModelDeployerList( + base_model="Qwen/Qwen3-235B-A22B", + model_id="nvidia/Qwen3-235B-A22B-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + ), + *ModelDeployerList( + base_model="Qwen/Qwen3-235B-A22B-Thinking-2507", + model_id="nvidia/Qwen3-235B-A22B-Thinking-2507-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + eagle3_one_model=False, + ), + *ModelDeployerList( + base_model="Qwen/Qwen3-30B-A3B", + model_id="nvidia/Qwen3-30B-A3B-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + ), + *ModelDeployerList( + base_model="Qwen/Qwen3-30B-A3B-Thinking-2507", + model_id="nvidia/Qwen3-30B-A3B-Thinking-2507-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + eagle3_one_model=False, + ), + *ModelDeployerList( + base_model="openai/gpt-oss-120b", + model_id="nvidia/gpt-oss-120b-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + ), + ], + ids=idfn, +) +def test_eagle(command): + command.run()