diff --git a/tests/_test_utils/deploy_utils.py b/tests/_test_utils/deploy_utils.py
new file mode 100644
index 000000000..85a97b616
--- /dev/null
+++ b/tests/_test_utils/deploy_utils.py
@@ -0,0 +1,227 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import subprocess
+
+import pytest
+import torch
+
+# Common test prompts for all backends
+COMMON_PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class ModelDeployer:
+    def __init__(
+        self,
+        backend: str = "trtllm",
+        model_id: str = "",
+        tensor_parallel_size: int = 1,
+        mini_sm: int = 89,
+        attn_backend: str = "TRTLLM",
+        base_model: str = "",
+        eagle3_one_model: bool = True,
+    ):
+        """
+        Initialize the ModelDeployer.
+
+        Args:
+            backend: The backend to use ('vllm', 'trtllm', or 'sglang')
+            model_id: Path to the model
+            tensor_parallel_size: Tensor parallel size for distributed inference
+            mini_sm: Minimum SM (Streaming Multiprocessor) requirement for the model
+        """
+        self.backend = backend
+        self.model_id = model_id
+        self.tensor_parallel_size = tensor_parallel_size
+        self.mini_sm = mini_sm
+        self.attn_backend = attn_backend
+        self.base_model = base_model
+        self.eagle3_one_model = eagle3_one_model
+
+    def run(self):
+        """Run the deployment based on the specified backend."""
+        if not torch.cuda.is_available() or torch.cuda.device_count() == 0:
+            pytest.skip("CUDA is not available")
+            return
+        if torch.cuda.get_device_capability() < (
+            self.mini_sm // 10,
+            self.mini_sm % 10,
+        ):
+            pytest.skip(reason=f"Requires sm{self.mini_sm} or higher")
+            return
+
+        if torch.cuda.device_count() < self.tensor_parallel_size:
+            pytest.skip(reason=f"Requires at least {self.tensor_parallel_size} GPUs")
+            return
+        if self.backend == "vllm":
+            self._deploy_vllm()
+        elif self.backend == "trtllm":
+            self._deploy_trtllm()
+        elif self.backend == "sglang":
+            self._deploy_sglang()
+        else:
+            raise ValueError(f"Unknown backend: {self.backend}")
+        # check gpu status
+        gpu_status = subprocess.run(
+            "nvidia-smi || true", shell=True, capture_output=True, text=True, check=True
+        )
+        print("\n=== GPU Status Before Test ===")
+        print(gpu_status.stdout)
+        print("=============================\n")
+
+    def _deploy_trtllm(self):
+        """Deploy a model using TensorRT-LLM."""
+        try:
+            from tensorrt_llm import LLM, SamplingParams
+            from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig
+        except ImportError:
+            pytest.skip("tensorrt_llm package not available")
+
+        sampling_params = SamplingParams(max_tokens=32)
+        spec_config = None
+        llm = None
+        kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8)
+        if "eagle" in self.model_id.lower():
+            spec_config = EagleDecodingConfig(
+                max_draft_len=3,
+                speculative_model_dir=self.model_id,
+                eagle3_one_model=self.eagle3_one_model,
+            )
+            cuda_graph = CudaGraphConfig(
+                max_batch_size=1,
+            )
+            llm = LLM(
+                model=self.base_model,
+                tensor_parallel_size=self.tensor_parallel_size,
+                enable_attention_dp=False,
+                disable_overlap_scheduler=True,
+                enable_autotuner=False,
+                speculative_config=spec_config,
+                cuda_graph_config=cuda_graph,
+                kv_cache_config=kv_cache_config,
+            )
+        else:
+            llm = LLM(
+                model=self.model_id,
+                tensor_parallel_size=self.tensor_parallel_size,
+                enable_attention_dp=False,
+                attn_backend=self.attn_backend,
+                trust_remote_code=True,
+                max_batch_size=8,
+                kv_cache_config=kv_cache_config,
+            )
+
+        outputs = llm.generate(COMMON_PROMPTS, sampling_params)
+
+        # Print outputs
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    def _deploy_vllm(self):
+        """Deploy a model using vLLM."""
+        try:
+            from vllm import LLM, SamplingParams
+        except ImportError:
+            pytest.skip("vllm package not available")
+
+        quantization_method = "modelopt"
+        if "FP4" in self.model_id:
+            quantization_method = "modelopt_fp4"
+        llm = LLM(
+            model=self.model_id,
+            quantization=quantization_method,
+            tensor_parallel_size=self.tensor_parallel_size,
+            trust_remote_code=True,
+        )
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+        outputs = llm.generate(COMMON_PROMPTS, sampling_params)
+
+        # Assertions and output
+        assert len(outputs) == len(COMMON_PROMPTS), (
+            f"Expected {len(COMMON_PROMPTS)} outputs, got {len(outputs)}"
+        )
+
+        for i, output in enumerate(outputs):
+            assert output.prompt == COMMON_PROMPTS[i], f"Prompt mismatch at index {i}"
+            assert hasattr(output, "outputs"), f"Output {i} missing 'outputs' attribute"
+            assert len(output.outputs) > 0, f"Output {i} has no generated text"
+            assert hasattr(output.outputs[0], "text"), f"Output {i} missing 'text' attribute"
+            assert isinstance(output.outputs[0].text, str), f"Output {i} text is not a string"
+            assert len(output.outputs[0].text) > 0, f"Output {i} generated empty text"
+
+            print(f"Model: {self.model_id}")
+            print(f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}")
+            print("-" * 50)
+
+    def _deploy_sglang(self):
+        """Deploy a model using SGLang."""
+        try:
+            import sglang as sgl
+        except ImportError:
+            pytest.skip("sglang package not available")
+        quantization_method = "modelopt"
+        if "FP4" in self.model_id:
+            quantization_method = "modelopt_fp4"
+        llm = sgl.Engine(
+            model_path=self.model_id,
+            quantization=quantization_method,
+            tp_size=self.tensor_parallel_size,
+            trust_remote_code=True,
+        )
+        print(llm.generate(["What's the age of the earth? "]))
+        llm.shutdown()
+
+
+class ModelDeployerList:
+    def __init__(self, **params):
+        self.params = {}
+        for key, value in params.items():
+            if isinstance(value, (list, tuple)):
+                self.params[key] = list(value)
+            else:
+                self.params[key] = [value]
+
+        # Pre-generate all deployers for pytest compatibility
+        self._deployers = list(self._generate_deployers())
+
+    def _generate_deployers(self):
+        for values in itertools.product(*self.params.values()):
+            deployer = ModelDeployer(**dict(zip(self.params.keys(), values)))
+            # Set test case ID in format "model_id_backend"
+            deployer.test_id = f"{deployer.model_id}_{deployer.backend}"
+            yield deployer
+
+    def __iter__(self):
+        return iter(self._deployers)
+
+    def __len__(self):
+        return len(self._deployers)
+
+    def __getitem__(self, index):
+        return self._deployers[index]
+
+    def __str__(self):
+        return f"ModelDeployerList({len(self._deployers)} items)"
+
+    def __repr__(self):
+        return f"ModelDeployerList({len(self._deployers)} items)"
diff --git a/tests/examples/cnn_qat/test_resnet50.py b/tests/examples/cnn_qat/test_resnet50.py
index 77da56c1f..503ac0d1a 100644
--- a/tests/examples/cnn_qat/test_resnet50.py
+++ b/tests/examples/cnn_qat/test_resnet50.py
@@ -20,11 +20,10 @@
 from _test_utils.torch_misc import minimum_gpu
 
 imagenet_path = os.getenv("IMAGENET_PATH")
-if not imagenet_path or not os.path.isdir(imagenet_path):
-    pytest.skip(
-        "IMAGENET_PATH environment variable is not set or does not point to a valid directory",
-        allow_module_level=True,
-    )
+skip_no_imagenet = pytest.mark.skipif(
+    not imagenet_path or not os.path.isdir(imagenet_path),
+    reason="IMAGENET_PATH environment variable is not set or does not point to a valid directory",
+)
 
 
 def _build_common_command():
@@ -59,6 +58,7 @@ def _run_qat_command(base_cmd, common_args, output_dir, example_dir="cnn_qat"):
     run_example_command(full_command, example_dir)
 
 
+@skip_no_imagenet
 @minimum_gpu(1)
 def test_cnn_qat_single_gpu(tmp_path):
     """Test CNN QAT on single GPU."""
@@ -68,6 +68,7 @@ def test_cnn_qat_single_gpu(tmp_path):
     _run_qat_command(base_command, common_args, tmp_path)
 
 
+@skip_no_imagenet
 @minimum_gpu(2)
 def test_cnn_qat_multi_gpu(tmp_path):
     """Test CNN QAT on multiple GPUs."""
diff --git a/tests/examples/gpt_oss/test_gpt_oss_qat.py b/tests/examples/gpt_oss/test_gpt_oss_qat.py
new file mode 100644
index 000000000..cbdfcf563
--- /dev/null
+++ b/tests/examples/gpt_oss/test_gpt_oss_qat.py
@@ -0,0 +1,242 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+
+import pytest
+from _test_utils.examples.run_command import run_example_command
+from _test_utils.torch_misc import minimum_gpu
+
+
+class GPTOSS:
+    """Test GPT-OSS-20B QAT (Quantization-Aware Training) pipeline.
+
+    This test suite covers the complete GPT-OSS-20B optimization pipeline:
+
+    Step 1: test_gpt_oss_sft_training - Supervised Fine-Tuning (SFT)
+           Input: openai/gpt-oss-20b
+           Output: gpt-oss-20b-sft
+
+    Step 2: test_gpt_oss_qat_training - Quantization-Aware Training (QAT)
+           Input: gpt-oss-20b-sft (from Step 1)
+           Output: gpt-oss-20b-qat
+
+    Step 3: test_gpt_oss_mxfp4_conversion - MXFP4 Weight-Only Conversion
+           Input: gpt-oss-20b-qat (from Step 2)
+           Output: gpt-oss-20b-qat-real-mxfp4
+
+    Each step can be run independently (with mock inputs) or as part of the full pipeline.
+    """
+
+    def __init__(self, model_path):
+        self.model_path = model_path
+
+    def gpt_oss_sft_training(self, tmp_path):
+        """Test supervised fine-tuning (SFT) of GPT-OSS-20B model - Step 1."""
+        model_name = self.model_path.split("/")[-1]
+        output_dir = tmp_path / f"{model_name}-sft"
+
+        # Command for SFT training (Step 1)
+        cmd_parts = [
+            "accelerate",
+            "launch",
+            "--config_file",
+            "configs/zero3.yaml",
+            "sft.py",
+            "--config",
+            "configs/sft_full.yaml",
+            "--model_name_or_path",
+            self.model_path,
+            "--output_dir",
+            str(output_dir),
+        ]
+
+        run_example_command(cmd_parts, "gpt-oss")
+
+        # Verify SFT output directory exists
+        assert output_dir.exists(), "SFT output directory should exist after training"
+
+    def gpt_oss_qat_training(self, tmp_path):
+        """Test quantization-aware training (QAT) with MXFP4 configuration - Step 2."""
+        # This test assumes test_gpt_oss_sft_training has been run first
+        # Look for the SFT output directory from step 1
+        model_name = self.model_path.split("/")[-1]
+        sft_dir = tmp_path / f"{model_name}-sft"
+
+        # If SFT directory doesn't exist, create a mock one for standalone testing
+        if not sft_dir.exists():
+            sft_dir.mkdir()
+
+            # Create minimal config.json for the mock model
+            config_content = {
+                "model_type": "gpt_oss",
+                "hidden_size": 5120,
+                "num_attention_heads": 40,
+                "num_hidden_layers": 44,
+                "vocab_size": 100000,
+                "torch_dtype": "bfloat16",
+            }
+
+            import json
+
+            with open(sft_dir / "config.json", "w") as f:
+                json.dump(config_content, f)
+
+        qat_output_dir = tmp_path / f"{model_name}-qat"
+
+        # Command for QAT training (Step 2)
+        cmd_parts = [
+            "accelerate",
+            "launch",
+            "--config_file",
+            "configs/zero3.yaml",
+            "sft.py",
+            "--config",
+            "configs/sft_full.yaml",
+            "--model_name_or_path",
+            str(sft_dir),
+            "--quant_cfg",
+            "MXFP4_MLP_WEIGHT_ONLY_CFG",
+            "--output_dir",
+            str(qat_output_dir),
+        ]
+
+        run_example_command(cmd_parts, "gpt-oss")
+
+        # Verify QAT output directory exists
+        assert qat_output_dir.exists(), "QAT output directory should exist after training"
+
+    def gpt_oss_mxfp4_conversion(self, tmp_path):
+        """Test conversion to MXFP4 weight-only format - Step 3."""
+        # This test assumes test_gpt_oss_qat_training has been run first
+        # Look for the QAT output directory from step 2
+        model_name = self.model_path.split("/")[-1]
+        qat_dir = tmp_path / f"{model_name}-qat"
+
+        # If QAT directory doesn't exist, create a mock one for standalone testing
+        if not qat_dir.exists():
+            qat_dir.mkdir()
+
+            # Create minimal config.json for the mock model
+            config_content = {
+                "model_type": "gpt_oss",
+                "hidden_size": 5120,
+                "num_attention_heads": 40,
+                "num_hidden_layers": 44,
+                "vocab_size": 100000,
+                "torch_dtype": "bfloat16",
+            }
+
+            import json
+
+            with open(qat_dir / "config.json", "w") as f:
+                json.dump(config_content, f)
+
+        conversion_output_dir = tmp_path / f"{model_name}-qat-real-mxfp4"
+
+        # Command for MXFP4 conversion (Step 3)
+        cmd_parts = [
+            "python",
+            "convert_oai_mxfp4_weight_only.py",
+            "--model_path",
+            str(qat_dir),
+            "--output_path",
+            str(conversion_output_dir),
+        ]
+
+        run_example_command(cmd_parts, "gpt-oss")
+
+        # Verify conversion output directory exists
+        assert conversion_output_dir.exists(), "MXFP4 conversion output directory should exist"
+
+    def deploy_gpt_oss_trtllm(self, tmp_path):
+        """Deploy GPT-OSS model with TensorRT-LLM."""
+        # Prepare benchmark data
+        tensorrt_llm_workspace = "/app/tensorrt_llm"
+        script = os.path.join(tensorrt_llm_workspace, "benchmarks", "cpp", "prepare_dataset.py")
+        model_name = self.model_path.split("/")[-1]
+        benchmark_file = str(tmp_path / f"{model_name}_synthetic_128_128.txt")
+
+        if not os.path.exists(benchmark_file) or os.path.getsize(benchmark_file) == 0:
+            print(f"Creating dataset file '{benchmark_file}'...")
+            with open(benchmark_file, "w") as fp:
+                subprocess.run(
+                    f"python {script} --stdout --tokenizer={self.model_path} token-norm-dist --input-mean 128 \
+                    --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1400",
+                    shell=True,
+                    check=True,
+                    stdout=fp,
+                )
+        else:
+            print(f"Dataset file '{benchmark_file}' already exists.")
+
+        assert os.path.isfile(benchmark_file), f"Benchmark file '{benchmark_file}' should exist"
+
+        cmd_parts = [
+            "trtllm-bench",
+            "--model",
+            self.model_path,
+            "throughput",
+            "--backend",
+            "pytorch",
+            "--dataset",
+            benchmark_file,
+            "--kv_cache_free_gpu_mem_fraction",
+            "0.9",
+            "--report_json",
+            str(tmp_path / "low_latency_throughput.json"),
+        ]
+        run_example_command(cmd_parts, "gpt-oss")
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        pytest.param("openai/gpt-oss-20b", id="gpt-oss-20b", marks=minimum_gpu(2)),
+        pytest.param("openai/gpt-oss-120b", id="gpt-oss-120b", marks=minimum_gpu(8)),
+    ],
+)
+def test_gpt_oss_complete_pipeline(model_path, tmp_path):
+    """Test the complete GPT-OSS optimization pipeline by executing all 3 steps in sequence."""
+    # Create GPTOSS instance with model path
+    gpt_oss = GPTOSS(model_path)
+    model_name = model_path.split("/")[-1]
+
+    # Execute Step 1: SFT Training
+    gpt_oss.gpt_oss_sft_training(tmp_path)
+
+    # Execute Step 2: QAT Training
+    gpt_oss.gpt_oss_qat_training(tmp_path)
+
+    # Execute Step 3: MXFP4 Conversion
+    gpt_oss.gpt_oss_mxfp4_conversion(tmp_path)
+
+    # Verify all output directories exist
+    sft_dir = tmp_path / f"{model_name}-sft"
+    qat_dir = tmp_path / f"{model_name}-qat"
+    conversion_dir = tmp_path / f"{model_name}-qat-real-mxfp4"
+
+    assert sft_dir.exists(), "SFT output directory should exist after Step 1"
+    assert qat_dir.exists(), "QAT output directory should exist after Step 2"
+    assert conversion_dir.exists(), "MXFP4 conversion output directory should exist after Step 3"
+
+    print(f"Complete pipeline executed successfully for {model_path}!")
+    print(f"Step 1 output: {sft_dir}")
+    print(f"Step 2 output: {qat_dir}")
+    print(f"Step 3 output: {conversion_dir}")
+
+    # Deploy with TensorRT-LLM
+    gpt_oss.deploy_gpt_oss_trtllm(tmp_path)
diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py
new file mode 100644
index 000000000..d41622a2c
--- /dev/null
+++ b/tests/examples/llm_ptq/test_deploy.py
@@ -0,0 +1,462 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+
+import pytest
+from _test_utils.deploy_utils import ModelDeployerList
+
+
+def idfn(val):
+    if hasattr(val, "test_id"):
+        return val.test_id
+    return str(val)
+
+
+# clean up hf cache
+HF_CACHE_PATH = os.getenv("HF_HUB_CACHE", os.path.expanduser("~/.cache/huggingface/hub"))
+
+
+def clear_hf_cache():
+    """Clear Hugging Face cache directory."""
+    try:
+        if os.path.exists(HF_CACHE_PATH):
+            print(f"Clearing HF cache at: {HF_CACHE_PATH}")
+            for item in os.listdir(HF_CACHE_PATH):
+                item_path = os.path.join(HF_CACHE_PATH, item)
+                if os.path.isdir(item_path) and "nvidia" in item:
+                    shutil.rmtree(item_path, ignore_errors=True)
+                    print(f"✓ Removed: {item}")
+            print("✓ HF cache cleared successfully")
+        else:
+            print(f"HF cache path does not exist: {HF_CACHE_PATH}")
+    except Exception as e:
+        print(f"⚠ Warning: Failed to clear HF cache: {e}")
+
+
+@pytest.fixture(autouse=True)
+def cleanup_after_test():
+    """Automatically clean up after each test."""
+    yield  # Run the test
+    clear_hf_cache()  # Clean up after test completes
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/DeepSeek-R1-FP4",
+            backend=("vllm", "trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/DeepSeek-R1-FP4-v2",
+            backend=("vllm", "trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/DeepSeek-R1-0528-FP4",
+            backend=("vllm", "trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/DeepSeek-R1-0528-FP4-v2",
+            backend=("vllm", "trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/DeepSeek-V3-0324-FP4",
+            backend=("vllm", "trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_deepseek(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        # Llama-3.1
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-8B-Instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-8B-Instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+        ),
+        # ModelDeployer(model_id="nvidia/Llama-3.1-8B-Medusa-FP8", backend="vllm"),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-70B-Instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.3-70B-Instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.3-70B-Instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-405B-Instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-405B-Instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        # Llama-4
+        *ModelDeployerList(
+            model_id="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_llama(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-8B-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-8B-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-14B-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-14B-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-235B-A22B-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=2,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-235B-A22B-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/QwQ-32B-FP4", backend=("trtllm", "vllm", "sglang"), mini_sm=100
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-32B-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen2.5-VL-7B-Instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen2.5-VL-7B-Instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-30B-A3B-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_qwen(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend=("trtllm", "vllm", "sglang")
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            mini_sm=100,
+        ),
+        #   ModelDeployer(model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend="sglang"), unsupported
+    ],
+    ids=idfn,
+)
+def test_mixtral(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [  # TRTLLM bug: https://nvbugs/5451286
+        *ModelDeployerList(
+            model_id="nvidia/gemma-3-12b-it-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+            attn_backend="FLASHINFER",
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/gemma-3-12b-it-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+            attn_backend="FLASHINFER",
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/gemma-3-27b-it-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+            attn_backend="FLASHINFER",
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/gemma-3-27b-it-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+            attn_backend="FLASHINFER",
+        ),
+    ],
+    ids=idfn,
+)
+def test_gemma(command):
+    command.run()
+
+
+# test phi
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/Phi-4-multimodal-instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Phi-4-multimodal-instruct-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Phi-4-reasoning-plus-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Phi-4-reasoning-plus-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+    ],
+    ids=idfn,
+)
+def test_phi(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/Kimi-K2-Instruct-FP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_kimi(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=4,
+            mini_sm=89,
+        ),
+    ],
+    ids=idfn,
+)
+def test_llama_nemotron(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-8B-Medusa-FP8",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-70B-Medusa-FP8",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=2,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Llama-3.1-405B-Medusa-FP8",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+@pytest.mark.skip(reason="Medusa is not supported yet")
+def test_medusa(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            base_model="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            model_id="nvidia/Llama-4-Maverick-17B-128E-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            base_model="Qwen/Qwen3-235B-A22B",
+            model_id="nvidia/Qwen3-235B-A22B-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            base_model="Qwen/Qwen3-235B-A22B-Thinking-2507",
+            model_id="nvidia/Qwen3-235B-A22B-Thinking-2507-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=89,
+            eagle3_one_model=False,
+        ),
+        *ModelDeployerList(
+            base_model="Qwen/Qwen3-30B-A3B",
+            model_id="nvidia/Qwen3-30B-A3B-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+        ),
+        *ModelDeployerList(
+            base_model="Qwen/Qwen3-30B-A3B-Thinking-2507",
+            model_id="nvidia/Qwen3-30B-A3B-Thinking-2507-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=89,
+            eagle3_one_model=False,
+        ),
+        *ModelDeployerList(
+            base_model="openai/gpt-oss-120b",
+            model_id="nvidia/gpt-oss-120b-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=89,
+        ),
+    ],
+    ids=idfn,
+)
+def test_eagle(command):
+    command.run()