diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 1a32e333b5a..fa2d1e3a287 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4096,6 +4096,34 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph, task.evaluate(llm, extra_evaluator_kwargs=self.extra_evaluator_kwargs) + # on spark 120b accuracy takes 2.2 hours, so we do 20b for now + def test_w4_1gpu_20b_spark(self, mocker): + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) + mocker.patch.dict(GSM8K.EVALUATE_KWARGS, + {"scores_filter": "exact_match,flexible-extract"}) + + pytorch_config = dict( + disable_overlap_scheduler=False, + cuda_graph_config=CudaGraphConfig()) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5, + dtype="auto") + + model_path = f"{llm_models_root()}/gpt_oss/gpt-oss-20b" + llm = LLM(model_path, + tensor_parallel_size=1, + pipeline_parallel_size=1, + moe_expert_parallel_size=1, + kv_cache_config=kv_cache_config, + **pytorch_config, + moe_config=MoeConfig(backend="CUTLASS")) + + with llm: + model_name = "GPT-OSS/20B-MXFP4" + task = GSM8K(model_name) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) + def test_dummy_load_format(self): llm = LLM( self.MODEL_PATH, diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index b55eeb8359d..4dacf110874 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1902,11 +1902,43 @@ def test_ptp_quickstart(llm_root, llm_venv): marks=skip_pre_blackwell), pytest.param( 'GPT-OSS-120B', 'gpt_oss/gpt-oss-120b', marks=skip_pre_blackwell), + pytest.param( + 'Qwen3-8b-fp8', + 'Qwen3/nvidia-Qwen3-8B-FP8', + marks=skip_pre_blackwell), + pytest.param( + 'Qwen3-8b-nvfp4', + 'Qwen3/nvidia-Qwen3-8B-NVFP4', + marks=skip_pre_blackwell), + pytest.param( + 'Qwen3-14b-fp8', + 'Qwen3/nvidia-Qwen3-14B-FP8', + marks=skip_pre_blackwell), + pytest.param( + 'Qwen3-14b-nvfp4', + 'Qwen3/nvidia-Qwen3-14B-NVFP4', + marks=skip_pre_blackwell), + pytest.param( + 'Qwen3-32b-nvfp4', + 'Qwen3/nvidia-Qwen3-32B-NVFP4', + marks=skip_pre_blackwell), + pytest.param( + 'Phi4-Reasoning-Plus-fp8', + 'nvidia-Phi-4-reasoning-plus-FP8', + marks=skip_pre_blackwell), + pytest.param( + 'Phi4-Reasoning-Plus-nvfp4', + 'nvidia-Phi-4-reasoning-plus-NVFP4', + marks=skip_pre_blackwell), + pytest.param( + 'Nemotron-Nano-v2-nvfp4', + 'NVIDIA-Nemotron-Nano-9B-v2-NVFP4', + marks=skip_pre_blackwell), ]) def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path): print(f"Testing {model_name}.") example_root = Path(os.path.join(llm_root, "examples", "llm-api")) - if model_name == "Nemotron-H-8B": + if model_name in ("Nemotron-H-8B", "Nemotron-Nano-v2-nvfp4"): llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), "--disable_kv_cache_reuse", diff --git a/tests/integration/test_lists/test-db/l0_gb10.yml b/tests/integration/test_lists/test-db/l0_gb10.yml index a749f27a54e..133dda3d732 100644 --- a/tests/integration/test_lists/test-db/l0_gb10.yml +++ b/tests/integration/test_lists/test-db/l0_gb10.yml @@ -16,10 +16,13 @@ l0_gb10: backend: pytorch tests: # ------------- PyTorch tests --------------- - - unittest/_torch/attention/test_attention_mla.py - - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] - - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency] + # - unittest/_torch/attention/test_attention_mla.py + # - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] + # - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] + # - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency] + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu_20b_spark - condition: ranges: system_gpu_count: @@ -35,8 +38,25 @@ l0_gb10: backend: pytorch tests: # ------------- PyTorch tests --------------- - # Below cases which are commented out due to they failed on gb10 - # - unittest/_torch/modeling -k "modeling_mllama" + - unittest/_torch/modeling -k "modeling_mllama" - unittest/_torch/modeling -k "modeling_out_of_tree" - # - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype0] - # - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype1] + - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] + - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] + - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] + - test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-20B-gpt_oss/gpt-oss-20b] + - test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-120B-gpt_oss/gpt-oss-120b] + - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-fp8-Qwen3/nvidia-Qwen3-8B-FP8] + - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-nvfp4-Qwen3/nvidia-Qwen3-8B-NVFP4] + - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14b-fp8-Qwen3/nvidia-Qwen3-14B-FP8] + - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14b-nvfp4-Qwen3/nvidia-Qwen3-14B-NVFP4] + - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio] + - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] + - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-32b-nvfp4-Qwen3/nvidia-Qwen3-32B-NVFP4] + - test_e2e.py::test_ptp_quickstart_advanced[Phi4-Reasoning-Plus-fp8-nvidia-Phi-4-reasoning-plus-FP8] + - test_e2e.py::test_ptp_quickstart_advanced[Phi4-Reasoning-Plus-nvfp4-nvidia-Phi-4-reasoning-plus-NVFP4] + - test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Nano-v2-nvfp4-NVIDIA-Nemotron-Nano-9B-v2-NVFP4] + - test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] + - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_fp8_hf-Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf] + - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_nvfp4_hf-Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf] + - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-70B] + - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] diff --git a/tests/unittest/_torch/modeling/test_modeling_mllama.py b/tests/unittest/_torch/modeling/test_modeling_mllama.py index a9423b86d35..50b327a28ce 100644 --- a/tests/unittest/_torch/modeling/test_modeling_mllama.py +++ b/tests/unittest/_torch/modeling/test_modeling_mllama.py @@ -10,6 +10,7 @@ from transformers import MllamaConfig from transformers import \ MllamaForConditionalGeneration as HFMllamaForConditionalGeneration +from utils.util import getSMVersion import tensorrt_llm from tensorrt_llm._torch.attention_backend.utils import get_attention_backend @@ -392,9 +393,10 @@ def test_mllama_allclose_to_hf_text_only(self, scenario: Scenario) -> None: position_ids=position_ids, use_cache=True) + atol = 0.35 if getSMVersion() >= 121 else 0.3 torch.testing.assert_close(logits, ref.logits[:, -1].float(), - atol=0.3, + atol=atol, rtol=0.3) # gen @@ -458,9 +460,10 @@ def run_forward(input_ids, position_ids, attn_metadata): past_key_values=ref.past_key_values, use_cache=True) + atol = 0.35 if getSMVersion() >= 121 else 0.3 torch.testing.assert_close(logits, ref.logits[:, -1].float(), - atol=0.3, + atol=atol, rtol=0.3) if graph_runner is not None: graph_runner.clear()