[test] add qa test mentioned in docs (#4248)

crazydemo · web-flow · commit b0ce1371ee44 · 2025-05-15T13:37:11.000+08:00
* add nemotron-h and llama_70b cases

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* trial

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* add llm decoder quick_start case

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* update nemotron-h test case

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* add qwen3 quickstart test

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* add trtllm_decoder accuracy test

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* remove quickstart test for llm_decoder

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

---------

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
@@ -146,7 +146,8 @@ def get_num_samples_and_threshold(self, **acc_specs):
     def evaluate(self,
                  llm: Union[LLM, PyTorchLLM],
                  extra_acc_spec: Optional[str] = None,
-                 extra_evaluator_kwargs: Optional[dict] = None):
+                 extra_evaluator_kwargs: Optional[dict] = None,
+                 sampling_params: Optional[SamplingParams] = None):
         assert self.EVALUATOR_CLS is not None
 
         if llm.args.speculative_config is None:
@@ -175,9 +176,15 @@ def evaluate(self,
                 spec_dec_algo=spec_dec_algo,
                 extra_acc_spec=extra_acc_spec)
 
-        sampling_params = SamplingParams(
-            max_tokens=self.MAX_OUTPUT_LEN,
-            truncate_prompt_tokens=self.MAX_INPUT_LEN)
+        if sampling_params is None:
+            sampling_params = SamplingParams(
+                max_tokens=self.MAX_OUTPUT_LEN,
+                truncate_prompt_tokens=self.MAX_INPUT_LEN)
+        else:
+            if sampling_params.max_tokens is None:
+                sampling_params.max_tokens = self.MAX_OUTPUT_LEN
+            if sampling_params.truncate_prompt_tokens is None:
+                sampling_params.truncate_prompt_tokens = self.MAX_INPUT_LEN
 
         evaluator_kwargs = {}
         if self.EVALUATOR_KWARGS is not None:
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -6,6 +6,7 @@ meta-llama/Llama-3.1-8B-Instruct:
     kv_cache_quant_algo: FP8
     accuracy: 72.85
 meta-llama/Llama-3.3-70B-Instruct:
+  - accuracy: 84.07
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 75.61
@@ -61,3 +62,5 @@ Qwen3/Qwen3-30B-A3B:
     accuracy: 83.43
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 92.57
+nvidia/Nemotron-H-8B-Base-8K:
+  - accuracy: 46.20
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -22,10 +22,14 @@ meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 68.17
   - quant_algo: FP8
     accuracy: 67.93
+  - quant_algo: FP8
+    extra_acc_spec: temperature=0.8,top_p=0.95
+    accuracy: 64.62
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 67.87
 meta-llama/Llama-3.3-70B-Instruct:
+  - accuracy: 81.28
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 79.31
@@ -114,4 +118,4 @@ nvidia/Llama-3_3-Nemotron-Super-49B-v1:
 nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
   - accuracy: 57.97
 nvidia/Nemotron-H-8B-Base-8K:
-  - accuracy: 87.573
+  - accuracy: 69.590
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -16,7 +16,8 @@
 
 from tensorrt_llm._torch import LLM
 from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
+from tensorrt_llm.llmapi import (LLM, KvCacheConfig, MTPDecodingConfig,
+                                 SamplingParams)
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
@@ -178,9 +179,36 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_hopper
+    def test_fp8_llm_decoder(self):
+        model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
+        pytorch_config = PyTorchConfig(enable_trtllm_decoder=True)
+        llm = LLM(model_path, pytorch_backend_config=pytorch_config)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+
+        sampling_params = SamplingParams(
+            temperature=0.8,
+            top_p=0.95,
+        )
+
+        with llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm,
+                          sampling_params=sampling_params,
+                          extra_acc_spec="temperature=0.8,top_p=0.95")
+
 
 class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
+    MODEL_PATH = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct"
+
+    @pytest.mark.skip_less_device(8)
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH, tensor_parallel_size=8) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
     @pytest.mark.skip_device_not_contain(["H100", "B200"])
@@ -730,10 +758,13 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
     MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
     MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K"
 
-    @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5264431")
     def test_auto_dtype(self):
+        # TODO: remove max_batch_size after mamba cache manager is supported
+        # ToDo: check 47b and 56b model
         kv_cache_config = KvCacheConfig(enable_block_reuse=False)
-        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=128) as llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1259,6 +1259,7 @@ def test_ptp_quickstart(llm_root, llm_venv):
     ("Llama3.2-11B-BF16", "llama-3.2-models/Llama-3.2-11B-Vision"),
     ("Nemotron4_4B-BF16", "nemotron/Minitron-4B-Base"),
     ("Nemotron-H-8B", "Nemotron-H-8B-Base-8K"),
+    ("Qwen3-30B-A3B", "Qwen3/Qwen3-30B-A3B"),
     pytest.param('Llama3.1-8B-NVFP4',
                  'nvfp4-quantized/Meta-Llama-3.1-8B',
                  marks=skip_pre_blackwell),
@@ -1299,10 +1300,12 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
                                          dir="./",
                                          delete=True,
                                          delete_on_close=True) as running_log:
+            kv_cache_fraction = 0.6 if "Qwen3" in model_name else None
             llm_venv.run_cmd([
                 str(example_root / "quickstart_advanced.py"),
                 "--enable_overlap_scheduler",
                 "--enable_chunked_prefill",
+                f"--kv_cache_fraction={kv_cache_fraction}",
                 "--model_dir",
                 f"{llm_models_root()}/{model_path}",
             ],
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -423,6 +423,8 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
 accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
 accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_decoder
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
 accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
@@ -472,6 +474,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision]
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base]
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K]
+test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B]
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8]
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8]