tests: add qa test mentioned in docs (NVIDIA#4357)

crazydemo · web-flow · commit c4a0d768b555 · 2025-05-19T10:06:51.000+08:00
* add nemotron-h and llama_70b cases

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* trial

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* add llm decoder quick_start case

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* update nemotron-h test case

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* add qwen3 quickstart test

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* add trtllm_decoder accuracy test

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* remove quickstart test for llm_decoder

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* fix import error

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* nemotronh fp8 trial

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* fix name

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

* remove nemotronh-fp8

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;

---------

Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
@@ -146,7 +146,8 @@ def get_num_samples_and_threshold(self, **acc_specs):
     def evaluate(self,
                  llm: Union[LLM, PyTorchLLM],
                  extra_acc_spec: Optional[str] = None,
-                 extra_evaluator_kwargs: Optional[dict] = None):
+                 extra_evaluator_kwargs: Optional[dict] = None,
+                 sampling_params: Optional[SamplingParams] = None):
         assert self.EVALUATOR_CLS is not None
 
         if llm.args.speculative_config is None:
@@ -175,9 +176,15 @@ def evaluate(self,
                 spec_dec_algo=spec_dec_algo,
                 extra_acc_spec=extra_acc_spec)
 
-        sampling_params = SamplingParams(
-            max_tokens=self.MAX_OUTPUT_LEN,
-            truncate_prompt_tokens=self.MAX_INPUT_LEN)
+        if sampling_params is None:
+            sampling_params = SamplingParams(
+                max_tokens=self.MAX_OUTPUT_LEN,
+                truncate_prompt_tokens=self.MAX_INPUT_LEN)
+        else:
+            if sampling_params.max_tokens is None:
+                sampling_params.max_tokens = self.MAX_OUTPUT_LEN
+            if sampling_params.truncate_prompt_tokens is None:
+                sampling_params.truncate_prompt_tokens = self.MAX_INPUT_LEN
 
         evaluator_kwargs = {}
         if self.EVALUATOR_KWARGS is not None:
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -64,3 +64,5 @@ Qwen3/Qwen3-30B-A3B:
     accuracy: 83.43
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 92.57
+nvidia/Nemotron-H-8B-Base-8K:
+  - accuracy: 46.20
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -22,6 +22,9 @@ meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 68.17
   - quant_algo: FP8
     accuracy: 67.93
+  - quant_algo: FP8
+    extra_acc_spec: temperature=0.8,top_p=0.95
+    accuracy: 64.62
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 67.87
@@ -117,4 +120,4 @@ nvidia/Llama-3_3-Nemotron-Super-49B-v1:
 nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
   - accuracy: 57.97
 nvidia/Nemotron-H-8B-Base-8K:
-  - accuracy: 87.573
+  - accuracy: 69.590
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -16,7 +16,7 @@
 
 from tensorrt_llm._torch import LLM
 from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
+from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
@@ -183,6 +183,24 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_hopper
+    def test_fp8_llm_decoder(self):
+        model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
+        pytorch_config = PyTorchConfig(enable_trtllm_decoder=True)
+        llm = LLM(model_path, pytorch_backend_config=pytorch_config)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+
+        sampling_params = SamplingParams(
+            temperature=0.8,
+            top_p=0.95,
+        )
+
+        with llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm,
+                          sampling_params=sampling_params,
+                          extra_acc_spec="temperature=0.8,top_p=0.95")
+
 
 class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
@@ -831,10 +849,13 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
     MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
     MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K"
 
-    @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5264431")
     def test_auto_dtype(self):
+        # TODO: remove max_batch_size after mamba cache manager is supported
+        # ToDo: check 47b and 56b model
         kv_cache_config = KvCacheConfig(enable_block_reuse=False)
-        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=128) as llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1257,6 +1257,7 @@ def test_ptp_quickstart(llm_root, llm_venv):
     ("Llama3.2-11B-BF16", "llama-3.2-models/Llama-3.2-11B-Vision"),
     ("Nemotron4_4B-BF16", "nemotron/Minitron-4B-Base"),
     ("Nemotron-H-8B", "Nemotron-H-8B-Base-8K"),
+    ("Qwen3-30B-A3B", "Qwen3/Qwen3-30B-A3B"),
     pytest.param('Llama3.1-8B-NVFP4',
                  'nvfp4-quantized/Meta-Llama-3.1-8B',
                  marks=skip_pre_blackwell),
@@ -1300,13 +1301,14 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
                                          dir="./",
                                          delete=True,
                                          delete_on_close=True) as running_log:
-            llm_venv.run_cmd([
+            cmds = [
                 str(example_root / "quickstart_advanced.py"),
                 "--enable_chunked_prefill",
-                "--model_dir",
-                f"{llm_models_root()}/{model_path}",
-            ],
-                             running_log=running_log)
+                f"--model_dir={llm_models_root()}/{model_path}",
+            ]
+            if "Qwen3" in model_name:
+                cmds.append(f"--kv_cache_fraction=0.6")
+            llm_venv.run_cmd(cmds, running_log=running_log)
             if model_name in mapping:
                 _check_mem_usage(running_log, [mapping[model_name], 0, 0, 0])
 
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -423,6 +423,7 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
 accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
 accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_decoder
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
 accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
@@ -472,6 +473,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision]
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base]
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K]
+test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B]
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8]
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8]