Skip to content

Commit b0ce137

Browse files
authored
[test] add qa test mentioned in docs (#4248)
* add nemotron-h and llama_70b cases Signed-off-by: Ivy Zhang <[email protected]> * trial Signed-off-by: Ivy Zhang <[email protected]> * add llm decoder quick_start case Signed-off-by: Ivy Zhang <[email protected]> * update nemotron-h test case Signed-off-by: Ivy Zhang <[email protected]> * add qwen3 quickstart test Signed-off-by: Ivy Zhang <[email protected]> * add trtllm_decoder accuracy test Signed-off-by: Ivy Zhang <[email protected]> * remove quickstart test for llm_decoder Signed-off-by: Ivy Zhang <[email protected]> --------- Signed-off-by: Ivy Zhang <[email protected]>
1 parent 3ea42e7 commit b0ce137

File tree

6 files changed

+59
-8
lines changed

6 files changed

+59
-8
lines changed

tests/integration/defs/accuracy/accuracy_core.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ def get_num_samples_and_threshold(self, **acc_specs):
146146
def evaluate(self,
147147
llm: Union[LLM, PyTorchLLM],
148148
extra_acc_spec: Optional[str] = None,
149-
extra_evaluator_kwargs: Optional[dict] = None):
149+
extra_evaluator_kwargs: Optional[dict] = None,
150+
sampling_params: Optional[SamplingParams] = None):
150151
assert self.EVALUATOR_CLS is not None
151152

152153
if llm.args.speculative_config is None:
@@ -175,9 +176,15 @@ def evaluate(self,
175176
spec_dec_algo=spec_dec_algo,
176177
extra_acc_spec=extra_acc_spec)
177178

178-
sampling_params = SamplingParams(
179-
max_tokens=self.MAX_OUTPUT_LEN,
180-
truncate_prompt_tokens=self.MAX_INPUT_LEN)
179+
if sampling_params is None:
180+
sampling_params = SamplingParams(
181+
max_tokens=self.MAX_OUTPUT_LEN,
182+
truncate_prompt_tokens=self.MAX_INPUT_LEN)
183+
else:
184+
if sampling_params.max_tokens is None:
185+
sampling_params.max_tokens = self.MAX_OUTPUT_LEN
186+
if sampling_params.truncate_prompt_tokens is None:
187+
sampling_params.truncate_prompt_tokens = self.MAX_INPUT_LEN
181188

182189
evaluator_kwargs = {}
183190
if self.EVALUATOR_KWARGS is not None:

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ meta-llama/Llama-3.1-8B-Instruct:
66
kv_cache_quant_algo: FP8
77
accuracy: 72.85
88
meta-llama/Llama-3.3-70B-Instruct:
9+
- accuracy: 84.07
910
- quant_algo: NVFP4
1011
kv_cache_quant_algo: FP8
1112
accuracy: 75.61
@@ -61,3 +62,5 @@ Qwen3/Qwen3-30B-A3B:
6162
accuracy: 83.43
6263
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
6364
- accuracy: 92.57
65+
nvidia/Nemotron-H-8B-Base-8K:
66+
- accuracy: 46.20

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,14 @@ meta-llama/Llama-3.1-8B-Instruct:
2222
- accuracy: 68.17
2323
- quant_algo: FP8
2424
accuracy: 67.93
25+
- quant_algo: FP8
26+
extra_acc_spec: temperature=0.8,top_p=0.95
27+
accuracy: 64.62
2528
- quant_algo: FP8
2629
kv_cache_quant_algo: FP8
2730
accuracy: 67.87
2831
meta-llama/Llama-3.3-70B-Instruct:
32+
- accuracy: 81.28
2933
- quant_algo: NVFP4
3034
kv_cache_quant_algo: FP8
3135
accuracy: 79.31
@@ -114,4 +118,4 @@ nvidia/Llama-3_3-Nemotron-Super-49B-v1:
114118
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
115119
- accuracy: 57.97
116120
nvidia/Nemotron-H-8B-Base-8K:
117-
- accuracy: 87.573
121+
- accuracy: 69.590

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
from tensorrt_llm._torch import LLM
1818
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
19-
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
19+
from tensorrt_llm.llmapi import (LLM, KvCacheConfig, MTPDecodingConfig,
20+
SamplingParams)
2021
from tensorrt_llm.models.modeling_utils import QuantConfig
2122
from tensorrt_llm.quantization import QuantAlgo
2223

@@ -178,9 +179,36 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
178179
task = GSM8K(self.MODEL_NAME)
179180
task.evaluate(llm)
180181

182+
@skip_pre_hopper
183+
def test_fp8_llm_decoder(self):
184+
model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
185+
pytorch_config = PyTorchConfig(enable_trtllm_decoder=True)
186+
llm = LLM(model_path, pytorch_backend_config=pytorch_config)
187+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
188+
189+
sampling_params = SamplingParams(
190+
temperature=0.8,
191+
top_p=0.95,
192+
)
193+
194+
with llm:
195+
task = MMLU(self.MODEL_NAME)
196+
task.evaluate(llm,
197+
sampling_params=sampling_params,
198+
extra_acc_spec="temperature=0.8,top_p=0.95")
199+
181200

182201
class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
183202
MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
203+
MODEL_PATH = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct"
204+
205+
@pytest.mark.skip_less_device(8)
206+
def test_auto_dtype(self):
207+
with LLM(self.MODEL_PATH, tensor_parallel_size=8) as llm:
208+
task = MMLU(self.MODEL_NAME)
209+
task.evaluate(llm)
210+
task = GSM8K(self.MODEL_NAME)
211+
task.evaluate(llm)
184212

185213
@pytest.mark.skip_less_device(4)
186214
@pytest.mark.skip_device_not_contain(["H100", "B200"])
@@ -730,10 +758,13 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
730758
MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
731759
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K"
732760

733-
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5264431")
734761
def test_auto_dtype(self):
762+
# TODO: remove max_batch_size after mamba cache manager is supported
763+
# ToDo: check 47b and 56b model
735764
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
736-
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
765+
with LLM(self.MODEL_PATH,
766+
kv_cache_config=kv_cache_config,
767+
max_batch_size=128) as llm:
737768
task = MMLU(self.MODEL_NAME)
738769
task.evaluate(llm)
739770
task = GSM8K(self.MODEL_NAME)

tests/integration/defs/test_e2e.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1259,6 +1259,7 @@ def test_ptp_quickstart(llm_root, llm_venv):
12591259
("Llama3.2-11B-BF16", "llama-3.2-models/Llama-3.2-11B-Vision"),
12601260
("Nemotron4_4B-BF16", "nemotron/Minitron-4B-Base"),
12611261
("Nemotron-H-8B", "Nemotron-H-8B-Base-8K"),
1262+
("Qwen3-30B-A3B", "Qwen3/Qwen3-30B-A3B"),
12621263
pytest.param('Llama3.1-8B-NVFP4',
12631264
'nvfp4-quantized/Meta-Llama-3.1-8B',
12641265
marks=skip_pre_blackwell),
@@ -1299,10 +1300,12 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
12991300
dir="./",
13001301
delete=True,
13011302
delete_on_close=True) as running_log:
1303+
kv_cache_fraction = 0.6 if "Qwen3" in model_name else None
13021304
llm_venv.run_cmd([
13031305
str(example_root / "quickstart_advanced.py"),
13041306
"--enable_overlap_scheduler",
13051307
"--enable_chunked_prefill",
1308+
f"--kv_cache_fraction={kv_cache_fraction}",
13061309
"--model_dir",
13071310
f"{llm_models_root()}/{model_path}",
13081311
],

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,8 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
423423
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
424424
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
425425
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
426+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_decoder
427+
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_auto_dtype
426428
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
427429
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
428430
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
@@ -472,6 +474,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta
472474
test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision]
473475
test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base]
474476
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K]
477+
test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
475478
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B]
476479
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8]
477480
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8]

0 commit comments

Comments
 (0)