Skip to content

Commit c4a0d76

Browse files
authored
tests: add qa test mentioned in docs (NVIDIA#4357)
* add nemotron-h and llama_70b cases Signed-off-by: Ivy Zhang <[email protected]> * trial Signed-off-by: Ivy Zhang <[email protected]> * add llm decoder quick_start case Signed-off-by: Ivy Zhang <[email protected]> * update nemotron-h test case Signed-off-by: Ivy Zhang <[email protected]> * add qwen3 quickstart test Signed-off-by: Ivy Zhang <[email protected]> * add trtllm_decoder accuracy test Signed-off-by: Ivy Zhang <[email protected]> * remove quickstart test for llm_decoder Signed-off-by: Ivy Zhang <[email protected]> * fix import error Signed-off-by: Ivy Zhang <[email protected]> * nemotronh fp8 trial Signed-off-by: Ivy Zhang <[email protected]> * fix name Signed-off-by: Ivy Zhang <[email protected]> * remove nemotronh-fp8 Signed-off-by: Ivy Zhang <[email protected]> --------- Signed-off-by: Ivy Zhang <[email protected]>
1 parent 791c209 commit c4a0d76

File tree

6 files changed

+50
-13
lines changed

6 files changed

+50
-13
lines changed

tests/integration/defs/accuracy/accuracy_core.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ def get_num_samples_and_threshold(self, **acc_specs):
146146
def evaluate(self,
147147
llm: Union[LLM, PyTorchLLM],
148148
extra_acc_spec: Optional[str] = None,
149-
extra_evaluator_kwargs: Optional[dict] = None):
149+
extra_evaluator_kwargs: Optional[dict] = None,
150+
sampling_params: Optional[SamplingParams] = None):
150151
assert self.EVALUATOR_CLS is not None
151152

152153
if llm.args.speculative_config is None:
@@ -175,9 +176,15 @@ def evaluate(self,
175176
spec_dec_algo=spec_dec_algo,
176177
extra_acc_spec=extra_acc_spec)
177178

178-
sampling_params = SamplingParams(
179-
max_tokens=self.MAX_OUTPUT_LEN,
180-
truncate_prompt_tokens=self.MAX_INPUT_LEN)
179+
if sampling_params is None:
180+
sampling_params = SamplingParams(
181+
max_tokens=self.MAX_OUTPUT_LEN,
182+
truncate_prompt_tokens=self.MAX_INPUT_LEN)
183+
else:
184+
if sampling_params.max_tokens is None:
185+
sampling_params.max_tokens = self.MAX_OUTPUT_LEN
186+
if sampling_params.truncate_prompt_tokens is None:
187+
sampling_params.truncate_prompt_tokens = self.MAX_INPUT_LEN
181188

182189
evaluator_kwargs = {}
183190
if self.EVALUATOR_KWARGS is not None:

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,5 @@ Qwen3/Qwen3-30B-A3B:
6464
accuracy: 83.43
6565
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
6666
- accuracy: 92.57
67+
nvidia/Nemotron-H-8B-Base-8K:
68+
- accuracy: 46.20

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ meta-llama/Llama-3.1-8B-Instruct:
2222
- accuracy: 68.17
2323
- quant_algo: FP8
2424
accuracy: 67.93
25+
- quant_algo: FP8
26+
extra_acc_spec: temperature=0.8,top_p=0.95
27+
accuracy: 64.62
2528
- quant_algo: FP8
2629
kv_cache_quant_algo: FP8
2730
accuracy: 67.87
@@ -117,4 +120,4 @@ nvidia/Llama-3_3-Nemotron-Super-49B-v1:
117120
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
118121
- accuracy: 57.97
119122
nvidia/Nemotron-H-8B-Base-8K:
120-
- accuracy: 87.573
123+
- accuracy: 69.590

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from tensorrt_llm._torch import LLM
1818
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
19-
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
19+
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams
2020
from tensorrt_llm.models.modeling_utils import QuantConfig
2121
from tensorrt_llm.quantization import QuantAlgo
2222

@@ -183,6 +183,24 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
183183
task = GSM8K(self.MODEL_NAME)
184184
task.evaluate(llm)
185185

186+
@skip_pre_hopper
187+
def test_fp8_llm_decoder(self):
188+
model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
189+
pytorch_config = PyTorchConfig(enable_trtllm_decoder=True)
190+
llm = LLM(model_path, pytorch_backend_config=pytorch_config)
191+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
192+
193+
sampling_params = SamplingParams(
194+
temperature=0.8,
195+
top_p=0.95,
196+
)
197+
198+
with llm:
199+
task = MMLU(self.MODEL_NAME)
200+
task.evaluate(llm,
201+
sampling_params=sampling_params,
202+
extra_acc_spec="temperature=0.8,top_p=0.95")
203+
186204

187205
class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
188206
MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
@@ -831,10 +849,13 @@ class TestNemotronH(LlmapiAccuracyTestHarness):
831849
MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
832850
MODEL_PATH = f"{llm_models_root()}/Nemotron-H-8B-Base-8K"
833851

834-
@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5264431")
835852
def test_auto_dtype(self):
853+
# TODO: remove max_batch_size after mamba cache manager is supported
854+
# ToDo: check 47b and 56b model
836855
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
837-
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
856+
with LLM(self.MODEL_PATH,
857+
kv_cache_config=kv_cache_config,
858+
max_batch_size=128) as llm:
838859
task = MMLU(self.MODEL_NAME)
839860
task.evaluate(llm)
840861
task = GSM8K(self.MODEL_NAME)

tests/integration/defs/test_e2e.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,6 +1257,7 @@ def test_ptp_quickstart(llm_root, llm_venv):
12571257
("Llama3.2-11B-BF16", "llama-3.2-models/Llama-3.2-11B-Vision"),
12581258
("Nemotron4_4B-BF16", "nemotron/Minitron-4B-Base"),
12591259
("Nemotron-H-8B", "Nemotron-H-8B-Base-8K"),
1260+
("Qwen3-30B-A3B", "Qwen3/Qwen3-30B-A3B"),
12601261
pytest.param('Llama3.1-8B-NVFP4',
12611262
'nvfp4-quantized/Meta-Llama-3.1-8B',
12621263
marks=skip_pre_blackwell),
@@ -1300,13 +1301,14 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
13001301
dir="./",
13011302
delete=True,
13021303
delete_on_close=True) as running_log:
1303-
llm_venv.run_cmd([
1304+
cmds = [
13041305
str(example_root / "quickstart_advanced.py"),
13051306
"--enable_chunked_prefill",
1306-
"--model_dir",
1307-
f"{llm_models_root()}/{model_path}",
1308-
],
1309-
running_log=running_log)
1307+
f"--model_dir={llm_models_root()}/{model_path}",
1308+
]
1309+
if "Qwen3" in model_name:
1310+
cmds.append(f"--kv_cache_fraction=0.6")
1311+
llm_venv.run_cmd(cmds, running_log=running_log)
13101312
if model_name in mapping:
13111313
_check_mem_usage(running_log, [mapping[model_name], 0, 0, 0])
13121314

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,7 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
423423
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
424424
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
425425
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
426+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_decoder
426427
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
427428
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
428429
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
@@ -472,6 +473,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta
472473
test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision]
473474
test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base]
474475
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K]
476+
test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
475477
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B]
476478
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8]
477479
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8]

0 commit comments

Comments
 (0)