Skip to content

Commit 4d0e462

Browse files
ruodilLarryXFly
andauthored
tests: skip writing prepare_dataset output to logs, and add llama_v3.1_8b_fp8, llama_v3.3_70b_fp8, llama_v3.1_405b_fp4 models (NVIDIA#3864)
* tests: skip writing prepare_dataset output to logs Signed-off-by: Ruodi <[email protected]> * test: add llama_v3.1_8b_fp8 model, llama_v3.1_405b model and llama_nemotron_49b model in perf test, and modify original llama models dtype from float16 to bfloat16 according to README.md Signed-off-by: Ruodi <[email protected]> --------- Signed-off-by: Ruodi <[email protected]> Signed-off-by: Larry <[email protected]> Co-authored-by: Larry <[email protected]>
1 parent 0446270 commit 4d0e462

File tree

6 files changed

+110
-84
lines changed

6 files changed

+110
-84
lines changed

tests/integration/defs/perf/test_perf.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,22 @@
4444
"llama_v2_70b": "llama-models-v2/llama-v2-70b-hf", # not safetensors repo
4545
"llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B",
4646
"llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
47+
"llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
4748
"llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
49+
"llama_v3.3_70b_instruct_fp8":
50+
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
51+
"llama_v3.1_405b_instruct_fp4":
52+
"llm-models/modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
4853
"llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
4954
"llama_v3.2_11b": "llama-3.2-models/Llama-3.2-11B-Vision",
55+
"llama_v3.3_nemotron_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1/",
5056
"llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
5157
# "llama_30b": "llama-models/llama-30b-hf",
5258
"mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
5359
"mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
5460
"mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1",
5561
"mistral_7b_v0.1": "mistral-7b-v0.1",
56-
"deepseek_r1": "DeepSeek-R1/DeepSeek-R1",
62+
"deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
5763
"deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
5864
"deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
5965
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
@@ -596,7 +602,7 @@ def validate(self):
596602
assert self.mode in VALID_MODES, f"Invalid mode {self.mode}!"
597603

598604
# Validate dtype.
599-
VALID_DTYPES = ["float32", "float16", "bfloat16"]
605+
VALID_DTYPES = ["float32", "float16", "bfloat16", "float8", "float4"]
600606
assert self.data_type in VALID_DTYPES, f"Invalid data_type {self.data_type}!"
601607

602608
# Validate quantization mode.
@@ -978,8 +984,8 @@ def get_prepare_data_command(self, engine_dir, input_len,
978984
nloras = self._config.num_loras
979985
lora_data = os.path.join(engine_dir,
980986
f"token-norm-dist-lora-{nloras}.json")
981-
with open(lora_data, 'w') as file:
982-
pass
987+
# with open(lora_data, 'w') as file:
988+
# pass
983989
data_cmd += [
984990
"python3", prepare_data_script, f"--output={lora_data}",
985991
f"--rand-task-id 0 {nloras-1}", f"--tokenizer={tokenizer_dir}",

tests/integration/defs/perf/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,9 @@ def run_ex(self,
425425
print(collect_and_clean_myelin_time(output))
426426

427427
# Print the output log to stdout and cache it.
428-
print(buf.getvalue())
428+
# skip the output log for prepare dataset command
429+
if 'prepare_dataset' not in commands.get_cmd_str(cmd_idx):
430+
print(buf.getvalue())
429431
outputs[cmd_idx] = buf.getvalue()
430432
else:
431433
print_info(f"Reusing cached logs for command index {cmd_idx}.")

tests/integration/test_lists/qa/trt_llm_release_perf_cluster_test.yml

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,23 @@ trt_llm_release_perf_cluster_test:
55
system_gpu_count:
66
gte: 1
77
tests:
8-
- perf/test_perf.py::test_perf[llama_v3_8b_instruct-bench-float16-input_output_len:128,128]
9-
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:128,128-quant:fp8]
10-
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:512,32-quant:fp8]
11-
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:nvfp4-gpus:2]
12-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:256-input_output_len:128,128-gpus:2]
13-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:256-input_output_len:512,32-gpus:2]
14-
- perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2]
8+
- perf/test_perf.py::test_perf[llama_v3_8b_instruct-bench-bfloat16-input_output_len:128,128]
9+
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8]
10+
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8]
1511
- perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20]
1612
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8]
1713
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8]
14+
15+
- condition:
16+
ranges:
17+
system_gpu_count:
18+
gte: 2
19+
tests:
20+
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2]
21+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:128,128-gpus:2]
22+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:512,32-gpus:2]
23+
- perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2]
24+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:2]
1825
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
1926

2027
# Tests for systems with 4+ GPUs
@@ -27,28 +34,23 @@ trt_llm_release_perf_cluster_test:
2734
- perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
2835
- perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:512,32-gpus:4]
2936
- perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4]
37+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
3038

3139
# Tests for systems with 8+ GPUs
3240
- condition:
3341
ranges:
3442
system_gpu_count:
3543
gte: 8
3644
tests:
45+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:8]
46+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:8]
3747
- perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:8]
3848
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
3949
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
40-
41-
# GB chip specific tests with high memory
42-
- condition:
43-
wildcards:
44-
gpu:
45-
- '*b100*'
46-
linux_distribution_name: '*'
47-
tests:
48-
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:fp8-ep:8-tp:8-gpus:8]
49-
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-tp:8-gpus:8] #min latency test
50-
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:fp8-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
51-
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:nvfp4-ep:8-tp:8-gpus:8]
52-
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float16-maxbs:512-input_output_len:1000,1000-quant:nvfp4-con:4096-ep:8-tp:8-gpus:8]
53-
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8] #min latency test
54-
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
50+
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
51+
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
52+
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
53+
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
54+
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:1000,1000-con:4096-ep:8-tp:8-gpus:8]
55+
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
56+
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test

tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ trt_llm_release_perf_sanity_test:
2424
- perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20]
2525
- perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20]
2626
- perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20]
27-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128+512,32]
27+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
28+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
29+
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
2830

2931
# Test list validation
3032
- test_list_validation.py::test_list_validation
@@ -41,7 +43,7 @@ trt_llm_release_perf_sanity_test:
4143
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-static_batching-plugin_ifb-float16-bs:8+64-input_output_len:128,128+512,32]
4244
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.0-input_output_len:128,128+512,32]
4345

44-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128]
46+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
4547
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
4648
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
4749
- perf/test_perf.py::test_perf[qwen2_7b_instruct-bench-float16-input_output_len:128,128]
@@ -51,8 +53,8 @@ trt_llm_release_perf_sanity_test:
5153
terms:
5254
supports_fp8: true
5355
tests:
54-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-quant:fp8]
55-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:512,32-quant:fp8]
56+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:fp8]
57+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32-quant:fp8]
5658

5759
# Tests for systems with 2+ GPUs
5860
- condition:
@@ -62,10 +64,10 @@ trt_llm_release_perf_sanity_test:
6264
tests:
6365
- perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
6466
- perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
65-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-quant:int8_sq-gpus:2]
66-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-quant:int8_wo-gpus:2]
67-
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:128,128-gpu:2]
68-
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-float16-input_output_len:128,128-gpus:2]
67+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_sq-gpus:2]
68+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_wo-gpus:2]
69+
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpu:2]
70+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128-gpus:2]
6971
- perf/test_perf.py::test_perf[llama_v3.2_11b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
7072

7173
# FP8 tests for systems with 2+ GPUs
@@ -77,7 +79,7 @@ trt_llm_release_perf_sanity_test:
7779
gte: 2
7880
tests:
7981
- perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128-quant:fp8-gpus:2]
80-
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
82+
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2]
8183
- perf/test_perf.py::test_perf[llama_v3.2_11b-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
8284
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
8385

@@ -89,7 +91,7 @@ trt_llm_release_perf_sanity_test:
8991
gpu_memory:
9092
gt: 80000
9193
tests:
92-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
94+
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
9395
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-gpus:2]
9496

9597
# Tests for systems with 4+ GPUs
@@ -98,7 +100,7 @@ trt_llm_release_perf_sanity_test:
98100
system_gpu_count:
99101
gte: 4
100102
tests:
101-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
103+
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
102104
- perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
103105
- perf/test_perf.py::test_perf[starcoder_15b-bench-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
104106

@@ -108,8 +110,8 @@ trt_llm_release_perf_sanity_test:
108110
system_gpu_count:
109111
gte: 8
110112
tests:
111-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
112-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
113+
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
114+
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
113115

114116
# Tests for systems with 8+ GPUs and high memory
115117
- condition:
@@ -129,5 +131,6 @@ trt_llm_release_perf_sanity_test:
129131
system_gpu_count:
130132
gte: 8
131133
tests:
132-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:128,128-quant:fp8-gpus:8]
133-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:512,32-quant:fp8-gpus:8]
134+
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-quant:fp8-gpus:8]
135+
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:512,32-quant:fp8-gpus:8]
136+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]

0 commit comments

Comments
 (0)