Skip to content

Commit c0a115a

Browse files
committed
Add dsr1 and gpt-oss test case
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 595f780 commit c0a115a

File tree

5 files changed

+348
-44
lines changed

5 files changed

+348
-44
lines changed

jenkins/L0_Test.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2689,6 +2689,7 @@ def launchTestJobs(pipeline, testFilter)
26892689
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
26902690
// Perf sanity post merge test
26912691
"DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
2692+
"DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "perf_sanity_l0_dgx_b200", 1, 1, 8],
26922693
"DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
26932694
]
26942695
fullSet += x86SlurmTestConfigs.keySet()

tests/integration/defs/perf/open_search_db_utils.py

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import re
2121
import sys
2222
import time
23+
from datetime import datetime
2324

2425
from defs.trt_test_alternative import print_info
2526

@@ -40,11 +41,13 @@
4041
"l_ep",
4142
"l_pp",
4243
"l_max_num_tokens",
44+
"l_cuda_graph_max_batch_size",
4345
"b_enable_chunked_prefill",
4446
"b_disable_overlap_scheduler",
4547
"s_attention_backend",
4648
"s_moe_backend",
4749
"l_moe_max_num_tokens",
50+
"l_num_postprocess_workers",
4851
"l_stream_interval",
4952
"b_enable_attention_dp",
5053
"b_attention_dp_balance",
@@ -55,6 +58,11 @@
5558
"d_free_gpu_memory_fraction",
5659
"l_max_batch_size",
5760
"b_enable_padding",
61+
"s_spec_decoding_type",
62+
"l_num_nextn_predict_layers",
63+
"l_eagle3_layers_to_capture",
64+
"l_max_draft_len",
65+
"s_speculative_model_dir",
5866
]
5967

6068
# Client config fields to compare
@@ -64,6 +72,8 @@
6472
"l_isl",
6573
"l_osl",
6674
"d_random_range_ratio",
75+
"s_backend",
76+
"b_use_chat_template",
6777
]
6878

6979
# Metrics where larger is better
@@ -189,7 +199,7 @@ def get_job_info():
189199
}
190200

191201

192-
def query_history_data():
202+
def query_history_data(gpu_type):
193203
"""
194204
Query post-merge data with specific gpu type and model name
195205
"""
@@ -209,6 +219,16 @@ def query_history_data():
209219
"b_is_post_merge": True
210220
}
211221
},
222+
{
223+
"term": {
224+
"b_is_regression": False
225+
}
226+
},
227+
{
228+
"term": {
229+
"s_gpu_type": gpu_type
230+
}
231+
},
212232
{
213233
"range": {
214234
"ts_created": {
@@ -339,27 +359,44 @@ def calculate_best_perf_result(history_data_list, new_data):
339359
return best_metrics
340360

341361

342-
def get_history_data(new_data_dict):
362+
def get_history_data(new_data_dict, gpu_type):
343363
"""
344364
Query history post-merge data for each cmd_idx
345365
"""
366+
367+
def get_latest_data(data_list):
368+
if not data_list:
369+
return None
370+
time_format = "%b %d, %Y @ %H:%M:%S.%f"
371+
# Find the item with the maximum ts_created value
372+
latest_data = max(
373+
data_list,
374+
key=lambda x: datetime.strptime(x["ts_created"], time_format))
375+
return latest_data
376+
346377
history_baseline_dict = {}
347378
history_data_dict = {}
348379
cmd_idxs = new_data_dict.keys()
349380
for cmd_idx in cmd_idxs:
350381
history_data_dict[cmd_idx] = []
351-
history_baseline_dict[cmd_idx] = None
352-
history_data_list = query_history_data()
382+
history_baseline_dict[cmd_idx] = []
383+
history_data_list = []
384+
if cmd_idxs:
385+
history_data_list = query_history_data(gpu_type)
353386
if history_data_list:
354387
for history_data in history_data_list:
355388
for cmd_idx in cmd_idxs:
356389
if match(history_data, new_data_dict[cmd_idx]):
357390
if history_data.get("b_is_baseline") and history_data.get(
358391
"b_is_baseline") == True:
359-
history_baseline_dict[cmd_idx] = history_data
392+
history_baseline_dict[cmd_idx].append(history_data)
360393
else:
361394
history_data_dict[cmd_idx].append(history_data)
362395
break
396+
# Sometime database has several baselines and we only use the latest baseline one
397+
for cmd_idx, baseline_list in history_baseline_dict:
398+
latest_baseline = get_latest_data(baseline_list)
399+
history_baseline_dict[cmd_idx] = latest_baseline
363400
return history_baseline_dict, history_data_dict
364401

365402

@@ -477,6 +514,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
477514
# Only post regressive test cases when post-merge.
478515
if new_baseline_data_dict:
479516
data_list.extend(regressive_data_list)
517+
if not data_list:
518+
return
480519
try:
481520
print_info(
482521
f"Ready to post {len(data_list)} data to {TEST_INFO_PROJECT_NAME}")

tests/integration/defs/perf/test_perf.py

Lines changed: 103 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
"deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
104104
"deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
105105
"deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
106+
"deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
106107
"deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
107108
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
108109
"qwen2_7b_instruct": "Qwen2-7B-Instruct",
@@ -513,11 +514,13 @@ def __init__(
513514
max_num_tokens: int,
514515
attention_backend: str,
515516
max_batch_size: int,
517+
cuda_graph_max_batch_size: int = 0,
516518
pp: int = 1,
517519
enable_chunked_prefill: bool = False,
518520
disable_overlap_scheduler: bool = False,
519521
moe_backend: str = "",
520522
moe_max_num_tokens: int = 0,
523+
num_postprocess_workers: int = 0,
521524
stream_interval: int = 10,
522525
enable_attention_dp: bool = False,
523526
attention_dp_balance: bool = False,
@@ -527,6 +530,11 @@ def __init__(
527530
enable_block_reuse: bool = False,
528531
free_gpu_memory_fraction: float = 0.8,
529532
enable_padding: bool = True,
533+
spec_decoding_type: str = "",
534+
num_nextn_predict_layers: int = 0,
535+
eagle3_layers_to_capture: int = 0,
536+
max_draft_len: int = 0,
537+
speculative_model_dir: str = "",
530538
):
531539
self.name = name
532540
self.model_name = model_name
@@ -540,6 +548,7 @@ def __init__(
540548
self.attention_backend = attention_backend
541549
self.moe_backend = moe_backend
542550
self.moe_max_num_tokens = moe_max_num_tokens
551+
self.num_postprocess_workers = num_postprocess_workers
543552
self.stream_interval = stream_interval
544553
self.enable_attention_dp = enable_attention_dp
545554
self.attention_dp_balance = attention_dp_balance
@@ -549,7 +558,13 @@ def __init__(
549558
self.enable_block_reuse = enable_block_reuse
550559
self.free_gpu_memory_fraction = free_gpu_memory_fraction
551560
self.max_batch_size = max_batch_size
561+
self.cuda_graph_max_batch_size = max_batch_size if cuda_graph_max_batch_size == 0 else cuda_graph_max_batch_size
552562
self.enable_padding = enable_padding
563+
self.spec_decoding_type = spec_decoding_type
564+
self.num_nextn_predict_layers = num_nextn_predict_layers
565+
self.eagle3_layers_to_capture = eagle3_layers_to_capture
566+
self.max_draft_len = max_draft_len
567+
self.speculative_model_dir = speculative_model_dir
553568

554569
self.model_path = ""
555570

@@ -567,7 +582,7 @@ def to_cmd(self, working_dir: str) -> List[str]:
567582

568583
def to_db_data(self) -> dict:
569584
"""Convert ServerConfig to Database data"""
570-
return {
585+
db_data = {
571586
"s_model_name": self.model_name.lower(),
572587
"l_gpus": self.gpus,
573588
"l_tp": self.tp,
@@ -588,9 +603,30 @@ def to_db_data(self) -> dict:
588603
"b_enable_block_reuse": self.enable_block_reuse,
589604
"d_free_gpu_memory_fraction": self.free_gpu_memory_fraction,
590605
"l_max_batch_size": self.max_batch_size,
606+
"l_cuda_graph_max_batch_size": self.cuda_graph_max_batch_size,
591607
"b_enable_padding": self.enable_padding,
608+
"s_spec_decoding_type": self.spec_decoding_type,
609+
"l_num_nextn_predict_layers": self.num_nextn_predict_layers,
610+
"l_eagle3_layers_to_capture": self.eagle3_layers_to_capture,
611+
"l_max_draft_len": self.max_draft_len,
612+
"s_speculative_model_dir": self.speculative_model_dir,
592613
"s_server_log_link": "",
593614
}
615+
if self.num_postprocess_workers > 0:
616+
db_data["l_num_postprocess_workers"] = self.num_postprocess_workers
617+
if self.spec_decoding_type:
618+
db_data["s_spec_decoding_type"] = self.spec_decoding_type
619+
if self.num_nextn_predict_layers > 0:
620+
db_data[
621+
"l_num_nextn_predict_layers"] = self.num_nextn_predict_layers
622+
if self.eagle3_layers_to_capture > 0:
623+
db_data[
624+
"l_eagle3_layers_to_capture"] = self.eagle3_layers_to_capture
625+
if self.max_draft_len > 0:
626+
db_data["l_max_draft_len"] = self.max_draft_len
627+
if self.speculative_model_dir:
628+
db_data["s_speculative_model_dir"] = self.speculative_model_dir
629+
return db_data
594630

595631
def generate_extra_llm_api_config(self) -> str:
596632
"""Generate extra-llm-api-config.yml content"""
@@ -599,21 +635,28 @@ def generate_extra_llm_api_config(self) -> str:
599635
f"moe_expert_parallel_size: {self.ep}",
600636
f"pipeline_parallel_size: {self.pp}",
601637
f"max_num_tokens: {self.max_num_tokens}",
638+
f"max_batch_size: {self.max_batch_size}",
602639
f"enable_attention_dp: {str(self.enable_attention_dp).lower()}",
603640
f"disable_overlap_scheduler: {str(self.disable_overlap_scheduler).lower()}",
604-
f"stream_interval: {self.stream_interval}",
605641
f"attn_backend: {self.attention_backend}",
606642
f"enable_chunked_prefill: {str(self.enable_chunked_prefill).lower()}",
607643
"cuda_graph_config:",
608644
f" enable_padding: {str(self.enable_padding).lower()}",
609-
f" max_batch_size: {self.max_batch_size}",
645+
f" max_batch_size: {self.cuda_graph_max_batch_size}",
610646
"kv_cache_config:",
611647
f" dtype: {self.kv_cache_dtype}",
612648
f" free_gpu_memory_fraction: {self.free_gpu_memory_fraction}",
613649
f" enable_block_reuse: {str(self.enable_block_reuse).lower()}",
614650
"print_iter_log: false",
615651
]
616652

653+
if self.stream_interval > 0:
654+
config_lines.append(f"stream_interval: {self.stream_interval}")
655+
656+
if self.num_postprocess_workers > 0:
657+
config_lines.append(
658+
f"num_postprocess_workers: {self.num_postprocess_workers}")
659+
617660
# Add moe_config if moe_backend is specified
618661
if self.moe_backend:
619662
config_lines.append("moe_config:")
@@ -629,6 +672,25 @@ def generate_extra_llm_api_config(self) -> str:
629672
f" batching_wait_iters: {self.batching_wait_iters}")
630673
config_lines.append(f" timeout_iters: {self.timeout_iters}")
631674

675+
if self.spec_decoding_type:
676+
config_lines.append("speculative_config:")
677+
config_lines.append(f" decoding_type: {self.spec_decoding_type}")
678+
if self.num_nextn_predict_layers > 0:
679+
config_lines.append(
680+
f" num_nextn_predict_layers: {self.num_nextn_predict_layers}"
681+
)
682+
if self.eagle3_layers_to_capture > 0:
683+
config_lines.append(
684+
f" eagle3_layers_to_capture: {self.eagle3_layers_to_capture}"
685+
)
686+
if self.max_draft_len > 0:
687+
config_lines.append(f" max_draft_len: {self.max_draft_len}")
688+
if self.speculative_model_dir:
689+
spec_model_dir = os.path.join(llm_models_root(),
690+
self.speculative_model_dir)
691+
config_lines.append(
692+
f" speculative_model_dir: {spec_model_dir}")
693+
632694
return "\n".join(config_lines)
633695

634696

@@ -644,22 +706,26 @@ def __init__(self,
644706
iterations: int,
645707
isl: int,
646708
osl: int,
647-
random_range_ratio: float = 0.0):
709+
random_range_ratio: float = 0.0,
710+
backend: str = "",
711+
use_chat_template: bool = False):
648712
self.name = name
649713
self.model_name = model_name
650714
self.concurrency = concurrency
651715
self.iterations = iterations
652716
self.isl = isl
653717
self.osl = osl
654718
self.random_range_ratio = random_range_ratio
719+
self.backend = backend
720+
self.use_chat_template = use_chat_template
655721

656722
self.model_path = ""
657723

658724
def to_cmd(self, working_dir: str) -> List[str]:
659725
model_dir = get_model_dir(self.model_name)
660726
self.model_path = model_dir if os.path.exists(
661727
model_dir) else self.model_name
662-
return [
728+
benchmark_cmd = [
663729
"python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving",
664730
"--model", self.model_path, "--dataset-name", "random",
665731
"--random-ids", "--num-prompts",
@@ -670,17 +736,30 @@ def to_cmd(self, working_dir: str) -> List[str]:
670736
"--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency",
671737
str(self.concurrency)
672738
]
739+
if self.backend:
740+
benchmark_cmd.append("--backend")
741+
benchmark_cmd.append(self.backend)
742+
if self.use_chat_template:
743+
benchmark_cmd.append("--use-chat-template")
744+
return benchmark_cmd
673745

674746
def to_db_data(self) -> dict:
675747
"""Convert ClientConfig to Database data"""
676-
return {
748+
db_data = {
677749
"l_concurrency": self.concurrency,
678750
"l_iterations": self.iterations,
679751
"l_isl": self.isl,
680752
"l_osl": self.osl,
681753
"d_random_range_ratio": self.random_range_ratio,
754+
"s_backend": self.backend,
755+
"b_use_chat_template": self.use_chat_template,
682756
"s_client_log_link": "",
683757
}
758+
if self.backend:
759+
db_data["s_backend"] = self.backend
760+
if self.use_chat_template:
761+
db_data["b_use_chat_template"] = self.use_chat_template
762+
return db_data
684763

685764

686765
def parse_select_pattern(select_pattern: str):
@@ -785,7 +864,18 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
785864
free_gpu_memory_fraction=server_config_data.get(
786865
'free_gpu_memory_fraction', 0.8),
787866
max_batch_size=server_config_data.get('max_batch_size', 256),
788-
enable_padding=server_config_data.get('enable_padding', True))
867+
cuda_graph_max_batch_size=server_config_data.get(
868+
'cuda_graph_max_batch_size', 0),
869+
enable_padding=server_config_data.get('enable_padding', True),
870+
spec_decoding_type=server_config_data.get('spec_decoding_type', ""),
871+
num_nextn_predict_layers=server_config_data.get(
872+
'num_nextn_predict_layers', 0),
873+
eagle3_layers_to_capture=server_config_data.get(
874+
'eagle3_layers_to_capture', 0),
875+
max_draft_len=server_config_data.get('max_draft_len', 0),
876+
speculative_model_dir=server_config_data.get(
877+
'speculative_model_dir', 0),
878+
)
789879

790880
server_id = len(server_configs)
791881
server_configs.append(server_config)
@@ -812,7 +902,11 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
812902
isl=client_config_data.get('isl', 1024),
813903
osl=client_config_data.get('osl', 1024),
814904
random_range_ratio=client_config_data.get(
815-
'random_range_ratio', 0.0))
905+
'random_range_ratio', 0.0),
906+
backend=client_config_data.get('backend', ""),
907+
use_chat_template=client_config_data.get(
908+
'use_chat_template', False),
909+
)
816910
client_configs.append(client_config)
817911

818912
server_client_configs[server_id] = client_configs
@@ -2114,7 +2208,7 @@ def upload_test_results_to_database(self):
21142208

21152209
# Get history data for each cmd_idx
21162210
history_baseline_dict, history_data_dict = get_history_data(
2117-
new_data_dict)
2211+
new_data_dict, self._config.gpu_type)
21182212
# Prepare regressive test cases
21192213
regressive_data_list = prepare_regressive_test_cases(
21202214
history_baseline_dict, new_data_dict)

0 commit comments

Comments
 (0)