Skip to content

Commit b1dc8c0

Browse files
committed
Add dsr1 and gpt-oss test cases
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 271a981 commit b1dc8c0

File tree

4 files changed

+373
-6
lines changed

4 files changed

+373
-6
lines changed

tests/integration/defs/perf/open_search_db_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,13 @@
4040
"l_ep",
4141
"l_pp",
4242
"l_max_num_tokens",
43+
"l_cuda_graph_max_batch_size",
4344
"b_enable_chunked_prefill",
4445
"b_disable_overlap_scheduler",
4546
"s_attention_backend",
4647
"s_moe_backend",
4748
"l_moe_max_num_tokens",
49+
"l_num_postprocess_workers",
4850
"l_stream_interval",
4951
"b_enable_attention_dp",
5052
"b_attention_dp_balance",
@@ -55,6 +57,11 @@
5557
"d_free_gpu_memory_fraction",
5658
"l_max_batch_size",
5759
"b_enable_padding",
60+
"s_spec_decoding_type",
61+
"l_num_nextn_predict_layers",
62+
"l_eagle3_layers_to_capture",
63+
"l_max_draft_len",
64+
"s_speculative_model_dir",
5865
]
5966

6067
# Client config fields to compare
@@ -64,6 +71,8 @@
6471
"l_isl",
6572
"l_osl",
6673
"d_random_range_ratio",
74+
"s_backend",
75+
"b_use_chat_template",
6776
]
6877

6978
# Metrics where larger is better

tests/integration/defs/perf/test_perf.py

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
"deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
104104
"deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
105105
"deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
106+
"deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
106107
"deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
107108
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
108109
"qwen2_7b_instruct": "Qwen2-7B-Instruct",
@@ -513,11 +514,13 @@ def __init__(
513514
max_num_tokens: int,
514515
attention_backend: str,
515516
max_batch_size: int,
517+
cuda_graph_max_batch_size: int = 0,
516518
pp: int = 1,
517519
enable_chunked_prefill: bool = False,
518520
disable_overlap_scheduler: bool = False,
519521
moe_backend: str = "",
520522
moe_max_num_tokens: int = 0,
523+
num_postprocess_workers: int = 0,
521524
stream_interval: int = 10,
522525
enable_attention_dp: bool = False,
523526
attention_dp_balance: bool = False,
@@ -527,6 +530,11 @@ def __init__(
527530
enable_block_reuse: bool = False,
528531
free_gpu_memory_fraction: float = 0.8,
529532
enable_padding: bool = True,
533+
spec_decoding_type: str = "",
534+
num_nextn_predict_layers: int = 0,
535+
eagle3_layers_to_capture: int = 0,
536+
max_draft_len: int = 0,
537+
speculative_model_dir: str = "",
530538
):
531539
self.name = name
532540
self.model_name = model_name
@@ -540,6 +548,7 @@ def __init__(
540548
self.attention_backend = attention_backend
541549
self.moe_backend = moe_backend
542550
self.moe_max_num_tokens = moe_max_num_tokens
551+
self.num_postprocess_workers = num_postprocess_workers
543552
self.stream_interval = stream_interval
544553
self.enable_attention_dp = enable_attention_dp
545554
self.attention_dp_balance = attention_dp_balance
@@ -549,7 +558,13 @@ def __init__(
549558
self.enable_block_reuse = enable_block_reuse
550559
self.free_gpu_memory_fraction = free_gpu_memory_fraction
551560
self.max_batch_size = max_batch_size
561+
self.cuda_graph_max_batch_size = max_batch_size if cuda_graph_max_batch_size == 0 else cuda_graph_max_batch_size
552562
self.enable_padding = enable_padding
563+
self.spec_decoding_type = spec_decoding_type
564+
self.num_nextn_predict_layers = num_nextn_predict_layers
565+
self.eagle3_layers_to_capture = eagle3_layers_to_capture
566+
self.max_draft_len = max_draft_len
567+
self.speculative_model_dir = speculative_model_dir
553568

554569
self.model_path = ""
555570

@@ -567,7 +582,7 @@ def to_cmd(self, working_dir: str) -> List[str]:
567582

568583
def to_db_data(self) -> dict:
569584
"""Convert ServerConfig to Database data"""
570-
return {
585+
db_data = {
571586
"s_model_name": self.model_name.lower(),
572587
"l_gpus": self.gpus,
573588
"l_tp": self.tp,
@@ -588,9 +603,30 @@ def to_db_data(self) -> dict:
588603
"b_enable_block_reuse": self.enable_block_reuse,
589604
"d_free_gpu_memory_fraction": self.free_gpu_memory_fraction,
590605
"l_max_batch_size": self.max_batch_size,
606+
"l_cuda_graph_max_batch_size": self.cuda_graph_max_batch_size,
591607
"b_enable_padding": self.enable_padding,
608+
"s_spec_decoding_type": self.spec_decoding_type,
609+
"l_num_nextn_predict_layers": self.num_nextn_predict_layers,
610+
"l_eagle3_layers_to_capture": self.eagle3_layers_to_capture,
611+
"l_max_draft_len": self.max_draft_len,
612+
"s_speculative_model_dir": self.speculative_model_dir,
592613
"s_server_log_link": "",
593614
}
615+
if self.num_postprocess_workers > 0:
616+
db_data["l_num_postprocess_workers"] = self.num_postprocess_workers
617+
if self.spec_decoding_type:
618+
db_data["s_spec_decoding_type"] = self.spec_decoding_type
619+
if self.num_nextn_predict_layers > 0:
620+
db_data[
621+
"l_num_nextn_predict_layers"] = self.num_nextn_predict_layers
622+
if self.eagle3_layers_to_capture > 0:
623+
db_data[
624+
"l_eagle3_layers_to_capture"] = self.eagle3_layers_to_capture
625+
if self.max_draft_len > 0:
626+
db_data["l_max_draft_len"] = self.max_draft_len
627+
if self.speculative_model_dir:
628+
db_data["s_speculative_model_dir"] = self.speculative_model_dir
629+
return db_data
594630

595631
def generate_extra_llm_api_config(self) -> str:
596632
"""Generate extra-llm-api-config.yml content"""
@@ -599,21 +635,28 @@ def generate_extra_llm_api_config(self) -> str:
599635
f"moe_expert_parallel_size: {self.ep}",
600636
f"pipeline_parallel_size: {self.pp}",
601637
f"max_num_tokens: {self.max_num_tokens}",
638+
f"max_batch_size: {self.max_batch_size}",
602639
f"enable_attention_dp: {str(self.enable_attention_dp).lower()}",
603640
f"disable_overlap_scheduler: {str(self.disable_overlap_scheduler).lower()}",
604-
f"stream_interval: {self.stream_interval}",
605641
f"attn_backend: {self.attention_backend}",
606642
f"enable_chunked_prefill: {str(self.enable_chunked_prefill).lower()}",
607643
"cuda_graph_config:",
608644
f" enable_padding: {str(self.enable_padding).lower()}",
609-
f" max_batch_size: {self.max_batch_size}",
645+
f" max_batch_size: {self.cuda_graph_max_batch_size}",
610646
"kv_cache_config:",
611647
f" dtype: {self.kv_cache_dtype}",
612648
f" free_gpu_memory_fraction: {self.free_gpu_memory_fraction}",
613649
f" enable_block_reuse: {str(self.enable_block_reuse).lower()}",
614650
"print_iter_log: false",
615651
]
616652

653+
if self.stream_interval > 0:
654+
config_lines.append(f"stream_interval: {self.stream_interval}")
655+
656+
if self.num_postprocess_workers > 0:
657+
config_lines.append(
658+
f"num_postprocess_workers: {self.num_postprocess_workers}")
659+
617660
# Add moe_config if moe_backend is specified
618661
if self.moe_backend:
619662
config_lines.append("moe_config:")
@@ -629,6 +672,23 @@ def generate_extra_llm_api_config(self) -> str:
629672
f" batching_wait_iters: {self.batching_wait_iters}")
630673
config_lines.append(f" timeout_iters: {self.timeout_iters}")
631674

675+
if self.spec_decoding_type:
676+
config_lines.append("speculative_config:")
677+
config_lines.append(f" decoding_type: {self.spec_decoding_type}")
678+
if self.num_nextn_predict_layers > 0:
679+
config_lines.append(
680+
f" num_nextn_predict_layers: {self.num_nextn_predict_layers}"
681+
)
682+
if self.eagle3_layers_to_capture > 0:
683+
config_lines.append(
684+
f" eagle3_layers_to_capture: {self.eagle3_layers_to_capture}"
685+
)
686+
if self.max_draft_len > 0:
687+
config_lines.append(f" max_draft_len: {self.max_draft_len}")
688+
if self.speculative_model_dir:
689+
config_lines.append(
690+
f" speculative_model_dir: {speculative_model_dir}")
691+
632692
return "\n".join(config_lines)
633693

634694

@@ -644,22 +704,26 @@ def __init__(self,
644704
iterations: int,
645705
isl: int,
646706
osl: int,
647-
random_range_ratio: float = 0.0):
707+
random_range_ratio: float = 0.0,
708+
backend: str = "",
709+
use_chat_template: bool = False):
648710
self.name = name
649711
self.model_name = model_name
650712
self.concurrency = concurrency
651713
self.iterations = iterations
652714
self.isl = isl
653715
self.osl = osl
654716
self.random_range_ratio = random_range_ratio
717+
self.backend = backend
718+
self.use_chat_template = use_chat_template
655719

656720
self.model_path = ""
657721

658722
def to_cmd(self, working_dir: str) -> List[str]:
659723
model_dir = get_model_dir(self.model_name)
660724
self.model_path = model_dir if os.path.exists(
661725
model_dir) else self.model_name
662-
return [
726+
benchmark_cmd = [
663727
"python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving",
664728
"--model", self.model_path, "--dataset-name", "random",
665729
"--random-ids", "--num-prompts",
@@ -670,17 +734,30 @@ def to_cmd(self, working_dir: str) -> List[str]:
670734
"--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency",
671735
str(self.concurrency)
672736
]
737+
if self.backend:
738+
benchmark_cmd.append("--backend")
739+
benchmark_cmd.append(self.backend)
740+
if self.use_chat_template:
741+
benchmark_cmd.append("--use-chat-template")
742+
return benchmark_cmd
673743

674744
def to_db_data(self) -> dict:
675745
"""Convert ClientConfig to Database data"""
676-
return {
746+
db_data = {
677747
"l_concurrency": self.concurrency,
678748
"l_iterations": self.iterations,
679749
"l_isl": self.isl,
680750
"l_osl": self.osl,
681751
"d_random_range_ratio": self.random_range_ratio,
752+
"s_backend": self.backend,
753+
"b_use_chat_template": self.use_chat_template,
682754
"s_client_log_link": "",
683755
}
756+
if self.backend:
757+
db_data["s_backend"] = self.backend
758+
if self.use_chat_template:
759+
db_data["b_use_chat_template"] = self.use_chat_template
760+
return db_data
684761

685762

686763
def parse_select_pattern(select_pattern: str):

0 commit comments

Comments
 (0)