Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2689,6 +2689,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
// Perf sanity post merge test
"DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
"DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "perf_sanity_l0_dgx_b200", 1, 1, 8],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chzblych do we have capacity to run this in L0 post-merge. I'm concerned that we're asking for 8 GPUs here rather than 4. This will basically double our requirements (going from 2x4 to an additional 1x8).

@chenfeiz0326 does this really need to run on 8 GPUs?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is not too much memory capacity left for DS R1 FP8 on 4 GPU, better to have 8GPUs.

"DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
]
fullSet += x86SlurmTestConfigs.keySet()
Expand Down
49 changes: 44 additions & 5 deletions tests/integration/defs/perf/open_search_db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import re
import sys
import time
from datetime import datetime

from defs.trt_test_alternative import print_info

Expand All @@ -40,11 +41,13 @@
"l_ep",
"l_pp",
"l_max_num_tokens",
"l_cuda_graph_max_batch_size",
"b_enable_chunked_prefill",
"b_disable_overlap_scheduler",
"s_attention_backend",
"s_moe_backend",
"l_moe_max_num_tokens",
"l_num_postprocess_workers",
"l_stream_interval",
"b_enable_attention_dp",
"b_attention_dp_balance",
Expand All @@ -55,6 +58,11 @@
"d_free_gpu_memory_fraction",
"l_max_batch_size",
"b_enable_padding",
"s_spec_decoding_type",
"l_num_nextn_predict_layers",
"l_eagle3_layers_to_capture",
"l_max_draft_len",
"s_speculative_model_dir",
]

# Client config fields to compare
Expand All @@ -64,6 +72,8 @@
"l_isl",
"l_osl",
"d_random_range_ratio",
"s_backend",
"b_use_chat_template",
]

# Metrics where larger is better
Expand Down Expand Up @@ -189,7 +199,7 @@ def get_job_info():
}


def query_history_data():
def query_history_data(gpu_type):
"""
Query post-merge data with specific gpu type and model name
"""
Expand All @@ -209,6 +219,16 @@ def query_history_data():
"b_is_post_merge": True
}
},
{
"term": {
"b_is_regression": False
}
},
{
"term": {
"s_gpu_type": gpu_type
}
},
{
"range": {
"ts_created": {
Expand Down Expand Up @@ -339,27 +359,44 @@ def calculate_best_perf_result(history_data_list, new_data):
return best_metrics


def get_history_data(new_data_dict):
def get_history_data(new_data_dict, gpu_type):
"""
Query history post-merge data for each cmd_idx
"""

def get_latest_data(data_list):
if not data_list:
return None
time_format = "%b %d, %Y @ %H:%M:%S.%f"
# Find the item with the maximum ts_created value
latest_data = max(
data_list,
key=lambda x: datetime.strptime(x["ts_created"], time_format))
return latest_data

history_baseline_dict = {}
history_data_dict = {}
cmd_idxs = new_data_dict.keys()
for cmd_idx in cmd_idxs:
history_data_dict[cmd_idx] = []
history_baseline_dict[cmd_idx] = None
history_data_list = query_history_data()
history_baseline_dict[cmd_idx] = []
history_data_list = []
if cmd_idxs:
history_data_list = query_history_data(gpu_type)
if history_data_list:
for history_data in history_data_list:
for cmd_idx in cmd_idxs:
if match(history_data, new_data_dict[cmd_idx]):
if history_data.get("b_is_baseline") and history_data.get(
"b_is_baseline") == True:
history_baseline_dict[cmd_idx] = history_data
history_baseline_dict[cmd_idx].append(history_data)
else:
history_data_dict[cmd_idx].append(history_data)
break
# Sometime database has several baselines and we only use the latest baseline one
for cmd_idx, baseline_list in history_baseline_dict:
latest_baseline = get_latest_data(baseline_list)
history_baseline_dict[cmd_idx] = latest_baseline
return history_baseline_dict, history_data_dict


Expand Down Expand Up @@ -477,6 +514,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
# Only post regressive test cases when post-merge.
if new_baseline_data_dict:
data_list.extend(regressive_data_list)
if not data_list:
return
try:
print_info(
f"Ready to post {len(data_list)} data to {TEST_INFO_PROJECT_NAME}")
Expand Down
112 changes: 103 additions & 9 deletions tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
"deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
"deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
"deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
"deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
"deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
"qwen2_7b_instruct": "Qwen2-7B-Instruct",
Expand Down Expand Up @@ -513,11 +514,13 @@ def __init__(
max_num_tokens: int,
attention_backend: str,
max_batch_size: int,
cuda_graph_max_batch_size: int = 0,
pp: int = 1,
enable_chunked_prefill: bool = False,
disable_overlap_scheduler: bool = False,
moe_backend: str = "",
moe_max_num_tokens: int = 0,
num_postprocess_workers: int = 0,
stream_interval: int = 10,
enable_attention_dp: bool = False,
attention_dp_balance: bool = False,
Expand All @@ -527,6 +530,11 @@ def __init__(
enable_block_reuse: bool = False,
free_gpu_memory_fraction: float = 0.8,
enable_padding: bool = True,
spec_decoding_type: str = "",
num_nextn_predict_layers: int = 0,
eagle3_layers_to_capture: int = 0,
max_draft_len: int = 0,
speculative_model_dir: str = "",
):
self.name = name
self.model_name = model_name
Expand All @@ -540,6 +548,7 @@ def __init__(
self.attention_backend = attention_backend
self.moe_backend = moe_backend
self.moe_max_num_tokens = moe_max_num_tokens
self.num_postprocess_workers = num_postprocess_workers
self.stream_interval = stream_interval
self.enable_attention_dp = enable_attention_dp
self.attention_dp_balance = attention_dp_balance
Expand All @@ -549,7 +558,13 @@ def __init__(
self.enable_block_reuse = enable_block_reuse
self.free_gpu_memory_fraction = free_gpu_memory_fraction
self.max_batch_size = max_batch_size
self.cuda_graph_max_batch_size = max_batch_size if cuda_graph_max_batch_size == 0 else cuda_graph_max_batch_size
self.enable_padding = enable_padding
self.spec_decoding_type = spec_decoding_type
self.num_nextn_predict_layers = num_nextn_predict_layers
self.eagle3_layers_to_capture = eagle3_layers_to_capture
self.max_draft_len = max_draft_len
self.speculative_model_dir = speculative_model_dir

self.model_path = ""

Expand All @@ -567,7 +582,7 @@ def to_cmd(self, working_dir: str) -> List[str]:

def to_db_data(self) -> dict:
"""Convert ServerConfig to Database data"""
return {
db_data = {
"s_model_name": self.model_name.lower(),
"l_gpus": self.gpus,
"l_tp": self.tp,
Expand All @@ -588,9 +603,30 @@ def to_db_data(self) -> dict:
"b_enable_block_reuse": self.enable_block_reuse,
"d_free_gpu_memory_fraction": self.free_gpu_memory_fraction,
"l_max_batch_size": self.max_batch_size,
"l_cuda_graph_max_batch_size": self.cuda_graph_max_batch_size,
"b_enable_padding": self.enable_padding,
"s_spec_decoding_type": self.spec_decoding_type,
"l_num_nextn_predict_layers": self.num_nextn_predict_layers,
"l_eagle3_layers_to_capture": self.eagle3_layers_to_capture,
"l_max_draft_len": self.max_draft_len,
"s_speculative_model_dir": self.speculative_model_dir,
"s_server_log_link": "",
}
if self.num_postprocess_workers > 0:
db_data["l_num_postprocess_workers"] = self.num_postprocess_workers
if self.spec_decoding_type:
db_data["s_spec_decoding_type"] = self.spec_decoding_type
if self.num_nextn_predict_layers > 0:
db_data[
"l_num_nextn_predict_layers"] = self.num_nextn_predict_layers
if self.eagle3_layers_to_capture > 0:
db_data[
"l_eagle3_layers_to_capture"] = self.eagle3_layers_to_capture
if self.max_draft_len > 0:
db_data["l_max_draft_len"] = self.max_draft_len
if self.speculative_model_dir:
db_data["s_speculative_model_dir"] = self.speculative_model_dir
return db_data

def generate_extra_llm_api_config(self) -> str:
"""Generate extra-llm-api-config.yml content"""
Expand All @@ -599,21 +635,28 @@ def generate_extra_llm_api_config(self) -> str:
f"moe_expert_parallel_size: {self.ep}",
f"pipeline_parallel_size: {self.pp}",
f"max_num_tokens: {self.max_num_tokens}",
f"max_batch_size: {self.max_batch_size}",
f"enable_attention_dp: {str(self.enable_attention_dp).lower()}",
f"disable_overlap_scheduler: {str(self.disable_overlap_scheduler).lower()}",
f"stream_interval: {self.stream_interval}",
f"attn_backend: {self.attention_backend}",
f"enable_chunked_prefill: {str(self.enable_chunked_prefill).lower()}",
"cuda_graph_config:",
f" enable_padding: {str(self.enable_padding).lower()}",
f" max_batch_size: {self.max_batch_size}",
f" max_batch_size: {self.cuda_graph_max_batch_size}",
"kv_cache_config:",
f" dtype: {self.kv_cache_dtype}",
f" free_gpu_memory_fraction: {self.free_gpu_memory_fraction}",
f" enable_block_reuse: {str(self.enable_block_reuse).lower()}",
"print_iter_log: false",
]

if self.stream_interval > 0:
config_lines.append(f"stream_interval: {self.stream_interval}")

if self.num_postprocess_workers > 0:
config_lines.append(
f"num_postprocess_workers: {self.num_postprocess_workers}")

# Add moe_config if moe_backend is specified
if self.moe_backend:
config_lines.append("moe_config:")
Expand All @@ -629,6 +672,25 @@ def generate_extra_llm_api_config(self) -> str:
f" batching_wait_iters: {self.batching_wait_iters}")
config_lines.append(f" timeout_iters: {self.timeout_iters}")

if self.spec_decoding_type:
config_lines.append("speculative_config:")
config_lines.append(f" decoding_type: {self.spec_decoding_type}")
if self.num_nextn_predict_layers > 0:
config_lines.append(
f" num_nextn_predict_layers: {self.num_nextn_predict_layers}"
)
if self.eagle3_layers_to_capture > 0:
config_lines.append(
f" eagle3_layers_to_capture: {self.eagle3_layers_to_capture}"
)
if self.max_draft_len > 0:
config_lines.append(f" max_draft_len: {self.max_draft_len}")
if self.speculative_model_dir:
spec_model_dir = os.path.join(llm_models_root(),
self.speculative_model_dir)
config_lines.append(
f" speculative_model_dir: {spec_model_dir}")

return "\n".join(config_lines)


Expand All @@ -644,22 +706,26 @@ def __init__(self,
iterations: int,
isl: int,
osl: int,
random_range_ratio: float = 0.0):
random_range_ratio: float = 0.0,
backend: str = "",
use_chat_template: bool = False):
self.name = name
self.model_name = model_name
self.concurrency = concurrency
self.iterations = iterations
self.isl = isl
self.osl = osl
self.random_range_ratio = random_range_ratio
self.backend = backend
self.use_chat_template = use_chat_template

self.model_path = ""

def to_cmd(self, working_dir: str) -> List[str]:
model_dir = get_model_dir(self.model_name)
self.model_path = model_dir if os.path.exists(
model_dir) else self.model_name
return [
benchmark_cmd = [
"python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving",
"--model", self.model_path, "--dataset-name", "random",
"--random-ids", "--num-prompts",
Expand All @@ -670,17 +736,30 @@ def to_cmd(self, working_dir: str) -> List[str]:
"--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency",
str(self.concurrency)
]
if self.backend:
benchmark_cmd.append("--backend")
benchmark_cmd.append(self.backend)
if self.use_chat_template:
benchmark_cmd.append("--use-chat-template")
return benchmark_cmd

def to_db_data(self) -> dict:
"""Convert ClientConfig to Database data"""
return {
db_data = {
"l_concurrency": self.concurrency,
"l_iterations": self.iterations,
"l_isl": self.isl,
"l_osl": self.osl,
"d_random_range_ratio": self.random_range_ratio,
"s_backend": self.backend,
"b_use_chat_template": self.use_chat_template,
"s_client_log_link": "",
}
if self.backend:
db_data["s_backend"] = self.backend
if self.use_chat_template:
db_data["b_use_chat_template"] = self.use_chat_template
return db_data


def parse_select_pattern(select_pattern: str):
Expand Down Expand Up @@ -785,7 +864,18 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
free_gpu_memory_fraction=server_config_data.get(
'free_gpu_memory_fraction', 0.8),
max_batch_size=server_config_data.get('max_batch_size', 256),
enable_padding=server_config_data.get('enable_padding', True))
cuda_graph_max_batch_size=server_config_data.get(
'cuda_graph_max_batch_size', 0),
enable_padding=server_config_data.get('enable_padding', True),
spec_decoding_type=server_config_data.get('spec_decoding_type', ""),
num_nextn_predict_layers=server_config_data.get(
'num_nextn_predict_layers', 0),
eagle3_layers_to_capture=server_config_data.get(
'eagle3_layers_to_capture', 0),
max_draft_len=server_config_data.get('max_draft_len', 0),
speculative_model_dir=server_config_data.get(
'speculative_model_dir', 0),
)

server_id = len(server_configs)
server_configs.append(server_config)
Expand All @@ -812,7 +902,11 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
isl=client_config_data.get('isl', 1024),
osl=client_config_data.get('osl', 1024),
random_range_ratio=client_config_data.get(
'random_range_ratio', 0.0))
'random_range_ratio', 0.0),
backend=client_config_data.get('backend', ""),
use_chat_template=client_config_data.get(
'use_chat_template', False),
)
client_configs.append(client_config)

server_client_configs[server_id] = client_configs
Expand Down Expand Up @@ -2114,7 +2208,7 @@ def upload_test_results_to_database(self):

# Get history data for each cmd_idx
history_baseline_dict, history_data_dict = get_history_data(
new_data_dict)
new_data_dict, self._config.gpu_type)
# Prepare regressive test cases
regressive_data_list = prepare_regressive_test_cases(
history_baseline_dict, new_data_dict)
Expand Down
Loading