Skip to content

Commit 5e0e481

Browse files
authored
[None][fix] Minor updates on Perf Test System (#10375)
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 0982516 commit 5e0e481

File tree

5 files changed

+49
-64
lines changed

5 files changed

+49
-64
lines changed

jenkins/L0_Test.groovy

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -939,8 +939,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
939939
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
940940
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
941941
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
942-
def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
943-
def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py"
944942

945943
stage("[${stageName}] Initializing Test") {
946944
// Create Job Workspace folder in Frontend Node
@@ -1023,16 +1021,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10231021
coverageConfigFile
10241022
)
10251023

1026-
if (perfSanityMode) {
1027-
Utils.copyFileToRemoteHost(
1028-
pipeline,
1029-
remote,
1030-
perfCheckScriptLocal,
1031-
perfCheckScriptNode,
1032-
true
1033-
)
1034-
}
1035-
10361024
// Generate Pytest command
10371025
String pytestUtil = ""
10381026
if (nodeCount > 1) {
@@ -1314,22 +1302,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13141302
),
13151303
numRetries: 3
13161304
)
1317-
1318-
if (perfSanityMode) {
1319-
stage("[${stageName}] Check perf result") {
1320-
def perfCheckResult = Utils.exec(
1321-
pipeline,
1322-
script: Utils.sshUserCmd(
1323-
remote,
1324-
"python3 ${perfCheckScriptNode} ${jobWorkspace}"
1325-
),
1326-
returnStatus: true
1327-
)
1328-
if (perfCheckResult != 0) {
1329-
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
1330-
}
1331-
}
1332-
}
13331305
}
13341306

13351307
echo "Finished test stage execution."
@@ -3297,15 +3269,10 @@ def launchTestJobs(pipeline, testFilter)
32973269
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
32983270
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
32993271
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
3300-
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
3301-
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
3302-
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
33033272
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
33043273
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
33053274
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
33063275
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
3307-
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
3308-
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
33093276
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
33103277
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
33113278
]

jenkins/scripts/slurm_run.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
135135
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
136136
fi
137137

138+
if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
139+
echo "Check Perf-Sanity Result"
140+
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
141+
$jobWorkspace
142+
perf_sanity_check_exit_code=$?
143+
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
144+
fi
145+
138146
if [ "$pytest_exit_code" -ne 0 ]; then
139147
final_exit_code=$pytest_exit_code
140148
elif [ "$perf_check_exit_code" -ne 0 ]; then

tests/integration/defs/perf/open_search_db_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@
6262
"d_p99_e2el",
6363
]
6464

65+
# Default threshold values for performance regression detection
66+
POST_MERGE_THRESHOLD = 0.05
67+
PRE_MERGE_THRESHOLD = 0.1
68+
6569
# Fields for scenario-only matching for recipe tests.
6670
# Unlike regular tests that match on all config fields, recipes match only on the benchmark
6771
# scenario, allowing the underlying config to change while still comparing against baselines

tests/integration/defs/perf/test_perf_sanity.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@ class DisaggConfig:
463463

464464
def __init__(
465465
self,
466+
name: str,
466467
disagg_serving_type: str,
467468
hostname: str,
468469
numa_bind: bool,
@@ -472,6 +473,7 @@ def __init__(
472473
hardware: dict,
473474
server_env_var: str,
474475
):
476+
self.name = name
475477
self.disagg_serving_type = disagg_serving_type
476478
self.hostname = hostname
477479
self.numa_bind = numa_bind
@@ -971,7 +973,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str):
971973
# Create ctx server config
972974
ctx_server_config_data = {
973975
"concurrency": max(concurrency_values),
974-
"name": f"ctx_{config_file_base_name}",
976+
"name": config_file_base_name,
975977
"model_name": model_name,
976978
"gpus_per_node": gpus_per_node,
977979
**worker_config.get("ctx", {}),
@@ -980,7 +982,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str):
980982
# Create gen server config
981983
gen_server_config_data = {
982984
"concurrency": max(concurrency_values),
983-
"name": f"gen_{config_file_base_name}",
985+
"name": config_file_base_name,
984986
"model_name": model_name,
985987
"gpus_per_node": gpus_per_node,
986988
**worker_config.get("gen", {}),
@@ -991,6 +993,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str):
991993

992994
# Create disagg config
993995
disagg_config = DisaggConfig(
996+
name=config_file_base_name,
994997
disagg_serving_type=disagg_serving_type,
995998
hostname=socket.gethostname(),
996999
numa_bind=numa_bind,
@@ -1249,6 +1252,8 @@ def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict:
12491252
new_data.update(job_config)
12501253
new_data.update(server_config_dict)
12511254
new_data.update(client_config_dict)
1255+
# Add test_case_name for convenient filtering on OpenSearch
1256+
new_data["s_test_case_name"] = f"{server_config.name}-{client_config.name}"
12521257

12531258
for metric_name in PERF_METRIC_LOG_QUERIES:
12541259
if metric_name in self._test_results[cmd_idx]:
@@ -1308,12 +1313,13 @@ def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict:
13081313
"l_num_gen_servers": num_gen_servers,
13091314
}
13101315
new_data.update(job_config)
1311-
13121316
if num_ctx_servers > 0:
13131317
new_data.update(ctx_server_config_dict)
13141318
if num_gen_servers > 0:
13151319
new_data.update(gen_server_config_dict)
13161320
new_data.update(client_config_dict)
1321+
# Add test_case_name for convenient filtering on OpenSearch
1322+
new_data["s_test_case_name"] = f"{disagg_config.name}-{client_config.name}"
13171323

13181324
for metric_name in PERF_METRIC_LOG_QUERIES:
13191325
if metric_name in self._test_results[cmd_idx]:

tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,20 @@ l0_gb200_multi_gpus_perf_sanity:
1414
stage: pre_merge
1515
backend: pytorch
1616
tests:
17-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
18-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
19-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
20-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
21-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
22-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
23-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
24-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
25-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
26-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
27-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
28-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
29-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
30-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
17+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
18+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
19+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
20+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
21+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
22+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
23+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
24+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
25+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
26+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
27+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
28+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
29+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
30+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)
3131
- condition:
3232
ranges:
3333
system_gpu_count:
@@ -42,17 +42,17 @@ l0_gb200_multi_gpus_perf_sanity:
4242
stage: post_merge
4343
backend: pytorch
4444
tests:
45-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
46-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
47-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
48-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
49-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
50-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
51-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
52-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
53-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
54-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
55-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
56-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
57-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
58-
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
45+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
46+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
47+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
48+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
49+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
50+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
51+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
52+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
53+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
54+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
55+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
56+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
57+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
58+
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)

0 commit comments

Comments
 (0)