Skip to content

Commit 7f9b8b1

Browse files
committed
update
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent f69f8d2 commit 7f9b8b1

15 files changed

+1724
-102
lines changed

jenkins/L0_Test.groovy

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
887887
// Create a unique suffix for the job name
888888
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
889889
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
890-
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
890+
def disaggMode = stageName.contains("PerfSanity-Disagg")
891891
def setSegment = disaggMode
892892

893893
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@@ -1245,6 +1245,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12451245
),
12461246
numRetries: 3
12471247
)
1248+
1249+
if (stageName.contains("PerfSanity")) {
1250+
stage("[${stageName}] Check perf result") {
1251+
def perfCheckResult = Utils.exec(
1252+
pipeline,
1253+
script: Utils.sshUserCmd(
1254+
remote,
1255+
"\"python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py ${WORKSPACE}/${stageName}\""
1256+
),
1257+
returnStatus: true
1258+
)
1259+
if (perfCheckResult != 0) {
1260+
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
1261+
}
1262+
}
1263+
}
12481264
}
12491265

12501266
echo "Finished test stage execution."
@@ -2698,7 +2714,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26982714
error "Some tests still failed after rerun attempts, please check the test report."
26992715
}
27002716

2701-
if (perfMode && !stageName.contains("Perf-Sanity")) {
2717+
if (perfMode) {
27022718
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
27032719
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
27042720
stage("Check perf result") {
@@ -2724,7 +2740,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27242740
}
27252741
}
27262742

2727-
if (perfMode && stageName.contains("Perf-Sanity")) {
2743+
if (stageName.contains("PerfSanity")) {
27282744
stage ("Check perf result") {
27292745
def perfCheckResult = sh(
27302746
script: """
@@ -2733,10 +2749,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27332749
""",
27342750
returnStatus: true
27352751
)
2736-
// TODO: Enable this when perf regression check is stable
2737-
// if (perfCheckResult != 0) {
2738-
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2739-
// }
2752+
if (perfCheckResult != 0) {
2753+
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2754+
}
27402755
}
27412756
}
27422757
}
@@ -3100,7 +3115,7 @@ def launchTestJobs(pipeline, testFilter)
31003115
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
31013116
]
31023117

3103-
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
3118+
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), {
31043119
def config = VANILLA_CONFIG
31053120
if (key.contains("single-device")) {
31063121
config = SINGLE_DEVICE_CONFIG
@@ -3111,7 +3126,7 @@ def launchTestJobs(pipeline, testFilter)
31113126
if (key.contains("Pybind")) {
31123127
config = PYBIND_CONFIG
31133128
}
3114-
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
3129+
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
31153130
}]]}
31163131
fullSet = parallelJobs.keySet()
31173132

@@ -3132,9 +3147,9 @@ def launchTestJobs(pipeline, testFilter)
31323147
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
31333148
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
31343149
// Perf sanity post merge test
3135-
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3136-
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3137-
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
3150+
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3151+
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3152+
// "DGX_B300-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
31383153
]
31393154
fullSet += x86SlurmTestConfigs.keySet()
31403155

@@ -3146,7 +3161,7 @@ def launchTestJobs(pipeline, testFilter)
31463161
if (key.contains("llvm")) {
31473162
config = LLVM_CONFIG
31483163
}
3149-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
3164+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
31503165
}]]}
31513166

31523167
parallelJobs += parallelSlurmJobs
@@ -3162,11 +3177,19 @@ def launchTestJobs(pipeline, testFilter)
31623177
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
31633178
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
31643179
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
3165-
// Perf sanity post merge test
3166-
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
31673180
// Disable GB300 stages due to nodes will be offline temporarily.
31683181
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
31693182
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
3183+
// Perf sanity pre merge test
3184+
"GB200-4_GPUs-PyTorch-PerfSanity-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 6, 4],
3185+
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 6, 4],
3186+
// Perf sanity post merge test
3187+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 6, 4],
3188+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 6, 4],
3189+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 6, 4],
3190+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 6, 4],
3191+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 6, 4],
3192+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 6, 4],
31703193
]
31713194
fullSet += SBSASlurmTestConfigs.keySet()
31723195

@@ -3178,13 +3201,15 @@ def launchTestJobs(pipeline, testFilter)
31783201
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
31793202
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
31803203
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
3181-
// Perf sanity post merge aggr tests
3182-
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
3183-
// Perf sanity post merge disagg tests
3184-
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
3185-
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3186-
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3187-
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
3204+
// Perf sanity pre merge tests
3205+
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3206+
// Perf sanity post merge tests
3207+
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
3208+
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
3209+
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3210+
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
3211+
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
3212+
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
31883213
]
31893214
fullSet += multiNodesSBSAConfigs.keySet()
31903215

@@ -3202,7 +3227,7 @@ def launchTestJobs(pipeline, testFilter)
32023227
if (key.contains("llvm")) {
32033228
config = LLVM_CONFIG
32043229
}
3205-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
3230+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
32063231
}]]}
32073232
parallelJobs += parallelSlurmJobs
32083233

@@ -3215,7 +3240,7 @@ def launchTestJobs(pipeline, testFilter)
32153240
if (key.contains("llvm")) {
32163241
config = LLVM_CONFIG
32173242
}
3218-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
3243+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
32193244
}]]}
32203245

32213246
parallelJobs += parallelMultiNodesSBSAJobs

jenkins/scripts/slurm_run.sh

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ echo "Full Command: $pytestCommand"
100100
eval $pytestCommand
101101
echo "Rank${SLURM_PROCID} Pytest finished execution"
102102

103-
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
103+
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
104104
if [[ "$stageName" == *PyTorch* ]]; then
105105
basePerfFilename="base_perf_pytorch.csv"
106106
else
@@ -117,9 +117,3 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
117117
--files $stageName/perf_script_test_results.csv \
118118
$basePerfPath
119119
fi
120-
121-
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
122-
echo "Check Perf-Sanity Result"
123-
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
124-
$jobWorkspace
125-
fi

tests/integration/defs/perf/test_perf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2618,7 +2618,7 @@ def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict:
26182618
if is_post_merge:
26192619
# Prepare new baseline data for post-merge
26202620
new_baseline_data_dict = prepare_baseline_data(
2621-
history_data_dict, new_data_dict)
2621+
history_baseline_dict, history_data_dict, new_data_dict)
26222622
else:
26232623
# Pre-merge does not need to upload baseline data
26242624
new_baseline_data_dict = None

0 commit comments

Comments
 (0)