Skip to content

Commit a23c6f1

Browse files
authored
[TRTLLM-9834][feat] Transfer to TRTLLM-INFRA Database and Fail post-merge tests if regression (#10282)
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 464847c commit a23c6f1

24 files changed

+1995
-1568
lines changed

jenkins/L0_Test.groovy

Lines changed: 78 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
893893
// Create a unique suffix for the job name
894894
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
895895
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
896-
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
896+
def perfSanityMode = stageName.contains("PerfSanity")
897+
def disaggMode = stageName.contains("PerfSanity-Disagg")
897898
def setSegment = disaggMode
898899

899900
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@@ -938,6 +939,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
938939
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
939940
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
940941
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
942+
def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
943+
def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py"
941944

942945
stage("[${stageName}] Initializing Test") {
943946
// Create Job Workspace folder in Frontend Node
@@ -1020,6 +1023,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10201023
coverageConfigFile
10211024
)
10221025

1026+
if (perfSanityMode) {
1027+
Utils.copyFileToRemoteHost(
1028+
pipeline,
1029+
remote,
1030+
perfCheckScriptLocal,
1031+
perfCheckScriptNode,
1032+
true
1033+
)
1034+
}
1035+
10231036
// Generate Pytest command
10241037
String pytestUtil = ""
10251038
if (nodeCount > 1) {
@@ -1094,7 +1107,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10941107
// Define environment variables to export
10951108
def envVarNames = [
10961109
'OPEN_SEARCH_DB_BASE_URL',
1097-
'OPEN_SEARCH_DB_CREDENTIALS',
1110+
'OPEN_SEARCH_DB_CREDENTIALS_USR',
1111+
'OPEN_SEARCH_DB_CREDENTIALS_PSW',
10981112
'BUILD_ID',
10991113
'BUILD_URL',
11001114
'JOB_NAME',
@@ -1300,6 +1314,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13001314
),
13011315
numRetries: 3
13021316
)
1317+
1318+
if (perfSanityMode) {
1319+
stage("[${stageName}] Check perf result") {
1320+
def perfCheckResult = Utils.exec(
1321+
pipeline,
1322+
script: Utils.sshUserCmd(
1323+
remote,
1324+
"python3 ${perfCheckScriptNode} ${jobWorkspace}"
1325+
),
1326+
returnStatus: true
1327+
)
1328+
if (perfCheckResult != 0) {
1329+
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
1330+
}
1331+
}
1332+
}
13031333
}
13041334

13051335
echo "Finished test stage execution."
@@ -2785,7 +2815,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27852815
error "Some tests still failed after rerun attempts, please check the test report."
27862816
}
27872817

2788-
if (perfMode && !stageName.contains("Perf-Sanity")) {
2818+
if (perfMode) {
27892819
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
27902820
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
27912821
stage("Check perf result") {
@@ -2811,7 +2841,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28112841
}
28122842
}
28132843

2814-
if (perfMode && stageName.contains("Perf-Sanity")) {
2844+
if (stageName.contains("PerfSanity")) {
28152845
stage ("Check perf result") {
28162846
def perfCheckResult = sh(
28172847
script: """
@@ -2820,10 +2850,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28202850
""",
28212851
returnStatus: true
28222852
)
2823-
// TODO: Enable this when perf regression check is stable
2824-
// if (perfCheckResult != 0) {
2825-
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2826-
// }
2853+
if (perfCheckResult != 0) {
2854+
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2855+
}
28272856
}
28282857
}
28292858
}
@@ -3187,7 +3216,7 @@ def launchTestJobs(pipeline, testFilter)
31873216
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
31883217
]
31893218

3190-
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
3219+
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), {
31913220
def config = VANILLA_CONFIG
31923221
if (key.contains("single-device")) {
31933222
config = SINGLE_DEVICE_CONFIG
@@ -3198,7 +3227,7 @@ def launchTestJobs(pipeline, testFilter)
31983227
if (key.contains("Pybind")) {
31993228
config = PYBIND_CONFIG
32003229
}
3201-
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
3230+
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
32023231
}]]}
32033232
fullSet = parallelJobs.keySet()
32043233

@@ -3219,9 +3248,12 @@ def launchTestJobs(pipeline, testFilter)
32193248
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
32203249
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
32213250
// Perf sanity post merge test
3222-
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3223-
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3224-
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
3251+
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
3252+
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
3253+
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
3254+
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
3255+
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
3256+
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
32253257
]
32263258
fullSet += x86SlurmTestConfigs.keySet()
32273259

@@ -3233,7 +3265,7 @@ def launchTestJobs(pipeline, testFilter)
32333265
if (key.contains("llvm")) {
32343266
config = LLVM_CONFIG
32353267
}
3236-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
3268+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
32373269
}]]}
32383270

32393271
parallelJobs += parallelSlurmJobs
@@ -3252,11 +3284,30 @@ def launchTestJobs(pipeline, testFilter)
32523284
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
32533285
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
32543286
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
3255-
// Perf sanity post merge test
3256-
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
32573287
// Disable GB300 stages due to nodes will be offline temporarily.
32583288
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
32593289
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
3290+
// Perf sanity pre merge test
3291+
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
3292+
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
3293+
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
3294+
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
3295+
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
3296+
// Perf sanity post merge test
3297+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
3298+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
3299+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
3300+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
3301+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
3302+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
3303+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
3304+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
3305+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
3306+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
3307+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
3308+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
3309+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
3310+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
32603311
]
32613312
fullSet += SBSASlurmTestConfigs.keySet()
32623313

@@ -3268,13 +3319,15 @@ def launchTestJobs(pipeline, testFilter)
32683319
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
32693320
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
32703321
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
3271-
// Perf sanity post merge aggr tests
3272-
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
3273-
// Perf sanity post merge disagg tests
3274-
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
3275-
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3276-
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3277-
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
3322+
// Perf sanity pre merge tests
3323+
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3324+
// Perf sanity post merge tests
3325+
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
3326+
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
3327+
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3328+
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
3329+
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
3330+
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
32783331
]
32793332
fullSet += multiNodesSBSAConfigs.keySet()
32803333

@@ -3292,7 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
32923345
if (key.contains("llvm")) {
32933346
config = LLVM_CONFIG
32943347
}
3295-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
3348+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
32963349
}]]}
32973350
parallelJobs += parallelSlurmJobs
32983351

@@ -3305,7 +3358,7 @@ def launchTestJobs(pipeline, testFilter)
33053358
if (key.contains("llvm")) {
33063359
config = LLVM_CONFIG
33073360
}
3308-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
3361+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
33093362
}]]}
33103363

33113364
parallelJobs += parallelMultiNodesSBSAJobs

jenkins/scripts/open_search_db.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
JOB_MACHINE_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-job_machine_info"
5252
FAILED_STEP_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-failed_step_info"
5353
PR_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-pr_info"
54+
PERF_SANITY_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-perf_sanity_info"
5455

5556
READ_ACCESS_PROJECT_NAME = [
5657
JOB_PROJECT_NAME,
@@ -59,9 +60,12 @@
5960
JOB_MACHINE_PROJECT_NAME,
6061
FAILED_STEP_PROJECT_NAME,
6162
PR_PROJECT_NAME,
63+
PERF_SANITY_PROJECT_NAME,
6264
]
6365

64-
WRITE_ACCESS_PROJECT_NAME = []
66+
WRITE_ACCESS_PROJECT_NAME = [
67+
PERF_SANITY_PROJECT_NAME,
68+
]
6569

6670
DISABLE_OPEN_SEARCH_DB_FOR_LOCAL_TEST = False
6771

jenkins/scripts/slurm_run.sh

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ eval $pytestCommand
108108
pytest_exit_code=$?
109109
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
110110

111-
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
111+
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
112112
if [[ "$stageName" == *PyTorch* ]]; then
113113
basePerfFilename="base_perf_pytorch.csv"
114114
else
@@ -135,14 +135,6 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
135135
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
136136
fi
137137

138-
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
139-
echo "Check Perf-Sanity Result"
140-
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
141-
$jobWorkspace
142-
perf_sanity_check_exit_code=$?
143-
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
144-
fi
145-
146138
if [ "$pytest_exit_code" -ne 0 ]; then
147139
final_exit_code=$pytest_exit_code
148140
elif [ "$perf_check_exit_code" -ne 0 ]; then

0 commit comments

Comments
 (0)