@@ -893,7 +893,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
893893 // Create a unique suffix for the job name
894894 String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
895895 def jobUID = " ${ cluster.host} -multi_node_test-${ customSuffix} "
896- def disaggMode = stageName. contains(" Perf-Sanity-Disagg" )
896+ def perfSanityMode = stageName. contains(" PerfSanity" )
897+ def disaggMode = stageName. contains(" PerfSanity-Disagg" )
897898 def setSegment = disaggMode
898899
899900 Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
@@ -938,6 +939,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
938939 def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
939940 def scriptExecPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_exec.sh"
940941 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
942+ def perfCheckScriptLocal = " ${ llmSrcLocal} /tests/integration/defs/perf/perf_regression_check.py"
943+ def perfCheckScriptNode = " ${ jobWorkspace} /${ jobUID} -perf_regression_check.py"
941944
942945 stage(" [${ stageName} ] Initializing Test" ) {
943946 // Create Job Workspace folder in Frontend Node
@@ -1020,6 +1023,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10201023 coverageConfigFile
10211024 )
10221025
1026+ if (perfSanityMode) {
1027+ Utils . copyFileToRemoteHost(
1028+ pipeline,
1029+ remote,
1030+ perfCheckScriptLocal,
1031+ perfCheckScriptNode,
1032+ true
1033+ )
1034+ }
1035+
10231036 // Generate Pytest command
10241037 String pytestUtil = " "
10251038 if (nodeCount > 1 ) {
@@ -1094,7 +1107,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10941107 // Define environment variables to export
10951108 def envVarNames = [
10961109 ' OPEN_SEARCH_DB_BASE_URL' ,
1097- ' OPEN_SEARCH_DB_CREDENTIALS' ,
1110+ ' OPEN_SEARCH_DB_CREDENTIALS_USR' ,
1111+ ' OPEN_SEARCH_DB_CREDENTIALS_PSW' ,
10981112 ' BUILD_ID' ,
10991113 ' BUILD_URL' ,
11001114 ' JOB_NAME' ,
@@ -1300,6 +1314,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13001314 ),
13011315 numRetries : 3
13021316 )
1317+
1318+ if (perfSanityMode) {
1319+ stage(" [${ stageName} ] Check perf result" ) {
1320+ def perfCheckResult = Utils . exec(
1321+ pipeline,
1322+ script : Utils . sshUserCmd(
1323+ remote,
1324+ " python3 ${ perfCheckScriptNode} ${ jobWorkspace} "
1325+ ),
1326+ returnStatus : true
1327+ )
1328+ if (perfCheckResult != 0 ) {
1329+ error " Performance regression detected and failing the build (exit code: ${ perfCheckResult} )"
1330+ }
1331+ }
1332+ }
13031333 }
13041334
13051335 echo " Finished test stage execution."
@@ -2785,7 +2815,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27852815 error " Some tests still failed after rerun attempts, please check the test report."
27862816 }
27872817
2788- if (perfMode && ! stageName . contains( " Perf-Sanity " ) ) {
2818+ if (perfMode) {
27892819 basePerfFilename = stageName. contains(" PyTorch" ) ? " base_perf_pytorch.csv" : " base_perf.csv"
27902820 basePerfPath = " ${ llmSrc} /tests/integration/defs/perf/${ basePerfFilename} "
27912821 stage(" Check perf result" ) {
@@ -2811,7 +2841,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28112841 }
28122842 }
28132843
2814- if (perfMode && stageName. contains(" Perf-Sanity " )) {
2844+ if (stageName. contains(" PerfSanity " )) {
28152845 stage (" Check perf result" ) {
28162846 def perfCheckResult = sh(
28172847 script : """
@@ -2820,10 +2850,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28202850 """ ,
28212851 returnStatus : true
28222852 )
2823- // TODO: Enable this when perf regression check is stable
2824- // if (perfCheckResult != 0) {
2825- // error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2826- // }
2853+ if (perfCheckResult != 0 ) {
2854+ error " Performance regression detected and failing the build (exit code: ${ perfCheckResult} )"
2855+ }
28272856 }
28282857 }
28292858 }
@@ -3187,7 +3216,7 @@ def launchTestJobs(pipeline, testFilter)
31873216 " RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2" : [" rtx-pro-6000d-x4" , " l0_rtx_pro_6000" , 2 , 2 , 4 ],
31883217 ]
31893218
3190- parallelJobs = x86TestConfigs. collectEntries{key , values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE , values[0 ], " amd64" , values[4 ] ?: 1 , key. contains(" Perf" )), {
3219+ parallelJobs = x86TestConfigs. collectEntries{key , values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE , values[0 ], " amd64" , values[4 ] ?: 1 , key. contains(" - Perf- " )), {
31913220 def config = VANILLA_CONFIG
31923221 if (key. contains(" single-device" )) {
31933222 config = SINGLE_DEVICE_CONFIG
@@ -3198,7 +3227,7 @@ def launchTestJobs(pipeline, testFilter)
31983227 if (key. contains(" Pybind" )) {
31993228 config = PYBIND_CONFIG
32003229 }
3201- runLLMTestlistOnPlatform(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ])
3230+ runLLMTestlistOnPlatform(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ])
32023231 }]]}
32033232 fullSet = parallelJobs. keySet()
32043233
@@ -3219,9 +3248,12 @@ def launchTestJobs(pipeline, testFilter)
32193248 " DGX_B300-4_GPUs-PyTorch-Post-Merge-1" : [" b300-x4" , " l0_dgx_b300" , 1 , 2 , 4 ],
32203249 " DGX_B300-4_GPUs-PyTorch-Post-Merge-2" : [" b300-x4" , " l0_dgx_b300" , 2 , 2 , 4 ],
32213250 // Perf sanity post merge test
3222- // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3223- // "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3224- // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
3251+ // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
3252+ // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
3253+ // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
3254+ // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
3255+ // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
3256+ // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
32253257 ]
32263258 fullSet + = x86SlurmTestConfigs. keySet()
32273259
@@ -3233,7 +3265,7 @@ def launchTestJobs(pipeline, testFilter)
32333265 if (key. contains(" llvm" )) {
32343266 config = LLVM_CONFIG
32353267 }
3236- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
3268+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
32373269 }]]}
32383270
32393271 parallelJobs + = parallelSlurmJobs
@@ -3252,11 +3284,30 @@ def launchTestJobs(pipeline, testFilter)
32523284 " GB200-4_GPUs-PyTorch-2" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 2 , 2 , 4 ],
32533285 " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
32543286 " GB10-PyTorch-Post-Merge-1" : [" gb10x-single" , " l0_gb10" , 1 , 1 ],
3255- // Perf sanity post merge test
3256- " GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 1 , 1 , 4 ],
32573287 // Disable GB300 stages due to nodes will be offline temporarily.
32583288 // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
32593289 // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
3290+ // Perf sanity pre merge test
3291+ " GB200-4_GPUs-PyTorch-PerfSanity-4" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 4 , 14 , 4 ],
3292+ " GB200-4_GPUs-PyTorch-PerfSanity-5" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 5 , 14 , 4 ],
3293+ " GB200-4_GPUs-PyTorch-PerfSanity-6" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 6 , 14 , 4 ],
3294+ " GB200-4_GPUs-PyTorch-PerfSanity-11" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 11 , 14 , 4 ],
3295+ " GB200-4_GPUs-PyTorch-PerfSanity-12" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 12 , 14 , 4 ],
3296+ // Perf sanity post merge test
3297+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 1 , 14 , 4 ],
3298+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 2 , 14 , 4 ],
3299+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 3 , 14 , 4 ],
3300+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 4 , 14 , 4 ],
3301+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 5 , 14 , 4 ],
3302+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 6 , 14 , 4 ],
3303+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 7 , 14 , 4 ],
3304+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 8 , 14 , 4 ],
3305+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 9 , 14 , 4 ],
3306+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 10 , 14 , 4 ],
3307+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 11 , 14 , 4 ],
3308+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 12 , 14 , 4 ],
3309+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 13 , 14 , 4 ],
3310+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 14 , 14 , 4 ],
32603311 ]
32613312 fullSet + = SBSASlurmTestConfigs . keySet()
32623313
@@ -3268,13 +3319,15 @@ def launchTestJobs(pipeline, testFilter)
32683319 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 3 , 8 , 2 ],
32693320 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 3 , 8 , 2 ],
32703321 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 3 , 3 , 8 , 2 ],
3271- // Perf sanity post merge aggr tests
3272- " GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001" , 1 , 1 , 8 , 2 ],
3273- // Perf sanity post merge disagg tests
3274- " GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001" , 1 , 1 , 12 , 3 ],
3275- // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3276- // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3277- // "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
3322+ // Perf sanity pre merge tests
3323+ // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3324+ // Perf sanity post merge tests
3325+ " GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes" , 1 , 2 , 8 , 2 ],
3326+ " GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes" , 2 , 2 , 8 , 2 ],
3327+ " GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes" , 1 , 1 , 12 , 3 ],
3328+ // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
3329+ // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
3330+ // "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
32783331 ]
32793332 fullSet + = multiNodesSBSAConfigs. keySet()
32803333
@@ -3292,7 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
32923345 if (key. contains(" llvm" )) {
32933346 config = LLVM_CONFIG
32943347 }
3295- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
3348+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
32963349 }]]}
32973350 parallelJobs + = parallelSlurmJobs
32983351
@@ -3305,7 +3358,7 @@ def launchTestJobs(pipeline, testFilter)
33053358 if (key. contains(" llvm" )) {
33063359 config = LLVM_CONFIG
33073360 }
3308- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 2 , values[6 ] ?: false )
3361+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 2 , values[6 ] ?: false )
33093362 }]]}
33103363
33113364 parallelJobs + = parallelMultiNodesSBSAJobs
0 commit comments