@@ -887,7 +887,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
887887 // Create a unique suffix for the job name
888888 String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
889889 def jobUID = " ${ cluster.host} -multi_node_test-${ customSuffix} "
890- def disaggMode = stageName. contains(" Perf-Sanity -Disagg" )
890+ def disaggMode = stageName. contains(" PerfSanity -Disagg" )
891891 def setSegment = disaggMode
892892
893893 Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
@@ -1245,6 +1245,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12451245 ),
12461246 numRetries : 3
12471247 )
1248+
1249+ if (stageName. contains(" PerfSanity" )) {
1250+ stage(" [${ stageName} ] Check perf result" ) {
1251+ def perfCheckResult = Utils . exec(
1252+ pipeline,
1253+ script : Utils . sshUserCmd(
1254+ remote,
1255+ " \" python3 ${ llmSrc} /tests/integration/defs/perf/perf_regression_check.py ${ WORKSPACE} /${ stageName} \" "
1256+ ),
1257+ returnStatus : true
1258+ )
1259+ if (perfCheckResult != 0 ) {
1260+ error " Performance regression detected and failing the build (exit code: ${ perfCheckResult} )"
1261+ }
1262+ }
1263+ }
12481264 }
12491265
12501266 echo " Finished test stage execution."
@@ -2698,7 +2714,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26982714 error " Some tests still failed after rerun attempts, please check the test report."
26992715 }
27002716
2701- if (perfMode && ! stageName . contains( " Perf-Sanity " ) ) {
2717+ if (perfMode) {
27022718 basePerfFilename = stageName. contains(" PyTorch" ) ? " base_perf_pytorch.csv" : " base_perf.csv"
27032719 basePerfPath = " ${ llmSrc} /tests/integration/defs/perf/${ basePerfFilename} "
27042720 stage(" Check perf result" ) {
@@ -2724,7 +2740,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27242740 }
27252741 }
27262742
2727- if (perfMode && stageName. contains(" Perf-Sanity " )) {
2743+ if (stageName. contains(" PerfSanity " )) {
27282744 stage (" Check perf result" ) {
27292745 def perfCheckResult = sh(
27302746 script : """
@@ -2733,10 +2749,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27332749 """ ,
27342750 returnStatus : true
27352751 )
2736- // TODO: Enable this when perf regression check is stable
2737- // if (perfCheckResult != 0) {
2738- // error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2739- // }
2752+ if (perfCheckResult != 0 ) {
2753+ error " Performance regression detected and failing the build (exit code: ${ perfCheckResult} )"
2754+ }
27402755 }
27412756 }
27422757 }
@@ -3100,7 +3115,7 @@ def launchTestJobs(pipeline, testFilter)
31003115 " RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2" : [" rtx-pro-6000d-x4" , " l0_rtx_pro_6000" , 2 , 2 , 4 ],
31013116 ]
31023117
3103- parallelJobs = x86TestConfigs. collectEntries{key , values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE , values[0 ], " amd64" , values[4 ] ?: 1 , key. contains(" Perf" )), {
3118+ parallelJobs = x86TestConfigs. collectEntries{key , values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE , values[0 ], " amd64" , values[4 ] ?: 1 , key. contains(" - Perf- " )), {
31043119 def config = VANILLA_CONFIG
31053120 if (key. contains(" single-device" )) {
31063121 config = SINGLE_DEVICE_CONFIG
@@ -3111,7 +3126,7 @@ def launchTestJobs(pipeline, testFilter)
31113126 if (key. contains(" Pybind" )) {
31123127 config = PYBIND_CONFIG
31133128 }
3114- runLLMTestlistOnPlatform(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ])
3129+ runLLMTestlistOnPlatform(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ])
31153130 }]]}
31163131 fullSet = parallelJobs. keySet()
31173132
@@ -3132,9 +3147,9 @@ def launchTestJobs(pipeline, testFilter)
31323147 " DGX_B300-4_GPUs-PyTorch-Post-Merge-1" : [" b300-x4" , " l0_dgx_b300" , 1 , 2 , 4 ],
31333148 " DGX_B300-4_GPUs-PyTorch-Post-Merge-2" : [" b300-x4" , " l0_dgx_b300" , 2 , 2 , 4 ],
31343149 // Perf sanity post merge test
3135- // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity -Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3136- // "DGX_B200-8_GPUs-PyTorch-Perf-Sanity -Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3137- // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity -Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
3150+ // "DGX_B200-4_GPUs-PyTorch-PerfSanity -Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3151+ // "DGX_B200-8_GPUs-PyTorch-PerfSanity -Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3152+ // "DGX_B300-4_GPUs-PyTorch-PerfSanity -Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
31383153 ]
31393154 fullSet + = x86SlurmTestConfigs. keySet()
31403155
@@ -3146,7 +3161,7 @@ def launchTestJobs(pipeline, testFilter)
31463161 if (key. contains(" llvm" )) {
31473162 config = LLVM_CONFIG
31483163 }
3149- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
3164+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
31503165 }]]}
31513166
31523167 parallelJobs + = parallelSlurmJobs
@@ -3162,11 +3177,19 @@ def launchTestJobs(pipeline, testFilter)
31623177 " GB200-4_GPUs-PyTorch-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 2 , 4 ],
31633178 " GB200-4_GPUs-PyTorch-2" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 2 , 2 , 4 ],
31643179 " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
3165- // Perf sanity post merge test
3166- " GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 1 , 1 , 4 ],
31673180 // Disable GB300 stages due to nodes will be offline temporarily.
31683181 // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
31693182 // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
3183+ // Perf sanity pre merge test
3184+ " GB200-4_GPUs-PyTorch-PerfSanity-2" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 2 , 6 , 4 ],
3185+ " GB200-4_GPUs-PyTorch-PerfSanity-4" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 4 , 6 , 4 ],
3186+ // Perf sanity post merge test
3187+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 1 , 6 , 4 ],
3188+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 2 , 6 , 4 ],
3189+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 3 , 6 , 4 ],
3190+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 4 , 6 , 4 ],
3191+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 5 , 6 , 4 ],
3192+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 6 , 6 , 4 ],
31703193 ]
31713194 fullSet + = SBSASlurmTestConfigs . keySet()
31723195
@@ -3178,13 +3201,15 @@ def launchTestJobs(pipeline, testFilter)
31783201 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 3 , 8 , 2 ],
31793202 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 3 , 8 , 2 ],
31803203 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 3 , 3 , 8 , 2 ],
3181- // Perf sanity post merge aggr tests
3182- " GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001" , 1 , 1 , 8 , 2 ],
3183- // Perf sanity post merge disagg tests
3184- " GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001" , 1 , 1 , 12 , 3 ],
3185- // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3186- // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3187- // "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
3204+ // Perf sanity pre merge tests
3205+ " GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes" , 1 , 1 , 12 , 3 ],
3206+ // Perf sanity post merge tests
3207+ " GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes" , 1 , 2 , 8 , 2 ],
3208+ " GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes" , 2 , 2 , 8 , 2 ],
3209+ " GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes" , 1 , 1 , 12 , 3 ],
3210+ // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
3211+ // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
3212+ // "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
31883213 ]
31893214 fullSet + = multiNodesSBSAConfigs. keySet()
31903215
@@ -3202,7 +3227,7 @@ def launchTestJobs(pipeline, testFilter)
32023227 if (key. contains(" llvm" )) {
32033228 config = LLVM_CONFIG
32043229 }
3205- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
3230+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
32063231 }]]}
32073232 parallelJobs + = parallelSlurmJobs
32083233
@@ -3215,7 +3240,7 @@ def launchTestJobs(pipeline, testFilter)
32153240 if (key. contains(" llvm" )) {
32163241 config = LLVM_CONFIG
32173242 }
3218- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 2 , values[6 ] ?: false )
3243+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 2 , values[6 ] ?: false )
32193244 }]]}
32203245
32213246 parallelJobs + = parallelMultiNodesSBSAJobs
0 commit comments