@@ -697,9 +697,13 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
697697
698698 slurmRunner = null
699699 if (cluster. containerRuntime. toString() == " DOCKER" ) {
700- slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
700+ echo " ${ stageName} partitionTimeout: ${ partition.time} "
701+ def partitionTimeout = partition. time ? partition. time : SlurmConfig . DEFAULT_TIMEOUT_SHORT
702+ slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, partitionTimeout, true )
701703 } else if (cluster. containerRuntime. toString() == " ENROOT" ) {
702- slurmRunner = runInEnrootOnNode(nodeName)
704+ echo " ${ stageName} partitionTimeout: ${ partition.time} "
705+ def partitionTimeout = partition. time ? partition. time : SlurmConfig . DEFAULT_TIMEOUT_SHORT
706+ slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
703707 } else {
704708 throw new Exception (" Unsupported container runtime: ${ cluster.containerRuntime} " )
705709 }
@@ -889,7 +893,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
889893 // Create a unique suffix for the job name
890894 String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
891895 def jobUID = " ${ cluster.host} -multi_node_test-${ customSuffix} "
892- def disaggMode = stageName. contains(" Perf-Sanity-Disagg" )
896+ def perfSanityMode = stageName. contains(" PerfSanity" )
897+ def disaggMode = stageName. contains(" PerfSanity-Disagg" )
893898 def setSegment = disaggMode
894899
895900 Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
@@ -1090,7 +1095,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10901095 // Define environment variables to export
10911096 def envVarNames = [
10921097 ' OPEN_SEARCH_DB_BASE_URL' ,
1093- ' OPEN_SEARCH_DB_CREDENTIALS' ,
1098+ ' OPEN_SEARCH_DB_CREDENTIALS_USR' ,
1099+ ' OPEN_SEARCH_DB_CREDENTIALS_PSW' ,
10941100 ' BUILD_ID' ,
10951101 ' BUILD_URL' ,
10961102 ' JOB_NAME' ,
@@ -1133,6 +1139,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11331139 #SBATCH --output=${ outputPath}
11341140 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11351141 #SBATCH ${ partition.additionalArgs}
1142+ ${ partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
11361143 ${ (partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
11371144
11381145 # SBATCH directives must appear before any executable commands.
@@ -2780,7 +2787,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27802787 error " Some tests still failed after rerun attempts, please check the test report."
27812788 }
27822789
2783- if (perfMode && ! stageName . contains( " Perf-Sanity " ) ) {
2790+ if (perfMode) {
27842791 basePerfFilename = stageName. contains(" PyTorch" ) ? " base_perf_pytorch.csv" : " base_perf.csv"
27852792 basePerfPath = " ${ llmSrc} /tests/integration/defs/perf/${ basePerfFilename} "
27862793 stage(" Check perf result" ) {
@@ -2806,7 +2813,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28062813 }
28072814 }
28082815
2809- if (perfMode && stageName. contains(" Perf-Sanity " )) {
2816+ if (stageName. contains(" PerfSanity " )) {
28102817 stage (" Check perf result" ) {
28112818 def perfCheckResult = sh(
28122819 script : """
@@ -2815,10 +2822,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28152822 """ ,
28162823 returnStatus : true
28172824 )
2818- // TODO: Enable this when perf regression check is stable
2819- // if (perfCheckResult != 0) {
2820- // error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2821- // }
2825+ if (perfCheckResult != 0 ) {
2826+ error " Performance regression detected and failing the build (exit code: ${ perfCheckResult} )"
2827+ }
28222828 }
28232829 }
28242830 }
@@ -3013,7 +3019,7 @@ def ensureStageResultNotUploaded(stageName) {
30133019}
30143020
30153021// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
3016- def runInDockerOnNodeMultiStage (image , label , dockerArgs , needToDeleteDir = true )
3022+ def runInDockerOnNodeMultiStage (image , label , dockerArgs , partitionTimeout , needToDeleteDir = true )
30173023{
30183024 return {
30193025 runner -> node(label) {
@@ -3024,9 +3030,9 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30243030 stage(' Pull Docker Image' ) {
30253031 docker. image(image). pull()
30263032 }
3027- // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
3033+ // We submit the Slurm job with the Slurm partition's time spec.
30283034 // Minus 10 minutes to avoid the Slurm job being stopped earlier.
3029- timeout(time : SlurmConfig . DEFAULT_TIMEOUT - 10 , unit : ' MINUTES' ) {
3035+ timeout(time : partitionTimeout - 10 , unit : ' MINUTES' ) {
30303036 docker. image(image). inside(dockerArgs) {
30313037 runner()
30323038 }
@@ -3042,13 +3048,13 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30423048 }
30433049}
30443050
3045- def runInEnrootOnNode (label )
3051+ def runInEnrootOnNode (label , partitionTimeout )
30463052{
30473053 return {
30483054 runner -> node(label) {
3049- // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
3055+ // We submit the Slurm job with the Slurm partition's time spec.
30503056 // Minus 10 minutes to avoid the Slurm job being stopped earlier.
3051- timeout(time : SlurmConfig . DEFAULT_TIMEOUT_SHORT - 10 , unit : ' MINUTES' ) {
3057+ timeout(time : partitionTimeout - 10 , unit : ' MINUTES' ) {
30523058 runner()
30533059 }
30543060 }
@@ -3182,7 +3188,7 @@ def launchTestJobs(pipeline, testFilter)
31823188 " RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2" : [" rtx-pro-6000d-x4" , " l0_rtx_pro_6000" , 2 , 2 , 4 ],
31833189 ]
31843190
3185- parallelJobs = x86TestConfigs. collectEntries{key , values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE , values[0 ], " amd64" , values[4 ] ?: 1 , key. contains(" Perf" )), {
3191+ parallelJobs = x86TestConfigs. collectEntries{key , values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE , values[0 ], " amd64" , values[4 ] ?: 1 , key. contains(" - Perf- " )), {
31863192 def config = VANILLA_CONFIG
31873193 if (key. contains(" single-device" )) {
31883194 config = SINGLE_DEVICE_CONFIG
@@ -3193,7 +3199,7 @@ def launchTestJobs(pipeline, testFilter)
31933199 if (key. contains(" Pybind" )) {
31943200 config = PYBIND_CONFIG
31953201 }
3196- runLLMTestlistOnPlatform(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ])
3202+ runLLMTestlistOnPlatform(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ])
31973203 }]]}
31983204 fullSet = parallelJobs. keySet()
31993205
@@ -3214,9 +3220,12 @@ def launchTestJobs(pipeline, testFilter)
32143220 " DGX_B300-4_GPUs-PyTorch-Post-Merge-1" : [" b300-x4" , " l0_dgx_b300" , 1 , 2 , 4 ],
32153221 " DGX_B300-4_GPUs-PyTorch-Post-Merge-2" : [" b300-x4" , " l0_dgx_b300" , 2 , 2 , 4 ],
32163222 // Perf sanity post merge test
3217- // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3218- // "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3219- // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
3223+ // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
3224+ // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
3225+ // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
3226+ // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
3227+ // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
3228+ // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
32203229 ]
32213230 fullSet + = x86SlurmTestConfigs. keySet()
32223231
@@ -3228,7 +3237,7 @@ def launchTestJobs(pipeline, testFilter)
32283237 if (key. contains(" llvm" )) {
32293238 config = LLVM_CONFIG
32303239 }
3231- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
3240+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
32323241 }]]}
32333242
32343243 parallelJobs + = parallelSlurmJobs
@@ -3247,11 +3256,25 @@ def launchTestJobs(pipeline, testFilter)
32473256 " GB200-4_GPUs-PyTorch-2" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 2 , 2 , 4 ],
32483257 " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
32493258 " GB10-PyTorch-Post-Merge-1" : [" gb10x-single" , " l0_gb10" , 1 , 1 ],
3250- // Perf sanity post merge test
3251- " GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 1 , 1 , 4 ],
32523259 // Disable GB300 stages due to nodes will be offline temporarily.
32533260 // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
32543261 // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
3262+ // Perf sanity pre merge test
3263+ " GB200-4_GPUs-PyTorch-PerfSanity-4" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 4 , 14 , 4 ],
3264+ " GB200-4_GPUs-PyTorch-PerfSanity-5" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 5 , 14 , 4 ],
3265+ " GB200-4_GPUs-PyTorch-PerfSanity-6" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 6 , 14 , 4 ],
3266+ " GB200-4_GPUs-PyTorch-PerfSanity-11" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 11 , 14 , 4 ],
3267+ " GB200-4_GPUs-PyTorch-PerfSanity-12" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 12 , 14 , 4 ],
3268+ // Perf sanity post merge test
3269+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 1 , 14 , 4 ],
3270+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 2 , 14 , 4 ],
3271+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 3 , 14 , 4 ],
3272+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 7 , 14 , 4 ],
3273+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 8 , 14 , 4 ],
3274+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 9 , 14 , 4 ],
3275+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 10 , 14 , 4 ],
3276+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 13 , 14 , 4 ],
3277+ " GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14" : [" gb200-x4-oci" , " l0_gb200_multi_gpus_perf_sanity" , 14 , 14 , 4 ],
32553278 ]
32563279 fullSet + = SBSASlurmTestConfigs . keySet()
32573280
@@ -3263,13 +3286,15 @@ def launchTestJobs(pipeline, testFilter)
32633286 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 3 , 8 , 2 ],
32643287 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 3 , 8 , 2 ],
32653288 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 3 , 3 , 8 , 2 ],
3266- // Perf sanity post merge aggr tests
3267- " GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001" , 1 , 1 , 8 , 2 ],
3268- // Perf sanity post merge disagg tests
3269- " GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001" , 1 , 1 , 12 , 3 ],
3270- // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3271- // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3272- // "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
3289+ // Perf sanity pre merge tests
3290+ // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3291+ // Perf sanity post merge tests
3292+ " GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes" , 1 , 2 , 8 , 2 ],
3293+ " GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes" , 2 , 2 , 8 , 2 ],
3294+ " GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes" , 1 , 1 , 12 , 3 ],
3295+ // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
3296+ // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
3297+ // "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
32733298 ]
32743299 fullSet + = multiNodesSBSAConfigs. keySet()
32753300
@@ -3287,7 +3312,7 @@ def launchTestJobs(pipeline, testFilter)
32873312 if (key. contains(" llvm" )) {
32883313 config = LLVM_CONFIG
32893314 }
3290- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
3315+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 1 , values[6 ] ?: false )
32913316 }]]}
32923317 parallelJobs + = parallelSlurmJobs
32933318
@@ -3300,7 +3325,7 @@ def launchTestJobs(pipeline, testFilter)
33003325 if (key. contains(" llvm" )) {
33013326 config = LLVM_CONFIG
33023327 }
3303- runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 2 , values[6 ] ?: false )
3328+ runLLMTestlistOnSlurm(pipeline, values[0 ], values[1 ], config, key. contains(" - Perf- " ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 2 , values[6 ] ?: false )
33043329 }]]}
33053330
33063331 parallelJobs + = parallelMultiNodesSBSAJobs
0 commit comments