@@ -697,9 +697,13 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
697697
698698 slurmRunner = null
699699 if (cluster. containerRuntime. toString() == " DOCKER" ) {
700- slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
700+ echo " ${ stageName} partitionTimeout: ${ partition.time} "
701+ def partitionTimeout = partition. time ? partition. time : SlurmConfig . DEFAULT_TIMEOUT_SHORT
702+ slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, partitionTimeout, true )
701703 } else if (cluster. containerRuntime. toString() == " ENROOT" ) {
702- slurmRunner = runInEnrootOnNode(nodeName)
704+ echo " ${ stageName} partitionTimeout: ${ partition.time} "
705+ def partitionTimeout = partition. time ? partition. time : SlurmConfig . DEFAULT_TIMEOUT_SHORT
706+ slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
703707 } else {
704708 throw new Exception (" Unsupported container runtime: ${ cluster.containerRuntime} " )
705709 }
@@ -1133,6 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11331137 #SBATCH --output=${ outputPath}
11341138 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11351139 #SBATCH ${ partition.additionalArgs}
1140+ ${ partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
11361141 ${ (partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
11371142
11381143 # SBATCH directives must appear before any executable commands.
@@ -3013,7 +3018,7 @@ def ensureStageResultNotUploaded(stageName) {
30133018}
30143019
30153020// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
3016- def runInDockerOnNodeMultiStage (image , label , dockerArgs , needToDeleteDir = true )
3021+ def runInDockerOnNodeMultiStage (image , label , dockerArgs , partitionTimeout , needToDeleteDir = true )
30173022{
30183023 return {
30193024 runner -> node(label) {
@@ -3024,9 +3029,9 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30243029 stage(' Pull Docker Image' ) {
30253030 docker. image(image). pull()
30263031 }
3027- // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
3032+ // We submit the Slurm job with the Slurm partition's time spec.
30283033 // Minus 10 minutes to avoid the Slurm job being stopped earlier.
3029- timeout(time : SlurmConfig . DEFAULT_TIMEOUT - 10 , unit : ' MINUTES' ) {
3034+ timeout(time : partitionTimeout - 10 , unit : ' MINUTES' ) {
30303035 docker. image(image). inside(dockerArgs) {
30313036 runner()
30323037 }
@@ -3042,13 +3047,13 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30423047 }
30433048}
30443049
3045- def runInEnrootOnNode (label )
3050+ def runInEnrootOnNode (label , partitionTimeout )
30463051{
30473052 return {
30483053 runner -> node(label) {
3049- // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
3054+ // We submit the Slurm job with the Slurm partition's time spec.
30503055 // Minus 10 minutes to avoid the Slurm job being stopped earlier.
3051- timeout(time : SlurmConfig . DEFAULT_TIMEOUT_SHORT - 10 , unit : ' MINUTES' ) {
3056+ timeout(time : partitionTimeout - 10 , unit : ' MINUTES' ) {
30523057 runner()
30533058 }
30543059 }
0 commit comments