Skip to content

Commit fdc0368

Browse files
yiqingy0chzblych
andauthored
[TRTLLM-10016][infra] Use SlurmPatition attribute time as timeout threshold (#10254)
Signed-off-by: Yiqing Yan <[email protected]> Co-authored-by: Yanchao Lu <[email protected]>
1 parent fad0005 commit fdc0368

File tree

1 file changed

+13
-8
lines changed

1 file changed

+13
-8
lines changed

jenkins/L0_Test.groovy

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -697,9 +697,13 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
697697

698698
slurmRunner = null
699699
if (cluster.containerRuntime.toString() == "DOCKER") {
700-
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
700+
echo "${stageName} partitionTimeout: ${partition.time}"
701+
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
702+
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
701703
} else if (cluster.containerRuntime.toString() == "ENROOT") {
702-
slurmRunner = runInEnrootOnNode(nodeName)
704+
echo "${stageName} partitionTimeout: ${partition.time}"
705+
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
706+
slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
703707
} else {
704708
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
705709
}
@@ -1133,6 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11331137
#SBATCH --output=${outputPath}
11341138
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11351139
#SBATCH ${partition.additionalArgs}
1140+
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
11361141
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
11371142
11381143
# SBATCH directives must appear before any executable commands.
@@ -3013,7 +3018,7 @@ def ensureStageResultNotUploaded(stageName) {
30133018
}
30143019

30153020
// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
3016-
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
3021+
def runInDockerOnNodeMultiStage(image, label, dockerArgs, partitionTimeout, needToDeleteDir=true)
30173022
{
30183023
return {
30193024
runner -> node(label) {
@@ -3024,9 +3029,9 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30243029
stage('Pull Docker Image') {
30253030
docker.image(image).pull()
30263031
}
3027-
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
3032+
// We submit the Slurm job with the Slurm partition's time spec.
30283033
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
3029-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT - 10, unit: 'MINUTES') {
3034+
timeout(time: partitionTimeout - 10, unit: 'MINUTES') {
30303035
docker.image(image).inside(dockerArgs) {
30313036
runner()
30323037
}
@@ -3042,13 +3047,13 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30423047
}
30433048
}
30443049

3045-
def runInEnrootOnNode(label)
3050+
def runInEnrootOnNode(label, partitionTimeout)
30463051
{
30473052
return {
30483053
runner -> node(label) {
3049-
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
3054+
// We submit the Slurm job with the Slurm partition's time spec.
30503055
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
3051-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT - 10, unit: 'MINUTES') {
3056+
timeout(time: partitionTimeout - 10, unit: 'MINUTES') {
30523057
runner()
30533058
}
30543059
}

0 commit comments

Comments
 (0)