Skip to content

Commit 8fefa2c

Browse files
authored
[None][infra] Fail fast if SLURM entrypoint fails (#9744)
Signed-off-by: Matt Lefebvre <[email protected]>
1 parent e343029 commit 8fefa2c

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

jenkins/L0_Test.groovy

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,19 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
625625
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
626626
}
627627
counter++
628+
// If entrypoint script fails to start, do not poll for agent connection
629+
try {
630+
SlurmConfig.checkJobStatus(pipeline, cluster, slurmJobID, remote)
631+
} catch (InterruptedException e) {
632+
throw e
633+
} catch (Exception e) {
634+
// If the exception is about job being inactive, enrich it with log path
635+
if (e.message.contains("is no longer active")) {
636+
throw new Exception("${e.message}. Check SLURM logs at /home/svc_tensorrt/slurm-logs/slurm-${slurmJobID}-${nodeName}.out on ${cluster.host}")
637+
}
638+
// Otherwise, log the error but continue (SSH might be temporarily unavailable)
639+
pipeline.echo("Warning: Could not check SLURM job status: ${e.message}")
640+
}
628641
}
629642
}
630643

0 commit comments

Comments
 (0)