[None][infra] add retry logic to get slurm sbatch job log when ssh dropped (NVIDIA#9167)

yuanjingx87 · videodanchik · commit a6ca621b1c0e · 2026-01-14T02:12:41.000Z
Signed-off-by: Yuanjing Xue &lt;197832395+yuanjingx87@users.noreply.github.com&gt;
Signed-off-by: Daniil Kulko &lt;kulkodaniil@gmail.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -933,11 +933,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
             def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
             def testListPathNode = "${jobWorkspace}/${testList}.txt"
             def waivesListPathNode = "${jobWorkspace}/waives.txt"
-            def outputPath = "${jobWorkspace}/job-output.log"
+            def sbatchLogPath = "${jobWorkspace}/job-output.log"
             def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
             def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
-            def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
-            def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
+            def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
+            def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
+            def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
+            def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
+            def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
+            def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
+            def isAarch64 = config.contains("aarch64")
             def coverageConfigFile = "${jobWorkspace}/.coveragerc"
 
             stage("[${stageName}] Initializing Test") {
@@ -1133,10 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     "export ${varName}=\"${escapedValue}\""
                 }.join('\n')
 
-                // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
                 def scriptLaunchPrefix = """#!/bin/bash
                     #SBATCH ${exemptionComment}
-                    #SBATCH --output=${outputPath}
+                    #SBATCH --output=${sbatchLogPath}
                     ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
                     #SBATCH ${partition.additionalArgs}
                     ${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@@ -1214,9 +1218,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     scriptLaunchPathNode,
                     true
                 )
-
-                def scriptExec = """#!/bin/bash
-                    set -xEeuo pipefail
+                def scriptSubmit = """#!/bin/bash
+                    set -Eeuo pipefail
                     trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
 
                     # Clean up previous job intermediate files so that retry can work
@@ -1231,21 +1234,60 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     rm -rf "${jobWorkspace}/results.xml"
                     rm -rf "${jobWorkspace}/report.csv"
                     rm -rf "${jobWorkspace}/unfinished_test.txt"
-                    rm -rf "${outputPath}"
+                    rm -rf "${sbatchLogPath}"
 
-                    touch "${outputPath}"
+                    touch ${sbatchLogPath}
                     jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
                     if [ -z "\$jobId" ]; then
                         echo "Error: Slurm job submission failed, no job ID returned."
                         exit 1
                     fi
                     echo "Submitted Slurm job \$jobId"
-                    echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
-                    tail -f ${outputPath} &
+                    # save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
+                    echo \$jobId > $jobWorkspace/slurm_job_id.txt
+                """.replaceAll("(?m)^\\s*", "").trim()
+                pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
+                Utils.copyFileToRemoteHost(
+                    pipeline,
+                    remote,
+                    scriptSubmitPathLocal,
+                    scriptSubmitPathNode,
+                    true
+                )
+            }
+            stage("[${stageName}] Run Pytest") {
+                // Submit the sbatch job
+                Utils.exec(
+                    pipeline,
+                    timeout: false,
+                    script: Utils.sshUserCmd(
+                        remote,
+                        scriptSubmitPathNode
+                    ),
+                    numRetries: 3
+                )
+                def sbatchJobId = Utils.exec(
+                    pipeline,
+                    returnStdout: true,
+                    script: Utils.sshUserCmd(
+                        remote,
+                        "cat $jobWorkspace/slurm_job_id.txt"
+                    )
+                ).trim()
+                def scriptTrack = """#!/bin/bash
+                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
+                    tail -f ${sbatchLogPath} &
                     tailPid=\$!
                     # Wait until sbatch job is done.
-                    while squeue -j \$jobId -o %T >/dev/null 2>&1; do
-                        sleep 300
+                    while true; do
+                        state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
+                        if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
+                            echo "job is still running"
+                            sleep 300
+                        else
+                            echo "Job \$jobId finished with state: \$state"
+                            break
+                        fi
                     done
                     # Kill tail -f process
                     kill \$tailPid
@@ -1282,28 +1324,55 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                         exit 1
                     fi
                 """.replaceAll("(?m)^\\s*", "").trim()
-                pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
-                Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
+                pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
                 Utils.copyFileToRemoteHost(
                     pipeline,
                     remote,
-                    scriptExecPathLocal,
-                    scriptExecPathNode,
+                    scriptTrackPathLocal,
+                    scriptTrackPathNode,
                     true
                 )
-            }
-            stage("[${stageName}] Run Pytest") {
-                Utils.exec(
+                def scriptStatus = """#!/bin/bash
+                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
+                    sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
+                """
+                pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
+                Utils.copyFileToRemoteHost(
                     pipeline,
-                    timeout: false,
-                    script: Utils.sshUserCmd(
-                        remote,
-                        "\"${scriptExecPathNode}\""
-                    ),
-                    numRetries: 3
+                    remote,
+                    scriptStatusPathLocal,
+                    scriptStatusPathNode,
+                    true
                 )
-            }
 
+                sh "cat $scriptStatusPathLocal"
+                while (true) {
+                    // Check if the job is done by running sacct via SSH
+                    def result = Utils.exec(
+                        pipeline,
+                        returnStdout: true,
+                        script: Utils.sshUserCmd(
+                            remote,
+                            scriptStatusPathNode
+                        )
+                    ).trim()
+                    if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
+                        echo "Slurm job $sbatchJobId is still running, pulling the job log."
+                        // Pulling the sbatch output log
+                        Utils.exec(
+                            pipeline,
+                            timeout: false,
+                            script: Utils.sshUserCmd(
+                                remote,
+                                scriptTrackPathNode
+                            )
+                        )
+                    } else {
+                        echo "Slurm job $sbatchJobId is done."
+                        break
+                    }
+                }
+            }
             echo "Finished test stage execution."
         }
     } finally {