Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 97 additions & 28 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -933,11 +933,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def outputPath = "${jobWorkspace}/job-output.log"
def sbatchLogPath = "${jobWorkspace}/job-output.log"
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
def isAarch64 = config.contains("aarch64")
def coverageConfigFile = "${jobWorkspace}/.coveragerc"

stage("[${stageName}] Initializing Test") {
Expand Down Expand Up @@ -1133,10 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
"export ${varName}=\"${escapedValue}\""
}.join('\n')

// Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
def scriptLaunchPrefix = """#!/bin/bash
#SBATCH ${exemptionComment}
#SBATCH --output=${outputPath}
#SBATCH --output=${sbatchLogPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs}
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
Expand Down Expand Up @@ -1214,9 +1218,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptLaunchPathNode,
true
)

def scriptExec = """#!/bin/bash
set -xEeuo pipefail
def scriptSubmit = """#!/bin/bash
set -Eeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR

# Clean up previous job intermediate files so that retry can work
Expand All @@ -1231,21 +1234,60 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
rm -rf "${jobWorkspace}/results.xml"
rm -rf "${jobWorkspace}/report.csv"
rm -rf "${jobWorkspace}/unfinished_test.txt"
rm -rf "${outputPath}"
rm -rf "${sbatchLogPath}"

touch "${outputPath}"
touch ${sbatchLogPath}
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
if [ -z "\$jobId" ]; then
echo "Error: Slurm job submission failed, no job ID returned."
exit 1
fi
echo "Submitted Slurm job \$jobId"
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
tail -f ${outputPath} &
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
echo \$jobId > $jobWorkspace/slurm_job_id.txt
""".replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptSubmitPathLocal,
scriptSubmitPathNode,
true
)
}
stage("[${stageName}] Run Pytest") {
// Submit the sbatch job
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
scriptSubmitPathNode
),
numRetries: 3
)
def sbatchJobId = Utils.exec(
pipeline,
returnStdout: true,
script: Utils.sshUserCmd(
remote,
"cat $jobWorkspace/slurm_job_id.txt"
)
).trim()
def scriptTrack = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
tail -f ${sbatchLogPath} &
tailPid=\$!
# Wait until sbatch job is done.
while squeue -j \$jobId -o %T >/dev/null 2>&1; do
sleep 300
while true; do
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
echo "job is still running"
sleep 300
else
echo "Job \$jobId finished with state: \$state"
break
fi
done
# Kill tail -f process
kill \$tailPid
Expand Down Expand Up @@ -1282,28 +1324,55 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
exit 1
fi
""".replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptExecPathLocal,
scriptExecPathNode,
scriptTrackPathLocal,
scriptTrackPathNode,
true
)
}
stage("[${stageName}] Run Pytest") {
Utils.exec(
def scriptStatus = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
"""
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
Utils.copyFileToRemoteHost(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
"\"${scriptExecPathNode}\""
),
numRetries: 3
remote,
scriptStatusPathLocal,
scriptStatusPathNode,
true
)
}

sh "cat $scriptStatusPathLocal"
while (true) {
// Check if the job is done by running sacct via SSH
def result = Utils.exec(
pipeline,
returnStdout: true,
script: Utils.sshUserCmd(
remote,
scriptStatusPathNode
)
).trim()
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
echo "Slurm job $sbatchJobId is still running, pulling the job log."
// Pulling the sbatch output log
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
scriptTrackPathNode
)
)
} else {
echo "Slurm job $sbatchJobId is done."
break
}
}
}
echo "Finished test stage execution."
}
} finally {
Expand Down