Skip to content

Commit bf93595

Browse files
committed
minor fix
Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
1 parent af98b12 commit bf93595

File tree

1 file changed

+19
-22
lines changed

1 file changed

+19
-22
lines changed

jenkins/L0_Test.groovy

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -933,15 +933,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
933933
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
934934
def testListPathNode = "${jobWorkspace}/${testList}.txt"
935935
def waivesListPathNode = "${jobWorkspace}/waives.txt"
936-
def outputPath = "${jobWorkspace}/job-output.log"
936+
def sbatchLogPath = "${jobWorkspace}/job-output.log"
937937
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
938-
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
938+
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
939939
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
940-
def scriptSubmitPathNode = "${jobWorkspace}/slurm_submit.sh"
940+
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
941941
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
942-
def scriptTrackPathNode = "${jobWorkspace}/slurm_track.sh"
942+
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
943943
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
944-
def scriptStatusPathNode = "${jobWorkspace}/slurm_status.sh"
944+
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
945945
def isAarch64 = config.contains("aarch64")
946946
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
947947
def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
@@ -1150,8 +1150,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11501150
"export ${varName}=\"${escapedValue}\""
11511151
}.join('\n')
11521152

1153-
// Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
1154-
def scriptLaunchPrefix = """#!/bin/bash
1153+
def scriptContent = """#!/bin/bash
11551154
#SBATCH ${exemptionComment}
11561155
#SBATCH --output=${outputPath}
11571156
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
@@ -1247,9 +1246,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12471246
rm -rf "${jobWorkspace}/results.xml"
12481247
rm -rf "${jobWorkspace}/report.csv"
12491248
rm -rf "${jobWorkspace}/unfinished_test.txt"
1250-
rm -rf "${outputPath}"
1249+
rm -rf "${sbatchLogPath}"
12511250
1252-
touch "${outputPath}"
1251+
touch ${sbatchLogPath}
12531252
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
12541253
if [ -z "\$jobId" ]; then
12551254
echo "Error: Slurm job submission failed, no job ID returned."
@@ -1281,14 +1280,13 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12811280
)
12821281
def scriptTrack = """#!/bin/bash
12831282
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
1284-
tail -f $outputPath &
1283+
tail -f ${sbatchLogPath} &
12851284
tailPid=\$!
12861285
# Wait until sbatch job is done.
12871286
while true; do
1288-
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | \
1289-
awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
1290-
if [[ -z "\$state" || "\$state" == "RUNNING" || \
1291-
"\$state" == "PENDING"]]; then
1287+
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
1288+
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" ]]; then
1289+
echo "job is still running"
12921290
sleep 300
12931291
else
12941292
echo "Job \$jobId finished with state: \$state"
@@ -1340,8 +1338,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13401338
)
13411339
def scriptStatus = """#!/bin/bash
13421340
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
1343-
sacct -j \$jobId --format=JobIDRaw,State --noheader |\
1344-
awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
1341+
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
13451342
"""
13461343
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
13471344
Utils.copyFileToRemoteHost(
@@ -1378,13 +1375,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13781375
remote,
13791376
scriptStatusPathNode
13801377
)
1381-
)
1382-
println(result)
1383-
if (result == "") {
1384-
echo "Job is done."
1385-
break
1386-
} else {
1378+
).trim()
1379+
if (!result || result == "RUNNING" || result == "PENDING") {
13871380
echo "Job is still running, pulling the job log."
1381+
// Pulling the sbatch output log
13881382
Utils.exec(
13891383
pipeline,
13901384
timeout: false,
@@ -1393,6 +1387,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13931387
scriptTrackPathNode
13941388
)
13951389
)
1390+
} else {
1391+
echo "Job is done."
1392+
break
13961393
}
13971394
}
13981395
}

0 commit comments

Comments
 (0)