Skip to content

Commit 5472822

Browse files
committed
Address feedbacks
Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
1 parent e566439 commit 5472822

File tree

1 file changed

+14
-22
lines changed

1 file changed

+14
-22
lines changed

jenkins/L0_Test.groovy

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,9 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11381138
"export ${varName}=\"${escapedValue}\""
11391139
}.join('\n')
11401140

1141-
def scriptContent = """#!/bin/bash
1141+
def scriptLaunchPrefix = """#!/bin/bash
11421142
#SBATCH ${exemptionComment}
1143-
#SBATCH --output=${outputPath}
1143+
#SBATCH --output=${sbatchLogPath}
11441144
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11451145
#SBATCH ${partition.additionalArgs}
11461146
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@@ -1266,14 +1266,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12661266
),
12671267
numRetries: 3
12681268
)
1269+
def sbatchJobId = Utils.exec(
1270+
pipeline,
1271+
returnStdout: true,
1272+
script: Utils.sshUserCmd(
1273+
remote,
1274+
"cat $jobWorkspace/slurm_job_id.txt"
1275+
)
1276+
).trim()
12691277
def scriptTrack = """#!/bin/bash
12701278
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
12711279
tail -f ${sbatchLogPath} &
12721280
tailPid=\$!
12731281
# Wait until sbatch job is done.
12741282
while true; do
12751283
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
1276-
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" ]]; then
1284+
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
12771285
echo "job is still running"
12781286
sleep 300
12791287
else
@@ -1337,22 +1345,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13371345
true
13381346
)
13391347

1340-
if (perfSanityMode) {
1341-
stage("[${stageName}] Check perf result") {
1342-
def perfCheckResult = Utils.exec(
1343-
pipeline,
1344-
script: Utils.sshUserCmd(
1345-
remote,
1346-
"python3 ${perfCheckScriptNode} ${jobWorkspace}"
1347-
),
1348-
returnStatus: true
1349-
)
1350-
if (perfCheckResult != 0) {
1351-
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
1352-
}
1353-
}
1354-
}
1355-
13561348
sh "cat $scriptStatusPathLocal"
13571349
while (true) {
13581350
// Check if the job is done by running sacct via SSH
@@ -1364,8 +1356,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13641356
scriptStatusPathNode
13651357
)
13661358
).trim()
1367-
if (!result || result == "RUNNING" || result == "PENDING") {
1368-
echo "Job is still running, pulling the job log."
1359+
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
1360+
echo "Slurm job $sbatchJobId is still running, pulling the job log."
13691361
// Pulling the sbatch output log
13701362
Utils.exec(
13711363
pipeline,
@@ -1376,7 +1368,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13761368
)
13771369
)
13781370
} else {
1379-
echo "Job is done."
1371+
echo "Slurm job $sbatchJobId is done."
13801372
break
13811373
}
13821374
}

0 commit comments

Comments
 (0)