Skip to content

Commit 01d64d3

Browse files
committed
fix more nitpicks
1 parent ca09ece commit 01d64d3

File tree

3 files changed

+36
-12
lines changed

3 files changed

+36
-12
lines changed

.github/scripts/monitor_slurm_job.sh

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,15 +119,28 @@ exit_code=""
119119
# Try scontrol first (works for recent jobs)
120120
scontrol_output=$(scontrol show job "$job_id" 2>/dev/null || echo "")
121121
if [ -n "$scontrol_output" ]; then
122-
exit_code=$(echo "$scontrol_output" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | cut -d= -f2 || echo "")
122+
# Prefer the JobId line matching the parent job, then extract ExitCode
123+
parent_block=$(echo "$scontrol_output" | awk -v id="$job_id" '
124+
BEGIN{RS="\n\n"; ORS="\n\n"}
125+
$0 ~ "JobId="id"[^0-9]" || $0 ~ "JobId="id"$" {print; exit}
126+
')
127+
if [ -n "$parent_block" ]; then
128+
exit_code=$(echo "$parent_block" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | head -n1 | cut -d= -f2 || echo "")
129+
else
130+
# Fallback: take first ExitCode found
131+
exit_code=$(echo "$scontrol_output" | grep -oE 'ExitCode=[0-9]+:[0-9]+' | head -n1 | cut -d= -f2 || echo "")
132+
fi
123133
fi
124134

125135
# If scontrol failed or returned invalid job, try sacct (for completed/aged-out jobs)
126136
if [ -z "$exit_code" ]; then
127137
echo "Warning: scontrol failed to get exit code, trying sacct..."
128-
sacct_output=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 || echo "")
129-
if [ -n "$sacct_output" ]; then
130-
exit_code="$sacct_output"
138+
# Only consider terminal states to avoid partial step lines
139+
sacct_line=$(sacct -j "$job_id" --format=JobID,State,ExitCode --noheader --parsable2 2>/dev/null | awk -F'|' '
140+
$2 ~ /COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY/ {print; exit}
141+
')
142+
if [ -n "$sacct_line" ]; then
143+
exit_code=$(echo "$sacct_line" | awk -F'|' '{print $3}' | head -n1)
131144
fi
132145
fi
133146

.github/workflows/bench.yml

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,18 +115,28 @@ jobs:
115115
116116
# Submit job
117117
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
118-
.github/workflows/$cluster/bench.sh $device $interface 2>&1)
118+
.github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)
119119
120+
# Extract job ID using standard sbatch output pattern
120121
job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
121-
job_slug="bench-$device-$interface"
122-
output_file="${job_slug}.out"
123122
124-
if [ -z "$job_id" ]; then
125-
echo "ERROR: Failed to submit job"
126-
echo "$submit_output"
127-
return 1
123+
# Validate that we got a numeric job ID
124+
if ! [[ "$job_id" =~ ^[0-9]+$ ]]; then
125+
# Fallback: try extracting last number from output (handles non-standard sbatch messages)
126+
job_id=$(echo "$submit_output" | grep -Eo '[0-9]+' | tail -n1)
127+
128+
# Final validation
129+
if ! [[ "$job_id" =~ ^[0-9]+$ ]]; then
130+
echo "ERROR: Failed to extract valid job ID from submit output:"
131+
echo "$submit_output"
132+
return 1
133+
fi
134+
echo "Warning: Used fallback method to extract job ID: $job_id"
128135
fi
129136
137+
job_slug="bench-$device-$interface"
138+
output_file="${job_slug}.out"
139+
130140
# Use the monitoring script from PR checkout
131141
bash "$MONITOR_SCRIPT" "$job_id" "$output_file"
132142
}

toolchain/mfc/test/test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,14 +410,15 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
410410

411411
except subprocess.TimeoutExpired as exc:
412412
# Subprocess timeout - the process was actually killed
413+
timeout_hours = TEST_TIMEOUT_SECONDS / 3600
413414
if os.path.exists(current_log_path):
414415
log_msg = f"Check the log at: {current_log_path}"
415416
else:
416417
log_msg = (
417418
f"Log file ({current_log_path}) may not exist if the timeout occurred early."
418419
)
419420
raise MFCException(
420-
f"Test {case} exceeded 1 hour timeout (process was killed). "
421+
f"Test {case} exceeded {timeout_hours:.1f} hour timeout (process was killed). "
421422
f"This may indicate a hung simulation or misconfigured case. "
422423
f"{log_msg}"
423424
) from exc

0 commit comments

Comments
 (0)