Skip to content

Commit f468fe5

Browse files
committed
fixup
1 parent 87fcd74 commit f468fe5

File tree

3 files changed

+130
-77
lines changed

3 files changed

+130
-77
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/bin/bash
2+
# Run PR and master benchmarks in parallel and verify outputs
3+
# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
4+
5+
set -euo pipefail
6+
7+
if [ $# -ne 3 ]; then
8+
echo "Usage: $0 <device> <interface> <cluster>"
9+
exit 1
10+
fi
11+
12+
device="$1"
13+
interface="$2"
14+
cluster="$3"
15+
16+
echo "=========================================="
17+
echo "Starting parallel benchmark jobs..."
18+
echo "=========================================="
19+
20+
# Run both jobs with monitoring using dedicated script
21+
(bash .github/scripts/submit_and_monitor_bench.sh pr "$device" "$interface" "$cluster") &
22+
pr_pid=$!
23+
echo "PR job started in background (PID: $pr_pid)"
24+
25+
(bash .github/scripts/submit_and_monitor_bench.sh master "$device" "$interface" "$cluster") &
26+
master_pid=$!
27+
echo "Master job started in background (PID: $master_pid)"
28+
29+
echo "Waiting for both jobs to complete..."
30+
31+
# Wait and capture exit codes reliably
32+
pr_exit=0
33+
master_exit=0
34+
35+
if ! wait "$pr_pid"; then
36+
pr_exit=$?
37+
echo "PR job exited with code: $pr_exit"
38+
else
39+
echo "PR job completed successfully"
40+
fi
41+
42+
if ! wait "$master_pid"; then
43+
master_exit=$?
44+
echo "Master job exited with code: $master_exit"
45+
else
46+
echo "Master job completed successfully"
47+
fi
48+
49+
# Check if either job failed
50+
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
51+
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
52+
exit 1
53+
fi
54+
55+
echo "=========================================="
56+
echo "Both benchmark jobs completed successfully!"
57+
echo "=========================================="
58+
59+
# Final verification that output files exist before proceeding
60+
pr_yaml="pr/bench-${device}-${interface}.yaml"
61+
master_yaml="master/bench-${device}-${interface}.yaml"
62+
63+
if [ ! -f "$pr_yaml" ]; then
64+
echo "ERROR: PR benchmark output not found: $pr_yaml"
65+
ls -la pr/ || true
66+
exit 1
67+
fi
68+
69+
if [ ! -f "$master_yaml" ]; then
70+
echo "ERROR: Master benchmark output not found: $master_yaml"
71+
ls -la master/ || true
72+
exit 1
73+
fi
74+
75+
echo "Verified both YAML files exist:"
76+
echo " - $pr_yaml"
77+
echo " - $master_yaml"
78+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/bin/bash
2+
# Submit and monitor a benchmark job on a SLURM cluster
3+
# Usage: submit_and_monitor_bench.sh <dir> <device> <interface> <cluster>
4+
5+
set -euo pipefail
6+
7+
if [ $# -ne 4 ]; then
8+
echo "Usage: $0 <dir> <device> <interface> <cluster>"
9+
exit 1
10+
fi
11+
12+
dir="$1"
13+
device="$2"
14+
interface="$3"
15+
cluster="$4"
16+
17+
echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
18+
cd "$dir"
19+
20+
# Submit job
21+
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
22+
.github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)
23+
24+
job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
25+
job_slug="bench-$device-$interface"
26+
output_file="${job_slug}.out"
27+
28+
if [ -z "$job_id" ]; then
29+
echo "[$dir] ERROR: Failed to submit job"
30+
echo "$submit_output"
31+
exit 1
32+
fi
33+
34+
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
35+
36+
# Use the monitoring script
37+
bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
38+
39+
echo "[$dir] Monitoring complete for job $job_id"
40+
41+
# Verify the YAML output file was created
42+
yaml_file="${job_slug}.yaml"
43+
if [ ! -f "$yaml_file" ]; then
44+
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
45+
echo "[$dir] Directory contents:"
46+
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
47+
exit 1
48+
fi
49+
50+
echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"
51+

.github/workflows/bench.yml

Lines changed: 1 addition & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -98,83 +98,7 @@ jobs:
9898
wait %1 && wait %2
9999
100100
- name: Bench (Master v. PR)
101-
run: |
102-
set -e
103-
104-
# Function to submit and monitor using extracted script
105-
submit_and_monitor() {
106-
local dir=$1
107-
local device=$2
108-
local interface=$3
109-
local cluster=$4
110-
111-
echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
112-
cd "$dir"
113-
114-
# Submit job
115-
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
116-
.github/workflows/$cluster/bench.sh $device $interface 2>&1)
117-
118-
job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
119-
job_slug="bench-$device-$interface"
120-
output_file="${job_slug}.out"
121-
122-
if [ -z "$job_id" ]; then
123-
echo "[$dir] ERROR: Failed to submit job"
124-
echo "$submit_output"
125-
return 1
126-
fi
127-
128-
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
129-
130-
# Use the monitoring script
131-
bash .github/scripts/monitor_slurm_job.sh "$job_id" "$output_file"
132-
133-
echo "[$dir] Monitoring complete for job $job_id"
134-
}
135-
136-
# Run both jobs with monitoring
137-
echo "=========================================="
138-
echo "Starting parallel benchmark jobs..."
139-
echo "=========================================="
140-
141-
(submit_and_monitor pr ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
142-
pr_pid=$!
143-
echo "PR job started in background (PID: $pr_pid)"
144-
145-
(submit_and_monitor master ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}) &
146-
master_pid=$!
147-
echo "Master job started in background (PID: $master_pid)"
148-
149-
echo "Waiting for both jobs to complete..."
150-
151-
# Wait and capture exit codes reliably
152-
pr_exit=0
153-
master_exit=0
154-
155-
if ! wait "$pr_pid"; then
156-
pr_exit=$?
157-
echo "PR job exited with code: $pr_exit"
158-
else
159-
echo "PR job completed successfully"
160-
fi
161-
162-
if ! wait "$master_pid"; then
163-
master_exit=$?
164-
echo "Master job exited with code: $master_exit"
165-
else
166-
echo "Master job completed successfully"
167-
fi
168-
169-
# Explicitly check and quote to avoid test errors
170-
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
171-
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
172-
exit 1
173-
fi
174-
175-
echo "=========================================="
176-
echo "Both benchmark jobs completed successfully!"
177-
echo "=========================================="
101+
run: bash .github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
178102

179103
- name: Generate & Post Comment
180104
run: |

0 commit comments

Comments
 (0)