Skip to content

Commit e56d6dc

Browse files
authored
Change unknown job device (#978)
1 parent 8173368 commit e56d6dc

File tree

4 files changed

+87
-177
lines changed

4 files changed

+87
-177
lines changed

.github/workflows/phoenix/bench.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22

33
n_ranks=12
44

5-
echo "My benchmarking device is:" $device
6-
if [ "$device" = "gpu" ]; then
5+
if [ "$job_device" = "gpu" ]; then
76
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
87
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
98
device_opts="--gpu -g $gpu_ids"
@@ -16,7 +15,7 @@ mkdir -p $currentdir
1615

1716
export TMPDIR=$currentdir
1817

19-
if [ "$device" = "gpu" ]; then
18+
if [ "$job_device" = "gpu" ]; then
2019
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
2120
else
2221
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
Lines changed: 39 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,64 @@
1-
#!/usr/bin/env bash
2-
set -euo pipefail
1+
#!/bin/bash
2+
3+
set -e
34

45
usage() {
56
echo "Usage: $0 [script.sh] [cpu|gpu]"
6-
exit 1
77
}
88

9-
[[ $# -eq 2 ]] || usage
10-
11-
sbatch_script="$1"
12-
13-
device="$2"
14-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
15-
16-
# read the body of the user script
17-
sbatch_body=$(<"$sbatch_script")
18-
19-
# common SBATCH directives
20-
sbatch_common_opts="\
21-
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
22-
#SBATCH --account=gts-sbryngelson3 # account
23-
#SBATCH -N1 # nodes
24-
#SBATCH -t 02:00:00 # walltime
25-
#SBATCH -q embers # QOS
26-
#SBATCH -o $job_slug.out # stdout+stderr
27-
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
28-
"
9+
if [ ! -z "$1" ]; then
10+
sbatch_script_contents=`cat $1`
11+
else
12+
usage
13+
exit 1
14+
fi
2915

30-
# CPU vs GPU overrides
31-
if [[ "$device" == "cpu" ]]; then
32-
sbatch_device_opts="\
16+
sbatch_cpu_opts="\
3317
#SBATCH -p cpu-small # partition
3418
#SBATCH --ntasks-per-node=24 # Number of cores per node required
3519
#SBATCH --mem-per-cpu=2G # Memory per core\
3620
"
37-
elif [[ "$device" == "gpu" ]]; then
38-
sbatch_device_opts="\
21+
22+
sbatch_gpu_opts="\
3923
#SBATCH -CL40S
4024
#SBATCH --ntasks-per-node=4 # Number of cores per node required
4125
#SBATCH -G2\
4226
"
27+
28+
if [ "$2" = "cpu" ]; then
29+
sbatch_device_opts="$sbatch_cpu_opts"
30+
elif [ "$2" = "gpu" ]; then
31+
sbatch_device_opts="$sbatch_gpu_opts"
4332
else
44-
usage
33+
usage
34+
exit 1
4535
fi
4636

47-
# submit and capture the JobID
48-
JOBID=$(sbatch <<-EOT | awk '{print $4}'
49-
#!/usr/bin/env bash
50-
${sbatch_common_opts}
51-
${sbatch_device_opts}
52-
53-
export job_slug="${job_slug}"
54-
export device="${device}"
55-
56-
echo "Job slug is:" $job_slug
57-
echo "Device is:" $device
58-
59-
set -e -x
60-
61-
cd "\$SLURM_SUBMIT_DIR"
62-
echo "Running in \$(pwd):"
37+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
6338

64-
# load your modules & env
65-
. ./mfc.sh load -c p -m $device
39+
sbatch <<EOT
40+
#!/bin/bash
41+
#SBATCH -Jshb-$job_slug # Job name
42+
#SBATCH --account=gts-sbryngelson3 # charge account
43+
#SBATCH -N1 # Number of nodes required
44+
$sbatch_device_opts
45+
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
46+
#SBATCH -q embers # QOS Name
47+
#SBATCH -o$job_slug.out # Combined output and error messages file
48+
#SBATCH -W # Do not exit until the submitted job terminates.
6649
67-
# user script contents
68-
${sbatch_body}
69-
EOT
70-
)
50+
set -e
51+
set -x
7152
72-
echo "Submitted: SLURM job $JOBID"
53+
cd "\$SLURM_SUBMIT_DIR"
54+
echo "Running in $(pwd):"
7355
74-
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
75-
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
56+
job_slug="$job_slug"
57+
job_device="$2"
7658
77-
# ────────── Poll until SLURM job finishes ──────────
78-
while :; do
79-
# Try sacct first
80-
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
59+
. ./mfc.sh load -c p -m $2
8160
82-
# Fallback to squeue if sacct is empty
83-
if [[ -z "$STATE" ]]; then
84-
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
85-
fi
61+
$sbatch_script_contents
8662
87-
# If it’s one of SLURM’s terminal states, break immediately
88-
case "$STATE" in
89-
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
90-
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
91-
break
92-
;;
93-
"")
94-
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
95-
break
96-
;;
97-
*)
98-
echo "Waiting: SLURM job $JOBID state: $STATE"
99-
sleep 10
100-
;;
101-
esac
102-
done
63+
EOT
10364

104-
# Now retrieve the exit code and exit with it
105-
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
106-
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
107-
exit "$EXIT_CODE"
Lines changed: 43 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,64 @@
1-
#!/usr/bin/env bash
2-
set -euo pipefail
1+
#!/bin/bash
2+
3+
set -e
34

45
usage() {
56
echo "Usage: $0 [script.sh] [cpu|gpu]"
6-
exit 1
77
}
88

9-
[[ $# -eq 2 ]] || usage
10-
11-
sbatch_script="$1"
12-
device="$2"
13-
14-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
15-
16-
# read the body of the user script
17-
sbatch_body=$(<"$sbatch_script")
9+
if [ ! -z "$1" ]; then
10+
sbatch_script_contents=`cat $1`
11+
else
12+
usage
13+
exit 1
14+
fi
1815

19-
# common SBATCH directives
20-
sbatch_common_opts="\
21-
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
22-
#SBATCH --account=gts-sbryngelson3 # account
23-
#SBATCH -N1 # nodes
24-
#SBATCH -t 03:00:00 # walltime
25-
#SBATCH -q embers # QOS
26-
#SBATCH -o $job_slug.out # stdout+stderr
27-
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
16+
sbatch_cpu_opts="\
17+
#SBATCH -p cpu-small # partition
18+
#SBATCH --ntasks-per-node=24 # Number of cores per node required
19+
#SBATCH --mem-per-cpu=2G # Memory per core\
2820
"
2921

30-
# CPU vs GPU overrides
31-
if [[ "$device" == "cpu" ]]; then
32-
sbatch_device_opts="\
33-
#SBATCH -p cpu-small
34-
#SBATCH --ntasks-per-node=24
35-
"
36-
elif [[ "$device" == "gpu" ]]; then
37-
sbatch_device_opts="\
22+
sbatch_gpu_opts="\
3823
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
39-
#SBATCH --ntasks-per-node=4
40-
#SBATCH -G2
24+
#SBATCH --ntasks-per-node=4 # Number of cores per node required
25+
#SBATCH -G2\
4126
"
27+
28+
if [ "$2" = "cpu" ]; then
29+
sbatch_device_opts="$sbatch_cpu_opts"
30+
elif [ "$2" = "gpu" ]; then
31+
sbatch_device_opts="$sbatch_gpu_opts"
4232
else
43-
usage
33+
usage
34+
exit 1
4435
fi
4536

46-
# submit and capture the JobID
47-
JOBID=$(sbatch <<-EOT | awk '{print $4}'
48-
#!/usr/bin/env bash
49-
${sbatch_common_opts}
50-
${sbatch_device_opts}
51-
52-
set -e -x
53-
54-
cd "\$SLURM_SUBMIT_DIR"
55-
echo "Running in \$(pwd):"
37+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
5638

57-
# load your modules & env
58-
. ./mfc.sh load -c p -m $device
39+
sbatch <<EOT
40+
#!/bin/bash
41+
#SBATCH -Jshb-$job_slug # Job name
42+
#SBATCH --account=gts-sbryngelson3 # charge account
43+
#SBATCH -N1 # Number of nodes required
44+
$sbatch_device_opts
45+
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
46+
#SBATCH -q embers # QOS Name
47+
#SBATCH -o$job_slug.out # Combined output and error messages file
48+
#SBATCH -W # Do not exit until the submitted job terminates.
5949
60-
# user script contents
61-
${sbatch_body}
62-
EOT
63-
)
50+
set -e
51+
set -x
6452
65-
echo "Submitted: SLURM job $JOBID"
53+
cd "\$SLURM_SUBMIT_DIR"
54+
echo "Running in $(pwd):"
6655
67-
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
68-
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
56+
job_slug="$job_slug"
57+
job_device="$2"
6958
70-
# ────────── Poll until SLURM job finishes ──────────
71-
while :; do
72-
# Try sacct first
73-
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
59+
. ./mfc.sh load -c p -m $2
7460
75-
# Fallback to squeue if sacct is empty
76-
if [[ -z "$STATE" ]]; then
77-
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
78-
fi
61+
$sbatch_script_contents
7962
80-
# If it’s one of SLURM’s terminal states, break immediately
81-
case "$STATE" in
82-
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
83-
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
84-
break
85-
;;
86-
"")
87-
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
88-
break
89-
;;
90-
*)
91-
echo "Waiting: SLURM job $JOBID state: $STATE"
92-
sleep 10
93-
;;
94-
esac
95-
done
63+
EOT
9664

97-
# Now retrieve the exit code and exit with it
98-
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
99-
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
100-
exit "$EXIT_CODE"

.github/workflows/phoenix/test.sh

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,13 @@
11
#!/bin/bash
22

3-
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
4-
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
5-
mkdir -p $tmpbuild
6-
mkdir -p $currentdir
7-
export TMPDIR=$currentdir
8-
9-
n_test_threads=8
10-
113
build_opts=""
124
if [ "$job_device" = "gpu" ]; then
135
build_opts="--gpu"
146
fi
157

16-
./mfc.sh test --dry-run -j $n_test_threads $build_opts
8+
./mfc.sh test --dry-run -j 8 $build_opts
9+
10+
n_test_threads=8
1711

1812
if [ "$job_device" = "gpu" ]; then
1913
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
@@ -24,7 +18,3 @@ fi
2418

2519
./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
2620

27-
sleep 10
28-
rm -rf "$currentdir" || true
29-
30-
unset TMPDIR

0 commit comments

Comments
 (0)