Skip to content

Commit ac513a3

Browse files
committed
Merge remote-tracking branch 'upstream/master' into MovingELBubblesSandia
2 parents 679926a + 88c0a11 commit ac513a3

File tree

203 files changed

+11714
-2814
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

203 files changed

+11714
-2814
lines changed

.fortitude.toml

Lines changed: 0 additions & 4 deletions
This file was deleted.

.github/workflows/bench.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ jobs:
2424

2525
self:
2626
name: "${{ matrix.name }} (${{ matrix.device }})"
27-
if: ${{ github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && (
28-
(github.event_name == 'pull_request_review' && github.event.review.state == 'approved') ||
29-
(github.event_name == 'pull_request' && github.event.pull_request.user.login == 'sbryngelson')
30-
) }}
27+
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
3128
needs: file-changes
3229
strategy:
3330
fail-fast: false

.github/workflows/cleanliness.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ jobs:
4141
- name: Setup Ubuntu
4242
run: |
4343
sudo apt update -y
44-
sudo apt install -y tar wget make cmake gcc g++ python3 python3-dev "openmpi-*" libopenmpi-dev
45-
44+
sudo apt install -y tar wget make cmake gcc g++ python3 python3-dev "openmpi-*" libopenmpi-dev libblas-dev liblapack-dev
45+
4646
- name: Build
4747
run: |
4848
(cd pr && /bin/bash mfc.sh build -j $(nproc) --debug 2> ../pr.txt)

.github/workflows/coverage.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ jobs:
3030
- name: Setup Ubuntu
3131
run: |
3232
sudo apt update -y
33-
sudo apt install -y tar wget make cmake gcc g++ python3 python3-dev "openmpi-*" libopenmpi-dev
33+
sudo apt install -y tar wget make cmake gcc g++ python3 \
34+
python3-dev "openmpi-*" libopenmpi-dev hdf5-tools \
35+
libfftw3-dev libhdf5-dev libblas-dev liblapack-dev
3436
3537
- name: Build
3638
run: /bin/bash mfc.sh build -j $(nproc) --gcov

.github/workflows/frontier/build.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,6 @@ if [ "$2" == "bench" ]; then
1313
./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
1414
done
1515
else
16-
./mfc.sh test --dry-run -j 8 $build_opts
16+
./mfc.sh test -a --dry-run --rdma-mpi --generate -j 8 $build_opts
1717
fi
18+

.github/workflows/frontier/submit.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
2929

3030
sbatch <<EOT
3131
#!/bin/bash
32-
#SBATCH -JMFC-$job_slug # Job name
32+
#SBATCH -J MFC-$job_slug # Job name
3333
#SBATCH -A CFD154 # charge account
3434
#SBATCH -N 1 # Number of nodes required
3535
$sbatch_device_opts
36-
#SBATCH -t 03:59:00 # Duration of the job (Ex: 15 mins)
36+
#SBATCH -t 04:59:00 # Duration of the job (Ex: 15 mins)
3737
#SBATCH -o$job_slug.out # Combined output and error messages file
3838
#SBATCH -p extended # Extended partition for shorter queues
3939
#SBATCH -W # Do not exit until the submitted job terminates.

.github/workflows/frontier/test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n'
44
ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`
55

66
if [ "$job_device" = "gpu" ]; then
7-
./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier
7+
./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier
88
else
9-
./mfc.sh test --max-attempts 3 -j 32 -- -c frontier
9+
./mfc.sh test -a --rdma-mpi --max-attempts 3 -j 32 -- -c frontier
1010
fi

.github/workflows/lint-source.yml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,6 @@ jobs:
2828
- name: Initialize MFC
2929
run: ./mfc.sh init
3030

31-
- name: Lint the full source
32-
run: |
33-
source build/venv/bin/activate
34-
find ./src -type f -not -name '*nvtx*' -exec sh -c 'fortitude check "$1" | grep -v E001' _ {} \;
35-
find ./src -type f -not -name '*nvtx*' -exec sh -c 'fortitude check "$1" | grep -v E001' _ {} \; | wc -l | xargs -I{} sh -c '[ {} -gt 0 ] && exit 1 || exit 0'
36-
3731
- name: Looking for raw directives
3832
run: |
3933
! grep -iR '!\$acc\|!\$omp' --exclude="parallel_macros.fpp" --exclude="syscheck.fpp" ./src/*

.github/workflows/phoenix/bench.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22

33
n_ranks=12
44

5-
echo "My benchmarking device is:" $device
6-
if [ "$device" = "gpu" ]; then
5+
if [ "$job_device" = "gpu" ]; then
76
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
87
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
98
device_opts="--gpu -g $gpu_ids"
@@ -16,7 +15,7 @@ mkdir -p $currentdir
1615

1716
export TMPDIR=$currentdir
1817

19-
if [ "$device" = "gpu" ]; then
18+
if [ "$job_device" = "gpu" ]; then
2019
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
2120
else
2221
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
Lines changed: 39 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,64 @@
1-
#!/usr/bin/env bash
2-
set -euo pipefail
1+
#!/bin/bash
2+
3+
set -e
34

45
usage() {
56
echo "Usage: $0 [script.sh] [cpu|gpu]"
6-
exit 1
77
}
88

9-
[[ $# -eq 2 ]] || usage
10-
11-
sbatch_script="$1"
12-
13-
device="$2"
14-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
15-
16-
# read the body of the user script
17-
sbatch_body=$(<"$sbatch_script")
18-
19-
# common SBATCH directives
20-
sbatch_common_opts="\
21-
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
22-
#SBATCH --account=gts-sbryngelson3 # account
23-
#SBATCH -N1 # nodes
24-
#SBATCH -t 02:00:00 # walltime
25-
#SBATCH -q embers # QOS
26-
#SBATCH -o $job_slug.out # stdout+stderr
27-
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
28-
"
9+
if [ ! -z "$1" ]; then
10+
sbatch_script_contents=`cat $1`
11+
else
12+
usage
13+
exit 1
14+
fi
2915

30-
# CPU vs GPU overrides
31-
if [[ "$device" == "cpu" ]]; then
32-
sbatch_device_opts="\
16+
sbatch_cpu_opts="\
3317
#SBATCH -p cpu-small # partition
3418
#SBATCH --ntasks-per-node=24 # Number of cores per node required
3519
#SBATCH --mem-per-cpu=2G # Memory per core\
3620
"
37-
elif [[ "$device" == "gpu" ]]; then
38-
sbatch_device_opts="\
21+
22+
sbatch_gpu_opts="\
3923
#SBATCH -CL40S
4024
#SBATCH --ntasks-per-node=4 # Number of cores per node required
4125
#SBATCH -G2\
4226
"
27+
28+
if [ "$2" = "cpu" ]; then
29+
sbatch_device_opts="$sbatch_cpu_opts"
30+
elif [ "$2" = "gpu" ]; then
31+
sbatch_device_opts="$sbatch_gpu_opts"
4332
else
44-
usage
33+
usage
34+
exit 1
4535
fi
4636

47-
# submit and capture the JobID
48-
JOBID=$(sbatch <<-EOT | awk '{print $4}'
49-
#!/usr/bin/env bash
50-
${sbatch_common_opts}
51-
${sbatch_device_opts}
52-
53-
export job_slug="${job_slug}"
54-
export device="${device}"
55-
56-
echo "Job slug is:" $job_slug
57-
echo "Device is:" $device
58-
59-
set -e -x
60-
61-
cd "\$SLURM_SUBMIT_DIR"
62-
echo "Running in \$(pwd):"
37+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
6338

64-
# load your modules & env
65-
. ./mfc.sh load -c p -m $device
39+
sbatch <<EOT
40+
#!/bin/bash
41+
#SBATCH -Jshb-$job_slug # Job name
42+
#SBATCH --account=gts-sbryngelson3 # charge account
43+
#SBATCH -N1 # Number of nodes required
44+
$sbatch_device_opts
45+
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
46+
#SBATCH -q embers # QOS Name
47+
#SBATCH -o$job_slug.out # Combined output and error messages file
48+
#SBATCH -W # Do not exit until the submitted job terminates.
6649
67-
# user script contents
68-
${sbatch_body}
69-
EOT
70-
)
50+
set -e
51+
set -x
7152
72-
echo "Submitted: SLURM job $JOBID"
53+
cd "\$SLURM_SUBMIT_DIR"
54+
echo "Running in $(pwd):"
7355
74-
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
75-
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
56+
job_slug="$job_slug"
57+
job_device="$2"
7658
77-
# ────────── Poll until SLURM job finishes ──────────
78-
while :; do
79-
# Try sacct first
80-
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
59+
. ./mfc.sh load -c p -m $2
8160
82-
# Fallback to squeue if sacct is empty
83-
if [[ -z "$STATE" ]]; then
84-
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
85-
fi
61+
$sbatch_script_contents
8662
87-
# If it’s one of SLURM’s terminal states, break immediately
88-
case "$STATE" in
89-
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
90-
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
91-
break
92-
;;
93-
"")
94-
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
95-
break
96-
;;
97-
*)
98-
echo "Waiting: SLURM job $JOBID state: $STATE"
99-
sleep 10
100-
;;
101-
esac
102-
done
63+
EOT
10364

104-
# Now retrieve the exit code and exit with it
105-
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
106-
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
107-
exit "$EXIT_CODE"

0 commit comments

Comments
 (0)