Skip to content

Commit c7e26f2

Browse files
committed
Support ./mfc.sh test on multiple GPUs
1 parent 4e11468 commit c7e26f2

File tree

6 files changed

+51
-30
lines changed

6 files changed

+51
-30
lines changed

misc/run-phoenix-release-cpu.sh

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
#!/bin/bash
2-
#SBATCH -Jshb-test-jobs # Job name
3-
#SBATCH --account=gts-sbryngelson3 # charge account
4-
#SBATCH -N1 --ntasks-per-node=12 # Number of nodes and cores per node required
5-
#SBATCH --mem-per-cpu=2G # Memory per core
6-
#SBATCH -t 04:00:00 # Duration of the job (Ex: 15 mins)
7-
#SBATCH -q embers # QOS Name
8-
#SBATCH -otest.out # Combined output and error messages file
9-
#SBATCH -W # Do not exit until the submitted job terminates.
2+
#SBATCH -Jshb-test-jobs # Job name
3+
#SBATCH --account=gts-sbryngelson3 # charge account
4+
#SBATCH -N1 --ntasks-per-node=12 # Number of nodes and cores per node required
5+
#SBATCH --mem-per-cpu=2G # Memory per core
6+
#SBATCH -t 04:00:00 # Duration of the job (Ex: 15 mins)
7+
#SBATCH -q embers # QOS Name
8+
#SBATCH -otest.out # Combined output and error messages file
9+
#SBATCH -W # Do not exit until the submitted job terminates.
10+
11+
cd "$SLURM_SUBMIT_DIR"
12+
echo "Running in $(pwd):"
1013

11-
cd $SLURM_SUBMIT_DIR # Change to working directory
12-
echo $(pwd)
1314
. ./mfc.sh load -c p -m gpu
1415
./mfc.sh test -j 12 -b mpirun -a
16+

misc/run-phoenix-release-gpu.sh

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
#!/bin/bash
2-
#SBATCH -Jshb-test-jobs # Job name
3-
#SBATCH --account=gts-sbryngelson3 # charge account
4-
#SBATCH -N1 # Number of nodes and cores per node required
2+
#SBATCH -Jshb-test-jobs # Job name
3+
#SBATCH --account=gts-sbryngelson3 # charge account
4+
#SBATCH -N1 # Number of nodes and cores per node required
55
#SBATCH -CV100-16GB
66
#SBATCH -G2
7-
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
8-
#SBATCH -q embers # QOS Name
9-
#SBATCH -otest.out # Combined output and error messages file
10-
#SBATCH -W # Do not exit until the submitted job terminates.
7+
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
8+
#SBATCH -q embers # QOS Name
9+
#SBATCH -otest.out # Combined output and error messages file
10+
#SBATCH -W # Do not exit until the submitted job terminates.
1111

12-
cd $SLURM_SUBMIT_DIR # Change to working directory
13-
echo $(pwd)
12+
cd "$SLURM_SUBMIT_DIR"
13+
echo "Running in $(pwd):"
1414

15-
. ./mfc.sh load -c p -m GPU
15+
set -x
1616

17-
nvidia-smi
18-
echo $(nproc)
17+
. ./mfc.sh load -c p -m GPU
1918

19+
./mfc.sh test -a -b mpirun -j $(nproc) \
20+
--gpu -g $(seq -s ',' 0 $(($(nvidia-smi -L | wc -l)-1)))
2021

21-
./mfc.sh test -b mpirun -a --gpu

toolchain/mfc/args.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def add_common_arguments(p, mask = None):
8181
test.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Serial) Override MPI execution binary")
8282
test.add_argument("-r", "--relentless", action="store_true", default=False, help="Run all tests, even if multiple fail.")
8383
test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.")
84+
test.add_argument("-g", "--gpus", type=str, default="0", help="(GPU) Comma separated list of GPU #s to use.")
8485

8586
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
8687

@@ -146,4 +147,8 @@ def append_defaults_to_data(name: str, parser):
146147
if args[e] is not None:
147148
args[e] = os.path.abspath(args[e])
148149

150+
# Turn GPU ID list into a comma separated string
151+
if "gpus" in args:
152+
args["gpus"] = [int(g) for g in args["gpus"].split(",")]
153+
149154
return args

toolchain/mfc/case.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import json, copy, dataclasses
1+
import json, math, copy, dataclasses
22

33
@dataclasses.dataclass(init=False)
44
class Case:
@@ -10,6 +10,9 @@ def __init__(self, params: dict) -> None:
1010
def get_parameters(self) -> str:
1111
return self.params.keys()
1212

13+
def get_cell_count(self) -> int:
14+
return math.prod([max(1, int(self.params.get(dir, 0))) for dir in {"m", "n", "p"}])
15+
1316
def has_parameter(self, key: str)-> bool:
1417
return key in self.get_parameters()
1518

toolchain/mfc/test/case.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ def __init__(self, trace: str, mods: dict, ppn: int = None) -> None:
100100
self.ppn = ppn if ppn is not None else 1
101101
super().__init__({**BASE_CFG.copy(), **mods})
102102

103-
def run(self, targets: typing.List[str]) -> subprocess.CompletedProcess:
103+
def run(self, targets: typing.List[str], gpu: int) -> subprocess.CompletedProcess:
104+
gpu_select = f"CUDA_VISIBLE_DEVICES={gpu}"
104105
filepath = f'"{self.get_dirpath()}/case.py"'
105106
tasks = f"-n {self.ppn}"
106107
jobs = f"-j {ARG('jobs')}" if ARG("case_optimization") else ""
@@ -110,7 +111,7 @@ def run(self, targets: typing.List[str]) -> subprocess.CompletedProcess:
110111
mfc_script = ".\mfc.bat" if os.name == 'nt' else "./mfc.sh"
111112

112113
command: str = f'''\
113-
{mfc_script} run {filepath} {tasks} {binary_option} \
114+
{gpu_select} {mfc_script} run {filepath} {tasks} {binary_option} \
114115
{case_optimization} {jobs} -t {' '.join(targets)} 2>&1\
115116
'''
116117

toolchain/mfc/test/test.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ def test():
9999
cons.print(f" tests/[bold magenta]UUID[/bold magenta] Summary")
100100
cons.print()
101101

102+
# Initialize GPU_LOAD to 0 for each GPU
103+
handle_case.GPU_LOAD = { id: 0 for id in ARG("gpus") }
104+
102105
# Select the correct number of threads to use to launch test CASES
103106
# We can't use ARG("jobs") when the --case-optimization option is set
104107
# because running a test case may cause it to rebuild, and thus
@@ -124,7 +127,7 @@ def test():
124127

125128
def handle_case(test: TestCase):
126129
global nFAIL
127-
130+
128131
try:
129132
if test.params.get("qbmm", 'F') == 'T':
130133
tol = 1e-10
@@ -137,8 +140,12 @@ def handle_case(test: TestCase):
137140

138141
test.delete_output()
139142
test.create_directory()
140-
141-
cmd = test.run(["pre_process", "simulation"])
143+
144+
load = test.get_cell_count()
145+
gpu_id = min(handle_case.GPU_LOAD.items(), key=lambda x: x[1])[0]
146+
handle_case.GPU_LOAD[gpu_id] += load
147+
148+
cmd = test.run(["pre_process", "simulation"], gpu=gpu_id)
142149

143150
out_filepath = os.path.join(test.get_dirpath(), "out_pre_sim.txt")
144151

@@ -178,7 +185,7 @@ def handle_case(test: TestCase):
178185

179186
if ARG("test_all"):
180187
test.delete_output()
181-
cmd = test.run(["pre_process", "simulation", "post_process"])
188+
cmd = test.run(["pre_process", "simulation", "post_process"], gpu=gpu_id)
182189
out_filepath = os.path.join(test.get_dirpath(), "out_post.txt")
183190
common.file_write(out_filepath, cmd.stdout)
184191

@@ -210,6 +217,7 @@ def handle_case(test: TestCase):
210217

211218
cons.print(f" [bold magenta]{test.get_uuid()}[/bold magenta] {test.trace}")
212219

220+
handle_case.GPU_LOAD[gpu_id] -= load
213221
except Exception as exc:
214222
nFAIL = nFAIL + 1
215223

@@ -218,3 +226,5 @@ def handle_case(test: TestCase):
218226

219227
cons.print(f"[bold red]Failed test {test}.[/bold red]")
220228
cons.print(f"{exc}")
229+
230+
handle_case.GPU_LOAD = {}

0 commit comments

Comments
 (0)