Skip to content

Commit 90ba0e3

Browse files
authored
Bugfix ci (#28)
* Fix the configuration launch scripts. Add a check for when the number of ranks and the number of GPUs per rank exceed the total number of GPUs per node. * Updating tests to new API. * Fixed a bug in the scheduler classes, where the host file was being created after the job was run. This prevented the CI tests from self checking. Additionally, added checks in the CI to skip if MPI wasn't detected.
1 parent 7f476d9 commit 90ba0e3

File tree

9 files changed

+89
-33
lines changed

9 files changed

+89
-33
lines changed

hpc_launcher/schedulers/flux.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def launcher_script(
158158
args: Optional[list[str]] = None,
159159
blocking: bool = True,
160160
save_hostlist: bool = False,
161+
launch_dir: str = "",
161162
) -> str:
162163

163164
script = ""
@@ -169,6 +170,9 @@ def launcher_script(
169170
script += "\n"
170171
if save_hostlist:
171172
script += "export HPC_LAUNCHER_HOSTLIST=$(flux hostlist local)\n"
173+
script += '\nif [ "${RANK}" = "0" ]; then'
174+
script += "\n echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
175+
script += "fi\n"
172176

173177
if not blocking:
174178
script += "flux run "

hpc_launcher/schedulers/local.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def launcher_script(
4242
args: Optional[list[str]] = None,
4343
blocking: bool = True,
4444
save_hostlist: bool = False,
45+
launch_dir: str = "",
4546
) -> str:
4647
envvars = [f"export {k}={v}" for k, v in system.environment_variables()]
4748
envvars += [
@@ -53,6 +54,9 @@ def launcher_script(
5354
if save_hostlist:
5455
envvars += [
5556
"export HPC_LAUNCHER_HOSTLIST=$(hostname)",
57+
'\nif [ "${RANK}" = "0" ]; then',
58+
"\n echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n"),
59+
"fi\n",
5660
]
5761
header = "\n".join(envvars)
5862

hpc_launcher/schedulers/lsf.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def launcher_script(
137137
args: Optional[list[str]] = None,
138138
blocking: bool = True,
139139
save_hostlist: bool = False,
140+
launch_dir: str = "",
140141
) -> str:
141142

142143
script = ""
@@ -148,6 +149,9 @@ def launcher_script(
148149
script += "\n"
149150
if save_hostlist:
150151
script += "export HPC_LAUNCHER_HOSTLIST=$(echo $LSB_HOSTS | tr ' ' '\\n' | sort -u)\n\n"
152+
script += '\nif [ "${RANK}" = "0" ]; then'
153+
script += "\n echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
154+
script += "fi\n"
151155

152156
if not blocking or (blocking and not os.getenv("LSB_HOSTS")):
153157
script += "jsrun "

hpc_launcher/schedulers/scheduler.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def launcher_script(
120120
args: Optional[list[str]] = None,
121121
blocking: bool = True,
122122
save_hostlist: bool = False,
123+
launch_dir: str = "",
123124
) -> str:
124125
"""
125126
Returns the full launcher script, which can be saved as a batch
@@ -335,17 +336,8 @@ def launch(
335336
logger.info(f"Script filename: {filename}")
336337
with open(filename, "w") as fp:
337338
fp.write(
338-
self.launcher_script(system, command, args, blocking, save_hostlist)
339+
self.launcher_script(system, command, args, blocking, save_hostlist, os.path.dirname(filename))
339340
)
340-
if save_hostlist:
341-
fp.write('\nif [ "${RANK}" = "0" ]; then')
342-
fp.write(
343-
"\n echo ${HPC_LAUNCHER_HOSTLIST} > "
344-
+ os.path.join(
345-
os.path.dirname(filename), f"hpc_launcher_hostlist.txt\n"
346-
)
347-
)
348-
fp.write("fi\n")
349341

350342
fp.write(f"\n# Launch command: " + " ".join(full_cmdline) + "\n")
351343
if self.command_line:

hpc_launcher/schedulers/slurm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def launcher_script(
167167
args: Optional[list[str]] = None,
168168
blocking: bool = True,
169169
save_hostlist: bool = False,
170+
launch_dir: str = "",
170171
) -> str:
171172

172173
script = ""
@@ -180,6 +181,9 @@ def launcher_script(
180181
script += "\n"
181182
if save_hostlist:
182183
script += "export HPC_LAUNCHER_HOSTLIST=${SLURM_JOB_NODELIST}\n"
184+
script += '\nif [ "${RANK}" = "0" ]; then'
185+
script += "\n echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
186+
script += "fi\n"
183187

184188
if not blocking:
185189
script += "srun -u "

hpc_launcher/systems/configure.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,10 @@ def configure_launch(
2727
queue: str,
2828
nodes: int,
2929
procs_per_node: int,
30-
gpus_per_proc: int,
31-
gpus_at_least: int,
32-
gpumem_at_least: int,
33-
cli_system_params: Optional[tuple[int, int, str, float, int, str, Optional[float]]],
30+
gpus_per_proc: Optional[int],
31+
gpus_at_least: int = 0,
32+
gpumem_at_least: int = 0,
33+
cli_system_params: Optional[tuple[int, int, str, float, int, str, Optional[float]]] = None,
3434
) -> tuple[System, int, int, int]:
3535
"""
3636
See if the system can be autodetected and then process some special
@@ -40,10 +40,14 @@ def configure_launch(
4040
:param nodes: The number of nodes to use (or 0 if not specified)
4141
:param procs_per_node: The number of processes per node given by the user
4242
(or 0 if not specified)
43+
:param gpus_per_proc: The number of GPUs per process given by the user
44+
(or None if not specified)
4345
:param gpus_at_least: The minimum number of GPUs to use (or 0 if not
4446
specified)
4547
:param gpumem_at_least: The minimum amount of GPU memory (in gigabytes) to
4648
use (or 0 if not specified)
49+
:param cli_system_params: CLI provide description of the system configuration
50+
(or None if not specified)
4751
:return: A tuple of (autodetected System, number of nodes, number of
4852
processes per node)
4953
"""
@@ -66,19 +70,25 @@ def configure_launch(
6670

6771
if not gpus_per_proc:
6872
gpus_per_proc = 0
73+
74+
# If not provided, attempt to figure out the basics of procs_per_node and gpus_per_proc
6975
if system_params is not None:
76+
if not procs_per_node:
77+
procs_per_node = system_params.procs_per_node()
7078
if gpus_per_proc == 0 and system_params.gpus_per_node > 0:
7179
# If gpus_per_proc wasn't set and there are gpus on the node set it to a default of 1
7280
gpus_per_proc = 1
73-
if gpus_per_proc > system_params.gpus_per_node:
81+
if procs_per_node * gpus_per_proc > system_params.gpus_per_node:
7482
logger.info(
7583
f"Requested number of GPUs per process {gpus_per_proc} exceeds the number of GPUs per node {system_params.gpus_per_node}"
7684
)
77-
gpus_per_proc = system_params.gpus_per_node
85+
# If no, or an invalid, configuration is given, set the gpus_per_proc
86+
if gpus_per_proc == 0 or gpus_per_proc > system_params.gpus_per_node:
87+
gpus_per_proc = max(system_params.gpus_per_node // procs_per_node, 1)
7888

79-
if procs_per_node * gpus_per_proc > system_params.gpus_per_node:
89+
if procs_per_node and procs_per_node * gpus_per_proc > system_params.gpus_per_node:
8090
logger.info(
81-
f"The combination of {procs_per_node} processes per node and {gpus_per_proc} GPUs per process exceeds the number of GPUs per node {system_params.gpus_per_node}"
91+
f"The combination of {procs_per_node} processes per node and {gpus_per_proc} GPUs per process exceeds the number of GPUs per node {system_params.gpus_per_node} - Job will not launch, please fix requested parameters"
8292
)
8393

8494
# If the user requested a specific number of processes per node, honor that
@@ -88,8 +98,6 @@ def configure_launch(
8898
# Otherwise, if there is a valid set of system parameters, try to fill in
8999
# the blanks provided by the user
90100
if system_params is not None:
91-
if not procs_per_node:
92-
procs_per_node = system_params.procs_per_node()
93101
if gpus_at_least > 0:
94102
nodes = ceildiv(gpus_at_least, procs_per_node)
95103
elif gpumem_at_least > 0:

tests/launch_config_test.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,28 +71,51 @@ def test_launch_config(*args):
7171
Tests various launch configurations for GPU count and memory size.
7272
"""
7373
# User-specified procs_per_node
74-
system, nodes, procs_per_node = configure_launch(None, 2, 4, 0, 0)
74+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 4, 1, 0, 0, None)
7575
assert isinstance(system, MockSystem)
7676
assert nodes == 2
7777
assert procs_per_node == 4
78+
assert gpus_per_proc == 1
7879

7980
# GPU count constraint test
80-
system, nodes, procs_per_node = configure_launch(None, 0, 0, 6, 0)
81+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 0, 0, 1, 6, 0, None)
8182
assert isinstance(system, MockSystem)
8283
assert nodes == 2
8384
assert procs_per_node == 3
85+
assert gpus_per_proc == 1
8486

8587
# Memory constraint test
86-
system, nodes, procs_per_node = configure_launch(None, 0, 0, 0, 22)
88+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 0, 0, 1, 0, 22, None)
8789
assert isinstance(system, MockSystem)
8890
assert nodes == 1
8991
assert procs_per_node == 2
92+
assert gpus_per_proc == 1
9093

9194
# Just above the memory limit of a single node, this triggers a switch to all gpus per node
92-
system, nodes, procs_per_node = configure_launch(None, 0, 0, 0, 34)
95+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 0, 0, 1, 0, 34, None)
9396
assert isinstance(system, MockSystem)
9497
assert nodes == 2
9598
assert procs_per_node == 3
99+
assert gpus_per_proc == 1
100+
101+
# Ask for too many GPUs per proc, this should snap down to the 3 GPUs available
102+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 1, 4, 0, 0, None)
103+
assert isinstance(system, MockSystem)
104+
assert nodes == 2
105+
assert procs_per_node == 1
106+
assert gpus_per_proc == 3
107+
108+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 2, 2, 0, 0, None)
109+
assert isinstance(system, MockSystem)
110+
assert nodes == 2
111+
assert procs_per_node == 2
112+
assert gpus_per_proc == 2
113+
114+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 2, None, 0, 0, None)
115+
assert isinstance(system, MockSystem)
116+
assert nodes == 2
117+
assert procs_per_node == 2
118+
assert gpus_per_proc == 1
96119

97120

98121
@patch(
@@ -103,16 +126,18 @@ def test_nondefault_queue(*args):
103126
"""
104127
Tests the configuration of a non-default queue.
105128
"""
106-
system, nodes, procs_per_node = configure_launch("nondefault", 1, None, 0, 0)
129+
system, nodes, procs_per_node, gpus_per_proc = configure_launch("nondefault", 1, 2, 1, 0, 0, None)
107130
assert isinstance(system, MockSystem)
108131
assert nodes == 1
109132
assert procs_per_node == 2
133+
assert gpus_per_proc == 1
110134

111135
# Memory constraint test
112-
system, nodes, procs_per_node = configure_launch("nondefault", 0, 0, 0, 22)
136+
system, nodes, procs_per_node, gpus_per_proc = configure_launch("nondefault", 0, 0, 1, 0, 22, None)
113137
assert isinstance(system, MockSystem)
114138
assert nodes == 3
115139
assert procs_per_node == 2
140+
assert gpus_per_proc == 1
116141

117142

118143
@patch(
@@ -125,16 +150,18 @@ def test_preferred_procs_per_node(*args):
125150
"""
126151

127152
# User specifies only number of nodes (GPU system)
128-
system, nodes, procs_per_node = configure_launch(None, 3, 0, 0, 0)
153+
system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 3, 0, 1, 0, 0, None)
129154
assert isinstance(system, MockSystem)
130155
assert nodes == 3
131156
assert procs_per_node == 3
157+
assert gpus_per_proc == 1
132158

133159
# User specifies only number of nodes (CPU system)
134-
system, nodes, procs_per_node = configure_launch("cpuonly", 3, 0, 0, 0)
160+
system, nodes, procs_per_node, gpus_per_proc = configure_launch("cpuonly", 3, 0, 0, 0, 0, None)
135161
assert isinstance(system, MockSystem)
136162
assert nodes == 3
137163
assert procs_per_node == 4
164+
assert gpus_per_proc == 0
138165

139166

140167
@patch(

tests/output_capture_test.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
@pytest.mark.parametrize("no_launch_dir", [False, True])
2828
def test_output_capture_local(no_launch_dir: bool):
2929
# Configure scheduler
30-
system, nodes, procs_per_node = configure.configure_launch(None, 1, 1, None, None)
31-
scheduler = LocalScheduler(nodes, procs_per_node)
30+
system, nodes, procs_per_node, gpus_per_proc = configure.configure_launch(None, 1, 1, 1, None, None)
31+
scheduler = LocalScheduler(nodes, procs_per_node, gpus_per_proc)
3232

3333
command = sys.executable
3434
script = "output_capture.py"
@@ -77,10 +77,10 @@ def test_output_capture_scheduler(scheduler_class, processes):
7777
pytest.skip("LSF not available")
7878

7979
# Configure scheduler
80-
system, nodes, procs_per_node = configure.configure_launch(
81-
None, 1, processes, None, None
80+
system, nodes, procs_per_node, gpus_per_proc = configure.configure_launch(
81+
None, 1, processes, 1, None, None
8282
)
83-
scheduler = scheduler_class(nodes, procs_per_node)
83+
scheduler = scheduler_class(nodes, procs_per_node, gpus_per_proc)
8484

8585
command = sys.executable
8686
_, launch_dir = scheduler.create_launch_folder_name(command, "launch")

tests/test_torchrun_hpc.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,11 @@ def test_launcher_multinode(num_nodes, procs_per_node, rdv, scheduler_type):
144144
except (ImportError, ModuleNotFoundError):
145145
pytest.skip("torch not found")
146146

147+
try:
148+
import mpi4py
149+
except (ImportError, ModuleNotFoundError):
150+
pytest.skip("mpi not found")
151+
147152
# Get full path to torch_dist_driver.py
148153
driver_file = os.path.join(os.path.dirname(__file__), "torch_dist_driver.py")
149154

@@ -183,3 +188,11 @@ def test_launcher_multinode(num_nodes, procs_per_node, rdv, scheduler_type):
183188

184189
if exp_dir:
185190
shutil.rmtree(exp_dir, ignore_errors=True)
191+
192+
if __name__ == "__main__":
193+
test_launcher_multinode(2, 1, "tcp", "slurm")
194+
test_launcher_multinode(2, 1, "tcp", "flux")
195+
test_launcher_multinode(2, 1, "tcp", "lsf")
196+
test_launcher_multinode(2, 1, "mpi", "slurm")
197+
test_launcher_multinode(2, 1, "mpi", "flux")
198+
test_launcher_multinode(2, 1, "mpi", "lsf")

0 commit comments

Comments
 (0)