Skip to content

Commit a5707c7

Browse files
authored
Added markdown of the CLI help messages. (#51)
* Added markdown of the CLI help messages. * Fix some bugs in the examples. * Fixed a bug where the number of GPUs required was returned as a float rather than an int. * Fixed how banks and accounts are specified. * Fixed a bug in how the check for None vs an empty string in the configuration of the launch directory argument. Updated some of the test configuration examples. * Improved how the system architecture (especially if overridden) is reported. * Added guards for setting out or error log files for ephemeral jobs. * Added function to scheduler class to get the environment variable for each rank's ID. Updated the launch script so that it gets the RANK environment variable so that it can write out the hostlist if necessary. Improved the guards for ephemeral job CLI flags. * Removed debugging code * Cleaned up and improved integration with the torchrun-hpc CLI argument to set the max memory size and the CLI parameter list. Fixed a bug in how the system parameters are mutated from the CLI. * Updated env variable. * Added a default argument for the max gpu mem. * Fixed how slurm runs check for the root node in a torch run. * Finished cleaning up the torchrun-hpc CLI examples. * Fixed tests to use a launch directory. * Minor cleanup
1 parent 2d93ff5 commit a5707c7

File tree

18 files changed

+890
-31
lines changed

18 files changed

+890
-31
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ Using HPC-Launcher within existing PyTorch code with explicity invoking it from
3333
import hpc_launcher.torch
3434
```
3535

36+
## CLI options for HPC-Launcher `launch` and `torchrun-hpc` commands
37+
38+
- [`launch`](./launch_cli.md) - General purpose HPC job launcher
39+
- [`torchrun-hpc`](./torchrun-hpc_cli_2.md) - PyTorch-specific distributed training launcher
40+
3641
# LBANN: Livermore Big Artificial Neural Network Toolkit
3742

3843
The Livermore Big Artificial Neural Network toolkit (LBANN) is an

hpc_launcher/cli/common_args.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,18 @@ def validate_arguments(args: argparse.Namespace):
358358
if os.path.dirname(output_script):
359359
raise ValueError(f"User provided output script filename cannot be a absolute or relative path: {output_script}")
360360

361-
if args.output_script and not args.launch_dir and not args.bg:
362-
raise ValueError("A output script file name was provided for a ephemeral interative job.")
361+
if args.launch_dir == None and not args.bg: # ephemeral interactive job
362+
if args.output_script:
363+
raise ValueError("A output script file name was provided for a ephemeral interative job.")
364+
365+
if args.out_log_file:
366+
raise ValueError("A output log file name was provided for a ephemeral interative job.")
367+
368+
if args.err_log_file:
369+
raise ValueError("A error log file name was provided for a ephemeral interative job.")
370+
371+
if args.save_hostlist:
372+
raise ValueError("Saving the hostlist was requested for a ephemeral interative job.")
363373

364374
if args.output_script and args.batch_script:
365375
raise ValueError("Cannot specify both an output script name: {args.output_script} and a pre-generated batch script {args.batch_script}.")

hpc_launcher/cli/launch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def main():
8181
args.setup_only,
8282
args.color_stderr,
8383
args.dry_run,
84-
args.launch_dir != None and (args.save_hostlist or args.verbose),
84+
args.launch_dir != None and args.save_hostlist,
8585
args.batch_script != "", # If a batch script is provided don't allow it to be modified
8686
)
8787

hpc_launcher/cli/torchrun_hpc.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ def main():
7575

7676
launch_helpers.setup_logging(logger, args.verbose)
7777

78+
if args.fraction_max_gpu_mem and args.fraction_max_gpu_mem != 1.0:
79+
if not args.system_params:
80+
args.system_params = {}
81+
args.system_params["fraction_max_gpu_mem"] = args.fraction_max_gpu_mem
82+
7883
# Process special arguments that can autoselect the number of ranks / GPUs
7984
system = common_args.process_arguments(args, logger)
8085
optimize_comm_protocol = ""
@@ -104,17 +109,6 @@ def main():
104109
else:
105110
raise Exception(f"Unknown rendezvous {args.rdv} requested.")
106111

107-
if args.fraction_max_gpu_mem and args.fraction_max_gpu_mem != 1.0:
108-
env_list.append(("TORCHRUN_HPC_MAX_GPU_MEM", args.fraction_max_gpu_mem))
109-
else:
110-
if system.active_system_params:
111-
env_list.append(
112-
(
113-
"TORCHRUN_HPC_MAX_GPU_MEM",
114-
system.active_system_params.fraction_max_gpu_mem,
115-
)
116-
)
117-
118112
if args.unswap_rocr_hip_vis_dev:
119113
env_list.append(("TORCHRUN_HPC_UNSWAP_ROCR_HIP_VIS_DEV", "TRUE"))
120114

@@ -176,7 +170,7 @@ def main():
176170
args.setup_only,
177171
args.color_stderr,
178172
args.dry_run,
179-
args.launch_dir != None and (args.save_hostlist or args.verbose),
173+
args.launch_dir != None and args.save_hostlist,
180174
)
181175

182176
if jobid:

hpc_launcher/schedulers/flux.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def build_scheduler_specific_arguments(
107107
self.submit_only_args["--queue"] = f"{self.queue}"
108108

109109
if self.account:
110-
self.submit_only_args["--account"] = f"{self.account}"
110+
self.submit_only_args["--bank"] = f"{self.account}"
111111

112112
if self.reservation:
113113
logger.warning(
@@ -158,6 +158,10 @@ def num_nodes_in_allocation(cls) -> Optional[int]:
158158

159159
return None
160160

161+
@classmethod
162+
def get_parallel_rank_env_variable(self) -> str:
163+
return "${FLUX_TASK_RANK}"
164+
161165
@classmethod
162166
def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
163167
env_vars = [
@@ -185,7 +189,7 @@ def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
185189

186190
def dynamically_configure_rendezvous_protocol(self, protocol: str) -> list[str]:
187191
env_list = []
188-
env_list.append(("RANK", "${FLUX_TASK_RANK}"))
192+
env_list.append(("RANK", self.get_parallel_rank_env_variable()))
189193
if protocol.lower() == "tcp":
190194
env_list.append(
191195
(

hpc_launcher/schedulers/lsf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,10 @@ def num_nodes_in_allocation(cls) -> Optional[int]:
141141

142142
return None
143143

144+
@classmethod
145+
def get_parallel_rank_env_variable(self) -> str:
146+
return "${OMPI_COMM_WORLD_RANK}"
147+
144148
@classmethod
145149
def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
146150
env_vars = [
@@ -167,7 +171,7 @@ def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
167171

168172
def dynamically_configure_rendezvous_protocol(self, protocol: str) -> list[str]:
169173
env_list = []
170-
env_list.append(("RANK", "${OMPI_COMM_WORLD_RANK}"))
174+
env_list.append(("RANK", self.get_parallel_rank_env_variable()))
171175
if protocol.lower() == "tcp":
172176
if os.getenv("LSB_HOSTS"):
173177
# When runing under an allocation use the current node as the coordinator

hpc_launcher/schedulers/scheduler.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,11 +333,17 @@ def launcher_script(
333333
logger.info(f"Callee directory: {callee_directory} - and {launch_dir}")
334334
script += f"export PYTHONPATH={callee_directory}:" + "${PYTHONPATH}\n"
335335
if save_hostlist:
336+
script += f'export RANK={self.get_parallel_rank_env_variable()}\n'
336337
script += self.export_hostlist()
337338
script += 'if [ "${RANK}" = "0" ]; then\n'
338339
script += " echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
339340
script += "fi\n\n"
340341

342+
if system.active_system_params:
343+
system_params = system.active_system_params
344+
if system_params.fraction_max_gpu_mem and system_params.fraction_max_gpu_mem != 1.0:
345+
script += f'export HPC_LAUNCHER_MAX_GPU_MEM={system_params.fraction_max_gpu_mem}\n'
346+
341347
if self.require_parallel_internal_run_command(blocking):
342348
script += self.internal_script_run_command()
343349
script += " ".join(cmd_args)
@@ -386,6 +392,15 @@ def num_nodes_in_allocation(cls) -> tuple[int]:
386392
"""
387393
raise NotImplementedError
388394

395+
@classmethod
396+
def get_parallel_rank_env_variable(cls) -> str:
397+
"""
398+
When running under an allocation, return the environment variable to get the current rank
399+
400+
:return: environment variable for rank in an allocation
401+
"""
402+
raise NotImplementedError
403+
389404
@classmethod
390405
def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
391406
"""

hpc_launcher/schedulers/slurm.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ def num_nodes_in_allocation(cls) -> Optional[int]:
168168

169169
return None
170170

171+
@classmethod
172+
def get_parallel_rank_env_variable(self) -> str:
173+
return "${SLURM_PROCID}"
174+
171175
@classmethod
172176
def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
173177
# Interesting but unused variables SLURM_JOB_NUM_NODES, SLURM_NPROCS, SLURM_DISTRIBUTION
@@ -192,12 +196,12 @@ def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
192196
@classmethod
193197
def dynamically_configure_rendezvous_protocol(self, protocol: str) -> str:
194198
env_list = []
195-
env_list.append(("RANK", "${SLURM_PROCID}"))
199+
env_list.append(("RANK", self.get_parallel_rank_env_variable()))
196200
if protocol.lower() == "tcp":
197201
env_list.append(
198202
(
199203
"TORCHRUN_HPC_MASTER_ADDR",
200-
"`printenv SLURM_JOB_NODELIST | /bin/hostlist -n 1`",
204+
"`scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1`",
201205
)
202206
)
203207
env_list.append(("TORCHRUN_HPC_MASTER_PORT", "23456"))

hpc_launcher/systems/autodetect.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def find_AMD_gpus() -> (int, float, str):
6060
finally:
6161
try:
6262
smi.amdsmi_shut_down()
63+
return (0, 0, None)
6364
except smi.AmdSmiException as e:
6465
return (0, 0, None)
6566

@@ -89,6 +90,7 @@ def find_NVIDIA_gpus() -> (int, float, str):
8990
finally:
9091
try:
9192
pynvml.nvmlShutdown()
93+
return (0, 0, None)
9294
except pynvml.NVMLError as e:
9395
return (0, 0, None)
9496

hpc_launcher/systems/configure.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def configure_launch(
3030
gpus_per_proc: Optional[int],
3131
gpus_at_least: int = 0,
3232
gpumem_at_least: int = 0,
33-
cli_system_params: Optional[tuple[int, int, str, float, int, str, Optional[float]]] = None,
33+
cli_system_params: Optional[dict[str, str]] = None,
3434
job_comm_protocol: Optional[str] = None,
3535
) -> tuple[System, int, int, int]:
3636
"""
@@ -63,15 +63,24 @@ def configure_launch(
6363
system_params = system.system_parameters(queue)
6464

6565
# If any system parameters were provided on the command line, potentially overriding any known or discovered system parameters
66+
msg = ""
6667
if cli_system_params:
68+
msg = " (CLI Override) "
6769
if not system_params: # Use a default set of system parameters
68-
system_params = SystemParams()
69-
_cli_system_params_dict = asdict(system_params)
70+
# for the active system params
71+
system.active_system_params = SystemParams()
72+
system_params = system.active_system_params()
7073
for field in fields(system_params):
7174
if field.name in cli_system_params:
72-
_cli_system_params_dict[field.name] = convert_to_type_of_another(cli_system_params[field.name], _cli_system_params_dict[field.name])
73-
# Create a new system_params with the proper fields overwritten
74-
system_params = SystemParams(**_cli_system_params_dict)
75+
system_params.__dict__[field.name] = convert_to_type_of_another(cli_system_params[field.name], system_params.__dict__[field.name])
76+
del cli_system_params[field.name]
77+
78+
for unused_field in cli_system_params.keys():
79+
raise ValueError(f"System Parameters CLI attempt to overwrite unknown field: {unused_field}")
80+
81+
logger.info(
82+
f"Active System Parameters{msg}: {system.active_system_params.prettyprint()}"
83+
)
7584

7685
if not gpus_per_proc:
7786
gpus_per_proc = 0

0 commit comments

Comments
 (0)