Skip to content

Commit 1eb779c

Browse files
bvanessentbennun
andauthored
Cleanup readme and fix bash-ism. (#17)
* Cleanup readme and fix bash-ism. * Moved PyTorch process affimity mapping and GPU memory limits into the torch __init__ function so that they are set when hpc_launcher.torch is imported. This should make it easier to use the library without calling the launcher from the command line. * Make the env variable a string in the shell test operation. * Capture the original command line and record it in the launch script. * Add a guard to make sure that active system parameters is defined. * Apply suggestions from code review Co-authored-by: Tal Ben-Nun <tbennun@users.noreply.github.com> * Fixed status of NERSC systems * Add an example of how to use HPC-Launcher within an existing PyTorch code. --------- Co-authored-by: Tal Ben-Nun <tbennun@users.noreply.github.com>
1 parent 9f86966 commit 1eb779c

File tree

7 files changed

+92
-52
lines changed

7 files changed

+92
-52
lines changed

README.md

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,31 @@
1+
# HPC-launcher Repository
2+
3+
The HPC launcher repository contains a set of helpful scripts and
4+
Python bindings for launching LBANN 2.0 (PyTorch-core) on multiple
5+
leadership-class HPC systems. There are optimized routines for FLUX,
6+
SLURM, and LSF launchers. Currently there are supported systems at:
7+
- LLNL Livermore Computing (LC)
8+
- LBL NERSC (Pending)
9+
- ORNL OLCF (Pending)
10+
- RIKEN (Pending)
11+
12+
## Example Usage
13+
14+
Using the launch command to execute a command in parallel
15+
```
16+
launch -N1 -n1 hostname
17+
```
18+
19+
Using the torchrun-hpc command to execute a PyTorch Python file in parallel on two nodes and four processes per node (8 in total):
20+
```
21+
torchrun-hpc -N2 -n4 file.py [arguments to Python file]
22+
```
23+
24+
Using HPC-Launcher within existing PyTorch code with explicity invoking it from the command line (CLI). Within the top level Python file, import `hpc_launcher.torch` first to ensure that `torch` is configured per HPC-Launcher's specification.
25+
```
26+
import hpc_launcher.torch
27+
```
28+
129
# LBANN: Livermore Big Artificial Neural Network Toolkit
230

331
The Livermore Big Artificial Neural Network toolkit (LBANN) is an
@@ -12,22 +40,11 @@ networks with massive amounts of data. LBANN is able to advantage of
1240
tightly-coupled accelerators, low-latency high-bandwidth networking,
1341
and high-bandwidth parallel file systems.
1442

15-
## HPC-launcher Repository
16-
17-
The HPC launcher repository contains a set of helpful scripts and
18-
Python bindings for launching LBANN 2.0 (PyTorch-core) on multiple
19-
leadership-class HPC systems. There are optimized routines for FLUX,
20-
SLURM, and LSF launchers. Currently there are supported systems at:
21-
- LLNL Livermore Computing (LC)
22-
- LBL NERSC
23-
- ORNL OLCF
24-
- RIKEN
25-
2643
## Publications
2744

2845
A list of publications, presentations and posters are shown
2946
[here](https://lbann.readthedocs.io/en/latest/publications.html).
3047

3148
## Reporting issues
3249
Issues, questions, and bugs can be raised on the [Github issue
33-
tracker](https://github.com/LBANN/lbann/issues).
50+
tracker](https://github.com/LBANN/HPC-launcher/issues).

hpc_launcher/cli/launch.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
logger = logging.getLogger(__name__)
2222

23-
2423
def main():
2524
parser = argparse.ArgumentParser(
2625
description=

hpc_launcher/cli/launch_helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from hpc_launcher.systems.system import System, GenericSystem
2323
from hpc_launcher.systems import autodetect
2424
import logging
25+
import sys
2526

2627
def setup_logging(logger: logging.Logger, verbose: bool):
2728
if verbose:
@@ -48,6 +49,7 @@ def select_scheduler(args: argparse.Namespace, logger: logging.Logger, system: S
4849

4950
scheduler_args = common_args.create_scheduler_arguments(**vars(args))
5051
scheduler = scheduler_class(**scheduler_args)
52+
scheduler.command_line = sys.argv
5153

5254
logger.info(
5355
f'system parameters: node={scheduler.nodes} ppn={scheduler.procs_per_node}'

hpc_launcher/cli/torchrun_hpc.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ def main():
8585
if args.fraction_max_gpu_mem and args.fraction_max_gpu_mem != 1.0:
8686
env_list.append(('TORCHRUN_HPC_MAX_GPU_MEM', args.fraction_max_gpu_mem))
8787
else:
88-
env_list.append(('TORCHRUN_HPC_MAX_GPU_MEM', system.active_system_params.fraction_max_gpu_mem))
88+
if system.active_system_params:
89+
env_list.append(('TORCHRUN_HPC_MAX_GPU_MEM', system.active_system_params.fraction_max_gpu_mem))
8990

9091
system.extend_environment_variables(env_list)
9192

hpc_launcher/cli/torchrun_hpc_stub.py

Lines changed: 26 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,31 +11,15 @@
1111
# https://github.com/LBANN and https://github.com/LLNL/LBANN.
1212
#
1313
# SPDX-License-Identifier: (Apache-2.0)
14-
from psutil import Process
15-
16-
# Save affinity before importing torch
17-
affinity = Process().cpu_affinity()
14+
import hpc_launcher.torch
1815

1916
import torch
20-
try:
21-
import mpi4py
22-
# This will automatically register MPI for initialization.
23-
import mpi_rdv
24-
from mpi4py import MPI
25-
mpi = True
26-
except (ImportError, ModuleNotFoundError):
27-
mpi = None
28-
2917
import torch.distributed as dist
3018
import runpy
3119
import atexit
3220
import sys
3321
import os
3422

35-
# Restore affinity after importing torch
36-
Process().cpu_affinity(affinity)
37-
import sys
38-
3923
from hpc_launcher.schedulers import get_schedulers
4024

4125

@@ -48,32 +32,37 @@ def main():
4832
(world_size, rank, local_world_size, local_rank) = scheduler.get_parallel_configuration()
4933

5034
rdv_protocol = os.getenv('TORCHRUN_HPC_RDV_PROTOCOL')
51-
if not mpi and rdv_protocol == 'mpi://':
52-
raise Exception(f'MPI rendezvous protocol selected without installing mpi_rndv library.')
53-
54-
if dist.is_initialized():
55-
raise Exception('PyTorch Distributed is already initialized')
56-
57-
print(f'Initializing distributed PyTorch using protocol: {rdv_protocol}')
58-
# TODO(later): Fix how we handle CUDA visible devices and MPI bind
59-
dist.init_process_group("nccl", init_method=rdv_protocol,
60-
world_size=world_size, rank=rank)
61-
6235
if rdv_protocol == 'mpi://':
63-
print('MPI Version: {}'.format(MPI.Get_version()))
64-
print('MPI Implementation: {}'.format(MPI.Get_library_version()))
65-
36+
try:
37+
import mpi4py
38+
# This will automatically register MPI for initialization.
39+
import mpi_rdv
40+
from mpi4py import MPI
41+
mpi = True
42+
except (ImportError, ModuleNotFoundError):
43+
mpi = None
44+
raise Exception(f'MPI rendezvous protocol selected without installing mpi_rndv library.')
45+
46+
torch_dist_initialized = dist.is_initialized()
47+
if not torch_dist_initialized:
48+
# raise Exception('PyTorch Distributed is already initialized')
49+
50+
print(f'Initializing distributed PyTorch using protocol: {rdv_protocol}')
51+
# TODO(later): Fix how we handle CUDA visible devices and MPI bind
52+
dist.init_process_group("nccl", init_method=rdv_protocol,
53+
world_size=world_size, rank=rank)
54+
55+
if rdv_protocol == 'mpi://':
56+
print('MPI Version: {}'.format(MPI.Get_version()))
57+
print('MPI Implementation: {}'.format(MPI.Get_library_version()))
6658

67-
fraction_max_gpu_mem = float(os.getenv('TORCHRUN_HPC_MAX_GPU_MEM'))
68-
if fraction_max_gpu_mem != 1.0:
69-
print(f'Setting the max GPU memory fraction to {fraction_max_gpu_mem}')
70-
torch.cuda.set_per_process_memory_fraction(fraction_max_gpu_mem)
7159

7260
# Run underlying script
7361
runpy.run_path(args[0], run_name="__main__")
7462

75-
# Deal with destroying the process group here
76-
dist.destroy_process_group()
63+
if not torch_dist_initialized:
64+
# Deal with destroying the process group here
65+
dist.destroy_process_group()
7766

7867

7968
if __name__ == "__main__":

hpc_launcher/schedulers/scheduler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ class Scheduler:
6262
launcher_flags: Optional[list[str]] = None
6363
# Hijack preload commands into a scheduler
6464
ld_preloads: Optional[list[str]] = None
65+
# Capture the original command so that it can be added to the launch script
66+
command_line: Optional[list[str]] = None
6567

6668
def select_interactive_or_batch(self,
6769
tmp: list[str],
@@ -298,11 +300,12 @@ def launch(self,
298300
logger.info(f'Script filename: {filename}')
299301
with open(filename, 'w') as fp:
300302
fp.write(self.launcher_script(system, command, args, blocking))
301-
fp.write('\nif [[ ${RANK} -eq 0 ]]; then')
303+
fp.write('\nif [ "${RANK}" = "0" ]; then')
302304
fp.write('\n echo ${HPC_LAUNCHER_HOSTLIST} > '
303305
+ os.path.join(os.path.dirname(filename), f'hpc_launcher_hostlist.txt\n'))
304306
fp.write('fi\n')
305307
fp.write(f'\n# Launch command: ' + ' '.join(full_cmdline) + '\n')
308+
fp.write(f'# User command invoked: ' + ' '.join(self.command_line) + '\n')
306309
os.chmod(filename, 0o700)
307310

308311
if setup_only:

hpc_launcher/torch/__init__.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC.
2+
# Produced at the Lawrence Livermore National Laboratory.
3+
# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
4+
# the CONTRIBUTORS file. See the top-level LICENSE file for details.
5+
#
6+
# LLNL-CODE-697807.
7+
# All rights reserved.
8+
#
9+
# This file is part of LBANN: Livermore Big Artificial Neural Network
10+
# Toolkit. For details, see http://software.llnl.gov/LBANN or
11+
# https://github.com/LBANN and https://github.com/LLNL/LBANN.
12+
#
13+
# SPDX-License-Identifier: (Apache-2.0)
14+
from psutil import Process
15+
16+
# Save affinity before importing torch
17+
affinity = Process().cpu_affinity()
18+
19+
import torch
20+
21+
# Restore affinity after importing torch
22+
Process().cpu_affinity(affinity)
23+
import os
24+
25+
if torch.cuda.is_available():
26+
fraction_max_gpu_mem = float(os.getenv('TORCHRUN_HPC_MAX_GPU_MEM'))
27+
if fraction_max_gpu_mem != 1.0:
28+
print(f'Setting the max GPU memory fraction to {fraction_max_gpu_mem}')
29+
torch.cuda.set_per_process_memory_fraction(fraction_max_gpu_mem)

0 commit comments

Comments
 (0)