Cleanup readme and fix bash-ism. (#17)

bvanessen · tbennun · web-flow · commit 1eb779c7de21 · 2025-01-19T16:32:15.000-08:00
* Cleanup readme and fix bash-ism.

* Moved PyTorch process affimity mapping and GPU memory limits into the
torch __init__ function so that they are set when hpc_launcher.torch
is imported.  This should make it easier to use the library without
calling the launcher from the command line.

* Make the env variable a string in the shell test operation.

* Capture the original command line and record it in the launch script.

* Add a guard to make sure that active system parameters is defined.

* Apply suggestions from code review

Co-authored-by: Tal Ben-Nun &lt;tbennun@users.noreply.github.com&gt;

* Fixed status of NERSC systems

* Add an example of how to use HPC-Launcher within an existing PyTorch code.

---------

Co-authored-by: Tal Ben-Nun &lt;tbennun@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -1,3 +1,31 @@
+# HPC-launcher Repository
+
+The HPC launcher repository contains a set of helpful scripts and
+Python bindings for launching LBANN 2.0 (PyTorch-core) on multiple
+leadership-class HPC systems.  There are optimized routines for FLUX,
+SLURM, and LSF launchers.  Currently there are supported systems at:
+ - LLNL Livermore Computing (LC)
+ - LBL NERSC (Pending)
+ - ORNL OLCF (Pending)
+ - RIKEN (Pending)
+
+## Example Usage
+
+Using the launch command to execute a command in parallel
+```
+launch -N1 -n1 hostname
+```
+
+Using the torchrun-hpc command to execute a PyTorch Python file in parallel on two nodes and four processes per node (8 in total):
+```
+torchrun-hpc -N2 -n4 file.py [arguments to Python file]
+```
+
+Using HPC-Launcher within existing PyTorch code with explicity invoking it from the command line (CLI).  Within the top level Python file, import `hpc_launcher.torch` first to ensure that `torch` is configured per HPC-Launcher's specification.
+```
+import hpc_launcher.torch
+```
+
 # LBANN: Livermore Big Artificial Neural Network Toolkit
 
 The Livermore Big Artificial Neural Network toolkit (LBANN) is an
@@ -12,22 +40,11 @@ networks with massive amounts of data.  LBANN is able to advantage of
 tightly-coupled accelerators, low-latency high-bandwidth networking,
 and high-bandwidth parallel file systems.
 
-## HPC-launcher Repository
-
-The HPC launcher repository contains a set of helpful scripts and
-Python bindings for launching LBANN 2.0 (PyTorch-core) on multiple
-leadership-class HPC systems.  There are optimized routines for FLUX,
-SLURM, and LSF launchers.  Currently there are supported systems at:
- - LLNL Livermore Computing (LC)
- - LBL NERSC
- - ORNL OLCF
- - RIKEN
- 
 ## Publications
 
 A list of publications, presentations and posters are shown
 [here](https://lbann.readthedocs.io/en/latest/publications.html).
 
 ## Reporting issues
 Issues, questions, and bugs can be raised on the [Github issue
-tracker](https://github.com/LBANN/lbann/issues).
+tracker](https://github.com/LBANN/HPC-launcher/issues).
diff --git a/hpc_launcher/cli/launch.py b/hpc_launcher/cli/launch.py
@@ -20,7 +20,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 def main():
     parser = argparse.ArgumentParser(
         description=
diff --git a/hpc_launcher/cli/launch_helpers.py b/hpc_launcher/cli/launch_helpers.py
@@ -22,6 +22,7 @@
 from hpc_launcher.systems.system import System, GenericSystem
 from hpc_launcher.systems import autodetect
 import logging
+import sys
 
 def setup_logging(logger: logging.Logger, verbose: bool):
     if verbose:
@@ -48,6 +49,7 @@ def select_scheduler(args: argparse.Namespace, logger: logging.Logger, system: S
 
     scheduler_args = common_args.create_scheduler_arguments(**vars(args))
     scheduler = scheduler_class(**scheduler_args)
+    scheduler.command_line = sys.argv
 
     logger.info(
         f'system parameters: node={scheduler.nodes} ppn={scheduler.procs_per_node}'
diff --git a/hpc_launcher/cli/torchrun_hpc.py b/hpc_launcher/cli/torchrun_hpc.py
@@ -85,7 +85,8 @@ def main():
     if args.fraction_max_gpu_mem and args.fraction_max_gpu_mem != 1.0:
         env_list.append(('TORCHRUN_HPC_MAX_GPU_MEM', args.fraction_max_gpu_mem))
     else:
-        env_list.append(('TORCHRUN_HPC_MAX_GPU_MEM', system.active_system_params.fraction_max_gpu_mem))
+        if system.active_system_params:
+            env_list.append(('TORCHRUN_HPC_MAX_GPU_MEM', system.active_system_params.fraction_max_gpu_mem))
 
     system.extend_environment_variables(env_list)
 
diff --git a/hpc_launcher/cli/torchrun_hpc_stub.py b/hpc_launcher/cli/torchrun_hpc_stub.py
@@ -11,31 +11,15 @@
 # https://github.com/LBANN and https://github.com/LLNL/LBANN.
 #
 # SPDX-License-Identifier: (Apache-2.0)
-from psutil import Process
-
-# Save affinity before importing torch
-affinity = Process().cpu_affinity()
+import hpc_launcher.torch
 
 import torch
-try:
-    import mpi4py
-    # This will automatically register MPI for initialization.
-    import mpi_rdv
-    from mpi4py import MPI
-    mpi = True
-except (ImportError, ModuleNotFoundError):
-    mpi = None
-
 import torch.distributed as dist
 import runpy
 import atexit
 import sys
 import os
 
-# Restore affinity after importing torch
-Process().cpu_affinity(affinity)
-import sys
-
 from hpc_launcher.schedulers import get_schedulers
 
 
@@ -48,32 +32,37 @@ def main():
     (world_size, rank, local_world_size, local_rank) = scheduler.get_parallel_configuration()
 
     rdv_protocol = os.getenv('TORCHRUN_HPC_RDV_PROTOCOL')
-    if not mpi and rdv_protocol == 'mpi://':
-        raise Exception(f'MPI rendezvous protocol selected without installing mpi_rndv library.')
-
-    if dist.is_initialized():
-        raise Exception('PyTorch Distributed is already initialized')
-
-    print(f'Initializing distributed PyTorch using protocol: {rdv_protocol}')
-    # TODO(later): Fix how we handle CUDA visible devices and MPI bind
-    dist.init_process_group("nccl", init_method=rdv_protocol,
-                            world_size=world_size, rank=rank)
-
     if rdv_protocol == 'mpi://':
-        print('MPI Version: {}'.format(MPI.Get_version()))
-        print('MPI Implementation: {}'.format(MPI.Get_library_version()))
-
+        try:
+            import mpi4py
+            # This will automatically register MPI for initialization.
+            import mpi_rdv
+            from mpi4py import MPI
+            mpi = True
+        except (ImportError, ModuleNotFoundError):
+            mpi = None
+            raise Exception(f'MPI rendezvous protocol selected without installing mpi_rndv library.')
+
+    torch_dist_initialized = dist.is_initialized()
+    if not torch_dist_initialized:
+        # raise Exception('PyTorch Distributed is already initialized')
+
+        print(f'Initializing distributed PyTorch using protocol: {rdv_protocol}')
+        # TODO(later): Fix how we handle CUDA visible devices and MPI bind
+        dist.init_process_group("nccl", init_method=rdv_protocol,
+                                world_size=world_size, rank=rank)
+
+        if rdv_protocol == 'mpi://':
+            print('MPI Version: {}'.format(MPI.Get_version()))
+            print('MPI Implementation: {}'.format(MPI.Get_library_version()))
 
-    fraction_max_gpu_mem = float(os.getenv('TORCHRUN_HPC_MAX_GPU_MEM'))
-    if fraction_max_gpu_mem != 1.0:
-        print(f'Setting the max GPU memory fraction to {fraction_max_gpu_mem}')
-        torch.cuda.set_per_process_memory_fraction(fraction_max_gpu_mem)
 
     # Run underlying script
     runpy.run_path(args[0], run_name="__main__")
 
-    # Deal with destroying the process group here
-    dist.destroy_process_group()
+    if not torch_dist_initialized:
+        # Deal with destroying the process group here
+        dist.destroy_process_group()
 
 
 if __name__ == "__main__":
diff --git a/hpc_launcher/schedulers/scheduler.py b/hpc_launcher/schedulers/scheduler.py
@@ -62,6 +62,8 @@ class Scheduler:
     launcher_flags: Optional[list[str]] = None
     # Hijack preload commands into a scheduler
     ld_preloads: Optional[list[str]] = None
+    # Capture the original command so that it can be added to the launch script
+    command_line: Optional[list[str]] = None
 
     def select_interactive_or_batch(self,
                                     tmp: list[str],
@@ -298,11 +300,12 @@ def launch(self,
         logger.info(f'Script filename: {filename}')
         with open(filename, 'w') as fp:
             fp.write(self.launcher_script(system, command, args, blocking))
-            fp.write('\nif [[ ${RANK} -eq 0 ]]; then')
+            fp.write('\nif [ "${RANK}" = "0" ]; then')
             fp.write('\n    echo ${HPC_LAUNCHER_HOSTLIST} > '
                      + os.path.join(os.path.dirname(filename), f'hpc_launcher_hostlist.txt\n'))
             fp.write('fi\n')
             fp.write(f'\n# Launch command: ' + ' '.join(full_cmdline) + '\n')
+            fp.write(f'# User command invoked: ' + ' '.join(self.command_line) + '\n')
         os.chmod(filename, 0o700)
 
         if setup_only:
diff --git a/hpc_launcher/torch/__init__.py b/hpc_launcher/torch/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+# the CONTRIBUTORS file. See the top-level LICENSE file for details.
+#
+# LLNL-CODE-697807.
+# All rights reserved.
+#
+# This file is part of LBANN: Livermore Big Artificial Neural Network
+# Toolkit. For details, see http://software.llnl.gov/LBANN or
+# https://github.com/LBANN and https://github.com/LLNL/LBANN.
+#
+# SPDX-License-Identifier: (Apache-2.0)
+from psutil import Process
+
+# Save affinity before importing torch
+affinity = Process().cpu_affinity()
+
+import torch
+
+# Restore affinity after importing torch
+Process().cpu_affinity(affinity)
+import os
+
+if torch.cuda.is_available():
+    fraction_max_gpu_mem = float(os.getenv('TORCHRUN_HPC_MAX_GPU_MEM'))
+    if fraction_max_gpu_mem != 1.0:
+        print(f'Setting the max GPU memory fraction to {fraction_max_gpu_mem}')
+        torch.cuda.set_per_process_memory_fraction(fraction_max_gpu_mem)