LBANN
diff --git a/‎hpc_launcher/cli/common_args.py‎
Lines changed: 17 additions & 4 deletions b/‎hpc_launcher/cli/common_args.py‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎hpc_launcher/cli/launch.py‎
Lines changed: 4 additions & 3 deletions b/‎hpc_launcher/cli/launch.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎hpc_launcher/cli/torchrun_hpc.py‎
Lines changed: 10 additions & 8 deletions b/‎hpc_launcher/cli/torchrun_hpc.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎hpc_launcher/schedulers/flux.py‎
Lines changed: 24 additions & 4 deletions b/‎hpc_launcher/schedulers/flux.py‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎hpc_launcher/schedulers/local.py‎
Lines changed: 6 additions & 2 deletions b/‎hpc_launcher/schedulers/local.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎hpc_launcher/schedulers/lsf.py‎
Lines changed: 11 additions & 2 deletions b/‎hpc_launcher/schedulers/lsf.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎hpc_launcher/schedulers/scheduler.py‎
Lines changed: 39 additions & 15 deletions b/‎hpc_launcher/schedulers/scheduler.py‎
Lines changed: 39 additions & 15 deletions
@@ -44,7 +44,7 @@ def setup_arguments(parser: argparse.ArgumentParser):
                         '-v',
                         action='store_true',
                         default=False,
-                        help='Run in verbose mode')
+                        help='Run in verbose mode.  Also save the hostlist as if --save-hostlist is set')
 
     # Job size arguments
     group = parser.add_argument_group(
@@ -134,12 +134,20 @@ def setup_arguments(parser: argparse.ArgumentParser):
                                       'Batch scheduler script parameters')
 
     group.add_argument(
-        '--run-from-dir',
+        '--run-from-launch-dir',
         action='store_true',
         default=False,
         help='If set, the launcher will run the command from the timestamped '
         'launch directory')
 
+    group.add_argument(
+        '--no-launch-dir',
+        action='store_true',
+        default=False,
+        help='If set, the launcher will not create a timestamped launch directory. '
+        'Instead, it will create the launch file and logs in the current working '
+        'directory')
+
     group.add_argument(
         '-o',
         '--output-script',
@@ -170,6 +178,11 @@ def setup_arguments(parser: argparse.ArgumentParser):
         help='Add a reservation arguement to scheduler.  '
         'Typically used for Dedecated Application Time runs (DATs)')
 
+    group.add_argument(
+        '--save-hostlist',
+        action='store_true',
+        default=False,
+        help='Write the hostlist to a file: hpc_launcher_hostlist.txt.')
 
 def validate_arguments(args: argparse.Namespace):
     """
@@ -211,9 +224,9 @@ def validate_arguments(args: argparse.Namespace):
     if args.local and args.scheduler:
         raise ValueError('The --local and --scheduler flags are mutually '
                          'exclusive')
-    if args.work_dir and args.run_from_dir:
+    if args.work_dir and args.run_from_launch_dir:
         raise ValueError(
-            'The --work-dir and --run-from-dir flags are mutually '
+            'The --work-dir and --run-from-launch-dir flags are mutually '
             'exclusive')
 
 # See if the system can be autodetected and then process some special arguments
 
@@ -43,17 +43,18 @@ def main():
     # Pick batch scheduler
     scheduler = launch_helpers.select_scheduler(args, logger, system)
 
-    _, folder_name = scheduler.create_launch_folder_name(args.command, 'launch')
+    _, folder_name = scheduler.create_launch_folder_name(args.command, 'launch', args.no_launch_dir)
 
     script_file = scheduler.create_launch_folder(folder_name,
                                                  not args.bg,
                                                  args.output_script,
-                                                 args.run_from_dir)
+                                                 args.run_from_launch_dir)
 
     jobid = scheduler.launch(system, folder_name, script_file,
                              args.command, args.args, not args.bg,
                              args.setup_only,
-                             args.color_stderr, args.run_from_dir)
+                             args.color_stderr, args.run_from_launch_dir,
+                             (args.save_hostlist or args.verbose))
 
     if jobid:
         logger.info(f'Job ID: {jobid}')
 
@@ -98,23 +98,24 @@ def main():
         )
         exit(1)
 
-    command_as_folder_name, folder_name = scheduler.create_launch_folder_name(args.command,
-                                                                                'torchrun_hpc',)
+    _, folder_name = scheduler.create_launch_folder_name(args.command,
+                                                         'torchrun_hpc',
+                                                         args.no_launch_dir)
 
     script_file = scheduler.create_launch_folder(folder_name,
                                                  not args.bg,
                                                  args.output_script,
-                                                 args.run_from_dir)
+                                                 args.run_from_launch_dir)
 
-    stub_file = 'torchrun_hpc_' + command_as_folder_name
+    trampoline_file = 'torchrun_hpc_trampoline.py'
 
     if os.path.exists(folder_name):
-        copied_stub_file = folder_name + '/' +  stub_file
+        copied_trampoline_file = folder_name + '/' +  trampoline_file
         package_path = os.path.dirname(os.path.abspath(__file__))
-        shutil.copy(os.path.join(package_path, 'torchrun_hpc_stub.py'), copied_stub_file)
+        shutil.copy(os.path.join(package_path, '..', 'torch', trampoline_file), copied_trampoline_file)
 
     command = sys.executable
-    launch_args = ['-u', f'{os.path.abspath(folder_name)}/{stub_file}', os.path.abspath(args.command)]
+    launch_args = ['-u', f'{os.path.abspath(folder_name)}/{trampoline_file}', os.path.abspath(args.command)]
     launch_args += args.args
 
     logger.info(f'Running job in directory: {folder_name}')
@@ -123,7 +124,8 @@ def main():
                              command, launch_args, not args.bg,
                              # args.output_script,
                              args.setup_only,
-                             args.color_stderr, args.run_from_dir)
+                             args.color_stderr, args.run_from_launch_dir,
+                             (args.save_hostlist or args.verbose))
 
     if jobid:
         logger.info(f'Job ID: {jobid}')
 
@@ -15,6 +15,8 @@
 from typing import TYPE_CHECKING, Optional
 from io import StringIO
 import os
+import subprocess
+import re
 
 if TYPE_CHECKING:
     # If type-checking, import the other class
@@ -97,8 +99,13 @@ def build_command_string_and_batch_script(self,
             self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
 
         if self.queue:
-            tmp = [f'--queue={self.queue}']
-            self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+            if os.getenv('FLUX_URI'):
+                logger.warning(
+                    f'WARNING: Dropping unsupported option requested when running inside of an allocation: --queue={self.queue}'
+                )
+            else:
+                tmp = [f'--queue={self.queue}']
+                self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
 
         if self.account:
             tmp = [f'--account={self.account}']
@@ -141,7 +148,8 @@ def launcher_script(self,
                         system: 'System',
                         command: str,
                         args: Optional[list[str]] = None,
-                        blocking: bool = True) -> str:
+                        blocking: bool = True,
+                        save_hostlist: bool = False) -> str:
 
         script = ''
         # Launcher script only use the header_lines to construct the shell script to be launched
@@ -150,7 +158,8 @@ def launcher_script(self,
              system, blocking)
         script += header_lines
         script += '\n'
-        script += 'export HPC_LAUNCHER_HOSTLIST=$(flux hostlist local)\n'
+        if save_hostlist:
+            script += 'export HPC_LAUNCHER_HOSTLIST=$(flux hostlist local)\n'
 
         if not blocking:
             script += 'flux run '
@@ -170,6 +179,17 @@ def get_job_id(self, output: str) -> Optional[str]:
         # The job ID is the only printout when calling flux batch
         return output.strip()
 
+    @classmethod
+    def num_nodes_in_allocation(cls) -> Optional[int]:
+        if os.getenv('FLUX_URI'):
+            cmd = ['flux', 'resource', 'info']
+            proc = subprocess.run(cmd, universal_newlines=True, capture_output=True)
+            m = re.search(r'^(\d*) Nodes, (\d*) Cores, (\d*) GPUs$', proc.stdout)
+            if m:
+                return int(m.group(1))
+
+        return None
+
     @classmethod
     def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
         env_vars = [
 
@@ -41,7 +41,8 @@ def launcher_script(self,
                         system: 'System',
                         command: str,
                         args: Optional[list[str]] = None,
-                        blocking: bool = True) -> str:
+                        blocking: bool = True,
+                        save_hostlist: bool = False) -> str:
         envvars = [
             f'export {k}={v}' for k, v in system.environment_variables()
         ]
@@ -51,8 +52,11 @@ def launcher_script(self,
         ]
         envvars += [
             'export RANK=0',
-            'export HPC_LAUNCHER_HOSTLIST=$(hostname)',
         ]
+        if save_hostlist:
+            envvars += [
+                'export HPC_LAUNCHER_HOSTLIST=$(hostname)',
+            ]
         header = '\n'.join(envvars)
 
         if self.work_dir:
 
@@ -127,14 +127,16 @@ def launcher_script(self,
                         system: 'System',
                         command: str,
                         args: Optional[list[str]] = None,
-                        blocking: bool = True) -> str:
+                        blocking: bool = True,
+                        save_hostlist: bool = False) -> str:
 
         script = ''
         # Launcher script only use the header_lines to construct the shell script to be launched
         (header_lines, cmd_string, parallel_run_args) = self.build_command_string_and_batch_script(system, blocking)
         script += header_lines
         script += '\n'
-        script += "export HPC_LAUNCHER_HOSTLIST=$(echo $LSB_HOSTS | tr ' ' '\\n' | sort -u)\n\n"
+        if save_hostlist:
+            script += "export HPC_LAUNCHER_HOSTLIST=$(echo $LSB_HOSTS | tr ' ' '\\n' | sort -u)\n\n"
 
         if not blocking or (blocking and not os.getenv('LSB_HOSTS')):
             script += 'jsrun '
@@ -153,6 +155,13 @@ def launcher_script(self,
     def get_job_id(self, output: str) -> Optional[str]:
         raise NotImplementedError
 
+    @classmethod
+    def num_nodes_in_allocation(cls) -> Optional[int]:
+        if os.getenv('LLNL_NUM_COMPUTE_NODES'):
+            return int(os.getenv('LLNL_NUM_COMPUTE_NODES'))
+
+        return None
+
     @classmethod
     def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
         env_vars = ['OMPI_COMM_WORLD_SIZE', 'OMPI_COMM_WORLD_RANK', 'OMPI_COMM_WORLD_LOCAL_RANK', 'OMPI_COMM_WORLD_LOCAL_SIZE']
 
@@ -113,13 +113,19 @@ def launch_command(self,
     def launcher_script(self,
                         system: 'System',
                         command: str,
-                        args: Optional[list[str]] = None) -> str:
+                        args: Optional[list[str]] = None,
+                        blocking: bool = True,
+                        save_hostlist: bool = False) -> str:
         """
         Returns the full launcher script, which can be saved as a batch
         script, for the given system and launcher configuration.
         This script usually performs node/resource allocation and manages I/O.
 
         :param system: The system to use.
+        :param command: The command to launch
+        :param args: Optional list of argument for the command to launch
+        :param blocking: Launch the comamnd interactively if true, else in a batch job
+        :params save_hostlist: Add local scripting to capture the list of hosts the command is launched on
         :return: A shell script as a string.
         """
         raise NotImplementedError
@@ -149,6 +155,15 @@ def get_job_id(self, output: str) -> Optional[str]:
         """
         return None
 
+    @classmethod
+    def num_nodes_in_allocation(cls) -> tuple[int]:
+        """
+        When running under an allocation, check how many nodes are available
+
+        :return: Number of nodes in an allocation
+        """
+        raise NotImplementedError
+
     @classmethod
     def get_parallel_configuration(cls) -> tuple[int, int, int, int]:
         """
@@ -193,7 +208,8 @@ def setup_rendezvous_protocol(self, protocol: str) -> list[str]:
 
     def create_launch_folder_name(self,
                                   command: str,
-                                  folder_prefix: str = 'launch'
+                                  folder_prefix: str = 'launch',
+                                  no_launch_dir: bool = False,
                              ) -> (str, str):
         """
         Create a folder name for the launcher based on the command.
@@ -206,26 +222,29 @@ def create_launch_folder_name(self,
         command_as_folder_name = os.path.basename(command).replace(' ', '_').replace(';','-')
         # Create a folder for the output and error logs
         # Timestamp is of the format YYYY-MM-DD_HHhMMmSSs
-        folder_name = f'{folder_prefix}-{self.job_name or command_as_folder_name}_{time.strftime("%Y-%m-%d_%Hh%Mm%Ss")}'
+        if no_launch_dir:
+            folder_name = os.getcwd()
+        else:
+            folder_name = f'{folder_prefix}-{self.job_name or command_as_folder_name}_{time.strftime("%Y-%m-%d_%Hh%Mm%Ss")}'
         return (command_as_folder_name, folder_name)
 
     def create_launch_folder(self,
                              folder_name: str,
                              blocking: bool = True,
                              script_file: Optional[str] = None,
-                             run_from_dir: bool = False,
+                             run_from_launch_dir: bool = False,
                              ) -> (str, str):
         """
         Create a folder and associated launch script if approrpiate.
 
         :param folder_name: The name of the folder for containing all of the launch artifacts.
         :param blocking: If True, the job should run from the launch folder.
         :param script_file: If given, saves the output script to this file.
-        :param run_from_dir: If True, runs the command from the launch folder.
+        :param run_from_launch_dir: If True, runs the command from the launch folder.
         :return: The filename for the launch script as a string.
         """
 
-        should_make_folder = blocking or run_from_dir
+        should_make_folder = blocking or run_from_launch_dir
 
         # Create a temporary file or a script file, if given
         if script_file is not None:
@@ -265,7 +284,8 @@ def launch(self,
                blocking: bool = True,
                setup_only: bool = False,
                color_stderr: bool = False,
-               run_from_dir: bool = False) -> str:
+               run_from_launch_dir: bool = False,
+               save_hostlist: bool = False) -> str:
         """
         Launches the given command and arguments uaing this launcher.
 
@@ -278,13 +298,14 @@ def launch(self,
                          and redirects/duplicates outputs to the terminal.
         :param setup_only: If True, only sets up the job and does not launch it.
         :param color_stderr: If True, colors stderr terminal outputs in red.
-        :param run_from_dir: If True, runs the command from the launch directory.
+        :param run_from_launch_dir: If True, runs the command from the launch directory.
+        :params save_hostlist: Add local scripting to capture the list of hosts the command is launched on
         :return: The queued job ID as a string.
         """
 
         # If the command is run from a directory, and the command exists as a
         # file, use its absolute path
-        if run_from_dir:
+        if run_from_launch_dir:
             if os.path.isfile(command):
                 command = os.path.abspath(command)
             # Change the working directory to the launch folder
@@ -299,13 +320,16 @@ def launch(self,
 
         logger.info(f'Script filename: {filename}')
         with open(filename, 'w') as fp:
-            fp.write(self.launcher_script(system, command, args, blocking))
-            fp.write('\nif [ "${RANK}" = "0" ]; then')
-            fp.write('\n    echo ${HPC_LAUNCHER_HOSTLIST} > '
-                     + os.path.join(os.path.dirname(filename), f'hpc_launcher_hostlist.txt\n'))
-            fp.write('fi\n')
+            fp.write(self.launcher_script(system, command, args, blocking, save_hostlist))
+            if save_hostlist:
+                fp.write('\nif [ "${RANK}" = "0" ]; then')
+                fp.write('\n    echo ${HPC_LAUNCHER_HOSTLIST} > '
+                         + os.path.join(os.path.dirname(filename), f'hpc_launcher_hostlist.txt\n'))
+                fp.write('fi\n')
+
             fp.write(f'\n# Launch command: ' + ' '.join(full_cmdline) + '\n')
-            fp.write(f'# User command invoked: ' + ' '.join(self.command_line) + '\n')
+            if self.command_line:
+                fp.write(f'# User command invoked: ' + ' '.join(self.command_line) + '\n')
         os.chmod(filename, 0o700)
 
         if setup_only: