LBANN
diff --git a/‎hpc_launcher/cli/common_args.py‎
Lines changed: 26 additions & 2 deletions b/‎hpc_launcher/cli/common_args.py‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎hpc_launcher/cli/launch.py‎
Lines changed: 2 additions & 1 deletion b/‎hpc_launcher/cli/launch.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎hpc_launcher/cli/torchrun_hpc.py‎
Lines changed: 2 additions & 1 deletion b/‎hpc_launcher/cli/torchrun_hpc.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎hpc_launcher/schedulers/flux.py‎
Lines changed: 36 additions & 127 deletions b/‎hpc_launcher/schedulers/flux.py‎
Lines changed: 36 additions & 127 deletions
@@ -25,7 +25,8 @@
 
 class ParseKVAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
-        setattr(namespace, self.dest, dict())
+        if not getattr(namespace, self.dest):
+            setattr(namespace, self.dest, dict())
         for each in values:
             try:
                 key, value = each.split("=")
@@ -125,6 +126,16 @@ def setup_arguments(parser: argparse.ArgumentParser):
         help="Indicate if the job will primarily use a specific communication protocol and set any relevant environment variables: MPI or *CCL (NCCL, RCCL)",
     )
 
+    group.add_argument(
+        "-x",
+        "--xargs",
+        dest="override_args",
+        nargs='+',
+        action=ParseKVAction,
+        help="Specifies scheduler and launch arguments (note it will override any known key): --xargs k1=v1 k2=v2 \n or --xargs k1=v1 --xargs k2=v2 \n Also note that a double dash -- is needed if this is the last argument. \n Arguments with a leading tilde ~ will be removed if found",
+        metavar="KEY1=VALUE1",
+    )
+
     # System
     group = parser.add_argument_group(
         "System",
@@ -136,7 +147,7 @@ def setup_arguments(parser: argparse.ArgumentParser):
         dest="system_params",
         nargs='+',
         action=ParseKVAction,
-        help="Specifies some or all of the parameters of a system as a dictionary (note it will override any known or autodetected parameters): -p cores_per_node=<int> gpus_per_node=<int> gpu_arch=<str> mem_per_gpu=<float> numa_domains=<int> scheduler=<str>",
+        help="Specifies some or all of the parameters of a system as a dictionary (note it will override any known or autodetected parameters): -p cores_per_node=<int> gpus_per_node=<int> gpu_arch=<str> mem_per_gpu=<float> numa_domains=<int> scheduler=<str>\n -p cores_per_node=<int> gpus_per_node=<int> \n Also note that a double dash -- is need if this is the last argument",
         metavar="KEY1=VALUE1",
     )
 
@@ -188,6 +199,15 @@ def setup_arguments(parser: argparse.ArgumentParser):
 
     group = parser.add_argument_group("Script", "Batch scheduler script parameters")
 
+    # different behavior for interactive vs batch jobs
+    # Add an argument to pick the run directory: tmp, none, self labeled, auto labeled
+
+    group.add_argument(
+        "--launch-dir-name",
+        default=None,
+        help="Use a custome name for the launch directory",
+    )
+
     group.add_argument(
         "--run-from-launch-dir",
         action="store_true",
@@ -293,6 +313,10 @@ def validate_arguments(args: argparse.Namespace):
         raise ValueError(
             "The --work-dir and --run-from-launch-dir flags are mutually " "exclusive"
         )
+    if args.launch_dir_name and args.no_launch_dir:
+        raise ValueError(
+            "The --launch-dir-name and --no-launch-dir flags are mutually " "exclusive"
+        )
 
 
 # See if the system can be autodetected and then process some special arguments
 
@@ -46,7 +46,7 @@ def main():
     scheduler = launch_helpers.select_scheduler(args, logger, system)
 
     _, folder_name = scheduler.create_launch_folder_name(
-        args.command, "launch", args.no_launch_dir
+        args.command, "launch", args.no_launch_dir, args.launch_dir_name
     )
 
     script_file = scheduler.create_launch_folder(
@@ -59,6 +59,7 @@ def main():
         script_file,
         args.command,
         args.args,
+        args.override_args,
         not args.bg,
         args.setup_only,
         args.color_stderr,
 
@@ -115,7 +115,7 @@ def main():
         exit(1)
 
     _, folder_name = scheduler.create_launch_folder_name(
-        args.command, "torchrun_hpc", args.no_launch_dir
+        args.command, "torchrun_hpc", args.no_launch_dir, args.launch_dir_name
     )
 
     script_file = scheduler.create_launch_folder(
@@ -148,6 +148,7 @@ def main():
         script_file,
         command,
         launch_args,
+        args.override_args,
         not args.bg,
         # args.output_script,
         args.setup_only,
 
@@ -33,175 +33,84 @@
 @dataclass
 class FluxScheduler(Scheduler):
 
-    def select_interactive_or_batch(
-        self,
-        tmp: list[str],
-        header: StringIO,
-        cmd_args: list[str],
-        blocking: bool = True,
-    ) -> None:
-        if blocking:
-            cmd_args += tmp
-        else:
-            header.write(f'# FLUX: {" ".join(tmp)}\n')
-        return
-
-    def build_command_string_and_batch_script(
+    def build_scheduler_specific_arguments(
         self, system: "System", blocking: bool = True
-    ) -> (str, list[str]):
-
-        env_vars = system.environment_variables()
-        passthrough_env_vars = system.passthrough_environment_variables()
-        # Enable the system to apply some customization to the scheduler instance
-        system.customize_scheduler(self)
-
-        header = StringIO()
-        header.write("#!/bin/sh\n")
-        cmd_args = []
+    ):
         if self.out_log_file and not blocking:
-            header.write(f"# FLUX: --output={self.out_log_file}\n")
+            self.submit_only_args[f"--output"] = f"{self.out_log_file}"
         if self.err_log_file and not blocking:
-            header.write(f"# FLUX: --error={self.err_log_file}\n")
-
-        # Unbuffered output
-        tmp = "-u"
-        cmd_args += [tmp]
-        if not blocking:
-            header.write(f"# FLUX: {tmp}\n")
+            self.submit_only_args[f"--error"] = f"{self.err_log_file}"
 
         # Number of Nodes
-        tmp = f"-N{self.nodes}"
-        cmd_args += [tmp]
-        if not blocking:
-            header.write(f"# FLUX: {tmp}\n")
+        self.common_launch_args[f"-N{self.nodes}"] = None
 
         # Total number of Tasks / Processes
-        tmp = f"-n{self.nodes * self.procs_per_node}"
-        cmd_args += [tmp]
-        if not blocking:
-            header.write(f"# FLUX: {tmp}\n")
+        self.common_launch_args[f"-n{self.nodes * self.procs_per_node}"] = None
+
+        # Unbuffered output
+        self.common_launch_args["-u"] = None
 
         # Set the Number of GPUs per task
         # There is a difference in option names between tasks and allocations
         if self.gpus_per_proc > 0:
             tmp = f"{self.gpus_per_proc}"
             # command line flag for a task
-            self.run_launch_args["--gpus-per-task"] = tmp
+            self.run_only_args["--gpus-per-task"] = tmp
             # command and shell flags for an allocation
-            self.batch_submit_args["--gpus-per-slot"] = tmp
-            self.batch_script_header["# FLUX: --gpus-per-slot"] = tmp
+            if not blocking:
+                self.submit_only_args["--gpus-per-slot"] = tmp
 
         if self.work_dir:
-            tmp = [f"--setattr=system.cwd={os.path.abspath(self.work_dir)}"]
-            self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+            self.submit_only_args["--setattr=system.cwd"] = f"{os.path.abspath(self.work_dir)}"
 
-        tmp = ["-onosetpgrp"]
-        self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+        self.common_launch_args["-onosetpgrp"] = None
 
         if self.ld_preloads:
-            tmp = [f'--env=LD_PRELOAD={",".join(self.ld_preloads)}']
-            self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+            self.common_launch_args['--env=LD_PRELOAD'] = f'{",".join(self.ld_preloads)}'
 
         if self.time_limit is not None:
-            tmp = [f"--time={self.time_limit}m"]
-            self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+            self.common_launch_args["--time"] = f"{self.time_limit}m"
 
         if self.job_name:
-            tmp = [f"--job-name={self.job_name}"]
-            self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+            self.common_launch_args["--job-name"] = f"{self.job_name}"
 
         if self.queue:
             if os.getenv("FLUX_URI"):
                 logger.warning(
                     f"WARNING: Dropping unsupported option requested when running inside of an allocation: --queue={self.queue}"
                 )
             else:
-                tmp = [f"--queue={self.queue}"]
-                self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+                self.submit_only_args["--queue"] = f"{self.queue}"
 
         if self.account:
-            tmp = [f"--account={self.account}"]
-            self.select_interactive_or_batch(tmp, header, cmd_args, blocking)
+            self.submit_only_args["--account"] = f"{self.account}"
 
         if self.reservation:
             logger.warning(
                 f"WARNING: Unsupported option requested: --reservation={self.reservation}"
             )
 
-        if self.launcher_flags:
-            for flag in self.launcher_flags:
-                # These flag should only be on the launcher commands not the batch commands
-                cmd_args += [flag]
+        return
 
-        if not blocking: # Only add batch script header items on non-blocking calls
-            for k,v in self.batch_script_header.items():
-                header.write(f"{k}={v}\n")
+    def batch_script_prefix(self) -> str:
+        return "# FLUX:"
 
-        for e in env_vars:
-            header.write(parse_env_list(*e))
+    def blocking_launch_command(self) -> list[str]:
+        return ["flux", "run"]
 
+    def nonblocking_launch_command(self) -> list[str]:
+        return ["flux", "batch"]
+
+    def cli_passthrough_env_arg(self, passthrough_env_vars) -> None:
         for k, v in passthrough_env_vars:
-            if not blocking:
-                cmd_args += [f" --env={k}={v}"]
-            else:
-                header += f"export {k}={v}\n"
-
-        return (header.getvalue(), cmd_args)
-
-    def launch_command(self, system: "System", blocking: bool = True) -> list[str]:
-        # Launch command only use the cmd_args to construct the shell script to be launched
-        (header_lines, cmd_args) = self.build_command_string_and_batch_script(
-            system, blocking
-        )
-
-        if not blocking:
-            for k,v in self.batch_submit_args.items():
-                cmd_args += [f"{k}={v}"]
-            return ["flux", "batch"] + cmd_args
-
-        for k,v in self.run_launch_args.items():
-            cmd_args += [f"{k}={v}"]
-        return ["flux", "run"] + cmd_args
-
-    def launcher_script(
-        self,
-        system: "System",
-        command: str,
-        args: Optional[list[str]] = None,
-        blocking: bool = True,
-        save_hostlist: bool = False,
-        launch_dir: str = "",
-    ) -> str:
-
-        script = ""
-        # Launcher script only use the header_lines to construct the shell script to be launched
-        (header_lines, cmd_string) = self.build_command_string_and_batch_script(
-            system, blocking
-        )
-        for k,v in self.run_launch_args.items():
-            cmd_string += [f"{k}={v}"]
-
-        script += header_lines
-        script += "\n"
-        if save_hostlist:
-            script += "export HPC_LAUNCHER_HOSTLIST=$(flux hostlist local)\n"
-            script += 'if [ "${RANK}" = "0" ]; then\n'
-            script += "    echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
-            script += "fi\n\n"
-
-        if not blocking:
-            script += "flux run "
-            script += " ".join(cmd_string)
-            script += " "
-
-        script += f"{command}"
-
-        for arg in args:
-            script += f" {arg}"
-
-        script += "\n"
-
-        return script
+            self.submit_only_args[f"--env={k}"] = f"{v}"
+        return
+
+    def export_hostlist(self) -> str:
+        return "export HPC_LAUNCHER_HOSTLIST=$(flux hostlist local)\n"
+
+    def internal_script_run_command(self) -> str:
+        return "flux run "
 
     def get_job_id(self, output: str) -> Optional[str]:
         # The job ID is the only printout when calling flux batch