Bugfix ci (#28)

bvanessen · web-flow · commit 90ba0e367841 · 2025-03-19T17:17:58.000-07:00
* Fix the configuration launch scripts.  Add a check for when the number of ranks and the number of GPUs per rank exceed the total number of GPUs per node.

* Updating tests to new API.

* Fixed a bug in the scheduler classes, where the host file was being
created after the job was run.  This prevented the CI tests from self
checking.  Additionally, added checks in the CI to skip if MPI wasn't
detected.
diff --git a/hpc_launcher/schedulers/flux.py b/hpc_launcher/schedulers/flux.py
@@ -158,6 +158,7 @@ def launcher_script(
         args: Optional[list[str]] = None,
         blocking: bool = True,
         save_hostlist: bool = False,
+        launch_dir: str = "",
     ) -> str:
 
         script = ""
@@ -169,6 +170,9 @@ def launcher_script(
         script += "\n"
         if save_hostlist:
             script += "export HPC_LAUNCHER_HOSTLIST=$(flux hostlist local)\n"
+            script += '\nif [ "${RANK}" = "0" ]; then'
+            script += "\n    echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
+            script += "fi\n"
 
         if not blocking:
             script += "flux run "
diff --git a/hpc_launcher/schedulers/local.py b/hpc_launcher/schedulers/local.py
@@ -42,6 +42,7 @@ def launcher_script(
         args: Optional[list[str]] = None,
         blocking: bool = True,
         save_hostlist: bool = False,
+        launch_dir: str = "",
     ) -> str:
         envvars = [f"export {k}={v}" for k, v in system.environment_variables()]
         envvars += [
@@ -53,6 +54,9 @@ def launcher_script(
         if save_hostlist:
             envvars += [
                 "export HPC_LAUNCHER_HOSTLIST=$(hostname)",
+                '\nif [ "${RANK}" = "0" ]; then',
+                "\n    echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n"),
+                "fi\n",
             ]
         header = "\n".join(envvars)
 
diff --git a/hpc_launcher/schedulers/lsf.py b/hpc_launcher/schedulers/lsf.py
@@ -137,6 +137,7 @@ def launcher_script(
         args: Optional[list[str]] = None,
         blocking: bool = True,
         save_hostlist: bool = False,
+        launch_dir: str = "",
     ) -> str:
 
         script = ""
@@ -148,6 +149,9 @@ def launcher_script(
         script += "\n"
         if save_hostlist:
             script += "export HPC_LAUNCHER_HOSTLIST=$(echo $LSB_HOSTS | tr ' ' '\\n' | sort -u)\n\n"
+            script += '\nif [ "${RANK}" = "0" ]; then'
+            script += "\n    echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
+            script += "fi\n"
 
         if not blocking or (blocking and not os.getenv("LSB_HOSTS")):
             script += "jsrun "
diff --git a/hpc_launcher/schedulers/scheduler.py b/hpc_launcher/schedulers/scheduler.py
@@ -120,6 +120,7 @@ def launcher_script(
         args: Optional[list[str]] = None,
         blocking: bool = True,
         save_hostlist: bool = False,
+        launch_dir: str = "",
     ) -> str:
         """
         Returns the full launcher script, which can be saved as a batch
@@ -335,17 +336,8 @@ def launch(
         logger.info(f"Script filename: {filename}")
         with open(filename, "w") as fp:
             fp.write(
-                self.launcher_script(system, command, args, blocking, save_hostlist)
+                self.launcher_script(system, command, args, blocking, save_hostlist, os.path.dirname(filename))
             )
-            if save_hostlist:
-                fp.write('\nif [ "${RANK}" = "0" ]; then')
-                fp.write(
-                    "\n    echo ${HPC_LAUNCHER_HOSTLIST} > "
-                    + os.path.join(
-                        os.path.dirname(filename), f"hpc_launcher_hostlist.txt\n"
-                    )
-                )
-                fp.write("fi\n")
 
             fp.write(f"\n# Launch command: " + " ".join(full_cmdline) + "\n")
             if self.command_line:
diff --git a/hpc_launcher/schedulers/slurm.py b/hpc_launcher/schedulers/slurm.py
@@ -167,6 +167,7 @@ def launcher_script(
         args: Optional[list[str]] = None,
         blocking: bool = True,
         save_hostlist: bool = False,
+        launch_dir: str = "",
     ) -> str:
 
         script = ""
@@ -180,6 +181,9 @@ def launcher_script(
         script += "\n"
         if save_hostlist:
             script += "export HPC_LAUNCHER_HOSTLIST=${SLURM_JOB_NODELIST}\n"
+            script += '\nif [ "${RANK}" = "0" ]; then'
+            script += "\n    echo ${HPC_LAUNCHER_HOSTLIST} > " + os.path.join(launch_dir, f"hpc_launcher_hostlist.txt\n")
+            script += "fi\n"
 
         if not blocking:
             script += "srun -u "
diff --git a/hpc_launcher/systems/configure.py b/hpc_launcher/systems/configure.py
@@ -27,10 +27,10 @@ def configure_launch(
     queue: str,
     nodes: int,
     procs_per_node: int,
-    gpus_per_proc: int,
-    gpus_at_least: int,
-    gpumem_at_least: int,
-    cli_system_params: Optional[tuple[int, int, str, float, int, str, Optional[float]]],
+    gpus_per_proc: Optional[int],
+    gpus_at_least: int = 0,
+    gpumem_at_least: int = 0,
+    cli_system_params: Optional[tuple[int, int, str, float, int, str, Optional[float]]] = None,
 ) -> tuple[System, int, int, int]:
     """
     See if the system can be autodetected and then process some special
@@ -40,10 +40,14 @@ def configure_launch(
     :param nodes: The number of nodes to use (or 0 if not specified)
     :param procs_per_node: The number of processes per node given by the user
                            (or 0 if not specified)
+    :param gpus_per_proc: The number of GPUs per process given by the user
+                           (or None if not specified)
     :param gpus_at_least: The minimum number of GPUs to use (or 0 if not
                           specified)
     :param gpumem_at_least: The minimum amount of GPU memory (in gigabytes) to
                             use (or 0 if not specified)
+    :param cli_system_params: CLI provide description of the system configuration
+                            (or None if not specified)
     :return: A tuple of (autodetected System, number of nodes, number of
              processes per node)
     """
@@ -66,19 +70,25 @@ def configure_launch(
 
     if not gpus_per_proc:
         gpus_per_proc = 0
+
+    # If not provided, attempt to figure out the basics of procs_per_node and gpus_per_proc
     if system_params is not None:
+        if not procs_per_node:
+            procs_per_node = system_params.procs_per_node()
         if gpus_per_proc == 0 and system_params.gpus_per_node > 0:
             # If gpus_per_proc wasn't set and there are gpus on the node set it to a default of 1
             gpus_per_proc = 1
-        if gpus_per_proc > system_params.gpus_per_node:
+        if procs_per_node * gpus_per_proc > system_params.gpus_per_node:
             logger.info(
                 f"Requested number of GPUs per process {gpus_per_proc} exceeds the number of GPUs per node {system_params.gpus_per_node}"
             )
-            gpus_per_proc = system_params.gpus_per_node
+            # If no, or an invalid, configuration is given, set the gpus_per_proc
+            if gpus_per_proc == 0 or gpus_per_proc > system_params.gpus_per_node:
+                gpus_per_proc = max(system_params.gpus_per_node // procs_per_node, 1)
 
-        if procs_per_node * gpus_per_proc > system_params.gpus_per_node:
+        if procs_per_node and procs_per_node * gpus_per_proc > system_params.gpus_per_node:
             logger.info(
-                f"The combination of {procs_per_node} processes per node and {gpus_per_proc} GPUs per process exceeds the number of GPUs per node {system_params.gpus_per_node}"
+                f"The combination of {procs_per_node} processes per node and {gpus_per_proc} GPUs per process exceeds the number of GPUs per node {system_params.gpus_per_node} - Job will not launch, please fix requested parameters"
             )
 
     # If the user requested a specific number of processes per node, honor that
@@ -88,8 +98,6 @@ def configure_launch(
     # Otherwise, if there is a valid set of system parameters, try to fill in
     # the blanks provided by the user
     if system_params is not None:
-        if not procs_per_node:
-            procs_per_node = system_params.procs_per_node()
         if gpus_at_least > 0:
             nodes = ceildiv(gpus_at_least, procs_per_node)
         elif gpumem_at_least > 0:
diff --git a/tests/launch_config_test.py b/tests/launch_config_test.py
@@ -71,28 +71,51 @@ def test_launch_config(*args):
     Tests various launch configurations for GPU count and memory size.
     """
     # User-specified procs_per_node
-    system, nodes, procs_per_node = configure_launch(None, 2, 4, 0, 0)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 4, 1, 0, 0, None)
     assert isinstance(system, MockSystem)
     assert nodes == 2
     assert procs_per_node == 4
+    assert gpus_per_proc == 1
 
     # GPU count constraint test
-    system, nodes, procs_per_node = configure_launch(None, 0, 0, 6, 0)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 0, 0, 1, 6, 0, None)
     assert isinstance(system, MockSystem)
     assert nodes == 2
     assert procs_per_node == 3
+    assert gpus_per_proc == 1
 
     # Memory constraint test
-    system, nodes, procs_per_node = configure_launch(None, 0, 0, 0, 22)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 0, 0, 1, 0, 22, None)
     assert isinstance(system, MockSystem)
     assert nodes == 1
     assert procs_per_node == 2
+    assert gpus_per_proc == 1
 
     # Just above the memory limit of a single node, this triggers a switch to all gpus per node
-    system, nodes, procs_per_node = configure_launch(None, 0, 0, 0, 34)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 0, 0, 1, 0, 34, None)
     assert isinstance(system, MockSystem)
     assert nodes == 2
     assert procs_per_node == 3
+    assert gpus_per_proc == 1
+
+    # Ask for too many GPUs per proc, this should snap down to the 3 GPUs available
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 1, 4, 0, 0, None)
+    assert isinstance(system, MockSystem)
+    assert nodes == 2
+    assert procs_per_node == 1
+    assert gpus_per_proc == 3
+
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 2, 2, 0, 0, None)
+    assert isinstance(system, MockSystem)
+    assert nodes == 2
+    assert procs_per_node == 2
+    assert gpus_per_proc == 2
+
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 2, 2, None, 0, 0, None)
+    assert isinstance(system, MockSystem)
+    assert nodes == 2
+    assert procs_per_node == 2
+    assert gpus_per_proc == 1
 
 
 @patch(
@@ -103,16 +126,18 @@ def test_nondefault_queue(*args):
     """
     Tests the configuration of a non-default queue.
     """
-    system, nodes, procs_per_node = configure_launch("nondefault", 1, None, 0, 0)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch("nondefault", 1, 2, 1, 0, 0, None)
     assert isinstance(system, MockSystem)
     assert nodes == 1
     assert procs_per_node == 2
+    assert gpus_per_proc == 1
 
     # Memory constraint test
-    system, nodes, procs_per_node = configure_launch("nondefault", 0, 0, 0, 22)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch("nondefault", 0, 0, 1, 0, 22, None)
     assert isinstance(system, MockSystem)
     assert nodes == 3
     assert procs_per_node == 2
+    assert gpus_per_proc == 1
 
 
 @patch(
@@ -125,16 +150,18 @@ def test_preferred_procs_per_node(*args):
     """
 
     # User specifies only number of nodes (GPU system)
-    system, nodes, procs_per_node = configure_launch(None, 3, 0, 0, 0)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch(None, 3, 0, 1, 0, 0, None)
     assert isinstance(system, MockSystem)
     assert nodes == 3
     assert procs_per_node == 3
+    assert gpus_per_proc == 1
 
     # User specifies only number of nodes (CPU system)
-    system, nodes, procs_per_node = configure_launch("cpuonly", 3, 0, 0, 0)
+    system, nodes, procs_per_node, gpus_per_proc = configure_launch("cpuonly", 3, 0, 0, 0, 0, None)
     assert isinstance(system, MockSystem)
     assert nodes == 3
     assert procs_per_node == 4
+    assert gpus_per_proc == 0
 
 
 @patch(
diff --git a/tests/output_capture_test.py b/tests/output_capture_test.py
@@ -27,8 +27,8 @@
 @pytest.mark.parametrize("no_launch_dir", [False, True])
 def test_output_capture_local(no_launch_dir: bool):
     # Configure scheduler
-    system, nodes, procs_per_node = configure.configure_launch(None, 1, 1, None, None)
-    scheduler = LocalScheduler(nodes, procs_per_node)
+    system, nodes, procs_per_node, gpus_per_proc = configure.configure_launch(None, 1, 1, 1, None, None)
+    scheduler = LocalScheduler(nodes, procs_per_node, gpus_per_proc)
 
     command = sys.executable
     script = "output_capture.py"
@@ -77,10 +77,10 @@ def test_output_capture_scheduler(scheduler_class, processes):
         pytest.skip("LSF not available")
 
     # Configure scheduler
-    system, nodes, procs_per_node = configure.configure_launch(
-        None, 1, processes, None, None
+    system, nodes, procs_per_node, gpus_per_proc = configure.configure_launch(
+        None, 1, processes, 1, None, None
     )
-    scheduler = scheduler_class(nodes, procs_per_node)
+    scheduler = scheduler_class(nodes, procs_per_node, gpus_per_proc)
 
     command = sys.executable
     _, launch_dir = scheduler.create_launch_folder_name(command, "launch")
diff --git a/tests/test_torchrun_hpc.py b/tests/test_torchrun_hpc.py
@@ -144,6 +144,11 @@ def test_launcher_multinode(num_nodes, procs_per_node, rdv, scheduler_type):
     except (ImportError, ModuleNotFoundError):
         pytest.skip("torch not found")
 
+    try:
+        import mpi4py
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip("mpi not found")
+
     # Get full path to torch_dist_driver.py
     driver_file = os.path.join(os.path.dirname(__file__), "torch_dist_driver.py")
 
@@ -183,3 +188,11 @@ def test_launcher_multinode(num_nodes, procs_per_node, rdv, scheduler_type):
 
     if exp_dir:
         shutil.rmtree(exp_dir, ignore_errors=True)
+
+if __name__ == "__main__":
+    test_launcher_multinode(2, 1, "tcp", "slurm")
+    test_launcher_multinode(2, 1, "tcp", "flux")
+    test_launcher_multinode(2, 1, "tcp", "lsf")
+    test_launcher_multinode(2, 1, "mpi", "slurm")
+    test_launcher_multinode(2, 1, "mpi", "flux")
+    test_launcher_multinode(2, 1, "mpi", "lsf")