Update systems and bug fix autodetect (#57)

bvanessen · web-flow · commit d7a1178f7c87 · 2025-10-22T22:34:49.000-07:00
* Fixed a bug in the autodetect GPU logic that overwrote the output in
the finally block attached to the try block.  Added code to
auto-detect the ROCm version being used and set that to constrain the
version of amdsmi installed.  Added definitions for more LLNL systems.

* Added installation instructions.
diff --git a/README.md b/README.md
@@ -16,6 +16,18 @@ There are two main entry points into HPC-Launcher from the cli:
 replacement for `torchrun`, while `launch` is a generic interface for
 launching parallel jobs.
 
+## Installation
+
+To install the package, install released versions from PyPI run:
+```bash
+pip install hpc-launcher
+```
+
+Or install directly from GitHub:
+```bash
+pip install git+https://github.com/LBANN/HPC-launcher.git
+```
+
 ## Example Usage
 
 Using the launch command to execute a command in parallel
diff --git a/hpc_launcher/systems/autodetect.py b/hpc_launcher/systems/autodetect.py
@@ -36,12 +36,14 @@ def find_AMD_gpus() -> (int, float, str):
         import amdsmi as smi
     except (ImportError, ModuleNotFoundError, KeyError):
         return (0, 0, None)
+
+    num_gpus = 0
+    mem_per_gpu = 0
+    gpu_arch = None
     try:
         smi.amdsmi_init()
         devices = smi.amdsmi_get_processor_handles()
         num_gpus = len(devices)
-        mem_per_gpu = 0
-        gpu_arch = None
         if len(devices) == 0:
             return (0, 0, None)
         else:
@@ -60,9 +62,9 @@ def find_AMD_gpus() -> (int, float, str):
     finally:
         try:
             smi.amdsmi_shut_down()
-            return (0, 0, None)
+            return (num_gpus, mem_per_gpu, gpu_arch)
         except smi.AmdSmiException as e:
-            return (0, 0, None)
+            return (num_gpus, mem_per_gpu, gpu_arch)
 
 
 def find_NVIDIA_gpus() -> (int, float, str):
@@ -76,23 +78,23 @@ def find_NVIDIA_gpus() -> (int, float, str):
     try:
         pynvml.nvmlInit()
 
-        deviceCount = pynvml.nvmlDeviceGetCount()
+        num_gpus = pynvml.nvmlDeviceGetCount()
         # Assume that the GPUs are homogeneous on a system
-        if deviceCount > 0:
+        if num_gpus > 0:
             handle = pynvml.nvmlDeviceGetHandleByIndex(0)
             major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
             info = pynvml.nvmlDeviceGetMemoryInfo(handle)
             gpu_arch = f"sm_{major}{minor}"
             mem_per_gpu = info.total / (1024**3)
-        return (deviceCount, mem_per_gpu, gpu_arch)
+        return (num_gpus, mem_per_gpu, gpu_arch)
     except:
         return (0, 0, None)
     finally:
         try:
             pynvml.nvmlShutdown()
-            return (0, 0, None)
+            return (num_gpus, mem_per_gpu, gpu_arch)
         except pynvml.NVMLError as e:
-            return (0, 0, None)
+            return (num_gpus, mem_per_gpu, gpu_arch)
 
 
 def find_gpus() -> (str, int, float, str):
@@ -202,10 +204,10 @@ def autodetect_current_system(quiet: bool = False) -> System:
     """
 
     sys = system()
-    if sys in ("tioga", "tuolumne", "elcap", "rzadams", "tenaya"):
+    if sys in ("tioga", "tuolumne", "elcap", "rzadams", "rzvernal", "tenaya"):
         return ElCapitan(sys)
 
-    if sys in ("ipa", "matrix", "vector"):
+    if sys in ("ipa", "matrix", "rzvector"):
         return CTS2(sys)
 
     if sys in ("lassen", "sierra", "rzanzel"):
diff --git a/hpc_launcher/systems/lc/cts2.py b/hpc_launcher/systems/lc/cts2.py
@@ -37,6 +37,13 @@
             "erl": _h100_node,
         },
     ),
+    "rzvector": (
+        "pbatch",
+        {
+            "pbatch": _h100_node,
+            "pdebug": _h100_node,
+        },
+    ),
 }
 
 
diff --git a/hpc_launcher/systems/lc/el_capitan_family.py b/hpc_launcher/systems/lc/el_capitan_family.py
@@ -53,6 +53,13 @@
             "pdebug": _mi300a_node,
         },
     ),
+    "rzvernal": (
+        "pdebug",
+        {
+            "pdebug": _mi250x_node,
+            "pllm":   _mi250x_node,
+        },
+    ),
     "tenaya": (
         "pbatch",
         {
diff --git a/setup.py b/setup.py
@@ -1,6 +1,23 @@
 import os
 from setuptools import find_packages, setup
 import ctypes.util
+import re
+
+def get_rocm_version():
+    """Detect installed ROCm version."""
+    # Try reading from ROCm installation
+    rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm')
+    version_file = os.path.join(rocm_path, '.info', 'version')
+
+    if os.path.exists(version_file):
+        with open(version_file) as f:
+            version = f.read().strip()
+            # Extract major.minor.patch
+            match = re.match(r'(\d+\.\d+.\d+)', version)
+            if match:
+                return match.group(1)
+
+    return None
 
 with open("README.md", "r") as fp:
     long_description = fp.read()
@@ -11,7 +28,14 @@
 extras = []
 path = ctypes.util.find_library("amdhip64")
 if path:
-    extras.append("amdsmi")
+    rocm_version = get_rocm_version()
+    if rocm_version:
+        # Constrain ROCm-dependent packages
+        major, minor, patch = rocm_version.split('.')
+        extras.append(f"amdsmi=={major}.{minor}.{patch}")
+    else:
+        # Fallback or raise error
+        raise RuntimeError("ROCm installation not found!")
 
 path = ctypes.util.find_library("cudart")
 if path: