Skip to content

Commit d7a1178

Browse files
authored
Update systems and bug fix autodetect (#57)
* Fixed a bug in the autodetect GPU logic that overwrote the output in the finally block attached to the try block. Added code to auto-detect the ROCm version being used and set that to constrain the version of amdsmi installed. Added definitions for more LLNL systems. * Added installation instructions.
1 parent 4c14b52 commit d7a1178

File tree

5 files changed

+64
-12
lines changed

5 files changed

+64
-12
lines changed

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,18 @@ There are two main entry points into HPC-Launcher from the cli:
1616
replacement for `torchrun`, while `launch` is a generic interface for
1717
launching parallel jobs.
1818

19+
## Installation
20+
21+
To install the package, install released versions from PyPI run:
22+
```bash
23+
pip install hpc-launcher
24+
```
25+
26+
Or install directly from GitHub:
27+
```bash
28+
pip install git+https://github.com/LBANN/HPC-launcher.git
29+
```
30+
1931
## Example Usage
2032

2133
Using the launch command to execute a command in parallel

hpc_launcher/systems/autodetect.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ def find_AMD_gpus() -> (int, float, str):
3636
import amdsmi as smi
3737
except (ImportError, ModuleNotFoundError, KeyError):
3838
return (0, 0, None)
39+
40+
num_gpus = 0
41+
mem_per_gpu = 0
42+
gpu_arch = None
3943
try:
4044
smi.amdsmi_init()
4145
devices = smi.amdsmi_get_processor_handles()
4246
num_gpus = len(devices)
43-
mem_per_gpu = 0
44-
gpu_arch = None
4547
if len(devices) == 0:
4648
return (0, 0, None)
4749
else:
@@ -60,9 +62,9 @@ def find_AMD_gpus() -> (int, float, str):
6062
finally:
6163
try:
6264
smi.amdsmi_shut_down()
63-
return (0, 0, None)
65+
return (num_gpus, mem_per_gpu, gpu_arch)
6466
except smi.AmdSmiException as e:
65-
return (0, 0, None)
67+
return (num_gpus, mem_per_gpu, gpu_arch)
6668

6769

6870
def find_NVIDIA_gpus() -> (int, float, str):
@@ -76,23 +78,23 @@ def find_NVIDIA_gpus() -> (int, float, str):
7678
try:
7779
pynvml.nvmlInit()
7880

79-
deviceCount = pynvml.nvmlDeviceGetCount()
81+
num_gpus = pynvml.nvmlDeviceGetCount()
8082
# Assume that the GPUs are homogeneous on a system
81-
if deviceCount > 0:
83+
if num_gpus > 0:
8284
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
8385
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
8486
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
8587
gpu_arch = f"sm_{major}{minor}"
8688
mem_per_gpu = info.total / (1024**3)
87-
return (deviceCount, mem_per_gpu, gpu_arch)
89+
return (num_gpus, mem_per_gpu, gpu_arch)
8890
except:
8991
return (0, 0, None)
9092
finally:
9193
try:
9294
pynvml.nvmlShutdown()
93-
return (0, 0, None)
95+
return (num_gpus, mem_per_gpu, gpu_arch)
9496
except pynvml.NVMLError as e:
95-
return (0, 0, None)
97+
return (num_gpus, mem_per_gpu, gpu_arch)
9698

9799

98100
def find_gpus() -> (str, int, float, str):
@@ -202,10 +204,10 @@ def autodetect_current_system(quiet: bool = False) -> System:
202204
"""
203205

204206
sys = system()
205-
if sys in ("tioga", "tuolumne", "elcap", "rzadams", "tenaya"):
207+
if sys in ("tioga", "tuolumne", "elcap", "rzadams", "rzvernal", "tenaya"):
206208
return ElCapitan(sys)
207209

208-
if sys in ("ipa", "matrix", "vector"):
210+
if sys in ("ipa", "matrix", "rzvector"):
209211
return CTS2(sys)
210212

211213
if sys in ("lassen", "sierra", "rzanzel"):

hpc_launcher/systems/lc/cts2.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@
3737
"erl": _h100_node,
3838
},
3939
),
40+
"rzvector": (
41+
"pbatch",
42+
{
43+
"pbatch": _h100_node,
44+
"pdebug": _h100_node,
45+
},
46+
),
4047
}
4148

4249

hpc_launcher/systems/lc/el_capitan_family.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@
5353
"pdebug": _mi300a_node,
5454
},
5555
),
56+
"rzvernal": (
57+
"pdebug",
58+
{
59+
"pdebug": _mi250x_node,
60+
"pllm": _mi250x_node,
61+
},
62+
),
5663
"tenaya": (
5764
"pbatch",
5865
{

setup.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
11
import os
22
from setuptools import find_packages, setup
33
import ctypes.util
4+
import re
5+
6+
def get_rocm_version():
7+
"""Detect installed ROCm version."""
8+
# Try reading from ROCm installation
9+
rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm')
10+
version_file = os.path.join(rocm_path, '.info', 'version')
11+
12+
if os.path.exists(version_file):
13+
with open(version_file) as f:
14+
version = f.read().strip()
15+
# Extract major.minor.patch
16+
match = re.match(r'(\d+\.\d+.\d+)', version)
17+
if match:
18+
return match.group(1)
19+
20+
return None
421

522
with open("README.md", "r") as fp:
623
long_description = fp.read()
@@ -11,7 +28,14 @@
1128
extras = []
1229
path = ctypes.util.find_library("amdhip64")
1330
if path:
14-
extras.append("amdsmi")
31+
rocm_version = get_rocm_version()
32+
if rocm_version:
33+
# Constrain ROCm-dependent packages
34+
major, minor, patch = rocm_version.split('.')
35+
extras.append(f"amdsmi=={major}.{minor}.{patch}")
36+
else:
37+
# Fallback or raise error
38+
raise RuntimeError("ROCm installation not found!")
1539

1640
path = ctypes.util.find_library("cudart")
1741
if path:

0 commit comments

Comments
 (0)