Incorporate pytorch's fixes in device_count_nvml #16795

awaelchli · lexierule · commit b0c79089f594 · 2023-02-28T11:52:17.000-05:00
diff --git a/src/lightning_fabric/CHANGELOG.md b/src/lightning_fabric/CHANGELOG.md
@@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed edge cases in parsing device ids using NVML ([#16795](https://github.com/Lightning-AI/lightning/pull/16795))
 - Fixed DDP spawn hang on TPU Pods ([#16844](https://github.com/Lightning-AI/lightning/pull/16844))
 
 
diff --git a/src/lightning_fabric/accelerators/cuda.py b/src/lightning_fabric/accelerators/cuda.py
@@ -15,17 +15,13 @@
 import warnings
 from contextlib import contextmanager
 from functools import lru_cache
-from typing import Dict, Generator, List, Optional, Set, Union
+from typing import cast, Dict, Generator, List, Optional, Union
 
 import torch
 from lightning_utilities.core.rank_zero import rank_zero_info
 
 from lightning_fabric.accelerators.accelerator import Accelerator
-from lightning_fabric.utilities.imports import (
-    _TORCH_GREATER_EQUAL_1_12,
-    _TORCH_GREATER_EQUAL_1_13,
-    _TORCH_GREATER_EQUAL_2_0,
-)
+from lightning_fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12, _TORCH_GREATER_EQUAL_2_0
 
 
 class CUDAAccelerator(Accelerator):
@@ -161,11 +157,11 @@ def num_cuda_devices() -> int:
     Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support,
     if the platform allows it.
     """
-    if _TORCH_GREATER_EQUAL_1_13:
+    if _TORCH_GREATER_EQUAL_2_0:
         return torch.cuda.device_count()
 
     # Implementation copied from upstream: https://github.com/pytorch/pytorch/pull/84879
-    # TODO: Remove once minimum supported PyTorch version is 1.13
+    # TODO: Remove once minimum supported PyTorch version is 2.0
     nvml_count = _device_count_nvml()
     return torch.cuda.device_count() if nvml_count < 0 else nvml_count
 
@@ -180,63 +176,167 @@ def is_cuda_available() -> bool:
     return torch.cuda.is_available() if _TORCH_GREATER_EQUAL_2_0 else num_cuda_devices() > 0
 
 
-# TODO: Remove once minimum supported PyTorch version is 1.13
-def _parse_visible_devices() -> Set[int]:
-    """Implementation copied from upstream: https://github.com/pytorch/pytorch/pull/84879."""
+# TODO: Remove once minimum supported PyTorch version is 2.0
+def _parse_visible_devices() -> Union[List[int], List[str]]:
+    """Parse CUDA_VISIBLE_DEVICES environment variable."""
     var = os.getenv("CUDA_VISIBLE_DEVICES")
     if var is None:
-        return {x for x in range(64)}
+        return list(range(64))
 
     def _strtoul(s: str) -> int:
-        """Return -1 or integer sequence string starts with."""
-        if len(s) == 0:
+        """Return -1 or positive integer sequence string starts with,"""
+        if not s:
             return -1
         for idx, c in enumerate(s):
-            if not c.isdigit():
+            if not (c.isdigit() or (idx == 0 and c in "+-")):
                 break
             if idx + 1 == len(s):
                 idx += 1
         return int(s[:idx]) if idx > 0 else -1
 
+    def parse_list_with_prefix(lst: str, prefix: str) -> List[str]:
+        rcs: List[str] = []
+        for elem in lst.split(","):
+            # Repeated id results in empty set
+            if elem in rcs:
+                return cast(List[str], [])
+            # Anything other but prefix is ignored
+            if not elem.startswith(prefix):
+                break
+            rcs.append(elem)
+        return rcs
+
+    if var.startswith("GPU-"):
+        return parse_list_with_prefix(var, "GPU-")
+    if var.startswith("MIG-"):
+        return parse_list_with_prefix(var, "MIG-")
     # CUDA_VISIBLE_DEVICES uses something like strtoul
     # which makes `1gpu2,2ampere` is equivalent to `1,2`
-    rc: Set[int] = set()
+    rc: List[int] = []
     for elem in var.split(","):
-        rc.add(_strtoul(elem.strip()))
+        x = _strtoul(elem.strip())
+        # Repeated ordinal results in empty set
+        if x in rc:
+            return cast(List[int], [])
+        # Negative value aborts the sequence
+        if x < 0:
+            break
+        rc.append(x)
     return rc
 
 
-# TODO: Remove once minimum supported PyTorch version is 1.13
+# TODO: Remove once minimum supported PyTorch version is 2.0
 def _raw_device_count_nvml() -> int:
-    """Implementation copied from upstream: https://github.com/pytorch/pytorch/pull/84879."""
-    from ctypes import c_int, CDLL
+    """Return number of devices as reported by NVML or negative value if NVML discovery/initialization failed."""
+    from ctypes import byref, c_int, CDLL
 
     nvml_h = CDLL("libnvidia-ml.so.1")
     rc = nvml_h.nvmlInit()
     if rc != 0:
         warnings.warn("Can't initialize NVML")
         return -1
-    dev_arr = (c_int * 1)(-1)
-    rc = nvml_h.nvmlDeviceGetCount_v2(dev_arr)
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
     if rc != 0:
         warnings.warn("Can't get nvml device count")
         return -1
     del nvml_h
-    return dev_arr[0]
+    return dev_count.value
+
 
+# TODO: Remove once minimum supported PyTorch version is 2.0
+def _raw_device_uuid_nvml() -> Optional[List[str]]:
+    """Return list of device UUID as reported by NVML or None if NVM discovery/initialization failed."""
+    from ctypes import byref, c_int, c_void_p, CDLL, create_string_buffer
 
-# TODO: Remove once minimum supported PyTorch version is 1.13
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML")
+        return None
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+    if rc != 0:
+        warnings.warn("Can't get nvml device count")
+        return None
+    uuids: List[str] = []
+    for idx in range(dev_count.value):
+        dev_id = c_void_p()
+        rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
+        if rc != 0:
+            warnings.warn("Can't get device handle")
+            return None
+        buf_len = 96
+        buf = create_string_buffer(buf_len)
+        rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
+        if rc != 0:
+            warnings.warn("Can't get device UUID")
+            return None
+        uuids.append(buf.raw.decode("ascii").strip("\0"))
+    del nvml_h
+    return uuids
+
+
+# TODO: Remove once minimum supported PyTorch version is 2.0
+def _transform_uuid_to_ordinals(candidates: List[str], uuids: List[str]) -> List[int]:
+    """Given the set of partial uuids and list of known uuids builds a set of ordinals excluding ambiguous partials
+    IDs."""
+
+    def uuid_to_orinal(candidate: str, uuids: List[str]) -> int:
+        best_match = -1
+        for idx, uuid in enumerate(uuids):
+            if not uuid.startswith(candidate):
+                continue
+            # Ambigous candidate
+            if best_match != -1:
+                return -1
+            best_match = idx
+        return best_match
+
+    rc: List[int] = []
+    for candidate in candidates:
+        idx = uuid_to_orinal(candidate, uuids)
+        # First invalid ordinal stops parsing
+        if idx < 0:
+            break
+        # Duplicates result in empty set
+        if idx in rc:
+            return cast(List[int], [])
+        rc.append(idx)
+    return rc
+
+
+# TODO: Remove once minimum supported PyTorch version is 2.0
 def _device_count_nvml() -> int:
-    """Implementation copied from upstream: https://github.com/pytorch/pytorch/pull/84879."""
+    """Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account.
+
+    Negative value is returned if NVML discovery or initialization has failed.
+    """
+    visible_devices = _parse_visible_devices()
+    if not visible_devices:
+        return 0
     try:
-        raw_cnt = _raw_device_count_nvml()
-        if raw_cnt <= 0:
-            return raw_cnt
-        return len(set(range(raw_cnt)).intersection(_parse_visible_devices()))
+        if type(visible_devices[0]) is str:
+            # Skip MIG parsing
+            if visible_devices[0].startswith("MIG-"):
+                return -1
+            uuids = _raw_device_uuid_nvml()
+            if uuids is None:
+                return -1
+            visible_devices = _transform_uuid_to_ordinals(cast(List[str], visible_devices), uuids)
+        else:
+            raw_cnt = _raw_device_count_nvml()
+            if raw_cnt <= 0:
+                return raw_cnt
+            # Trim the list up to a maximum available device
+            for idx, val in enumerate(visible_devices):
+                if cast(int, val) >= raw_cnt:
+                    return idx
     except OSError:
         return -1
     except AttributeError:
         return -1
+    return len(visible_devices)
 
 
 def _check_cuda_matmul_precision(device: torch.device) -> None:
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 - Fixed DDP spawn hang on TPU Pods ([#16844](https://github.com/Lightning-AI/lightning/pull/16844))
+- Fixed edge cases in parsing device ids using NVML ([#16795](https://github.com/Lightning-AI/lightning/pull/16795))
 
 
 ## [1.9.3] - 2023-02-21
diff --git a/tests/tests_fabric/plugins/collectives/test_torch_collective.py b/tests/tests_fabric/plugins/collectives/test_torch_collective.py
@@ -233,6 +233,7 @@ def _test_distributed_collectives_fn(strategy, collective):
 
 @skip_distributed_unavailable
 @pytest.mark.parametrize("n", (1, 2))
+@RunIf(skip_windows=True)
 @mock.patch.dict(os.environ, os.environ.copy(), clear=True)  # sets CUDA_MODULE_LOADING in torch==1.13
 def test_collectives_distributed(n):
     collective_launch(_test_distributed_collectives_fn, [torch.device("cpu")] * n)

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).`
`9`	`9`
`10`	`10`	`### Fixed`
`11`	`11`
	`12`	`+- Fixed edge cases in parsing device ids using NVML ([#16795](https://github.com/Lightning-AI/lightning/pull/16795))`
`12`	`13`	`- Fixed DDP spawn hang on TPU Pods ([#16844](https://github.com/Lightning-AI/lightning/pull/16844))`
`13`	`14`
`14`	`15`