Skip to content

Commit 16a1cc9

Browse files
authored
[misc][distributed] improve libcudart.so finding (#7127)
1 parent b1c9aa3 commit 16a1cc9

File tree

2 files changed

+25
-23
lines changed

2 files changed

+25
-23
lines changed

vllm/distributed/device_communicators/cuda_wrapper.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44
"""
55

66
import ctypes
7-
import glob
8-
import os
9-
import sys
107
from dataclasses import dataclass
118
from typing import Any, Dict, List, Optional
129

@@ -36,24 +33,25 @@ class Function:
3633
argtypes: List[Any]
3734

3835

39-
def get_pytorch_default_cudart_library_path() -> str:
40-
# code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
41-
lib_folder = "cuda_runtime"
42-
lib_name = "libcudart.so.*[0-9]"
43-
lib_path = None
44-
for path in sys.path:
45-
nvidia_path = os.path.join(path, "nvidia")
46-
if not os.path.exists(nvidia_path):
47-
continue
48-
candidate_lib_paths = glob.glob(
49-
os.path.join(nvidia_path, lib_folder, "lib", lib_name))
50-
if candidate_lib_paths and not lib_path:
51-
lib_path = candidate_lib_paths[0]
52-
if lib_path:
53-
break
54-
if not lib_path:
55-
raise ValueError(f"{lib_name} not found in the system path {sys.path}")
56-
return lib_path
36+
def find_loaded_library(lib_name) -> Optional[str]:
37+
"""
38+
According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
39+
the file `/proc/self/maps` contains the memory maps of the process, which includes the
40+
shared libraries loaded by the process. We can use this file to find the path of the
41+
a loaded library.
42+
""" # noqa
43+
found = False
44+
with open("/proc/self/maps") as f:
45+
for line in f:
46+
if lib_name in line:
47+
found = True
48+
break
49+
if not found:
50+
# the library is not loaded in the current process
51+
return None
52+
start = line.index("/")
53+
path = line[start:].strip()
54+
return path
5755

5856

5957
class CudaRTLibrary:
@@ -100,7 +98,9 @@ class CudaRTLibrary:
10098

10199
def __init__(self, so_file: Optional[str] = None):
102100
if so_file is None:
103-
so_file = get_pytorch_default_cudart_library_path()
101+
so_file = find_loaded_library("libcudart.so")
102+
assert so_file is not None, \
103+
"libcudart.so is not loaded in the current process"
104104
if so_file not in CudaRTLibrary.path_to_library_cache:
105105
lib = ctypes.CDLL(so_file)
106106
CudaRTLibrary.path_to_library_cache[so_file] = lib

vllm/distributed/device_communicators/custom_all_reduce_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ def can_actually_p2p(
145145
p_tgt.start()
146146
p_src.join()
147147
p_tgt.join()
148+
assert p_src.exitcode == 0 and p_tgt.exitcode == 0
148149
result: List[bool] = []
149150
for src, tgt in zip(batch_src, batch_tgt):
150151
a = result_queue.get()
@@ -221,7 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
221222
# wrap raised exception to provide more information
222223
raise RuntimeError(
223224
f"Error happened when batch testing "
224-
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
225+
f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
226+
f"{returned.stderr.decode()}") from e
225227
result = pickle.loads(returned.stdout)
226228
for _i, _j, r in zip(batch_src, batch_tgt, result):
227229
cache[f"{_i}->{_j}"] = r

0 commit comments

Comments
 (0)