Skip to content

Commit 5a5cf48

Browse files
committed
python.tool.gpu_check to load_nvidia_uvm if not already loaded
1 parent dd6173c commit 5a5cf48

File tree

2 files changed

+328
-0
lines changed

2 files changed

+328
-0
lines changed

python/vsi/test/test_gpu_check.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import os
2+
import unittest
3+
from unittest.mock import patch, MagicMock
4+
5+
import vsi.tools.gpu_check
6+
7+
8+
# dummy class to represent the loaded libcudart.so library
9+
class MockLibrary:
10+
def __init__(self, file):
11+
self._name = file
12+
self.cudaGetDeviceCount = MagicMock(return_value="cudaGetDeviceCount")
13+
14+
15+
# unit tests
16+
class GpuCheckTest(unittest.TestCase):
17+
18+
def _check_logs(self, logs, logs_expected):
19+
'''Check for expected logs'''
20+
21+
# check length
22+
self.assertEqual(len(logs.output), len(logs_expected),
23+
f'Logs are of different length\n{logs.output=}')
24+
25+
# check that expected substring is in each log item
26+
for idx, (item, expected) in enumerate(zip(logs.output, logs_expected)):
27+
self.assertIn(expected, item,
28+
f'Log mismatch line {idx}\n{logs.output=}')
29+
30+
31+
@patch('vsi.tools.gpu_check.pathlib.Path.rglob')
32+
@patch('vsi.tools.gpu_check.ctypes.cdll.LoadLibrary', side_effect=MockLibrary)
33+
def test_find_cudart(self, mock_LoadLibrary, mock_rglob):
34+
'''Test find_cudart'''
35+
36+
# rglob return for 3 search_dirs
37+
search_dirs = ['/foo', '/bar', '/path/to']
38+
mock_rglob.side_effect = [list(), list(), ['/path/to/liubcudart.so']]
39+
40+
# call function & check return
41+
result = vsi.tools.gpu_check.find_cudart(search_dirs)
42+
self.assertEqual(result._name, '/path/to/liubcudart.so')
43+
44+
45+
@patch('vsi.tools.gpu_check.pathlib.Path.rglob')
46+
def test_find_cudart_error(self, mock_rglob):
47+
'''Test find_cudart without discovering a file'''
48+
49+
# rglob return for 1 search_dirs
50+
search_dirs = ['/foo']
51+
mock_rglob.side_effect = [list()]
52+
53+
# raise OSError
54+
with self.assertRaises(OSError):
55+
_ = vsi.tools.gpu_check.find_cudart(search_dirs)
56+
57+
58+
@patch('vsi.tools.gpu_check.ctypes.cdll.LoadLibrary', side_effect=MockLibrary)
59+
def test_load_cudart_file(self, mock_LoadLibrary):
60+
'''Test load_cudart from file'''
61+
62+
# call function & check return
63+
result = vsi.tools.gpu_check.load_cudart('/path/to/file')
64+
self.assertEqual(result._name, '/path/to/file')
65+
66+
67+
@patch.dict(os.environ, {"LD_LIBRARY_PATH": ""})
68+
@patch('vsi.tools.gpu_check.find_cudart')
69+
@patch('vsi.tools.gpu_check.pathlib.Path.glob')
70+
def test_load_cudart_dirs(self, mock_glob, mock_find_cudart):
71+
'''Test load_cudart from ``/usr/local/cuda``'''
72+
73+
# glob result for ['/usr/local', '/usr']
74+
mock_glob.side_effect = [['/usr/local/cuda'], ['/usr/cuda']]
75+
76+
# find_cudart result
77+
def _find_cudart(search_dirs):
78+
for search_dir in search_dirs:
79+
if '/usr/local/cuda' in str(search_dir):
80+
return MockLibrary('/usr/local/cuda/libcudart.so')
81+
raise None
82+
83+
mock_find_cudart.side_effect = _find_cudart
84+
85+
# call function & check return
86+
result = vsi.tools.gpu_check.load_cudart()
87+
self.assertEqual(result._name, '/usr/local/cuda/libcudart.so')
88+
89+
90+
@patch.dict(os.environ, {"LD_LIBRARY_PATH": "/foo:/bar:/path/to"})
91+
@patch('vsi.tools.gpu_check.find_cudart')
92+
@patch('vsi.tools.gpu_check.pathlib.Path.glob')
93+
def test_load_cudart_ld(self, mock_glob, mock_find_cudart):
94+
'''Test load_cudart from ``/usr/local/cuda``'''
95+
96+
# glob result for ['/usr/local', '/usr']
97+
mock_glob.side_effect = [['/usr/local/cuda'], ['/usr/cuda']]
98+
99+
# find_cudart result
100+
def _find_cudart(search_dirs):
101+
for search_dir in search_dirs:
102+
if '/path/to' in str(search_dir):
103+
return MockLibrary('/path/to/libcudart.so')
104+
raise None
105+
106+
mock_find_cudart.side_effect = _find_cudart
107+
108+
# call function & check return
109+
result = vsi.tools.gpu_check.load_cudart()
110+
self.assertEqual(result._name, '/path/to/libcudart.so')
111+
112+
113+
@patch('vsi.tools.gpu_check.load_cudart', return_value=MockLibrary('libcudart.so'))
114+
def test_load_nvidia_uvm(self, mock_load_cudart):
115+
'''Test load_nvidia_uvm'''
116+
117+
# call function & check logs
118+
with self.assertLogs(level='DEBUG') as logs:
119+
vsi.tools.gpu_check.load_nvidia_uvm()
120+
121+
logs_expected = [
122+
"found libcudart.so : libcudart.so",
123+
"cudaGetDeviceCount : exit_code='cudaGetDeviceCount', gpu_count=-1",
124+
]
125+
self._check_logs(logs, logs_expected)
126+
127+
128+
@patch('vsi.tools.gpu_check.glob.glob', return_value=None)
129+
def test_gpu_check_no_gpu(self, mock_glob):
130+
'''Test gpu_check with nothing in /dev/nvidia[0-9]'''
131+
132+
with self.assertLogs(level='DEBUG') as logs:
133+
vsi.tools.gpu_check.gpu_check()
134+
135+
logs_expected = ["Skip gpu_check : /dev/nvidia[0-9] missing"]
136+
self._check_logs(logs, logs_expected)
137+
138+
139+
@patch('vsi.tools.gpu_check.glob.glob', return_value=True)
140+
@patch('vsi.tools.gpu_check.os.path.exists', return_value=True)
141+
def test_gpu_check_uvm_exists(self, mock_exists, mock_glob):
142+
'''Test gpu_check with existing /dev/nvidia-uvm'''
143+
144+
with self.assertLogs(level='DEBUG') as logs:
145+
vsi.tools.gpu_check.gpu_check()
146+
147+
logs_expected = ["Skip gpu_check : /dev/nvidia-uvm already loaded"]
148+
self._check_logs(logs, logs_expected)
149+
150+
151+
@patch('vsi.tools.gpu_check.glob.glob', return_value=True)
152+
@patch('vsi.tools.gpu_check.os.path.exists')
153+
@patch('vsi.tools.gpu_check.load_nvidia_uvm', return_value=MockLibrary('libcudart.so'))
154+
def test_gpu_check_uvm_success(self, mock_load_nvidia_uvm, mock_exists, mock_glob):
155+
'''Test gpu_check with load_nvidia_uvm success'''
156+
157+
# /dev/nvidia-uvm check fails then succeeds
158+
mock_exists.side_effect = [False, True]
159+
160+
# run test
161+
with self.assertLogs(level='DEBUG') as logs:
162+
vsi.tools.gpu_check.gpu_check()
163+
logs_expected = ["/dev/nvidia-uvm has been successfully loaded"]
164+
self._check_logs(logs, logs_expected)
165+

python/vsi/tools/gpu_check.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import argparse
2+
import ctypes
3+
import glob
4+
import logging
5+
import os
6+
import pathlib
7+
8+
logger = logging.getLogger(__name__)
9+
10+
11+
def find_cudart(search_dirs):
12+
'''
13+
Find, load, and return the ``libcudart.so`` library.
14+
15+
Paramters
16+
---------
17+
search_dirs : :obj:`list`
18+
List of directories to search
19+
'''
20+
for search_dir in search_dirs:
21+
for file in pathlib.Path(search_dir).rglob('libcudart.so*'):
22+
try:
23+
return ctypes.cdll.LoadLibrary(file)
24+
except OSError:
25+
continue
26+
27+
raise OSError("Failed to find & load libcudart.so")
28+
29+
30+
def load_cudart(file=None):
31+
'''
32+
Load ``libcudart.so`` library. If not provided as an input, search
33+
standard locations for the library.
34+
35+
Paramters
36+
---------
37+
file : :obj:`str`
38+
Location of ``libcudart.so``
39+
'''
40+
41+
# load directly
42+
if file:
43+
return ctypes.cdll.LoadLibrary(file)
44+
45+
# list of directories to search (via recursive glob)
46+
# - /usr/local/cuda*
47+
# - /usr/cuda*
48+
# - ${LD_LIBRARY_PATH}
49+
# - /usr
50+
search_dirs = list()
51+
52+
search_dirs.extend(pathlib.Path('/usr/local').glob('cuda*'))
53+
search_dirs.extend(pathlib.Path('/usr').glob('cuda*'))
54+
55+
LD_LIBRARY_PATH = os.getenv('LD_LIBRARY_PATH')
56+
if LD_LIBRARY_PATH:
57+
search_dirs.extend(LD_LIBRARY_PATH.split(os.pathsep))
58+
59+
search_dirs.append('/usr')
60+
61+
# ensure all search directories are pathlib
62+
search_dirs = [pathlib.Path(d) for d in search_dirs]
63+
64+
# search for cudart
65+
return find_cudart(search_dirs)
66+
67+
68+
def load_nvidia_uvm(file=None):
69+
'''
70+
A function to load the nvidia uvm device
71+
72+
Some (older) Linux Operating systems do not load ``/dev/nvidia-uvm`` on boot
73+
to runlevel 3 (headless). This results in the ``nvidia-uvm`` module not being
74+
loaded.
75+
76+
Unfortunately, a simple modprobe does not fix the issue, but a CUDA call on
77+
the host (not in a container) will.
78+
79+
This scripts attempts to locate a ``libcudart.so`` library and calls the
80+
``cudaGetDeviceCount`` function, which loads the ``/dev/nvidia-uvm`` driver.
81+
If it cannot locate the cuda runtime, you can give it the location as an
82+
argument.
83+
84+
The CUDA Runtime is required on the host.
85+
'''
86+
87+
# check file exists
88+
if file:
89+
file = pathlib.Path(file)
90+
if not file.is_file():
91+
raise OSError(f"File does not exist {file}")
92+
93+
# load libcudart.so
94+
try:
95+
cudart = load_cudart(file)
96+
except OSError:
97+
if file:
98+
raise OSError(f"Failed to load cuda runtime from {file}")
99+
else:
100+
raise OSError("Failed to find & load cuda runtime. Try passing the "
101+
"full path of cuda runtime as an argument.")
102+
103+
# report
104+
logger.debug(f"found libcudart.so : {cudart._name}")
105+
106+
# run cudaGetDeviceCount
107+
cudart.cudaGetDeviceCount.argtypes = (ctypes.POINTER(ctypes.c_int), )
108+
gpu_count = ctypes.c_int(-1)
109+
exit_code = cudart.cudaGetDeviceCount(ctypes.pointer(gpu_count))
110+
111+
# report
112+
logger.debug(f"cudaGetDeviceCount : {exit_code=}, gpu_count={gpu_count.value}")
113+
114+
115+
def gpu_check(file=None):
116+
'''Try to load nvidia-uvm if not already loaded'''
117+
118+
# Only bother checking if there are any nvidia cards present
119+
if not glob.glob('/dev/nvidia[0-9]'):
120+
logger.debug('Skip gpu_check : /dev/nvidia[0-9] missing')
121+
return
122+
123+
# is nvidia-uvm already loaded
124+
if os.path.exists('/dev/nvidia-uvm'):
125+
logger.debug('Skip gpu_check : /dev/nvidia-uvm already loaded')
126+
return
127+
128+
# call load_nvidia_uvm
129+
try:
130+
load_nvidia_uvm(file)
131+
except OSError as err:
132+
logger.warning(f'load_nvidia_uvm failure : {err}')
133+
134+
# nvidia-uvm report
135+
if os.path.exists('/dev/nvidia-uvm'):
136+
logger.debug("/dev/nvidia-uvm has been successfully loaded")
137+
else:
138+
logger.critical("load_nvidia_uvm ran but /dev/nvidia-uvm is still not loaded")
139+
140+
141+
def main():
142+
'''
143+
Command line interface to :func:`gpu_check`
144+
'''
145+
146+
# argument parser
147+
parser = argparse.ArgumentParser()
148+
parser.add_argument('--file', type=pathlib.Path,
149+
default=None, required=False,
150+
help="Path to libcudart.so")
151+
152+
args = parser.parse_args()
153+
154+
# basic logging when called from command line
155+
log_format = "[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s : %(message)s"
156+
logging.basicConfig(level='DEBUG', format=log_format)
157+
158+
# run gpu_check
159+
gpu_check(args.file)
160+
161+
162+
if __name__ == '__main__':
163+
main()

0 commit comments

Comments
 (0)