Skip to content

Commit 9d54fd7

Browse files
committed
Separates the GPU querying into two classes, one for nvidia and another for amd, that inherit a common interface
1 parent d703c06 commit 9d54fd7

File tree

2 files changed

+87
-38
lines changed

2 files changed

+87
-38
lines changed

src/gpu_tracker/tracker.py

Lines changed: 85 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""The ``tracker`` module contains the ``Tracker`` class which can alternatively be imported directly from the ``gpu_tracker`` package."""
22
from __future__ import annotations
3+
import abc
34
import json
45
import dataclasses as dclass
56
import platform
@@ -17,6 +18,62 @@
1718
import pandas as pd
1819

1920

21+
class _GPUQuerier(abc.ABC):
22+
def __init__(self, command: str):
23+
self.command = command
24+
25+
def _query_gpu(self, *args) -> pd.DataFrame:
26+
output = subp.check_output((self.command,) + args, stderr=subp.STDOUT).decode()
27+
gpu_info = pd.read_csv(io.StringIO(output))
28+
return gpu_info.map(lambda value: value.strip() if type(value) is str else value)
29+
30+
def is_available(self) -> bool:
31+
try:
32+
subp.check_output(self.command)
33+
return True
34+
except FileNotFoundError:
35+
return False
36+
37+
@abc.abstractmethod
38+
def static_info(self) -> pd.DataFrame:
39+
pass
40+
41+
@abc.abstractmethod
42+
def process_ram(self) -> pd.DataFrame:
43+
pass
44+
45+
@abc.abstractmethod
46+
def ram_and_utilization(self) -> pd.DataFrame:
47+
pass
48+
49+
class _NvidiaQuerier(_GPUQuerier):
50+
def __init__(self):
51+
super().__init__('nvidia-smi')
52+
53+
def _query_gpu(self, *args: list[str], ram_column: str | None = None):
54+
gpu_info = super()._query_gpu(*args, '--format=csv')
55+
gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns]
56+
if ram_column:
57+
gpu_info[ram_column] = gpu_info[ram_column].apply(lambda ram: int(ram.replace('MiB', '').strip()))
58+
gpu_info = gpu_info.rename(columns={ram_column: 'ram'})
59+
return gpu_info.map(lambda value: value.strip() if type(value) is str else value)
60+
61+
def static_info(self) -> pd.DataFrame:
62+
return self._query_gpu('--query-gpu=uuid,memory.total', ram_column='memory.total')
63+
64+
def process_ram(self) -> pd.DataFrame:
65+
return self._query_gpu('--query-compute-apps=pid,used_gpu_memory', ram_column='used_gpu_memory')
66+
67+
def ram_and_utilization(self) -> pd.DataFrame:
68+
gpu_info = self._query_gpu('--query-gpu=uuid,memory.used,utilization.gpu', ram_column='memory.used')
69+
gpu_info = gpu_info.rename(columns={'utilization.gpu': 'utilization_percent'})
70+
gpu_info.utilization_percent = [float(percentage.replace('%', '').strip()) for percentage in gpu_info.utilization_percent]
71+
return gpu_info
72+
73+
class _AMDQuerier(_GPUQuerier):
74+
def __init__(self):
75+
super().__init__('amd-smi')
76+
2077
class _TrackingProcess(mproc.Process):
2178
_CPU_PERCENT_INTERVAL = 0.1
2279
_ram_unit2coefficient = {
@@ -43,7 +100,7 @@ class _TrackingProcess(mproc.Process):
43100
def __init__(
44101
self, stop_event: mproc.Event, sleep_time: float, ram_unit: str, gpu_ram_unit: str, time_unit: str,
45102
n_expected_cores: int | None, gpu_uuids: set[str] | None, disable_logs: bool, main_process_id: int,
46-
resource_usage_file: str, extraneous_process_ids: set[int]):
103+
resource_usage_file: str, extraneous_process_ids: set[int], gpu_querier: _GPUQuerier):
47104
super().__init__()
48105
self._stop_event = stop_event
49106
if sleep_time < _TrackingProcess._CPU_PERCENT_INTERVAL:
@@ -63,24 +120,22 @@ def __init__(
63120
self._hardware_percent_sums = {key: 0. for key in percent_keys}
64121
self._tracking_iteration = 1
65122
self._is_linux = platform.system().lower() == 'linux'
66-
self._nvidia_available = True
67-
try:
68-
subp.check_output('nvidia-smi')
69-
except FileNotFoundError:
70-
self._nvidia_available = False
123+
self._gpu_querier = gpu_querier
124+
self._gpu_available = self._gpu_querier.is_available()
125+
if not self._gpu_available:
71126
self._log_warning(
72-
'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. '
73-
'Otherwise the Max GPU RAM values will remain 0.0')
127+
f'The {self._gpu_querier.command} command is not available. Please install it to track GPU usage. '
128+
'Otherwise the GPU RAM and GPU utilization values will remain 0.0.')
74129
max_ram = MaxRAM(unit=ram_unit, system_capacity=psutil.virtual_memory().total * self._ram_coefficient)
75130
system_core_count = psutil.cpu_count()
76131
cpu_utilization = CPUUtilization(
77132
system_core_count=system_core_count,
78133
n_expected_cores=n_expected_cores if n_expected_cores is not None else system_core_count)
79-
if self._nvidia_available:
80-
gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.total')
81-
gpu_ram_system_capacity = self._get_gpu_ram(gpu_info=gpu_info, column='memory.total')
134+
if self._gpu_available:
135+
gpu_info = self._gpu_querier.static_info()
136+
gpu_ram_system_capacity = self._get_gpu_ram(gpu_info=gpu_info)
82137
max_gpu_ram = MaxGPURAM(unit=gpu_ram_unit, system_capacity=gpu_ram_system_capacity)
83-
all_uuids = set(gpu_info['uuid'])
138+
all_uuids = set(gpu_info.uuid)
84139
if gpu_uuids is None:
85140
self._gpu_uuids = all_uuids
86141
else:
@@ -143,25 +198,23 @@ def run(self):
143198
self._resource_usage.max_ram.system = max(
144199
self._resource_usage.max_ram.system, psutil.virtual_memory().used * self._ram_coefficient)
145200
# Get the maximum GPU RAM usage if available.
146-
if self._nvidia_available: # pragma: nocover
147-
gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-compute-apps=pid,used_gpu_memory')
201+
if self._gpu_available: # pragma: nocover
202+
gpu_info = self._gpu_querier.process_ram()
148203
if len(gpu_info):
149204
process_ids = {self._main_process_id}
150205
self._update_gpu_ram(attr='main', process_ids=process_ids, gpu_info=gpu_info)
151206
process_ids = set(self._map_processes(processes=descendant_processes, map_func=lambda process: process.pid))
152207
self._update_gpu_ram(attr='descendants', process_ids=process_ids, gpu_info=gpu_info)
153208
process_ids.add(self._main_process_id)
154209
self._update_gpu_ram(attr='combined', process_ids=process_ids, gpu_info=gpu_info)
155-
gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.used,utilization.gpu')
156-
system_gpu_ram = self._get_gpu_ram(gpu_info, column='memory.used')
210+
gpu_info = self._gpu_querier.ram_and_utilization()
211+
system_gpu_ram = self._get_gpu_ram(gpu_info)
157212
self._resource_usage.max_gpu_ram.system = max(self._resource_usage.max_gpu_ram.system, system_gpu_ram)
158-
gpu_info = gpu_info.loc[gpu_info['uuid'].apply(lambda gpu_uuid: gpu_uuid in self._gpu_uuids)]
159-
gpu_percentages = [float(percentage.replace('%', '').strip()) for percentage in gpu_info['utilization.gpu']]
213+
gpu_info = gpu_info.loc[[uuid in self._gpu_uuids for uuid in gpu_info.uuid]]
160214
self._update_processing_unit_utilization(
161-
current_percentages=gpu_percentages,
215+
current_percentages=list(gpu_info.utilization_percent),
162216
processing_unit_percentages=self._resource_usage.gpu_utilization.gpu_percentages, percent_key='gpu',
163217
n_hardware_units=self._resource_usage.gpu_utilization.n_expected_gpus)
164-
165218
# Get the mean and maximum CPU usages.
166219
main_n_threads = self._map_processes([main_process], map_func=get_n_threads)
167220
descendant_n_threads = self._map_processes(descendant_processes, map_func=get_n_threads)
@@ -230,23 +283,13 @@ def _update_ram(self, rss_values: RSSValues, memory_maps_list: list[list] | None
230283
rss_values.total_rss = max(rss_values.total_rss, total_rss)
231284

232285
def _update_gpu_ram(self, attr: str, process_ids: set[int], gpu_info: pd.DataFrame):
233-
gpu_info = gpu_info.loc[[pid in process_ids for pid in gpu_info['pid']]]
234-
gpu_ram = self._get_gpu_ram(gpu_info, column='used_gpu_memory')
286+
gpu_info = gpu_info.loc[[pid in process_ids for pid in gpu_info.pid]]
287+
gpu_ram = self._get_gpu_ram(gpu_info)
235288
max_gpu_ram = getattr(self._resource_usage.max_gpu_ram, attr)
236289
setattr(self._resource_usage.max_gpu_ram, attr, max(max_gpu_ram, gpu_ram))
237290

238-
@staticmethod
239-
def _query_gpu(nvidia_command: str) -> pd.DataFrame:
240-
command = f'nvidia-smi {nvidia_command} --format=csv'
241-
output = subp.check_output(command.split(), stderr=subp.STDOUT).decode()
242-
gpu_info = pd.read_csv(io.StringIO(output))
243-
gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns]
244-
return gpu_info.map(lambda value: value.strip() if type(value) is str else value)
245-
246-
def _get_gpu_ram(self, gpu_info: pd.DataFrame, column: str) -> float:
247-
gpu_rams = gpu_info[column]
248-
gpu_rams = gpu_rams.apply(lambda ram: int(ram.replace('MiB', '').strip()))
249-
return sum(gpu_rams) * self._gpu_ram_coefficient
291+
def _get_gpu_ram(self, gpu_info: pd.DataFrame) -> float:
292+
return sum(gpu_info.ram) * self._gpu_ram_coefficient
250293

251294
def _update_processing_unit_utilization(
252295
self, current_percentages: list[float], processing_unit_percentages: ProcessingUnitPercentages,
@@ -297,7 +340,7 @@ class State(enum.Enum):
297340
def __init__(
298341
self, sleep_time: float = 1.0, ram_unit: str = 'gigabytes', gpu_ram_unit: str = 'gigabytes', time_unit: str = 'hours',
299342
n_expected_cores: int = None, gpu_uuids: set[str] = None, disable_logs: bool = False, process_id: int = None,
300-
resource_usage_file: str | None = None, n_join_attempts: int = 5, join_timeout: float = 10.0):
343+
resource_usage_file: str | None = None, n_join_attempts: int = 5, join_timeout: float = 10.0, gpu_brand: str = 'nvidia'):
301344
"""
302345
:param sleep_time: The number of seconds to sleep in between usage-collection iterations.
303346
:param ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.
@@ -321,9 +364,15 @@ def __init__(
321364
self._stop_event = mproc.Event()
322365
extraneous_ids = {process.pid for process in current_process.children()} - legit_child_ids
323366
self._resource_usage_file = f'.gpu-tracker_{uuid.uuid1()}.pkl' if resource_usage_file is None else resource_usage_file
367+
if gpu_brand == 'nvidia':
368+
gpu_querier = _NvidiaQuerier()
369+
elif gpu_brand == 'amd':
370+
gpu_querier = _AMDQuerier()
371+
else:
372+
raise ValueError(f'"{gpu_brand}" is not a valid GPU brand. Supported values are "nvidia" and "amd".')
324373
self._tracking_process = _TrackingProcess(
325374
self._stop_event, sleep_time, ram_unit, gpu_ram_unit, time_unit, n_expected_cores, gpu_uuids, disable_logs,
326-
process_id if process_id is not None else current_process_id, self._resource_usage_file, extraneous_ids)
375+
process_id if process_id is not None else current_process_id, self._resource_usage_file, extraneous_ids, gpu_querier)
327376
self.resource_usage = None
328377
self.n_join_attempts = n_join_attempts
329378
self.join_timeout = join_timeout

tests/test_tracker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
import pytest as pt
66
import utils
77

8-
nvidia_smi_unavailable_message = 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. ' \
9-
'Otherwise the Max GPU RAM values will remain 0.0'
8+
nvidia_smi_unavailable_message = 'The nvidia-smi command is not available. Please install it to track GPU usage. ' \
9+
'Otherwise the GPU RAM and GPU utilization values will remain 0.0.'
1010

1111

1212
@pt.fixture(name='operating_system', params=['Linux', 'not-linux'])

0 commit comments

Comments
 (0)