Skip to content

Commit 51302b6

Browse files
authored
Merge pull request #60 from MoseleyBioinformaticsLab/granular
Adds an option for logging the per timepoint resource usage
2 parents 2c006ec + 34c8f3c commit 51302b6

22 files changed

+381
-146
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
psutil>=6.0.0
22
docopt>=0.6.2
33
pandas>=2.2.3
4+
SQLAlchemy>=2.0.39

src/gpu_tracker/__main__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
Usage:
55
gpu-tracker -h | --help
66
gpu-tracker -v | --version
7-
gpu-tracker --execute=<command> [--output=<output>] [--format=<format>] [--st=<sleep-time>] [--ru=<ram-unit>] [--gru=<gpu-ram-unit>] [--tu=<time-unit>] [--nec=<num-cores>] [--guuids=<gpu-uuids>] [--disable-logs] [--gb=<gpu-brand>]
7+
gpu-tracker --execute=<command> [--output=<output>] [--format=<format>] [--st=<sleep-time>] [--ru=<ram-unit>] [--gru=<gpu-ram-unit>] [--tu=<time-unit>] [--nec=<num-cores>] [--guuids=<gpu-uuids>] [--disable-logs] [--gb=<gpu-brand>] [--tf=<tracking-file>]
88
99
Options:
1010
-h --help Show this help message and exit.
@@ -20,6 +20,7 @@
2020
--guuids=<gpu-uuids> Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system.
2121
--disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual.
2222
--gb=<gpu-brand> The brand of GPU to profile. Valid values are nvidia and amd. Defaults to the brand of GPU detected in the system, checking NVIDIA first.
23+
--tf=<tracking-file> If specified, stores the individual resource usage measurements at each iteration. Valid file formats are CSV (.csv) and SQLite (.sqlite) where the SQLite file format stores the data in a table called "tracking" and allows for more efficient querying.
2324
"""
2425
import docopt as doc
2526
import subprocess as subp
@@ -43,7 +44,8 @@ def main():
4344
'--nec': 'n_expected_cores',
4445
'--guuids': 'gpu_uuids',
4546
'--disable-logs': 'disable_logs',
46-
'--gb': 'gpu_brand'
47+
'--gb': 'gpu_brand',
48+
'--tf': 'tracking_file'
4749
}
4850
kwargs = {
4951
option_map[option]: value for option, value in args.items() if value is not None and option not in {

src/gpu_tracker/_helper_classes.py

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
from __future__ import annotations
2+
import abc
3+
import subprocess as subp
4+
import pandas as pd
5+
import io
6+
import os
7+
import csv
8+
import dataclasses as dclass
9+
import sqlalchemy as sqlalc
10+
import sqlalchemy.orm as sqlorm
11+
12+
13+
class _GPUQuerier(abc.ABC):
14+
command = None
15+
16+
@classmethod
17+
def _query_gpu(cls, *args) -> pd.DataFrame:
18+
output = subp.check_output((cls.command,) + args, stderr=subp.STDOUT).decode()
19+
gpu_info = pd.read_csv(io.StringIO(output))
20+
return gpu_info.map(lambda value: value.strip() if type(value) is str else value)
21+
22+
@classmethod
23+
def is_available(cls) -> bool | None:
24+
try:
25+
subp.check_output(cls.command)
26+
return True
27+
except subp.CalledProcessError:
28+
return False
29+
except FileNotFoundError:
30+
return None
31+
32+
@classmethod
33+
@abc.abstractmethod
34+
def static_info(cls) -> pd.DataFrame:
35+
pass # pragma: nocover
36+
37+
@classmethod
38+
@abc.abstractmethod
39+
def process_ram(cls) -> pd.DataFrame:
40+
pass # pragma: nocover
41+
42+
@classmethod
43+
@abc.abstractmethod
44+
def ram_and_utilization(cls) -> pd.DataFrame:
45+
pass # pragma: nocover
46+
47+
48+
class _NvidiaQuerier(_GPUQuerier):
49+
command = 'nvidia-smi'
50+
51+
@classmethod
52+
def _query_gpu(cls, *args: str, ram_column: str):
53+
gpu_info = super()._query_gpu(*args, '--format=csv')
54+
gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns]
55+
gpu_info[ram_column] = gpu_info[ram_column].apply(lambda ram: int(ram.replace('MiB', '').strip()))
56+
return gpu_info.rename(columns={ram_column: 'ram'})
57+
58+
@classmethod
59+
def static_info(cls) -> pd.DataFrame:
60+
return cls._query_gpu('--query-gpu=uuid,memory.total', ram_column='memory.total')
61+
62+
@classmethod
63+
def process_ram(cls) -> pd.DataFrame:
64+
return cls._query_gpu('--query-compute-apps=pid,used_gpu_memory', ram_column='used_gpu_memory')
65+
66+
@classmethod
67+
def ram_and_utilization(cls) -> pd.DataFrame:
68+
gpu_info = cls._query_gpu('--query-gpu=uuid,memory.used,utilization.gpu', ram_column='memory.used')
69+
gpu_info = gpu_info.rename(columns={'utilization.gpu': 'utilization_percent'})
70+
gpu_info.utilization_percent = [float(percentage.replace('%', '').strip()) for percentage in gpu_info.utilization_percent]
71+
return gpu_info
72+
73+
74+
class _AMDQuerier(_GPUQuerier):
75+
command = 'amd-smi'
76+
__id_to_uuid = None
77+
78+
@classmethod
79+
@property
80+
def _id_to_uuid(cls) -> dict[int, str]:
81+
if cls.__id_to_uuid is None:
82+
gpu_info = super()._query_gpu('list', '--csv')
83+
cls.__id_to_uuid = {gpu_id: uuid for gpu_id, uuid in zip(gpu_info.gpu, gpu_info.gpu_uuid)}
84+
return cls.__id_to_uuid
85+
86+
@classmethod
87+
def _query_gpu(cls, *args: str, ram_column: str) -> pd.DataFrame:
88+
gpu_info = super()._query_gpu(*args, '--csv')
89+
if 'gpu' in gpu_info.columns:
90+
gpu_info.gpu = [cls._id_to_uuid[gpu_id] for gpu_id in gpu_info.gpu]
91+
gpu_info = gpu_info.rename(columns={'gpu': 'uuid'})
92+
return gpu_info.rename(columns={ram_column: 'ram'})
93+
94+
@classmethod
95+
def static_info(cls) -> pd.DataFrame:
96+
gpu_info = cls._query_gpu('static', '--vram', ram_column='size')
97+
return gpu_info[['uuid', 'ram']]
98+
99+
@classmethod
100+
def process_ram(cls) -> pd.DataFrame:
101+
gpu_info = cls._query_gpu('process', ram_column='vram_mem')
102+
gpu_info.ram = [ram / 1e6 for ram in gpu_info.ram] # RAM is in bytes for the process subcommand.
103+
return gpu_info[['pid', 'ram']]
104+
105+
@classmethod
106+
def ram_and_utilization(cls) -> pd.DataFrame:
107+
gpu_info = cls._query_gpu('monitor', '--vram-usage', '--gfx', ram_column='vram_used')
108+
gpu_info = gpu_info[['uuid', 'gfx', 'ram']]
109+
gpu_info.gfx = gpu_info.gfx.astype(float)
110+
return gpu_info.rename(columns={'gfx': 'utilization_percent'})
111+
112+
113+
@dclass.dataclass
114+
class TimepointUsage:
115+
main_ram: float = 0.0
116+
descendants_ram: float = 0.0
117+
combined_ram: float = 0.0
118+
system_ram: float = 0.0
119+
main_gpu_ram: float = 0.0
120+
descendants_gpu_ram: float = 0.0
121+
combined_gpu_ram: float = 0.0
122+
system_gpu_ram: float = 0.0
123+
gpu_sum_utilization_percent: float = 0.0
124+
gpu_hardware_utilization_percent: float = 0.0
125+
main_n_threads: int = 0
126+
descendants_n_threads: int = 0
127+
combined_n_threads: int = 0
128+
cpu_system_sum_utilization_percent: float = 0.0,
129+
cpu_system_hardware_utilization_percent: float = 0.0
130+
cpu_main_sum_utilization_percent: float = 0.0
131+
cpu_main_hardware_utilization_percent: float = 0.0
132+
cpu_descendants_sum_utilization_percent: float = 0.0
133+
cpu_descendants_hardware_utilization_percent: float = 0.0
134+
cpu_combined_sum_utilization_percent: float = 0.0
135+
cpu_combined_hardware_utilization_percent: float = 0.0
136+
timestamp: float = 0.0
137+
138+
139+
class _TrackingFile(abc.ABC):
140+
@staticmethod
141+
def create(file: str | None) -> _TrackingFile | None:
142+
if file is not None:
143+
if file.endswith('.csv'):
144+
return _CSVTrackingFile(file)
145+
elif file.endswith('.sqlite'):
146+
return _SQLiteTrackingFile(file)
147+
else:
148+
raise ValueError(
149+
f'Invalid file name: "{file}". Valid file extensions are ".csv" and ".sqlite".')
150+
else:
151+
return None
152+
153+
def __init__(self, file: str):
154+
self._file = file
155+
156+
def write_row(self, values: TimepointUsage):
157+
values = dclass.asdict(values)
158+
if not os.path.isfile(self._file):
159+
self._create_file(values)
160+
self._write_row(values)
161+
162+
@abc.abstractmethod
163+
def _write_row(self, values: dict):
164+
pass # pragma: nocover
165+
166+
@abc.abstractmethod
167+
def _create_file(self, values: dict):
168+
pass # pragma: nocover
169+
170+
171+
class _CSVTrackingFile(_TrackingFile):
172+
def _write_row(self, values: dict):
173+
with open(self._file, 'a', newline='') as f:
174+
writer = csv.DictWriter(f, fieldnames=values.keys())
175+
writer.writerow(values)
176+
177+
def _create_file(self, values: dict):
178+
with open(self._file, 'w', newline='') as f:
179+
writer = csv.DictWriter(f, fieldnames=values.keys())
180+
writer.writeheader()
181+
182+
183+
class _SQLiteTrackingFile(_TrackingFile):
184+
_SQLITE_TABLE_NAME = 'tracking'
185+
186+
def _write_row(self, values: dict):
187+
engine = sqlalc.create_engine(f'sqlite:///{self._file}', poolclass=sqlalc.pool.NullPool)
188+
metadata = sqlalc.MetaData()
189+
tracking_table = sqlalc.Table(_SQLiteTrackingFile._SQLITE_TABLE_NAME, metadata, autoload_with=engine)
190+
Session = sqlorm.sessionmaker(bind=engine)
191+
with Session() as session:
192+
insert_stmt = sqlalc.insert(tracking_table).values(**values)
193+
session.execute(insert_stmt)
194+
session.commit()
195+
196+
def _create_file(self, values: dict):
197+
engine = sqlalc.create_engine(f'sqlite:///{self._file}', poolclass=sqlalc.pool.NullPool)
198+
metadata = sqlalc.MetaData()
199+
type_mapping = {
200+
str: sqlalc.String,
201+
int: sqlalc.Integer,
202+
float: sqlalc.Float,
203+
}
204+
columns = list[sqlalc.Column]()
205+
schema = {name: type(value) for name, value in values.items()}
206+
for column_name, data_type in schema.items():
207+
sqlalchemy_type = type_mapping[data_type]
208+
columns.append(sqlalc.Column(column_name, sqlalchemy_type))
209+
tracking_table = sqlalc.Table(_SQLiteTrackingFile._SQLITE_TABLE_NAME, metadata, *columns)
210+
metadata.create_all(engine)

0 commit comments

Comments
 (0)