11"""The ``tracker`` module contains the ``Tracker`` class which can alternatively be imported directly from the ``gpu_tracker`` package."""
22from __future__ import annotations
3+ import abc
34import json
45import dataclasses as dclass
56import platform
1718import pandas as pd
1819
1920
21+ class _GPUQuerier (abc .ABC ):
22+ def __init__ (self , command : str ):
23+ self .command = command
24+
25+ def _query_gpu (self , * args ) -> pd .DataFrame :
26+ output = subp .check_output ((self .command ,) + args , stderr = subp .STDOUT ).decode ()
27+ gpu_info = pd .read_csv (io .StringIO (output ))
28+ return gpu_info .map (lambda value : value .strip () if type (value ) is str else value )
29+
30+ def is_available (self ) -> bool :
31+ try :
32+ subp .check_output (self .command )
33+ return True
34+ except FileNotFoundError :
35+ return False
36+
37+ @abc .abstractmethod
38+ def static_info (self ) -> pd .DataFrame :
39+ pass
40+
41+ @abc .abstractmethod
42+ def process_ram (self ) -> pd .DataFrame :
43+ pass
44+
45+ @abc .abstractmethod
46+ def ram_and_utilization (self ) -> pd .DataFrame :
47+ pass
48+
49+ class _NvidiaQuerier (_GPUQuerier ):
50+ def __init__ (self ):
51+ super ().__init__ ('nvidia-smi' )
52+
53+ def _query_gpu (self , * args : list [str ], ram_column : str | None = None ):
54+ gpu_info = super ()._query_gpu (* args , '--format=csv' )
55+ gpu_info .columns = [col .replace ('[MiB]' , '' ).replace ('[%]' , '' ).strip () for col in gpu_info .columns ]
56+ if ram_column :
57+ gpu_info [ram_column ] = gpu_info [ram_column ].apply (lambda ram : int (ram .replace ('MiB' , '' ).strip ()))
58+ gpu_info = gpu_info .rename (columns = {ram_column : 'ram' })
59+ return gpu_info .map (lambda value : value .strip () if type (value ) is str else value )
60+
61+ def static_info (self ) -> pd .DataFrame :
62+ return self ._query_gpu ('--query-gpu=uuid,memory.total' , ram_column = 'memory.total' )
63+
64+ def process_ram (self ) -> pd .DataFrame :
65+ return self ._query_gpu ('--query-compute-apps=pid,used_gpu_memory' , ram_column = 'used_gpu_memory' )
66+
67+ def ram_and_utilization (self ) -> pd .DataFrame :
68+ gpu_info = self ._query_gpu ('--query-gpu=uuid,memory.used,utilization.gpu' , ram_column = 'memory.used' )
69+ gpu_info = gpu_info .rename (columns = {'utilization.gpu' : 'utilization_percent' })
70+ gpu_info .utilization_percent = [float (percentage .replace ('%' , '' ).strip ()) for percentage in gpu_info .utilization_percent ]
71+ return gpu_info
72+
73+ class _AMDQuerier (_GPUQuerier ):
74+ def __init__ (self ):
75+ super ().__init__ ('amd-smi' )
76+
2077class _TrackingProcess (mproc .Process ):
2178 _CPU_PERCENT_INTERVAL = 0.1
2279 _ram_unit2coefficient = {
@@ -43,7 +100,7 @@ class _TrackingProcess(mproc.Process):
43100 def __init__ (
44101 self , stop_event : mproc .Event , sleep_time : float , ram_unit : str , gpu_ram_unit : str , time_unit : str ,
45102 n_expected_cores : int | None , gpu_uuids : set [str ] | None , disable_logs : bool , main_process_id : int ,
46- resource_usage_file : str , extraneous_process_ids : set [int ]):
103+ resource_usage_file : str , extraneous_process_ids : set [int ], gpu_querier : _GPUQuerier ):
47104 super ().__init__ ()
48105 self ._stop_event = stop_event
49106 if sleep_time < _TrackingProcess ._CPU_PERCENT_INTERVAL :
@@ -63,24 +120,22 @@ def __init__(
63120 self ._hardware_percent_sums = {key : 0. for key in percent_keys }
64121 self ._tracking_iteration = 1
65122 self ._is_linux = platform .system ().lower () == 'linux'
66- self ._nvidia_available = True
67- try :
68- subp .check_output ('nvidia-smi' )
69- except FileNotFoundError :
70- self ._nvidia_available = False
123+ self ._gpu_querier = gpu_querier
124+ self ._gpu_available = self ._gpu_querier .is_available ()
125+ if not self ._gpu_available :
71126 self ._log_warning (
72- 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. '
73- 'Otherwise the Max GPU RAM values will remain 0.0' )
127+ f 'The { self . _gpu_querier . command } command is not available. Please install it to track GPU usage. '
128+ 'Otherwise the GPU RAM and GPU utilization values will remain 0.0. ' )
74129 max_ram = MaxRAM (unit = ram_unit , system_capacity = psutil .virtual_memory ().total * self ._ram_coefficient )
75130 system_core_count = psutil .cpu_count ()
76131 cpu_utilization = CPUUtilization (
77132 system_core_count = system_core_count ,
78133 n_expected_cores = n_expected_cores if n_expected_cores is not None else system_core_count )
79- if self ._nvidia_available :
80- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-gpu=uuid,memory.total' )
81- gpu_ram_system_capacity = self ._get_gpu_ram (gpu_info = gpu_info , column = 'memory.total' )
134+ if self ._gpu_available :
135+ gpu_info = self . _gpu_querier . static_info ( )
136+ gpu_ram_system_capacity = self ._get_gpu_ram (gpu_info = gpu_info )
82137 max_gpu_ram = MaxGPURAM (unit = gpu_ram_unit , system_capacity = gpu_ram_system_capacity )
83- all_uuids = set (gpu_info [ ' uuid' ] )
138+ all_uuids = set (gpu_info . uuid )
84139 if gpu_uuids is None :
85140 self ._gpu_uuids = all_uuids
86141 else :
@@ -143,25 +198,23 @@ def run(self):
143198 self ._resource_usage .max_ram .system = max (
144199 self ._resource_usage .max_ram .system , psutil .virtual_memory ().used * self ._ram_coefficient )
145200 # Get the maximum GPU RAM usage if available.
146- if self ._nvidia_available : # pragma: nocover
147- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-compute-apps=pid,used_gpu_memory' )
201+ if self ._gpu_available : # pragma: nocover
202+ gpu_info = self . _gpu_querier . process_ram ( )
148203 if len (gpu_info ):
149204 process_ids = {self ._main_process_id }
150205 self ._update_gpu_ram (attr = 'main' , process_ids = process_ids , gpu_info = gpu_info )
151206 process_ids = set (self ._map_processes (processes = descendant_processes , map_func = lambda process : process .pid ))
152207 self ._update_gpu_ram (attr = 'descendants' , process_ids = process_ids , gpu_info = gpu_info )
153208 process_ids .add (self ._main_process_id )
154209 self ._update_gpu_ram (attr = 'combined' , process_ids = process_ids , gpu_info = gpu_info )
155- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-gpu=uuid,memory.used,utilization.gpu' )
156- system_gpu_ram = self ._get_gpu_ram (gpu_info , column = 'memory.used' )
210+ gpu_info = self . _gpu_querier . ram_and_utilization ( )
211+ system_gpu_ram = self ._get_gpu_ram (gpu_info )
157212 self ._resource_usage .max_gpu_ram .system = max (self ._resource_usage .max_gpu_ram .system , system_gpu_ram )
158- gpu_info = gpu_info .loc [gpu_info ['uuid' ].apply (lambda gpu_uuid : gpu_uuid in self ._gpu_uuids )]
159- gpu_percentages = [float (percentage .replace ('%' , '' ).strip ()) for percentage in gpu_info ['utilization.gpu' ]]
213+ gpu_info = gpu_info .loc [[uuid in self ._gpu_uuids for uuid in gpu_info .uuid ]]
160214 self ._update_processing_unit_utilization (
161- current_percentages = gpu_percentages ,
215+ current_percentages = list ( gpu_info . utilization_percent ) ,
162216 processing_unit_percentages = self ._resource_usage .gpu_utilization .gpu_percentages , percent_key = 'gpu' ,
163217 n_hardware_units = self ._resource_usage .gpu_utilization .n_expected_gpus )
164-
165218 # Get the mean and maximum CPU usages.
166219 main_n_threads = self ._map_processes ([main_process ], map_func = get_n_threads )
167220 descendant_n_threads = self ._map_processes (descendant_processes , map_func = get_n_threads )
@@ -230,23 +283,13 @@ def _update_ram(self, rss_values: RSSValues, memory_maps_list: list[list] | None
230283 rss_values .total_rss = max (rss_values .total_rss , total_rss )
231284
232285 def _update_gpu_ram (self , attr : str , process_ids : set [int ], gpu_info : pd .DataFrame ):
233- gpu_info = gpu_info .loc [[pid in process_ids for pid in gpu_info [ ' pid' ] ]]
234- gpu_ram = self ._get_gpu_ram (gpu_info , column = 'used_gpu_memory' )
286+ gpu_info = gpu_info .loc [[pid in process_ids for pid in gpu_info . pid ]]
287+ gpu_ram = self ._get_gpu_ram (gpu_info )
235288 max_gpu_ram = getattr (self ._resource_usage .max_gpu_ram , attr )
236289 setattr (self ._resource_usage .max_gpu_ram , attr , max (max_gpu_ram , gpu_ram ))
237290
238- @staticmethod
239- def _query_gpu (nvidia_command : str ) -> pd .DataFrame :
240- command = f'nvidia-smi { nvidia_command } --format=csv'
241- output = subp .check_output (command .split (), stderr = subp .STDOUT ).decode ()
242- gpu_info = pd .read_csv (io .StringIO (output ))
243- gpu_info .columns = [col .replace ('[MiB]' , '' ).replace ('[%]' , '' ).strip () for col in gpu_info .columns ]
244- return gpu_info .map (lambda value : value .strip () if type (value ) is str else value )
245-
246- def _get_gpu_ram (self , gpu_info : pd .DataFrame , column : str ) -> float :
247- gpu_rams = gpu_info [column ]
248- gpu_rams = gpu_rams .apply (lambda ram : int (ram .replace ('MiB' , '' ).strip ()))
249- return sum (gpu_rams ) * self ._gpu_ram_coefficient
291+ def _get_gpu_ram (self , gpu_info : pd .DataFrame ) -> float :
292+ return sum (gpu_info .ram ) * self ._gpu_ram_coefficient
250293
251294 def _update_processing_unit_utilization (
252295 self , current_percentages : list [float ], processing_unit_percentages : ProcessingUnitPercentages ,
@@ -297,7 +340,7 @@ class State(enum.Enum):
297340 def __init__ (
298341 self , sleep_time : float = 1.0 , ram_unit : str = 'gigabytes' , gpu_ram_unit : str = 'gigabytes' , time_unit : str = 'hours' ,
299342 n_expected_cores : int = None , gpu_uuids : set [str ] = None , disable_logs : bool = False , process_id : int = None ,
300- resource_usage_file : str | None = None , n_join_attempts : int = 5 , join_timeout : float = 10.0 ):
343+ resource_usage_file : str | None = None , n_join_attempts : int = 5 , join_timeout : float = 10.0 , gpu_brand : str = 'nvidia' ):
301344 """
302345 :param sleep_time: The number of seconds to sleep in between usage-collection iterations.
303346 :param ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.
@@ -321,9 +364,15 @@ def __init__(
321364 self ._stop_event = mproc .Event ()
322365 extraneous_ids = {process .pid for process in current_process .children ()} - legit_child_ids
323366 self ._resource_usage_file = f'.gpu-tracker_{ uuid .uuid1 ()} .pkl' if resource_usage_file is None else resource_usage_file
367+ if gpu_brand == 'nvidia' :
368+ gpu_querier = _NvidiaQuerier ()
369+ elif gpu_brand == 'amd' :
370+ gpu_querier = _AMDQuerier ()
371+ else :
372+ raise ValueError (f'"{ gpu_brand } " is not a valid GPU brand. Supported values are "nvidia" and "amd".' )
324373 self ._tracking_process = _TrackingProcess (
325374 self ._stop_event , sleep_time , ram_unit , gpu_ram_unit , time_unit , n_expected_cores , gpu_uuids , disable_logs ,
326- process_id if process_id is not None else current_process_id , self ._resource_usage_file , extraneous_ids )
375+ process_id if process_id is not None else current_process_id , self ._resource_usage_file , extraneous_ids , gpu_querier )
327376 self .resource_usage = None
328377 self .n_join_attempts = n_join_attempts
329378 self .join_timeout = join_timeout
0 commit comments