44import pynvml as nvml
55import time
66import inspect
7+ import json
78from pprint import pprint
8-
99from collections .abc import Callable
1010
1111sys .path .insert (0 , "./experiment-runner/Plugins/Profilers/" )
1212from DataSource import CLISource , ParameterDict , DeviceSource
1313
14- nvml_clock_strings = {
15- nvml .NVML_CLOCK_GRAPHICS : "GraphicsClock" ,
16- nvml .NVML_CLOCK_SM : "StramingMultiproessorClock" ,
17- nvml .NVML_CLOCK_MEM : "MemoryClock" ,
18- nvml .NVML_CLOCK_VIDEO : "VideoClock" ,
19- }
14+ # Define a custom enum wrapper to help generating enums from the nvml enums
15+ class NVML_EnumMeta (enum .EnumType ):
16+ def __call__ (cls , * args , prefix = None , suffix = None , ** kwargs ):
17+ # We are not creating a new enum here, execute normally
18+ if prefix is None and suffix is None :
19+ return super ().__call__ (* args , ** kwargs )
20+
21+ # Interpose on enum creation to have nice strings
22+ assert (prefix != None or suffix != None )
2023
21- nvml_powersource_strings = {
22- nvml .NVML_POWER_SOURCE_AC : "AC" ,
23- nvml .NVML_POWER_SOURCE_BATTERY : "Battery" ,
24- nvml .NVML_POWER_SOURCE_UNDERSIZED : "Undersized" ,
25- }
24+ cls .name_prefix = prefix
25+ cls .name_suffix = suffix
26+
27+ members = {name : val
28+ for name , val in inspect .getmembers (nvml )
29+ if (True if not prefix else name .startswith (prefix ))
30+ and (True if not suffix else name .endswith (suffix ))}
2631
27- nvml_arch_strings = {
28- nvml .NVML_DEVICE_ARCH_KEPLER : "Kepler" ,
29- nvml .NVML_DEVICE_ARCH_MAXWELL : "Maxwell" ,
30- nvml .NVML_DEVICE_ARCH_PASCAL : "Pascal" ,
31- nvml .NVML_DEVICE_ARCH_VOLTA : "Volta" ,
32- nvml .NVML_DEVICE_ARCH_TURING : "Turing" ,
33- nvml .NVML_DEVICE_ARCH_AMPERE : "Ampere" ,
34- nvml .NVML_DEVICE_ARCH_ADA : "Ada" ,
35- nvml .NVML_DEVICE_ARCH_HOPPER : "Hopper" ,
36- nvml .NVML_DEVICE_ARCH_BLACKWELL :"Blackwell" ,
37- nvml .NVML_DEVICE_ARCH_T23X : "Orin" ,
38- nvml .NVML_DEVICE_ARCH_UNKNOWN : "Unknown"
39- }
32+ return super ().__call__ (* args , names = members , ** kwargs )
4033
41- # Need these for legacy temperature support
42- nvml_tempthr_strings = {
43- nvml . NVML_TEMPERATURE_THRESHOLD_SHUTDOWN : "Shutdown Temp" ,
44- nvml . NVML_TEMPERATURE_THRESHOLD_SLOWDOWN : "Slowdown Temp" ,
45- nvml . NVML_TEMPERATURE_THRESHOLD_GPU_MAX : "GPU Max Temp" ,
46- nvml . NVML_TEMPERATURE_THRESHOLD_MEM_MAX : "Memory Max Temp" ,
47- nvml . NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN : "Acoustic Min Temp" ,
48- nvml . NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR : "Acoustic Current Temp" ,
49- nvml . NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX : "Acoustic Max Temp" ,
50- }
34+ class NVML_Enum ( enum . Enum , metaclass = NVML_EnumMeta ):
35+ @ property
36+ def name ( self ):
37+ name = self . _name_
38+ if self . name_prefix :
39+ name = name . lstrip ( self . name_prefix )
40+ if self . name_suffix :
41+ name = name . rstrip ( self . name_suffix )
42+
43+ return name . lower ()
5144
52- class NVML_ID_Types (enum .Enum ):
45+ # There are a lot of these, extract then automatically
46+ NVML_Field = NVML_Enum ("NVML_Field" , prefix = "NVML_FI_DEV_" )
47+ NVML_Sample = NVML_Enum ("NVML_Sample" , prefix = "NVML_" , suffix = "_SAMPLES" )
48+ NVML_Clock = NVML_Enum ("NVML_Clock" , prefix = "NVML_CLOCK_" )
49+ NVML_PowerSource = NVML_Enum ("NVML_PowerSource" , prefix = "NVML_POWER_SOURCE_" )
50+ NVML_Arch = NVML_Enum ("NVML_Arch" , prefix = "NVML_DEVICE_ARCH_" )
51+ NVML_TempThreshold = NVML_Enum ("NVML_TempThreshold" , prefix = "NVML_TEMPERATURE_THRESHOLD_" )
52+ NVML_API_Restriction = NVML_Enum ("NVML_API_Restriction" , prefix = "NVML_RESTRICTED_API_SET_" )
53+ NVML_Enable_State = NVML_Enum ("NVML_Enable_State" , prefix = "NVML_FEATURE_" )
54+ NVML_Compute_Mode = NVML_Enum ("NVML_Compute_Mode" , prefix = "NVML_COMPUTEMODE_" )
55+ NVML_GPU_Operation_Mode = NVML_Enum ("NVML_GPU_Operation_Mode" , prefix = "NVML_GOM_" )
56+
57+ class NVML_IDs (enum .Enum ):
5358 NVML_ID_INDEX = 0
5459 NVML_ID_SERIAL = 1
5560 NVML_ID_UUID = 2
5661
57- class NVML_Query_Types (enum .Enum ):
62+ class NVML_Query (enum .Enum ):
5863 NVML_POWER_USAGE = "PowerUsage"
5964 NVML_TOTAL_ENERGY = "TotalEnergyConsumption"
6065 NVML_TEMPERATURE = "Temperature"
@@ -63,42 +68,6 @@ class NVML_Query_Types(enum.Enum):
6368 NVML_PSTATE = "PerformanceState"
6469 NVML_CLOCK = "GetClockInfo"
6570
66- class NVML_Sample_Types (enum .Enum ):
67- NVML_TOTAL_POWER = nvml .NVML_TOTAL_POWER_SAMPLES
68- NVML_GPU_UTILIZATION = nvml .NVML_GPU_UTILIZATION_SAMPLES
69- NVML_MEM_UTILIZATION = nvml .NVML_MEMORY_UTILIZATION_SAMPLES
70- NVML_ENC_UTILIZATION = nvml .NVML_ENC_UTILIZATION_SAMPLES
71- NVML_DEC_UTILIZATION = nvml .NVML_DEC_UTILIZATION_SAMPLES
72- NVML_PROCESSOR_CLK = nvml .NVML_PROCESSOR_CLK_SAMPLES
73- NVML_MEMORY_CLK = nvml .NVML_MEMORY_CLK_SAMPLES
74- NVML_MODULE_POWER = nvml .NVML_MODULE_POWER_SAMPLES
75-
76- # There are a lot of these, extract then automatically
77- NVML_Field_Types = enum .Enum ("NVML_Field_Types" ,
78- {
79- name : val
80- for name , val in inspect .getmembers (nvml )
81- if name .startswith ("NVML_FI_" )
82- })
83-
84- class NVML_Dev_Config_Types (enum .Enum ):
85- pass
86-
87- # These are the setting functions
88- dev_commands = ["APIRestriction" ,
89- "ApplicationsClocks" ,
90- "ComputeMode" ,
91- "ConfComputeUnprotectedMemSize" ,
92- "EccMode" ,
93- "FanSpeed" ,
94- "GpcClkVfOffset" ,
95- "GpuLockedClocks" ,
96- "GpuOperationMode" ,
97- "MemClkVfOffset" ,
98- "MemoryLockedClocks" ,
99- "PersistenceMode" ,
100- "PowerManagementLimit" ]
101-
10271# These are the static query functions
10372config_stats = ["Name" ,
10473 "UUID" ,
@@ -130,16 +99,33 @@ class NVML_Dev_Config_Types(enum.Enum):
13099 "MaxClockInfo" ,
131100 "MinMaxClockOfPState" ]
132101
102+ NVML_CONFIG_PARAMETERS = {
103+ "APIRestriction" : (NVML_API_Restriction , NVML_Enable_State ),
104+ "ApplicationsClocks" : (int , int ),
105+ "ComputeMode" : NVML_Compute_Mode ,
106+ "ConfComputeUnprotectedMemSize" : int ,
107+ "EccMode" : NVML_Enable_State ,
108+ "FanSpeed_v2" : (int , int ),
109+ "GpcClkVfOffset" : int ,
110+ "GpuLockedClocks" : (int , int ),
111+ "GpuOperationMode" : NVML_GPU_Operation_Mode ,
112+ "MemClkVfOffset" : int ,
113+ "MemoryLockedClocks" : (int , int ),
114+ "PersistenceMode" : NVML_Enable_State ,
115+ "PowerManagementLimit" : int
116+ }
117+
133118class NvidiaML (DeviceSource ):
134119 source_name = "Nvidia Management Library"
135120 supported_platforms = ["Linux" , "Windows" ]
136121
137122 def __init__ (self ,
138123 sample_frequency : int = 5000 ,
139124 out_file : Path = "nvml_out.csv" ,
140- queries : list [NVML_Query_Types ] = [],
141- fields : list [NVML_Field_Types ] = [],
142- samples : list [NVML_Sample_Types ] = []):
125+ queries : list [NVML_Query ] = [NVML_Query .NVML_UTILIZATION ,
126+ NVML_Query .NVML_POWER_USAGE ],
127+ fields : list [NVML_Field ] = [],
128+ samples : list [NVML_Sample ] = []):
143129 super ().__init__ ()
144130
145131 # Initialize an instance of the library
@@ -149,21 +135,15 @@ def __init__(self,
149135 self .sample_frequency = sample_frequency
150136 self .logfile = out_file
151137
152- # Configure a few default stats to collect
138+ # Configure which measurements will be made by nvml
153139 self .measurements = {
154- "queries" : [NVML_Query_Types .NVML_UTILIZATION ,
155- NVML_Query_Types .NVML_PSTATE ,
156- NVML_Query_Types .NVML_TEMPERATURE ,
157- NVML_Query_Types .NVML_POWER_USAGE ],
158- "fields" : [NVML_Field_Types . NVML_FI_DEV_ENERGY ,
159- NVML_Field_Types .NVML_FI_DEV_MEMORY_TEMP ],
160- "samples" : [NVML_Sample_Types .NVML_PROCESSOR_CLK ,
161- NVML_Sample_Types .NVML_MEMORY_CLK ,
162- NVML_Sample_Types .NVML_MODULE_POWER ]
140+ "queries" : queries ,
141+ "fields" : fields ,
142+ "samples" : samples
163143 }
164144
165145 # This records the latest timestamp per sample type
166- self .latest_timestamp = {sample .name : 0 for sample in NVML_Sample_Types }
146+ self .latest_timestamp = {sample .name : 0 for sample in NVML_Sample }
167147
168148 def _print_stat (self , stat , value , unit = None ):
169149 if unit is not None :
@@ -214,9 +194,9 @@ def _query_fields(self, handle, field_ids=[]):
214194 ret = {}
215195 for f_value in values :
216196 if f_value .nvmlReturn != nvml .NVML_SUCCESS :
217- ret [NVML_Field_Types (f_value .fieldId ).name ] = nvml .NVMLError (f_value .nvmlReturn )
197+ ret [NVML_Field (f_value .fieldId ).name ] = nvml .NVMLError (f_value .nvmlReturn )
218198 else :
219- ret [NVML_Field_Types (f_value .fieldId ).name ] = self ._parse_field_value (f_value )
199+ ret [NVML_Field (f_value .fieldId ).name ] = self ._parse_field_value (f_value )
220200
221201 return ret
222202
@@ -243,8 +223,8 @@ def _query_device(self, handle, query_type):
243223 ret [fan ] = func (handle , fan )
244224 case "GetClockInfo" :
245225 ret = {}
246- for val , string in nvml_clock_strings . items () :
247- ret [string ] = func (handle , val )
226+ for clk_type in NVML_Clock :
227+ ret [clk_type . name ] = func (handle , clk_type . value )
248228 case "TemperatureThreshold" :
249229 ret = self ._query_fields (handle , [nvml .NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT ,
250230 nvml .NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT ,
@@ -253,29 +233,29 @@ def _query_device(self, handle, query_type):
253233
254234 # The new method has failed, revert to depricated features
255235 if ret == {}:
256- for val , string in nvml_tempthr_strings . items () :
236+ for temp_type in NVML_TempThreshold :
257237 try :
258- ret [string ] = func (handle , val )
238+ ret [temp_type . name ] = func (handle , temp_type . value )
259239 except :
260240 pass
261241 case "Architecture" :
262- ret = nvml_arch_strings [ func (handle )]
242+ ret = NVML_Arch ( func (handle )). name
263243 case "MinMaxClockOfPState" :
264244 ret = {}
265245 for p_state in nvml .nvmlDeviceGetSupportedPerformanceStates (handle ):
266246 ret [p_state ] = {}
267- for val , string in nvml_clock_strings . items () :
268- ret [p_state ][string ] = func (handle , pstate = p_state , clockType = val )
247+ for clk_type in NVML_Clock :
248+ ret [p_state ][clk_type . name ] = func (handle , pstate = p_state , clockType = clk_type . value )
269249 case "TargetFanSpeed" :
270250 ret = {}
271251 for i in range (0 , nvml .nvmlDeviceGetNumFans (handle )):
272252 ret [i ] = func (handle , i )
273253 case "PowerSource" :
274- ret = nvml_powersource_strings [ func (handle )]
254+ ret = NVML_PowerSource ( func (handle )). name
275255 case "MaxClockInfo" | "MaxCustomerBoostClock" :
276256 ret = {}
277- for val , string in nvml_clock_strings . items () :
278- ret [string ] = func (handle , val )
257+ for clk_type in NVML_Clock :
258+ ret [clk_type . name ] = func (handle , clk_type . value )
279259 case _:
280260 ret = func (handle )
281261
@@ -285,9 +265,9 @@ def _query_device(self, handle, query_type):
285265 return ret
286266
287267 # Very important function, this sets what stats are measured when log is called
288- def set_measurements (self , samples : list [NVML_Sample_Types ] = [],
289- fields : list [NVML_Field_Types ] = [],
290- queries : list [NVML_Query_Types ] = []):
268+ def set_measurements (self , samples : list [NVML_Sample ] = [],
269+ fields : list [NVML_Field ] = [],
270+ queries : list [NVML_Query ] = []):
291271
292272 # Set new measurements if present
293273 if len (samples ) > 0 :
@@ -365,19 +345,19 @@ def list_devices(self, print_dev=False):
365345
366346 return devices
367347
368- def open_device (self , dev_id , id_type : NVML_ID_Types ):
348+ def open_device (self , dev_id , id_type : NVML_IDs ):
369349 # A bit more descriptive than the nvidia errors
370- if id_type == NVML_ID_Types .NVML_ID_INDEX and \
350+ if id_type == NVML_IDs .NVML_ID_INDEX and \
371351 int (dev_id ) >= nvml .nvmlDeviceGetCount ():
372352 raise RuntimeError (f"GPU device index ({ int (dev_id )} ) larger than the number of devices { nvml .nvmlDeviceGetCount ()} " )
373353
374354 try :
375355 match id_type :
376- case NVML_ID_Types .NVML_ID_SERIAL :
356+ case NVML_IDs .NVML_ID_SERIAL :
377357 self .device_handle = nvml .nvmlDeviceGetHandleBySerial (str (dev_id ))
378- case NVML_ID_Types .NVML_ID_UUID :
358+ case NVML_IDs .NVML_ID_UUID :
379359 self .device_handle = nvml .nvmlDeviceGetHandleByUUID (str (dev_id ))
380- case NVML_ID_Types .NVML_ID_INDEX :
360+ case NVML_IDs .NVML_ID_INDEX :
381361 self .device_handle = nvml .nvmlDeviceGetHandleByIndex (int (dev_id ))
382362 except nvml .NVMLError as e :
383363 raise RuntimeError (f"Could not get device with { str (id_type )} { dev_id } : { e } " )
@@ -443,15 +423,23 @@ def log(self, timeout: int = 60, logfile: Path = None, finished_fn: Callable[[],
443423 for key , value in log_data .items ()}
444424
445425 pprint (log_data )
446- # TODO: Log the data into a file
426+
427+ if self .logfile :
428+ with open (self .logfile , "w" ) as f :
429+ json .dump (log_data , f )
430+
431+ return log_data
447432
448- def parse_log (self ):
449- # TODO: Csv read in log
450- pass
433+ @staticmethod
434+ def parse_log (logfile ):
435+ with open (logfile , "r" ) as f :
436+ log_data = json .load (f )
437+
438+ return log_data
451439
452440def main ():
453441 source = NvidiaML ()
454- source .open_device (dev_id = 0 , id_type = NVML_ID_Types .NVML_ID_INDEX )
442+ source .open_device (dev_id = 0 , id_type = NVML_IDs .NVML_ID_INDEX )
455443 source .list_devices (print_dev = True )
456444 source .log (timeout = 10 )
457445
0 commit comments