Skip to content

Commit f7bbc12

Browse files
committed
Log read/write + automatic nvml enum generation
1 parent 5643e56 commit f7bbc12

File tree

1 file changed

+100
-112
lines changed

1 file changed

+100
-112
lines changed

scratch.py

Lines changed: 100 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -4,57 +4,62 @@
44
import pynvml as nvml
55
import time
66
import inspect
7+
import json
78
from pprint import pprint
8-
99
from collections.abc import Callable
1010

1111
sys.path.insert(0, "./experiment-runner/Plugins/Profilers/")
1212
from DataSource import CLISource, ParameterDict, DeviceSource
1313

14-
nvml_clock_strings = {
15-
nvml.NVML_CLOCK_GRAPHICS: "GraphicsClock",
16-
nvml.NVML_CLOCK_SM: "StramingMultiproessorClock",
17-
nvml.NVML_CLOCK_MEM: "MemoryClock",
18-
nvml.NVML_CLOCK_VIDEO: "VideoClock",
19-
}
14+
# Define a custom enum wrapper to help generating enums from the nvml enums
15+
class NVML_EnumMeta(enum.EnumType):
16+
def __call__(cls, *args, prefix=None, suffix=None, **kwargs):
17+
# We are not creating a new enum here, execute normally
18+
if prefix is None and suffix is None:
19+
return super().__call__(*args, **kwargs)
20+
21+
# Interpose on enum creation to have nice strings
22+
assert(prefix != None or suffix != None)
2023

21-
nvml_powersource_strings = {
22-
nvml.NVML_POWER_SOURCE_AC: "AC",
23-
nvml.NVML_POWER_SOURCE_BATTERY: "Battery",
24-
nvml.NVML_POWER_SOURCE_UNDERSIZED: "Undersized",
25-
}
24+
cls.name_prefix = prefix
25+
cls.name_suffix = suffix
26+
27+
members = {name: val
28+
for name, val in inspect.getmembers(nvml)
29+
if (True if not prefix else name.startswith(prefix))
30+
and (True if not suffix else name.endswith(suffix))}
2631

27-
nvml_arch_strings = {
28-
nvml.NVML_DEVICE_ARCH_KEPLER: "Kepler",
29-
nvml.NVML_DEVICE_ARCH_MAXWELL: "Maxwell",
30-
nvml.NVML_DEVICE_ARCH_PASCAL: "Pascal",
31-
nvml.NVML_DEVICE_ARCH_VOLTA: "Volta",
32-
nvml.NVML_DEVICE_ARCH_TURING: "Turing",
33-
nvml.NVML_DEVICE_ARCH_AMPERE: "Ampere",
34-
nvml.NVML_DEVICE_ARCH_ADA: "Ada",
35-
nvml.NVML_DEVICE_ARCH_HOPPER: "Hopper",
36-
nvml.NVML_DEVICE_ARCH_BLACKWELL:"Blackwell",
37-
nvml.NVML_DEVICE_ARCH_T23X: "Orin",
38-
nvml.NVML_DEVICE_ARCH_UNKNOWN: "Unknown"
39-
}
32+
return super().__call__(*args, names=members, **kwargs)
4033

41-
# Need these for legacy temperature support
42-
nvml_tempthr_strings = {
43-
nvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN: "Shutdown Temp",
44-
nvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN: "Slowdown Temp",
45-
nvml.NVML_TEMPERATURE_THRESHOLD_GPU_MAX: "GPU Max Temp",
46-
nvml.NVML_TEMPERATURE_THRESHOLD_MEM_MAX: "Memory Max Temp",
47-
nvml.NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN: "Acoustic Min Temp",
48-
nvml.NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR: "Acoustic Current Temp",
49-
nvml.NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX: "Acoustic Max Temp",
50-
}
34+
class NVML_Enum(enum.Enum, metaclass=NVML_EnumMeta):
35+
@property
36+
def name(self):
37+
name = self._name_
38+
if self.name_prefix:
39+
name = name.lstrip(self.name_prefix)
40+
if self.name_suffix:
41+
name = name.rstrip(self.name_suffix)
42+
43+
return name.lower()
5144

52-
class NVML_ID_Types(enum.Enum):
45+
# There are a lot of these, extract then automatically
46+
NVML_Field = NVML_Enum("NVML_Field", prefix="NVML_FI_DEV_")
47+
NVML_Sample = NVML_Enum("NVML_Sample", prefix="NVML_", suffix="_SAMPLES")
48+
NVML_Clock = NVML_Enum("NVML_Clock", prefix="NVML_CLOCK_")
49+
NVML_PowerSource = NVML_Enum("NVML_PowerSource", prefix="NVML_POWER_SOURCE_")
50+
NVML_Arch = NVML_Enum("NVML_Arch", prefix="NVML_DEVICE_ARCH_")
51+
NVML_TempThreshold = NVML_Enum("NVML_TempThreshold", prefix="NVML_TEMPERATURE_THRESHOLD_")
52+
NVML_API_Restriction = NVML_Enum("NVML_API_Restriction", prefix="NVML_RESTRICTED_API_SET_")
53+
NVML_Enable_State = NVML_Enum("NVML_Enable_State", prefix="NVML_FEATURE_")
54+
NVML_Compute_Mode = NVML_Enum("NVML_Compute_Mode", prefix="NVML_COMPUTEMODE_")
55+
NVML_GPU_Operation_Mode = NVML_Enum("NVML_GPU_Operation_Mode", prefix="NVML_GOM_")
56+
57+
class NVML_IDs(enum.Enum):
5358
NVML_ID_INDEX = 0
5459
NVML_ID_SERIAL = 1
5560
NVML_ID_UUID = 2
5661

57-
class NVML_Query_Types(enum.Enum):
62+
class NVML_Query(enum.Enum):
5863
NVML_POWER_USAGE = "PowerUsage"
5964
NVML_TOTAL_ENERGY = "TotalEnergyConsumption"
6065
NVML_TEMPERATURE = "Temperature"
@@ -63,42 +68,6 @@ class NVML_Query_Types(enum.Enum):
6368
NVML_PSTATE = "PerformanceState"
6469
NVML_CLOCK = "GetClockInfo"
6570

66-
class NVML_Sample_Types(enum.Enum):
67-
NVML_TOTAL_POWER = nvml.NVML_TOTAL_POWER_SAMPLES
68-
NVML_GPU_UTILIZATION = nvml.NVML_GPU_UTILIZATION_SAMPLES
69-
NVML_MEM_UTILIZATION = nvml.NVML_MEMORY_UTILIZATION_SAMPLES
70-
NVML_ENC_UTILIZATION = nvml.NVML_ENC_UTILIZATION_SAMPLES
71-
NVML_DEC_UTILIZATION = nvml.NVML_DEC_UTILIZATION_SAMPLES
72-
NVML_PROCESSOR_CLK = nvml.NVML_PROCESSOR_CLK_SAMPLES
73-
NVML_MEMORY_CLK = nvml.NVML_MEMORY_CLK_SAMPLES
74-
NVML_MODULE_POWER = nvml.NVML_MODULE_POWER_SAMPLES
75-
76-
# There are a lot of these, extract then automatically
77-
NVML_Field_Types = enum.Enum("NVML_Field_Types",
78-
{
79-
name: val
80-
for name, val in inspect.getmembers(nvml)
81-
if name.startswith("NVML_FI_")
82-
})
83-
84-
class NVML_Dev_Config_Types(enum.Enum):
85-
pass
86-
87-
# These are the setting functions
88-
dev_commands = ["APIRestriction",
89-
"ApplicationsClocks",
90-
"ComputeMode",
91-
"ConfComputeUnprotectedMemSize",
92-
"EccMode",
93-
"FanSpeed",
94-
"GpcClkVfOffset",
95-
"GpuLockedClocks",
96-
"GpuOperationMode",
97-
"MemClkVfOffset",
98-
"MemoryLockedClocks",
99-
"PersistenceMode",
100-
"PowerManagementLimit"]
101-
10271
# These are the static query functions
10372
config_stats = ["Name",
10473
"UUID",
@@ -130,16 +99,33 @@ class NVML_Dev_Config_Types(enum.Enum):
13099
"MaxClockInfo",
131100
"MinMaxClockOfPState"]
132101

102+
NVML_CONFIG_PARAMETERS = {
103+
"APIRestriction": (NVML_API_Restriction, NVML_Enable_State),
104+
"ApplicationsClocks": (int, int),
105+
"ComputeMode": NVML_Compute_Mode,
106+
"ConfComputeUnprotectedMemSize": int,
107+
"EccMode": NVML_Enable_State,
108+
"FanSpeed_v2": (int, int),
109+
"GpcClkVfOffset": int,
110+
"GpuLockedClocks": (int, int),
111+
"GpuOperationMode": NVML_GPU_Operation_Mode,
112+
"MemClkVfOffset": int,
113+
"MemoryLockedClocks": (int, int),
114+
"PersistenceMode": NVML_Enable_State,
115+
"PowerManagementLimit": int
116+
}
117+
133118
class NvidiaML(DeviceSource):
134119
source_name = "Nvidia Management Library"
135120
supported_platforms = ["Linux", "Windows"]
136121

137122
def __init__(self,
138123
sample_frequency: int = 5000,
139124
out_file: Path = "nvml_out.csv",
140-
queries: list[NVML_Query_Types] = [],
141-
fields: list[NVML_Field_Types] = [],
142-
samples: list[NVML_Sample_Types] = []):
125+
queries: list[NVML_Query] = [NVML_Query.NVML_UTILIZATION,
126+
NVML_Query.NVML_POWER_USAGE],
127+
fields: list[NVML_Field] = [],
128+
samples: list[NVML_Sample] = []):
143129
super().__init__()
144130

145131
# Initialize an instance of the library
@@ -149,21 +135,15 @@ def __init__(self,
149135
self.sample_frequency = sample_frequency
150136
self.logfile = out_file
151137

152-
# Configure a few default stats to collect
138+
# Configure which measurements will be made by nvml
153139
self.measurements = {
154-
"queries": [NVML_Query_Types.NVML_UTILIZATION,
155-
NVML_Query_Types.NVML_PSTATE,
156-
NVML_Query_Types.NVML_TEMPERATURE,
157-
NVML_Query_Types.NVML_POWER_USAGE],
158-
"fields": [NVML_Field_Types. NVML_FI_DEV_ENERGY,
159-
NVML_Field_Types.NVML_FI_DEV_MEMORY_TEMP],
160-
"samples": [NVML_Sample_Types.NVML_PROCESSOR_CLK,
161-
NVML_Sample_Types.NVML_MEMORY_CLK,
162-
NVML_Sample_Types.NVML_MODULE_POWER]
140+
"queries": queries,
141+
"fields": fields,
142+
"samples": samples
163143
}
164144

165145
# This records the latest timestamp per sample type
166-
self.latest_timestamp = {sample.name: 0 for sample in NVML_Sample_Types}
146+
self.latest_timestamp = {sample.name: 0 for sample in NVML_Sample}
167147

168148
def _print_stat(self, stat, value, unit=None):
169149
if unit is not None:
@@ -214,9 +194,9 @@ def _query_fields(self, handle, field_ids=[]):
214194
ret = {}
215195
for f_value in values:
216196
if f_value.nvmlReturn != nvml.NVML_SUCCESS:
217-
ret[NVML_Field_Types(f_value.fieldId).name] = nvml.NVMLError(f_value.nvmlReturn)
197+
ret[NVML_Field(f_value.fieldId).name] = nvml.NVMLError(f_value.nvmlReturn)
218198
else:
219-
ret[NVML_Field_Types(f_value.fieldId).name] = self._parse_field_value(f_value)
199+
ret[NVML_Field(f_value.fieldId).name] = self._parse_field_value(f_value)
220200

221201
return ret
222202

@@ -243,8 +223,8 @@ def _query_device(self, handle, query_type):
243223
ret[fan] = func(handle, fan)
244224
case "GetClockInfo":
245225
ret = {}
246-
for val, string in nvml_clock_strings.items():
247-
ret[string] = func(handle, val)
226+
for clk_type in NVML_Clock:
227+
ret[clk_type.name] = func(handle, clk_type.value)
248228
case "TemperatureThreshold":
249229
ret = self._query_fields(handle, [nvml.NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT,
250230
nvml.NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT,
@@ -253,29 +233,29 @@ def _query_device(self, handle, query_type):
253233

254234
# The new method has failed, revert to depricated features
255235
if ret == {}:
256-
for val, string in nvml_tempthr_strings.items():
236+
for temp_type in NVML_TempThreshold:
257237
try:
258-
ret[string] = func(handle, val)
238+
ret[temp_type.name] = func(handle, temp_type.value)
259239
except:
260240
pass
261241
case "Architecture":
262-
ret = nvml_arch_strings[func(handle)]
242+
ret = NVML_Arch(func(handle)).name
263243
case "MinMaxClockOfPState":
264244
ret = {}
265245
for p_state in nvml.nvmlDeviceGetSupportedPerformanceStates(handle):
266246
ret[p_state] = {}
267-
for val, string in nvml_clock_strings.items():
268-
ret[p_state][string] = func(handle, pstate=p_state, clockType=val)
247+
for clk_type in NVML_Clock:
248+
ret[p_state][clk_type.name] = func(handle, pstate=p_state, clockType=clk_type.value)
269249
case "TargetFanSpeed":
270250
ret = {}
271251
for i in range(0, nvml.nvmlDeviceGetNumFans(handle)):
272252
ret[i] = func(handle, i)
273253
case "PowerSource":
274-
ret = nvml_powersource_strings[func(handle)]
254+
ret = NVML_PowerSource(func(handle)).name
275255
case "MaxClockInfo" | "MaxCustomerBoostClock":
276256
ret = {}
277-
for val, string in nvml_clock_strings.items():
278-
ret[string] = func(handle, val)
257+
for clk_type in NVML_Clock:
258+
ret[clk_type.name] = func(handle, clk_type.value)
279259
case _:
280260
ret = func(handle)
281261

@@ -285,9 +265,9 @@ def _query_device(self, handle, query_type):
285265
return ret
286266

287267
# Very important function, this sets what stats are measured when log is called
288-
def set_measurements(self, samples: list[NVML_Sample_Types] = [],
289-
fields: list[NVML_Field_Types] = [],
290-
queries: list[NVML_Query_Types] = []):
268+
def set_measurements(self, samples: list[NVML_Sample] = [],
269+
fields: list[NVML_Field] = [],
270+
queries: list[NVML_Query] = []):
291271

292272
# Set new measurements if present
293273
if len(samples) > 0:
@@ -365,19 +345,19 @@ def list_devices(self, print_dev=False):
365345

366346
return devices
367347

368-
def open_device(self, dev_id, id_type: NVML_ID_Types):
348+
def open_device(self, dev_id, id_type: NVML_IDs):
369349
# A bit more descriptive than the nvidia errors
370-
if id_type == NVML_ID_Types.NVML_ID_INDEX and \
350+
if id_type == NVML_IDs.NVML_ID_INDEX and \
371351
int(dev_id) >= nvml.nvmlDeviceGetCount():
372352
raise RuntimeError(f"GPU device index ({int(dev_id)}) larger than the number of devices {nvml.nvmlDeviceGetCount()}")
373353

374354
try:
375355
match id_type:
376-
case NVML_ID_Types.NVML_ID_SERIAL:
356+
case NVML_IDs.NVML_ID_SERIAL:
377357
self.device_handle = nvml.nvmlDeviceGetHandleBySerial(str(dev_id))
378-
case NVML_ID_Types.NVML_ID_UUID:
358+
case NVML_IDs.NVML_ID_UUID:
379359
self.device_handle = nvml.nvmlDeviceGetHandleByUUID(str(dev_id))
380-
case NVML_ID_Types.NVML_ID_INDEX:
360+
case NVML_IDs.NVML_ID_INDEX:
381361
self.device_handle = nvml.nvmlDeviceGetHandleByIndex(int(dev_id))
382362
except nvml.NVMLError as e:
383363
raise RuntimeError(f"Could not get device with {str(id_type)} {dev_id}: {e}")
@@ -443,15 +423,23 @@ def log(self, timeout: int = 60, logfile: Path = None, finished_fn: Callable[[],
443423
for key, value in log_data.items()}
444424

445425
pprint(log_data)
446-
# TODO: Log the data into a file
426+
427+
if self.logfile:
428+
with open(self.logfile, "w") as f:
429+
json.dump(log_data, f)
430+
431+
return log_data
447432

448-
def parse_log(self):
449-
# TODO: Csv read in log
450-
pass
433+
@staticmethod
434+
def parse_log(logfile):
435+
with open(logfile, "r") as f:
436+
log_data = json.load(f)
437+
438+
return log_data
451439

452440
def main():
453441
source = NvidiaML()
454-
source.open_device(dev_id=0, id_type=NVML_ID_Types.NVML_ID_INDEX)
442+
source.open_device(dev_id=0, id_type=NVML_IDs.NVML_ID_INDEX)
455443
source.list_devices(print_dev=True)
456444
source.log(timeout=10)
457445

0 commit comments

Comments
 (0)