22This module contains a set of helper functions specifically for auto-tuning codes
33for energy efficiency.
44"""
5+ from collections import OrderedDict
56import numpy as np
67import math
78from scipy import optimize
89
910from kernel_tuner import tune_kernel
10- from kernel_tuner .nvml import nvml , NVMLObserver
11+ from kernel_tuner .nvml import NVMLObserver , get_nvml_gr_clocks , get_idle_power
12+
13+ try :
14+ import pycuda .driver as drv
15+ except ImportError :
16+ pass
1117
1218fp32_kernel_string = """
1319__device__ void fp32_n_8(
3642}
3743"""
3844
39- def get_frequency_power_relation_fp32 (device , n_samples = 10 , use_locked_clocks = False , nvidia_smi_fallback = None ):
45+ def get_frequency_power_relation_fp32 (device , n_samples = 10 , nvidia_smi_fallback = None , use_locked_clocks = False ):
4046 """ Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs """
4147
4248 if drv is None :
@@ -46,17 +52,17 @@ def get_frequency_power_relation_fp32(device, n_samples=10, use_locked_clocks=Fa
4652 drv .init ()
4753 dev = drv .Device (device )
4854 device_name = dev .name ().replace (' ' , '_' )
49- multiprocessor_count = dev .get_attribute (
50- drv .device_attribute .MULTIPROCESSOR_COUNT )
51- max_block_dim_x = dev .get_attribute (drv .device_attribute .MAX_BLOCK_DIM_X )
55+ multiprocessor_count = int ( dev .get_attribute (
56+ drv .device_attribute .MULTIPROCESSOR_COUNT ))
57+ max_block_dim_x = int ( dev .get_attribute (drv .device_attribute .MAX_BLOCK_DIM_X ) )
5258
5359 # kernel arguments
5460 data_size = (multiprocessor_count , max_block_dim_x )
5561 data = np .random .random (np .prod (data_size )).astype (float )
5662 arguments = [data ]
5763
5864 # setup clocks
59- nvml_gr_clocks = get_nvml_gr_clocks (device , n = n_samples )
65+ nvml_gr_clocks = get_nvml_gr_clocks (device , n = n_samples , quiet = True )
6066
6167 # idle power
6268 power_idle = get_idle_power (device )
@@ -68,17 +74,19 @@ def get_frequency_power_relation_fp32(device, n_samples=10, use_locked_clocks=Fa
6874 tune_params ["nr_inner" ] = [1024 ]
6975 tune_params .update (nvml_gr_clocks )
7076
77+ tune_params ["nvml_gr_clock" ] = [int (c ) for c in tune_params ["nvml_gr_clock" ]]
78+
7179 # metrics
7280 metrics = OrderedDict ()
7381 metrics ["f" ] = lambda p : p ["core_freq" ]
7482
7583 nvmlobserver = NVMLObserver (
76- ["core_freq" , "nvml_power" ], device = device , nvidia_smi_fallback = nvidia_smi_fallback )
84+ ["core_freq" , "nvml_power" ], device = device , nvidia_smi_fallback = nvidia_smi_fallback , use_locked_clocks = use_locked_clocks )
7785
7886 results , _ = tune_kernel ("fp32_kernel" , fp32_kernel_string , problem_size = (multiprocessor_count , 64 ),
7987 arguments = arguments , tune_params = tune_params , observers = [nvmlobserver ],
8088 verbose = False , quiet = True , metrics = metrics , iterations = 10 ,
81- grid_div_x = [], grid_div_y = [])
89+ grid_div_x = [], grid_div_y = [], cache = f"synthetic_fp32_cache_ { device_name } .json" )
8290
8391 freqs = np .array ([res ["core_freq" ] for res in results ])
8492 nvml_power = np .array ([res ["nvml_power" ] for res in results ])
@@ -172,27 +180,27 @@ def create_performance_frequency_model(device=0, n_samples=10, verbose=False, nv
172180 :rtype: float
173181
174182 """
175- freqs , nvml_power = get_frequency_power_relation (device , n_samples , nvidia_smi_fallback , use_locked_clocks )
183+ freqs , nvml_power = get_frequency_power_relation_fp32 (device , n_samples , nvidia_smi_fallback , use_locked_clocks )
176184
177185 if verbose :
178186 print ("Clock frequencies:" , freqs .tolist ())
179187 print ("Power consumption:" , nvml_power .tolist ())
180188
181- ridge_frequency , fitted_params , scaling = fit_model (freqs , nvml_power )
189+ ridge_frequency , fitted_params , scaling = fit_performance_frequency_model (freqs , nvml_power )
182190
183191 if verbose :
184192 print (f"Modelled most energy efficient frequency: { ridge_frequency } MHz" )
185193
186- all_frequencies = np .array (get_nvml_gr_clocks (device )['nvml_gr_clock' ])
194+ all_frequencies = np .array (get_nvml_gr_clocks (device , quiet = True )['nvml_gr_clock' ])
187195 ridge_frequency_final = all_frequencies [np .argmin (abs (all_frequencies - ridge_frequency ))]
188196
189197 if verbose :
190- print (f"Closest configurable most energy efficient frequency: { ridge_frequency2 } MHz" )
198+ print (f"Closest configurable most energy efficient frequency: { ridge_frequency_final } MHz" )
191199
192- return ridge_frequency_final , fitted_params , scaling
200+ return ridge_frequency_final , freqs , nvml_power , fitted_params , scaling
193201
194202
195- def get_frequency_range_around_ridge (ridge_frequency , all_frequencies , freq_range , number_of_freqs , verbose = False )
203+ def get_frequency_range_around_ridge (ridge_frequency , all_frequencies , freq_range , number_of_freqs , verbose = False ):
196204 """ Return number_of_freqs frequencies in a freq_range percentage around the ridge_frequency from among all_frequencies """
197205
198206 min_freq = 1e-2 * (100 - int (freq_range )) * ridge_frequency
0 commit comments