33for energy efficiency.
44"""
55from collections import OrderedDict
6+
67import numpy as np
7- import math
8+ from kernel_tuner import tune_kernel , util
9+ from kernel_tuner .nvml import NVMLObserver , get_nvml_gr_clocks
810from scipy import optimize
911
10- from kernel_tuner import tune_kernel
11- from kernel_tuner .nvml import NVMLObserver , get_nvml_gr_clocks , get_idle_power
12-
1312try :
1413 import pycuda .driver as drv
1514except ImportError :
4241}
4342"""
4443
45- def get_frequency_power_relation_fp32 (device , n_samples = 10 , nvidia_smi_fallback = None , use_locked_clocks = False ):
44+ def get_frequency_power_relation_fp32 (device , n_samples = 10 , nvidia_smi_fallback = None , use_locked_clocks = False , cache = None ):
4645 """ Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs """
4746
48- if drv is None :
49- raise ImportError ("get_ridge_point_gr_frequency requires PyCUDA" )
50-
5147 # get some numbers about the device
52- drv .init ()
53- dev = drv .Device (device )
54- device_name = dev .name ().replace (' ' , '_' )
55- multiprocessor_count = dev .get_attribute (
56- drv .device_attribute .MULTIPROCESSOR_COUNT )
57- max_block_dim_x = dev .get_attribute (drv .device_attribute .MAX_BLOCK_DIM_X )
48+ if not cache :
49+ if drv is None :
50+ raise ImportError ("get_ridge_point_gr_frequency requires PyCUDA" )
51+
52+ drv .init ()
53+ dev = drv .Device (device )
54+ device_name = dev .name ().replace (' ' , '_' )
55+ multiprocessor_count = dev .get_attribute (drv .device_attribute .MULTIPROCESSOR_COUNT )
56+ max_block_dim_x = dev .get_attribute (drv .device_attribute .MAX_BLOCK_DIM_X )
57+
58+ # setup clocks
59+ nvml_gr_clocks = get_nvml_gr_clocks (device , n = n_samples , quiet = True )
60+
61+ else :
62+ cached_data = util .read_cache (cache , open_cache = False )
63+ multiprocessor_count = cached_data ["problem_size" ][0 ]
64+ max_block_dim_x = cached_data ["tune_params" ]["block_size_x" ][0 ]
65+ nvml_gr_clocks = cached_data ["tune_params" ]
5866
5967 # kernel arguments
6068 data_size = (multiprocessor_count , max_block_dim_x )
6169 data = np .random .random (np .prod (data_size )).astype (np .float32 )
6270 arguments = [data ]
6371
64- # setup clocks
65- nvml_gr_clocks = get_nvml_gr_clocks (device , n = n_samples , quiet = True )
66-
6772 # setup tunable parameters
6873 tune_params = OrderedDict ()
6974 tune_params ["block_size_x" ] = [max_block_dim_x ]
@@ -81,7 +86,7 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
8186 results , _ = tune_kernel ("fp32_kernel" , fp32_kernel_string , problem_size = (multiprocessor_count , 64 ),
8287 arguments = arguments , tune_params = tune_params , observers = [nvmlobserver ],
8388 verbose = False , quiet = True , metrics = metrics , iterations = 10 ,
84- grid_div_x = [], grid_div_y = [], cache = f"synthetic_fp32_cache_{ device_name } .json" )
89+ grid_div_x = [], grid_div_y = [], cache = cache or f"synthetic_fp32_cache_{ device_name } .json" )
8590
8691 freqs = np .array ([res ["core_freq" ] for res in results ])
8792 nvml_power = np .array ([res ["nvml_power" ] for res in results ])
@@ -142,7 +147,7 @@ def fit_power_frequency_model(freqs, nvml_power):
142147 return clock_threshold + clock_min , fit_parameters , scale_parameters
143148
144149
145- def create_power_frequency_model (device = 0 , n_samples = 10 , verbose = False , nvidia_smi_fallback = None , use_locked_clocks = False ):
150+ def create_power_frequency_model (device = 0 , n_samples = 10 , verbose = False , nvidia_smi_fallback = None , use_locked_clocks = False , cache = None ):
146151 """ Calculate the most energy-efficient clock frequency of device
147152
148153 This function uses a performance model to fit the power-frequency curve
@@ -169,11 +174,14 @@ def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_s
169174 :param use_locked_clocks: Whether to prefer locked clocks over application clocks
170175 :type use_locked_clocks: bool
171176
177+ :param cache: Name for the cache file to use to store measurements
178+ :type cache: string
179+
172180 :returns: The clock frequency closest to the ridge point, fitted parameters, scaling
173181 :rtype: float
174182
175183 """
176- freqs , nvml_power = get_frequency_power_relation_fp32 (device , n_samples , nvidia_smi_fallback , use_locked_clocks )
184+ freqs , nvml_power = get_frequency_power_relation_fp32 (device , n_samples , nvidia_smi_fallback , use_locked_clocks , cache = cache )
177185
178186 if verbose :
179187 print ("Clock frequencies:" , freqs .tolist ())
0 commit comments