Skip to content

Commit ebed827

Browse files
add support for using cache files in power frequency model
1 parent 4f245c6 commit ebed827

File tree

2 files changed

+40
-20
lines changed

2 files changed

+40
-20
lines changed

kernel_tuner/energy/energy.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@
33
for energy efficiency.
44
"""
55
from collections import OrderedDict
6+
67
import numpy as np
7-
import math
8+
from kernel_tuner import tune_kernel, util
9+
from kernel_tuner.nvml import NVMLObserver, get_nvml_gr_clocks
810
from scipy import optimize
911

10-
from kernel_tuner import tune_kernel
11-
from kernel_tuner.nvml import NVMLObserver, get_nvml_gr_clocks, get_idle_power
12-
1312
try:
1413
import pycuda.driver as drv
1514
except ImportError:
@@ -42,28 +41,34 @@
4241
}
4342
"""
4443

45-
def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False):
44+
def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None):
4645
""" Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs """
4746

48-
if drv is None:
49-
raise ImportError("get_ridge_point_gr_frequency requires PyCUDA")
50-
5147
# get some numbers about the device
52-
drv.init()
53-
dev = drv.Device(device)
54-
device_name = dev.name().replace(' ', '_')
55-
multiprocessor_count = dev.get_attribute(
56-
drv.device_attribute.MULTIPROCESSOR_COUNT)
57-
max_block_dim_x = dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X)
48+
if not cache:
49+
if drv is None:
50+
raise ImportError("get_ridge_point_gr_frequency requires PyCUDA")
51+
52+
drv.init()
53+
dev = drv.Device(device)
54+
device_name = dev.name().replace(' ', '_')
55+
multiprocessor_count = dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
56+
max_block_dim_x = dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X)
57+
58+
# setup clocks
59+
nvml_gr_clocks = get_nvml_gr_clocks(device, n=n_samples, quiet=True)
60+
61+
else:
62+
cached_data = util.read_cache(cache, open_cache=False)
63+
multiprocessor_count = cached_data["problem_size"][0]
64+
max_block_dim_x = cached_data["tune_params"]["block_size_x"][0]
65+
nvml_gr_clocks = cached_data["tune_params"]
5866

5967
# kernel arguments
6068
data_size = (multiprocessor_count, max_block_dim_x)
6169
data = np.random.random(np.prod(data_size)).astype(np.float32)
6270
arguments = [data]
6371

64-
# setup clocks
65-
nvml_gr_clocks = get_nvml_gr_clocks(device, n=n_samples, quiet=True)
66-
6772
# setup tunable parameters
6873
tune_params = OrderedDict()
6974
tune_params["block_size_x"] = [max_block_dim_x]
@@ -81,7 +86,7 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
8186
results, _ = tune_kernel("fp32_kernel", fp32_kernel_string, problem_size=(multiprocessor_count, 64),
8287
arguments=arguments, tune_params=tune_params, observers=[nvmlobserver],
8388
verbose=False, quiet=True, metrics=metrics, iterations=10,
84-
grid_div_x=[], grid_div_y=[], cache=f"synthetic_fp32_cache_{device_name}.json")
89+
grid_div_x=[], grid_div_y=[], cache=cache or f"synthetic_fp32_cache_{device_name}.json")
8590

8691
freqs = np.array([res["core_freq"] for res in results])
8792
nvml_power = np.array([res["nvml_power"] for res in results])
@@ -142,7 +147,7 @@ def fit_power_frequency_model(freqs, nvml_power):
142147
return clock_threshold + clock_min, fit_parameters, scale_parameters
143148

144149

145-
def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_smi_fallback=None, use_locked_clocks=False):
150+
def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None):
146151
""" Calculate the most energy-efficient clock frequency of device
147152
148153
This function uses a performance model to fit the power-frequency curve
@@ -169,11 +174,14 @@ def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_s
169174
:param use_locked_clocks: Whether to prefer locked clocks over application clocks
170175
:type use_locked_clocks: bool
171176
177+
:param cache: Name for the cache file to use to store measurements
178+
:type cache: string
179+
172180
:returns: The clock frequency closest to the ridge point, fitted parameters, scaling
173181
:rtype: float
174182
175183
"""
176-
freqs, nvml_power = get_frequency_power_relation_fp32(device, n_samples, nvidia_smi_fallback, use_locked_clocks)
184+
freqs, nvml_power = get_frequency_power_relation_fp32(device, n_samples, nvidia_smi_fallback, use_locked_clocks, cache=cache)
177185

178186
if verbose:
179187
print("Clock frequencies:", freqs.tolist())

test/test_energy.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import os
2+
3+
from kernel_tuner.energy import energy
4+
5+
6+
cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/synthetic_fp32_cache_NVIDIA_RTX_A4000.json"
7+
8+
def test_create_power_frequency_model():
9+
10+
ridge_frequency, freqs, nvml_power, fitted_params, scaling = energy.create_power_frequency_model(cache=cache_filename)
11+
assert ridge_frequency == 1350
12+

0 commit comments

Comments
 (0)