Skip to content

Commit 12efec0

Browse files
Merge pull request #179 from KernelTuner/energy_helpers
Energy helpers
2 parents d15dca5 + c3b5c91 commit 12efec0

File tree

12 files changed

+565
-12
lines changed

12 files changed

+565
-12
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
66

77
### Added
88
- Support for using time_limit in simulation mode
9+
- Helper functions for energy tuning
10+
- Example to show ridge frequency and power-frequency model
911

1012
### Changed
1113
- Changed what timings are stored in cache files

README.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,16 @@ If you use Kernel Tuner in research or research software, please cite the most r
172172
author={Schoonhoven, Richard and van Werkhoven, Ben and Batenburg, K Joost},
173173
journal={IEEE Transactions on Evolutionary Computation},
174174
year={2022},
175-
publisher={IEEE}
175+
publisher={IEEE},
176+
url = {https://arxiv.org/abs/2210.01465}
177+
}
178+
179+
@article{schoonhoven2022going,
180+
author = {Schoonhoven, Richard and Veenboer, Bram, and van Werkhoven, Ben and Batenburg, K Joost},
181+
title = {Going green: optimizing GPUs for energy efficiency through model-steered auto-tuning},
182+
journal = {International Workshop on Performance Modeling, Benchmarking and Simulation
183+
of High Performance Computer Systems (PMBS) at Supercomputing (SC22)},
184+
year = {2022}
176185
}
177186

178187

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/usr/bin/env python
2+
"""
3+
This example demonstrates how to use the power-frequency model presented in
4+
5+
* Going green: optimizing GPUs for energy efficiency through model-steered auto-tuning
6+
R. Schoonhoven, B. Veenboer, B. van Werkhoven, K. J. Batenburg
7+
International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS) at Supercomputing (SC22) 2022
8+
9+
to reduce the number of frequencies for GPU energy tuning.
10+
11+
In particular, this example creates a plot with the modeled power consumption vs
12+
frequency curve, highlighting the ridge frequency and the frequency range
13+
selected by the user.
14+
15+
This example requires CUDA and NVML as well as PyCuda and a CUDA-capable
16+
GPU with the ability (and permissions) to set applications clocks. GPUs
17+
that do support locked clocks but not application clocks may use the
18+
locked_clocks=True option.
19+
20+
"""
21+
import argparse
22+
from collections import OrderedDict
23+
import numpy as np
24+
import math
25+
import matplotlib.pyplot as plt
26+
from scipy import optimize
27+
import time
28+
29+
try:
30+
from pycuda import driver as drv
31+
except ImportError as e:
32+
drv = None
33+
raise e
34+
35+
from kernel_tuner.energy import energy
36+
from kernel_tuner.nvml import get_nvml_gr_clocks
37+
38+
def get_default_parser():
39+
parser = argparse.ArgumentParser(
40+
description='Find energy efficient frequencies')
41+
parser.add_argument("-d", dest="device", nargs="?",
42+
default=0, help="GPU ID to use")
43+
parser.add_argument("-s", dest="samples", nargs="?",
44+
default=10, help="Number of frequency samples")
45+
parser.add_argument("-r", dest="range", nargs="?",
46+
default=10, help="Frequency spread (10%% of 'optimum')")
47+
parser.add_argument("-n", dest="number", nargs="?", default=10,
48+
help="Maximum number of suggested frequencies")
49+
parser.add_argument("-l", dest="locked_clocks", nargs="?", default=False,
50+
help="Whether to use locked clocks over application clocks")
51+
parser.add_argument("-nsf", dest="nvidia_smi_fallback", nargs="?", default=None,
52+
help="Path to nvidia-smi as fallback when missing NVML permissions")
53+
54+
55+
return parser
56+
57+
58+
if __name__ == "__main__":
59+
parser = get_default_parser()
60+
args = parser.parse_args()
61+
62+
ridge_frequency, freqs, nvml_power, fitted_params, scaling = energy.create_power_frequency_model(device=args.device,
63+
n_samples=args.samples,
64+
verbose=True,
65+
nvidia_smi_fallback=args.nvidia_smi_fallback,
66+
use_locked_clocks=args.locked_clocks)
67+
68+
all_frequencies = np.array(get_nvml_gr_clocks(args.device, quiet=True)['nvml_gr_clock'])
69+
70+
frequency_selection = energy.get_frequency_range_around_ridge(ridge_frequency, all_frequencies, args.range, args.number, verbose=True)
71+
print(f"Search space reduction: {np.round(100 - len(frequency_selection) / len(all_frequencies) * 100, 1)} %")
72+
73+
xs = np.linspace(all_frequencies[0], all_frequencies[-1], 100)
74+
# scale to start at 0
75+
xs -= scaling[0]
76+
modelled_power = energy.estimated_power(xs, *fitted_params)
77+
# undo scaling
78+
xs += scaling[0]
79+
modelled_power *= scaling[1]
80+
81+
# Add point for ridge frequency
82+
P_ridge = energy.estimated_power([ridge_frequency - scaling[0]], *fitted_params) * scaling[1]
83+
84+
# Add the frequency range
85+
min_freq = 1e-2 * (100 - int(args.range)) * ridge_frequency
86+
max_freq = 1e-2 * (100 + int(args.range)) * ridge_frequency
87+
88+
# plot measurements with model
89+
try:
90+
import seaborn as sns
91+
sns.set_theme(style="darkgrid")
92+
sns.set_context("paper", rc={"font.size":10,
93+
"axes.titlesize":9, "axes.labelsize":12})
94+
fig, ax = plt.subplots()
95+
except ImportError:
96+
fig, ax = plt.subplots()
97+
plt.grid()
98+
99+
plt.scatter(x=freqs, y=nvml_power, label='NVML measurements')
100+
plt.scatter(x=ridge_frequency, y=P_ridge, color='g',
101+
label='Ridge frequency (MHz)')
102+
plt.plot(xs, modelled_power, label='Modelled power consumption')
103+
ax.axvspan(min_freq, max_freq, alpha=0.15, color='green',
104+
label='Recommended frequency range')
105+
plt.title('GPU modelled power consumption', size=18)
106+
plt.xlabel('Core frequency (MHz)')
107+
plt.ylabel('Power consumption (W)')
108+
plt.legend()
109+
plt.show()
110+
111+
plt.savefig("GPU_power_consumption_model.pdf")

kernel_tuner/energy/__init__.py

Whitespace-only changes.

kernel_tuner/energy/energy.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
"""
2+
This module contains a set of helper functions specifically for auto-tuning codes
3+
for energy efficiency.
4+
"""
5+
from collections import OrderedDict
6+
7+
import numpy as np
8+
from kernel_tuner import tune_kernel, util
9+
from kernel_tuner.nvml import NVMLObserver, get_nvml_gr_clocks
10+
from scipy import optimize
11+
12+
try:
13+
import pycuda.driver as drv
14+
except ImportError:
15+
pass
16+
17+
fp32_kernel_string = """
18+
__device__ void fp32_n_8(
19+
float2& a, float2& b, float2& c)
20+
{
21+
// Perform nr_inner * 4 fma
22+
for (int i = 0; i < nr_inner; i++) {
23+
a.x += b.x * c.x;
24+
a.x -= b.y * c.y;
25+
a.y += b.x * c.y;
26+
a.y += b.y * c.x;
27+
}
28+
}
29+
30+
__global__ void fp32_kernel(float *ptr)
31+
{
32+
float2 a = make_float2(threadIdx.x, threadIdx.x + 1);
33+
float2 b = make_float2(1, 2);
34+
float2 c = make_float2(3, 4);
35+
36+
for (int i = 0; i < nr_outer; i++) {
37+
fp32_n_8(a, b, c);
38+
}
39+
40+
ptr[blockIdx.x * blockDim.x + threadIdx.x] = a.x + a.y;
41+
}
42+
"""
43+
44+
def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None):
45+
""" Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs """
46+
47+
# get some numbers about the device
48+
if not cache:
49+
if drv is None:
50+
raise ImportError("get_ridge_point_gr_frequency requires PyCUDA")
51+
52+
drv.init()
53+
dev = drv.Device(device)
54+
device_name = dev.name().replace(' ', '_')
55+
multiprocessor_count = dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
56+
max_block_dim_x = dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X)
57+
58+
# setup clocks
59+
nvml_gr_clocks = get_nvml_gr_clocks(device, n=n_samples, quiet=True)
60+
61+
else:
62+
cached_data = util.read_cache(cache, open_cache=False)
63+
multiprocessor_count = cached_data["problem_size"][0]
64+
max_block_dim_x = cached_data["tune_params"]["block_size_x"][0]
65+
nvml_gr_clocks = cached_data["tune_params"]
66+
67+
# kernel arguments
68+
data_size = (multiprocessor_count, max_block_dim_x)
69+
data = np.random.random(np.prod(data_size)).astype(np.float32)
70+
arguments = [data]
71+
72+
# setup tunable parameters
73+
tune_params = OrderedDict()
74+
tune_params["block_size_x"] = [max_block_dim_x]
75+
tune_params["nr_outer"] = [64]
76+
tune_params["nr_inner"] = [1024]
77+
tune_params.update(nvml_gr_clocks)
78+
79+
# metrics
80+
metrics = OrderedDict()
81+
metrics["f"] = lambda p: p["core_freq"]
82+
83+
nvmlobserver = NVMLObserver(
84+
["core_freq", "nvml_power"], device=device, nvidia_smi_fallback=nvidia_smi_fallback, use_locked_clocks=use_locked_clocks)
85+
86+
results, _ = tune_kernel("fp32_kernel", fp32_kernel_string, problem_size=(multiprocessor_count, 64),
87+
arguments=arguments, tune_params=tune_params, observers=[nvmlobserver],
88+
verbose=False, quiet=True, metrics=metrics, iterations=10,
89+
grid_div_x=[], grid_div_y=[], cache=cache or f"synthetic_fp32_cache_{device_name}.json")
90+
91+
freqs = np.array([res["core_freq"] for res in results])
92+
nvml_power = np.array([res["nvml_power"] for res in results])
93+
94+
return freqs, nvml_power
95+
96+
97+
def estimated_voltage(clocks, clock_threshold, voltage_scale):
98+
""" estimate voltage based on clock_threshold and voltage_scale """
99+
return [1 + ((clock > clock_threshold) * (1e-3 * voltage_scale * (clock-clock_threshold))) for clock in clocks]
100+
101+
102+
def estimated_power(clocks, clock_threshold, voltage_scale, clock_scale, power_max):
103+
""" estimate power consumption based on clock threshold, clock_scale and max power """
104+
n = len(clocks)
105+
powers = np.zeros(n)
106+
107+
voltages = estimated_voltage(clocks, clock_threshold, voltage_scale)
108+
109+
for i in range(n):
110+
clock = clocks[i]
111+
voltage = voltages[i]
112+
power = 1 + clock_scale * clock * voltage**2 * 1e-3
113+
powers[i] = min(power_max, power)
114+
115+
return powers
116+
117+
118+
def fit_power_frequency_model(freqs, nvml_power):
119+
""" Fit the power-frequency model based on frequency and power measurements """
120+
nvml_gr_clocks = np.array(freqs)
121+
nvml_power = np.array(nvml_power)
122+
123+
clock_min = min(freqs)
124+
clock_max = max(freqs)
125+
126+
nvml_gr_clock_normalized = nvml_gr_clocks - clock_min
127+
nvml_power_normalized = nvml_power / min(nvml_power)
128+
129+
clock_threshold = np.median(nvml_gr_clock_normalized)
130+
voltage_scale = 1
131+
clock_scale = 1
132+
power_max = max(nvml_power_normalized)
133+
134+
x = nvml_gr_clock_normalized
135+
y = nvml_power_normalized
136+
137+
# fit the model
138+
p0 = (clock_threshold, voltage_scale, clock_scale, power_max)
139+
bounds = ([clock_min, 0, 0, 0.9*power_max],
140+
[clock_max, 1, 1, 1.1*power_max])
141+
res = optimize.curve_fit(estimated_power, x, y, p0=p0, bounds=bounds)
142+
clock_threshold, voltage_scale, clock_scale, power_max = np.round(
143+
res[0], 2)
144+
145+
fit_parameters = (clock_threshold, voltage_scale, clock_scale, power_max)
146+
scale_parameters = (clock_min, min(nvml_power))
147+
return clock_threshold + clock_min, fit_parameters, scale_parameters
148+
149+
150+
def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None):
151+
""" Calculate the most energy-efficient clock frequency of device
152+
153+
This function uses a performance model to fit the power-frequency curve
154+
using a synthethic benchmarking kernel. The method has been described in:
155+
156+
* Going green: optimizing GPUs for energy efficiency through model-steered auto-tuning
157+
R. Schoonhoven, B. Veenboer, B. van Werkhoven, K. J. Batenburg
158+
International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS) at Supercomputing (SC22) 2022
159+
160+
Requires NVML and PyCUDA.
161+
162+
:param device: The device ordinal for NVML
163+
:type device: int
164+
165+
:param n_samples: Number of frequencies to sample
166+
:type n_samples: int
167+
168+
:param verbose: Enable verbose printing of sampled frequencies and power consumption
169+
:type verbose: bool
170+
171+
:param nvidia_smi_fallback: Path to nvidia-smi when insufficient permissions to use NVML directly
172+
:type nvidia_smi_fallback: string
173+
174+
:param use_locked_clocks: Whether to prefer locked clocks over application clocks
175+
:type use_locked_clocks: bool
176+
177+
:param cache: Name for the cache file to use to store measurements
178+
:type cache: string
179+
180+
:returns: The clock frequency closest to the ridge point, fitted parameters, scaling
181+
:rtype: float
182+
183+
"""
184+
freqs, nvml_power = get_frequency_power_relation_fp32(device, n_samples, nvidia_smi_fallback, use_locked_clocks, cache=cache)
185+
186+
if verbose:
187+
print("Clock frequencies:", freqs.tolist())
188+
print("Power consumption:", nvml_power.tolist())
189+
190+
ridge_frequency, fitted_params, scaling = fit_power_frequency_model(freqs, nvml_power)
191+
192+
if verbose:
193+
print(f"Modelled most energy efficient frequency: {ridge_frequency} MHz")
194+
195+
all_frequencies = np.array(get_nvml_gr_clocks(device, quiet=True)['nvml_gr_clock'])
196+
ridge_frequency_final = all_frequencies[np.argmin(abs(all_frequencies - ridge_frequency))]
197+
198+
if verbose:
199+
print(f"Closest configurable most energy efficient frequency: {ridge_frequency_final} MHz")
200+
201+
return ridge_frequency_final, freqs, nvml_power, fitted_params, scaling
202+
203+
204+
def get_frequency_range_around_ridge(ridge_frequency, all_frequencies, freq_range, number_of_freqs, verbose=False):
205+
""" Return number_of_freqs frequencies in a freq_range percentage around the ridge_frequency from among all_frequencies """
206+
207+
min_freq = 1e-2 * (100 - int(freq_range)) * ridge_frequency
208+
max_freq = 1e-2 * (100 + int(freq_range)) * ridge_frequency
209+
frequency_selection = np.unique([all_frequencies[np.argmin(abs(
210+
all_frequencies - f))] for f in np.linspace(min_freq, max_freq, int(number_of_freqs))]).tolist()
211+
212+
if verbose:
213+
print(f"Suggested range of frequencies to auto-tune: {frequency_selection} MHz")
214+
215+
return frequency_selection

0 commit comments

Comments
 (0)