Skip to content

Commit 3c7b8aa

Browse files
got the code working again
1 parent 0ff2305 commit 3c7b8aa

File tree

3 files changed

+41
-23
lines changed

3 files changed

+41
-23
lines changed

examples/cuda/going_green_performance_model.py

100644100755
Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#!/usr/bin/env python
12
"""
23
This example demonstrates how to use the performance model presented in
34
@@ -58,27 +59,31 @@ def get_default_parser():
5859
parser = get_default_parser()
5960
args = parser.parse_args()
6061

61-
ridge_frequency, fitted_params, scaling = energy.create_performance_frequency_model(device=args.device,
62-
n_samples=args.samples,
63-
verbose=True,
64-
nvidia_smi_fallback=args.nvidia_smi_fallback,
65-
use_locked_clocks=args.locked_clocks)
62+
ridge_frequency, freqs, nvml_power, fitted_params, scaling = energy.create_performance_frequency_model(device=args.device,
63+
n_samples=args.samples,
64+
verbose=True,
65+
nvidia_smi_fallback=args.nvidia_smi_fallback,
66+
use_locked_clocks=args.locked_clocks)
6667

6768
all_frequencies = np.array(get_nvml_gr_clocks(args.device)['nvml_gr_clock'])
6869

69-
frequency_selection = energy.get_frequency_range_around_ridge(all_frequencies, args.range, args.number)
70+
frequency_selection = energy.get_frequency_range_around_ridge(ridge_frequency, all_frequencies, args.range, args.number)
7071
print(f"Search space reduction: {np.round(100 - len(frequency_selection) / len(all_frequencies) * 100, 1)} %%")
7172

7273
xs = np.linspace(all_frequencies[0], all_frequencies[-1], 100)
7374
# scale to start at 0
7475
xs -= scaling[0]
75-
modelled_power = estimated_power(xs, *fitted_params)
76+
modelled_power = energy.estimated_power(xs, *fitted_params)
7677
# undo scaling
7778
xs += scaling[0]
7879
modelled_power *= scaling[1]
7980

8081
# Add point for ridge frequency
81-
P_ridge = estimated_power([ridge_frequency - scaling[0]], *fitted_params) * scaling[1]
82+
P_ridge = energy.estimated_power([ridge_frequency - scaling[0]], *fitted_params) * scaling[1]
83+
84+
# Add the frequency range
85+
min_freq = 1e-2 * (100 - int(args.range)) * ridge_frequency
86+
max_freq = 1e-2 * (100 + int(args.range)) * ridge_frequency
8287

8388
# plot measurements with model
8489
try:
@@ -101,5 +106,6 @@ def get_default_parser():
101106
plt.xlabel('Core frequency (MHz)')
102107
plt.ylabel('Power consumption (W)')
103108
plt.legend()
109+
plt.show()
104110

105111
plt.savefig("GPU_power_consumption_model.pdf")

kernel_tuner/energy/energy.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,18 @@
22
This module contains a set of helper functions specifically for auto-tuning codes
33
for energy efficiency.
44
"""
5+
from collections import OrderedDict
56
import numpy as np
67
import math
78
from scipy import optimize
89

910
from kernel_tuner import tune_kernel
10-
from kernel_tuner.nvml import nvml, NVMLObserver
11+
from kernel_tuner.nvml import NVMLObserver, get_nvml_gr_clocks, get_idle_power
12+
13+
try:
14+
import pycuda.driver as drv
15+
except ImportError:
16+
pass
1117

1218
fp32_kernel_string = """
1319
__device__ void fp32_n_8(
@@ -36,7 +42,7 @@
3642
}
3743
"""
3844

39-
def get_frequency_power_relation_fp32(device, n_samples=10, use_locked_clocks=False, nvidia_smi_fallback=None):
45+
def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False):
4046
""" Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs """
4147

4248
if drv is None:
@@ -46,17 +52,17 @@ def get_frequency_power_relation_fp32(device, n_samples=10, use_locked_clocks=Fa
4652
drv.init()
4753
dev = drv.Device(device)
4854
device_name = dev.name().replace(' ', '_')
49-
multiprocessor_count = dev.get_attribute(
50-
drv.device_attribute.MULTIPROCESSOR_COUNT)
51-
max_block_dim_x = dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X)
55+
multiprocessor_count = int(dev.get_attribute(
56+
drv.device_attribute.MULTIPROCESSOR_COUNT))
57+
max_block_dim_x = int(dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X))
5258

5359
# kernel arguments
5460
data_size = (multiprocessor_count, max_block_dim_x)
5561
data = np.random.random(np.prod(data_size)).astype(float)
5662
arguments = [data]
5763

5864
# setup clocks
59-
nvml_gr_clocks = get_nvml_gr_clocks(device, n=n_samples)
65+
nvml_gr_clocks = get_nvml_gr_clocks(device, n=n_samples, quiet=True)
6066

6167
# idle power
6268
power_idle = get_idle_power(device)
@@ -68,17 +74,19 @@ def get_frequency_power_relation_fp32(device, n_samples=10, use_locked_clocks=Fa
6874
tune_params["nr_inner"] = [1024]
6975
tune_params.update(nvml_gr_clocks)
7076

77+
tune_params["nvml_gr_clock"] = [int(c) for c in tune_params["nvml_gr_clock"]]
78+
7179
# metrics
7280
metrics = OrderedDict()
7381
metrics["f"] = lambda p: p["core_freq"]
7482

7583
nvmlobserver = NVMLObserver(
76-
["core_freq", "nvml_power"], device=device, nvidia_smi_fallback=nvidia_smi_fallback)
84+
["core_freq", "nvml_power"], device=device, nvidia_smi_fallback=nvidia_smi_fallback, use_locked_clocks=use_locked_clocks)
7785

7886
results, _ = tune_kernel("fp32_kernel", fp32_kernel_string, problem_size=(multiprocessor_count, 64),
7987
arguments=arguments, tune_params=tune_params, observers=[nvmlobserver],
8088
verbose=False, quiet=True, metrics=metrics, iterations=10,
81-
grid_div_x=[], grid_div_y=[])
89+
grid_div_x=[], grid_div_y=[], cache=f"synthetic_fp32_cache_{device_name}.json")
8290

8391
freqs = np.array([res["core_freq"] for res in results])
8492
nvml_power = np.array([res["nvml_power"] for res in results])
@@ -172,27 +180,27 @@ def create_performance_frequency_model(device=0, n_samples=10, verbose=False, nv
172180
:rtype: float
173181
174182
"""
175-
freqs, nvml_power = get_frequency_power_relation(device, n_samples, nvidia_smi_fallback, use_locked_clocks)
183+
freqs, nvml_power = get_frequency_power_relation_fp32(device, n_samples, nvidia_smi_fallback, use_locked_clocks)
176184

177185
if verbose:
178186
print("Clock frequencies:", freqs.tolist())
179187
print("Power consumption:", nvml_power.tolist())
180188

181-
ridge_frequency, fitted_params, scaling = fit_model(freqs, nvml_power)
189+
ridge_frequency, fitted_params, scaling = fit_performance_frequency_model(freqs, nvml_power)
182190

183191
if verbose:
184192
print(f"Modelled most energy efficient frequency: {ridge_frequency} MHz")
185193

186-
all_frequencies = np.array(get_nvml_gr_clocks(device)['nvml_gr_clock'])
194+
all_frequencies = np.array(get_nvml_gr_clocks(device, quiet=True)['nvml_gr_clock'])
187195
ridge_frequency_final = all_frequencies[np.argmin(abs(all_frequencies - ridge_frequency))]
188196

189197
if verbose:
190-
print(f"Closest configurable most energy efficient frequency: {ridge_frequency2} MHz")
198+
print(f"Closest configurable most energy efficient frequency: {ridge_frequency_final} MHz")
191199

192-
return ridge_frequency_final, fitted_params, scaling
200+
return ridge_frequency_final, freqs, nvml_power, fitted_params, scaling
193201

194202

195-
def get_frequency_range_around_ridge(ridge_frequency, all_frequencies, freq_range, number_of_freqs, verbose=False)
203+
def get_frequency_range_around_ridge(ridge_frequency, all_frequencies, freq_range, number_of_freqs, verbose=False):
196204
""" Return number_of_freqs frequencies in a freq_range percentage around the ridge_frequency from among all_frequencies """
197205

198206
min_freq = 1e-2 * (100 - int(freq_range)) * ridge_frequency

kernel_tuner/nvml.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import time
33
import re
44
import numpy as np
5+
from collections import OrderedDict
56

67
from kernel_tuner.observers import BenchmarkObserver, ContinuousObserver
78

@@ -39,6 +40,7 @@ def __init__(self, device_id=0, nvidia_smi_fallback='nvidia-smi', use_locked_clo
3940
self._auto_boost = None
4041

4142
#try to initialize application clocks
43+
self.modified_clocks = False
4244
try:
4345
if not use_locked_clocks:
4446
self.gr_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock(self.dev, pynvml.NVML_CLOCK_GRAPHICS)
@@ -75,7 +77,8 @@ def __del__(self):
7577
#try to restore to defaults
7678
if self.pwr_limit_default is not None:
7779
self.pwr_limit = self.pwr_limit_default
78-
self.reset_clocks()
80+
if self.modified_clocks:
81+
self.reset_clocks()
7982

8083
@property
8184
def pwr_state(self):
@@ -115,6 +118,7 @@ def persistence_mode(self, new_mode):
115118

116119
def set_clocks(self, mem_clock, gr_clock):
117120
"""Set the memory and graphics clock for this device (may require permission)"""
121+
self.modified_clocks = True
118122
if not mem_clock in self.supported_mem_clocks:
119123
raise ValueError("Illegal value for memory clock")
120124
if not gr_clock in self.supported_gr_clocks[mem_clock]:

0 commit comments

Comments
 (0)