|
| 1 | +""" |
| 2 | +This module contains a set of helper functions specifically for auto-tuning codes |
| 3 | +for energy efficiency. |
| 4 | +""" |
| 5 | +from collections import OrderedDict |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +from kernel_tuner import tune_kernel, util |
| 9 | +from kernel_tuner.nvml import NVMLObserver, get_nvml_gr_clocks |
| 10 | +from scipy import optimize |
| 11 | + |
| 12 | +try: |
| 13 | + import pycuda.driver as drv |
| 14 | +except ImportError: |
| 15 | + pass |
| 16 | + |
| 17 | +fp32_kernel_string = """ |
| 18 | +__device__ void fp32_n_8( |
| 19 | + float2& a, float2& b, float2& c) |
| 20 | +{ |
| 21 | + // Perform nr_inner * 4 fma |
| 22 | + for (int i = 0; i < nr_inner; i++) { |
| 23 | + a.x += b.x * c.x; |
| 24 | + a.x -= b.y * c.y; |
| 25 | + a.y += b.x * c.y; |
| 26 | + a.y += b.y * c.x; |
| 27 | + } |
| 28 | +} |
| 29 | +
|
| 30 | +__global__ void fp32_kernel(float *ptr) |
| 31 | +{ |
| 32 | + float2 a = make_float2(threadIdx.x, threadIdx.x + 1); |
| 33 | + float2 b = make_float2(1, 2); |
| 34 | + float2 c = make_float2(3, 4); |
| 35 | +
|
| 36 | + for (int i = 0; i < nr_outer; i++) { |
| 37 | + fp32_n_8(a, b, c); |
| 38 | + } |
| 39 | +
|
| 40 | + ptr[blockIdx.x * blockDim.x + threadIdx.x] = a.x + a.y; |
| 41 | +} |
| 42 | +""" |
| 43 | + |
| 44 | +def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None): |
| 45 | + """ Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs """ |
| 46 | + |
| 47 | + # get some numbers about the device |
| 48 | + if not cache: |
| 49 | + if drv is None: |
| 50 | + raise ImportError("get_ridge_point_gr_frequency requires PyCUDA") |
| 51 | + |
| 52 | + drv.init() |
| 53 | + dev = drv.Device(device) |
| 54 | + device_name = dev.name().replace(' ', '_') |
| 55 | + multiprocessor_count = dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT) |
| 56 | + max_block_dim_x = dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X) |
| 57 | + |
| 58 | + # setup clocks |
| 59 | + nvml_gr_clocks = get_nvml_gr_clocks(device, n=n_samples, quiet=True) |
| 60 | + |
| 61 | + else: |
| 62 | + cached_data = util.read_cache(cache, open_cache=False) |
| 63 | + multiprocessor_count = cached_data["problem_size"][0] |
| 64 | + max_block_dim_x = cached_data["tune_params"]["block_size_x"][0] |
| 65 | + nvml_gr_clocks = cached_data["tune_params"] |
| 66 | + |
| 67 | + # kernel arguments |
| 68 | + data_size = (multiprocessor_count, max_block_dim_x) |
| 69 | + data = np.random.random(np.prod(data_size)).astype(np.float32) |
| 70 | + arguments = [data] |
| 71 | + |
| 72 | + # setup tunable parameters |
| 73 | + tune_params = OrderedDict() |
| 74 | + tune_params["block_size_x"] = [max_block_dim_x] |
| 75 | + tune_params["nr_outer"] = [64] |
| 76 | + tune_params["nr_inner"] = [1024] |
| 77 | + tune_params.update(nvml_gr_clocks) |
| 78 | + |
| 79 | + # metrics |
| 80 | + metrics = OrderedDict() |
| 81 | + metrics["f"] = lambda p: p["core_freq"] |
| 82 | + |
| 83 | + nvmlobserver = NVMLObserver( |
| 84 | + ["core_freq", "nvml_power"], device=device, nvidia_smi_fallback=nvidia_smi_fallback, use_locked_clocks=use_locked_clocks) |
| 85 | + |
| 86 | + results, _ = tune_kernel("fp32_kernel", fp32_kernel_string, problem_size=(multiprocessor_count, 64), |
| 87 | + arguments=arguments, tune_params=tune_params, observers=[nvmlobserver], |
| 88 | + verbose=False, quiet=True, metrics=metrics, iterations=10, |
| 89 | + grid_div_x=[], grid_div_y=[], cache=cache or f"synthetic_fp32_cache_{device_name}.json") |
| 90 | + |
| 91 | + freqs = np.array([res["core_freq"] for res in results]) |
| 92 | + nvml_power = np.array([res["nvml_power"] for res in results]) |
| 93 | + |
| 94 | + return freqs, nvml_power |
| 95 | + |
| 96 | + |
| 97 | +def estimated_voltage(clocks, clock_threshold, voltage_scale): |
| 98 | + """ estimate voltage based on clock_threshold and voltage_scale """ |
| 99 | + return [1 + ((clock > clock_threshold) * (1e-3 * voltage_scale * (clock-clock_threshold))) for clock in clocks] |
| 100 | + |
| 101 | + |
| 102 | +def estimated_power(clocks, clock_threshold, voltage_scale, clock_scale, power_max): |
| 103 | + """ estimate power consumption based on clock threshold, clock_scale and max power """ |
| 104 | + n = len(clocks) |
| 105 | + powers = np.zeros(n) |
| 106 | + |
| 107 | + voltages = estimated_voltage(clocks, clock_threshold, voltage_scale) |
| 108 | + |
| 109 | + for i in range(n): |
| 110 | + clock = clocks[i] |
| 111 | + voltage = voltages[i] |
| 112 | + power = 1 + clock_scale * clock * voltage**2 * 1e-3 |
| 113 | + powers[i] = min(power_max, power) |
| 114 | + |
| 115 | + return powers |
| 116 | + |
| 117 | + |
| 118 | +def fit_power_frequency_model(freqs, nvml_power): |
| 119 | + """ Fit the power-frequency model based on frequency and power measurements """ |
| 120 | + nvml_gr_clocks = np.array(freqs) |
| 121 | + nvml_power = np.array(nvml_power) |
| 122 | + |
| 123 | + clock_min = min(freqs) |
| 124 | + clock_max = max(freqs) |
| 125 | + |
| 126 | + nvml_gr_clock_normalized = nvml_gr_clocks - clock_min |
| 127 | + nvml_power_normalized = nvml_power / min(nvml_power) |
| 128 | + |
| 129 | + clock_threshold = np.median(nvml_gr_clock_normalized) |
| 130 | + voltage_scale = 1 |
| 131 | + clock_scale = 1 |
| 132 | + power_max = max(nvml_power_normalized) |
| 133 | + |
| 134 | + x = nvml_gr_clock_normalized |
| 135 | + y = nvml_power_normalized |
| 136 | + |
| 137 | + # fit the model |
| 138 | + p0 = (clock_threshold, voltage_scale, clock_scale, power_max) |
| 139 | + bounds = ([clock_min, 0, 0, 0.9*power_max], |
| 140 | + [clock_max, 1, 1, 1.1*power_max]) |
| 141 | + res = optimize.curve_fit(estimated_power, x, y, p0=p0, bounds=bounds) |
| 142 | + clock_threshold, voltage_scale, clock_scale, power_max = np.round( |
| 143 | + res[0], 2) |
| 144 | + |
| 145 | + fit_parameters = (clock_threshold, voltage_scale, clock_scale, power_max) |
| 146 | + scale_parameters = (clock_min, min(nvml_power)) |
| 147 | + return clock_threshold + clock_min, fit_parameters, scale_parameters |
| 148 | + |
| 149 | + |
| 150 | +def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None): |
| 151 | + """ Calculate the most energy-efficient clock frequency of device |
| 152 | +
|
| 153 | + This function uses a performance model to fit the power-frequency curve |
| 154 | + using a synthethic benchmarking kernel. The method has been described in: |
| 155 | +
|
| 156 | + * Going green: optimizing GPUs for energy efficiency through model-steered auto-tuning |
| 157 | + R. Schoonhoven, B. Veenboer, B. van Werkhoven, K. J. Batenburg |
| 158 | + International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS) at Supercomputing (SC22) 2022 |
| 159 | +
|
| 160 | + Requires NVML and PyCUDA. |
| 161 | +
|
| 162 | + :param device: The device ordinal for NVML |
| 163 | + :type device: int |
| 164 | +
|
| 165 | + :param n_samples: Number of frequencies to sample |
| 166 | + :type n_samples: int |
| 167 | +
|
| 168 | + :param verbose: Enable verbose printing of sampled frequencies and power consumption |
| 169 | + :type verbose: bool |
| 170 | +
|
| 171 | + :param nvidia_smi_fallback: Path to nvidia-smi when insufficient permissions to use NVML directly |
| 172 | + :type nvidia_smi_fallback: string |
| 173 | +
|
| 174 | + :param use_locked_clocks: Whether to prefer locked clocks over application clocks |
| 175 | + :type use_locked_clocks: bool |
| 176 | +
|
| 177 | + :param cache: Name for the cache file to use to store measurements |
| 178 | + :type cache: string |
| 179 | +
|
| 180 | + :returns: The clock frequency closest to the ridge point, fitted parameters, scaling |
| 181 | + :rtype: float |
| 182 | +
|
| 183 | + """ |
| 184 | + freqs, nvml_power = get_frequency_power_relation_fp32(device, n_samples, nvidia_smi_fallback, use_locked_clocks, cache=cache) |
| 185 | + |
| 186 | + if verbose: |
| 187 | + print("Clock frequencies:", freqs.tolist()) |
| 188 | + print("Power consumption:", nvml_power.tolist()) |
| 189 | + |
| 190 | + ridge_frequency, fitted_params, scaling = fit_power_frequency_model(freqs, nvml_power) |
| 191 | + |
| 192 | + if verbose: |
| 193 | + print(f"Modelled most energy efficient frequency: {ridge_frequency} MHz") |
| 194 | + |
| 195 | + all_frequencies = np.array(get_nvml_gr_clocks(device, quiet=True)['nvml_gr_clock']) |
| 196 | + ridge_frequency_final = all_frequencies[np.argmin(abs(all_frequencies - ridge_frequency))] |
| 197 | + |
| 198 | + if verbose: |
| 199 | + print(f"Closest configurable most energy efficient frequency: {ridge_frequency_final} MHz") |
| 200 | + |
| 201 | + return ridge_frequency_final, freqs, nvml_power, fitted_params, scaling |
| 202 | + |
| 203 | + |
| 204 | +def get_frequency_range_around_ridge(ridge_frequency, all_frequencies, freq_range, number_of_freqs, verbose=False): |
| 205 | + """ Return number_of_freqs frequencies in a freq_range percentage around the ridge_frequency from among all_frequencies """ |
| 206 | + |
| 207 | + min_freq = 1e-2 * (100 - int(freq_range)) * ridge_frequency |
| 208 | + max_freq = 1e-2 * (100 + int(freq_range)) * ridge_frequency |
| 209 | + frequency_selection = np.unique([all_frequencies[np.argmin(abs( |
| 210 | + all_frequencies - f))] for f in np.linspace(min_freq, max_freq, int(number_of_freqs))]).tolist() |
| 211 | + |
| 212 | + if verbose: |
| 213 | + print(f"Suggested range of frequencies to auto-tune: {frequency_selection} MHz") |
| 214 | + |
| 215 | + return frequency_selection |
0 commit comments