Skip to content

Commit 72365f9

Browse files
Merge pull request #178 from KernelTuner/refactor_costfunc
Refactor costfunc
2 parents 38bd2dc + 6500822 commit 72365f9

File tree

12 files changed

+278
-312
lines changed

12 files changed

+278
-312
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
44

55
## Unreleased
66

7+
### Added
8+
- Support for using time_limit in simulation mode
9+
10+
### Changed
11+
- Changed what timings are stored in cache files
712

813
## [0.4.3] - 2022-10-19
914
### Added

kernel_tuner/core.py

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,6 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
259259
self.units = dev.units
260260
self.name = dev.name
261261
self.max_threads = dev.max_threads
262-
self.last_compilation_time = None
263-
self.last_verification_time = None
264262
if not quiet:
265263
print("Using: " + self.dev.name)
266264

@@ -317,7 +315,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
317315

318316

319317

320-
def benchmark(self, func, gpu_args, instance, verbose):
318+
def benchmark(self, func, gpu_args, instance, verbose, objective):
321319
"""benchmark the kernel instance"""
322320
logging.debug('benchmark ' + instance.name)
323321
logging.debug('thread block dimensions x,y,z=%d,%d,%d', *instance.threads)
@@ -333,9 +331,8 @@ def benchmark(self, func, gpu_args, instance, verbose):
333331
if "nvml_mem_clock" in instance.params:
334332
self.nvml.mem_clock = instance.params["nvml_mem_clock"]
335333

336-
result = None
334+
result = {}
337335
try:
338-
result = dict()
339336
self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result)
340337

341338
if self.continuous_observers:
@@ -348,16 +345,16 @@ def benchmark(self, func, gpu_args, instance, verbose):
348345

349346

350347
except Exception as e:
351-
#some launches may fail because too many registers are required
352-
#to run the kernel given the current thread block size
353-
#the desired behavior is to simply skip over this configuration
354-
#and proceed to try the next one
348+
# some launches may fail because too many registers are required
349+
# to run the kernel given the current thread block size
350+
# the desired behavior is to simply skip over this configuration
351+
# and proceed to try the next one
355352
skippable_exceptions = ["too many resources requested for launch", "OUT_OF_RESOURCES", "INVALID_WORK_GROUP_SIZE"]
356353
if any([skip_str in str(e) for skip_str in skippable_exceptions]):
357354
logging.debug('benchmark fails due to runtime failure too many resources required')
358355
if verbose:
359356
print(f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch")
360-
return util.RuntimeFailedConfig()
357+
result[objective] = util.RuntimeFailedConfig()
361358
else:
362359
logging.debug('benchmark encountered runtime failure: ' + str(e))
363360
print("Error while benchmarking:", instance.name)
@@ -408,61 +405,69 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
408405
if not correct:
409406
raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
410407

411-
def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, tuning_options):
408+
def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
412409
""" Compile and benchmark a kernel instance based on kernel strings and parameters """
413-
start_compilation = time.perf_counter()
414410
instance_string = util.get_instance_string(params)
415411

416412
# reset previous timers
417-
self.last_compilation_time = None
418-
self.last_verification_time = None
413+
last_compilation_time = None
414+
last_verification_time = None
415+
last_benchmark_time = None
419416

420417
logging.debug('compile_and_benchmark ' + instance_string)
421418

422-
verbose = tuning_options.verbose
419+
verbose = to.verbose
420+
result = {}
423421

424422
instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
425423
if isinstance(instance, util.ErrorConfig):
426424
return instance
427425

428426
try:
429-
#compile the kernel
427+
# compile the kernel
428+
start_compilation = time.perf_counter()
430429
func = self.compile_kernel(instance, verbose)
431-
if func is None:
432-
return util.CompilationFailedConfig()
433-
434-
#add shared memory arguments to compiled module
435-
if kernel_options.smem_args is not None:
436-
self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
437-
#add constant memory arguments to compiled module
438-
if kernel_options.cmem_args is not None:
439-
self.dev.copy_constant_memory_args(kernel_options.cmem_args)
440-
#add texture memory arguments to compiled module
441-
if kernel_options.texmem_args is not None:
442-
self.dev.copy_texture_memory_args(kernel_options.texmem_args)
430+
if not func:
431+
result[to.objective] = util.CompilationFailedConfig()
432+
else:
433+
# add shared memory arguments to compiled module
434+
if kernel_options.smem_args is not None:
435+
self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
436+
# add constant memory arguments to compiled module
437+
if kernel_options.cmem_args is not None:
438+
self.dev.copy_constant_memory_args(kernel_options.cmem_args)
439+
# add texture memory arguments to compiled module
440+
if kernel_options.texmem_args is not None:
441+
self.dev.copy_texture_memory_args(kernel_options.texmem_args)
443442

444443
# stop compilation stopwatch and convert to miliseconds
445-
self.last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
444+
last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
446445

447-
#test kernel for correctness and benchmark
448-
start_verification = time.perf_counter()
449-
if tuning_options.answer is not None or tuning_options.verify is not None:
450-
self.check_kernel_output(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose)
451-
# stop verification stopwatch and convert to miliseconds
452-
self.last_verification_time = 1000 * (time.perf_counter() - start_verification)
446+
# test kernel for correctness
447+
if func and (to.answer or to.verify):
448+
start_verification = time.perf_counter()
449+
self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
450+
last_verification_time = 1000 * (time.perf_counter() - start_verification)
453451

454452
# benchmark
455-
result = self.benchmark(func, gpu_args, instance, verbose)
453+
if func:
454+
start_benchmark = time.perf_counter()
455+
result.update(self.benchmark(func, gpu_args, instance, verbose, to.objective))
456+
last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark)
456457

457458
except Exception as e:
458-
#dump kernel_string to temp file
459+
# dump kernel sources to temp file
459460
temp_filenames = instance.prepare_temp_files_for_error_msg()
460461
print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
461462
raise e
462463

463464
#clean up any temporary files, if no error occured
464465
instance.delete_temp_files()
465466

467+
result['compile_time'] = last_compilation_time or 0
468+
result['verification_time'] = last_verification_time or 0
469+
result['benchmark_time'] = last_benchmark_time or 0
470+
466471
return result
467472

468473
def compile_kernel(self, instance, verbose):

kernel_tuner/interface.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def tune_kernel(kernel_name, kernel_source, problem_size, arguments, tune_params
352352
objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
353353

354354
# check for forbidden names in tune parameters
355-
util.check_tune_params_list(tune_params)
355+
util.check_tune_params_list(tune_params, observers)
356356

357357
# check whether block_size_names are used as expected
358358
util.check_block_size_params_names_list(block_size_names, tune_params)
@@ -415,7 +415,8 @@ def tune_kernel(kernel_name, kernel_source, problem_size, arguments, tune_params
415415
strategy = brute_force
416416

417417
# select the runner for this job based on input
418-
selected_runner = SimulationRunner if simulation_mode is True else SequentialRunner
418+
selected_runner = SimulationRunner if simulation_mode else SequentialRunner
419+
tuning_options.simulated_time = 0
419420
runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
420421

421422
# the user-specified function may or may not have an optional atol argument;

kernel_tuner/runners/sequential.py

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
""" The default runner for sequentially tuning the parameter space """
2-
from collections import OrderedDict
32
import logging
3+
from collections import OrderedDict
44
from time import perf_counter
55

6-
from kernel_tuner.util import get_config_string, store_cache, process_metrics, print_config_output, ErrorConfig
76
from kernel_tuner.core import DeviceInterface
7+
from kernel_tuner.util import (ErrorConfig, print_config_output,
8+
process_metrics, store_cache)
89

910

10-
class SequentialRunner(object):
11+
class SequentialRunner:
1112
""" SequentialRunner is used for tuning with a single process/thread """
1213

1314
def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
@@ -36,7 +37,9 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
3637
self.kernel_source = kernel_source
3738
self.warmed_up = False
3839
self.simulation_mode = False
39-
self.last_strategy_start_time = perf_counter()
40+
self.start_time = perf_counter()
41+
self.last_strategy_start_time = self.start_time
42+
self.last_strategy_time = 0
4043

4144
#move data to the GPU
4245
self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
@@ -64,47 +67,51 @@ def run(self, parameter_space, kernel_options, tuning_options):
6467

6568
results = []
6669

67-
#iterate over parameter space
70+
# iterate over parameter space
6871
for element in parameter_space:
6972
params = OrderedDict(zip(tuning_options.tune_params.keys(), element))
7073

71-
#attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
74+
# attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
75+
warmup_time = 0
7276
if not self.warmed_up:
77+
warmup_time = perf_counter()
7378
self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, kernel_options, tuning_options)
7479
self.warmed_up = True
80+
warmup_time = 1e3 * (perf_counter() - warmup_time)
81+
82+
result = None
7583

76-
#check if element is in the cache
84+
# check if configuration is in the cache
7785
x_int = ",".join([str(i) for i in element])
7886
if tuning_options.cache and x_int in tuning_options.cache:
79-
results.append(tuning_options.cache[x_int])
80-
continue
81-
82-
result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, kernel_options, tuning_options)
87+
params.update(tuning_options.cache[x_int])
88+
params['compile_time'] = 0
89+
params['verification_time'] = 0
90+
params['benchmark_time'] = 0
91+
else:
92+
result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, kernel_options, tuning_options)
8393

84-
if self.dev.last_compilation_time is not None:
85-
params['compile_time'] = self.dev.last_compilation_time
86-
if self.dev.last_verification_time is not None:
87-
params['verification_time'] = self.dev.last_verification_time
94+
params.update(result)
8895

89-
if isinstance(result, ErrorConfig):
90-
logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
91-
params.update({ tuning_options.objective: result })
92-
store_cache(x_int, params, tuning_options)
93-
results.append(params)
94-
continue
96+
# only compute metrics on configs that have not errored
97+
if isinstance(result[tuning_options.objective], ErrorConfig):
98+
logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
99+
elif tuning_options.metrics:
100+
params = process_metrics(params, tuning_options.metrics)
95101

96-
# print and append to results
97-
if not isinstance(result, dict):
98-
params[tuning_options.objective] = result
99-
else:
100-
params.update(result)
102+
# print configuration to the console
103+
print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
101104

102-
if tuning_options.metrics:
103-
params = process_metrics(params, tuning_options.metrics)
105+
# get the framework time by estimating based on other times
106+
total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
107+
params['strategy_time'] = self.last_strategy_time
108+
params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
109+
self.start_time = perf_counter()
104110

105-
print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
111+
if result:
112+
store_cache(x_int, params, tuning_options)
106113

107-
store_cache(x_int, params, tuning_options)
114+
# all visited configurations are added to results to provide a trace for optimization strategies
108115
results.append(params)
109116

110117
return results, self.dev.get_environment()

0 commit comments

Comments
 (0)