From 37c5338938e2514911815bbbbd8aa6c9272f3f3c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 May 2025 11:12:20 +0200 Subject: [PATCH 1/9] Typo. --- kernel_tuner/runners/sequential.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index aeebd5116..194eb0545 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -55,7 +55,7 @@ def run(self, parameter_space, tuning_options): :param tuning_options: A dictionary with all options regarding the tuning process. - :type tuning_options: kernel_tuner.iterface.Options + :type tuning_options: kernel_tuner.interface.Options :returns: A list of dictionaries for executed kernel configurations and their execution times. From 6c5b360cad115d840852f618f6ffda9cf89addbd Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 May 2025 11:12:43 +0200 Subject: [PATCH 2/9] Add missing parameter to the interface. --- kernel_tuner/runners/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/runners/runner.py b/kernel_tuner/runners/runner.py index 80ab32146..8c4de22d7 100644 --- a/kernel_tuner/runners/runner.py +++ b/kernel_tuner/runners/runner.py @@ -14,7 +14,7 @@ def __init__( pass @abstractmethod - def get_environment(self): + def get_environment(self, tuning_options): pass @abstractmethod From a21caf8398306cb9f7b136043ae97f4a084c2759 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 May 2025 11:53:57 +0200 Subject: [PATCH 3/9] Formatting. --- kernel_tuner/runners/sequential.py | 53 +++++++++++++++++------------- kernel_tuner/runners/simulation.py | 40 +++++++++++----------- 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 194eb0545..eeeedbd29 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -20,15 +20,13 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob :param kernel_options: A dictionary with all options for the kernel. :type kernel_options: kernel_tuner.interface.Options - :param device_options: A dictionary with all options for the device - on which the kernel should be tuned. + :param device_options: A dictionary with all options for the device on which the kernel should be tuned. :type device_options: kernel_tuner.interface.Options - :param iterations: The number of iterations used for benchmarking - each kernel instance. + :param iterations: The number of iterations used for benchmarking each kernel instance. :type iterations: int """ - #detect language and create high-level device interface + # detect language and create high-level device interface self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) self.units = self.dev.units @@ -41,7 +39,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob self.last_strategy_time = 0 self.kernel_options = kernel_options - #move data to the GPU + # move data to the GPU self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments) def get_environment(self, tuning_options): @@ -53,16 +51,14 @@ def run(self, parameter_space, tuning_options): :param parameter_space: The parameter space as an iterable. :type parameter_space: iterable - :param tuning_options: A dictionary with all options regarding the tuning - process. + :param tuning_options: A dictionary with all options regarding the tuning process. :type tuning_options: kernel_tuner.interface.Options - :returns: A list of dictionaries for executed kernel configurations and their - execution times. - :rtype: dict()) + :returns: A list of dictionaries for executed kernel configurations and their execution times. + :rtype: dict() """ - logging.debug('sequential runner started for ' + self.kernel_options.kernel_name) + logging.debug("sequential runner started for " + self.kernel_options.kernel_name) results = [] @@ -77,33 +73,46 @@ def run(self, parameter_space, tuning_options): x_int = ",".join([str(i) for i in element]) if tuning_options.cache and x_int in tuning_options.cache: params.update(tuning_options.cache[x_int]) - params['compile_time'] = 0 - params['verification_time'] = 0 - params['benchmark_time'] = 0 + params["compile_time"] = 0 + params["verification_time"] = 0 + params["benchmark_time"] = 0 else: # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result if not self.warmed_up: warmup_time = perf_counter() - self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options) + self.dev.compile_and_benchmark( + self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options + ) self.warmed_up = True warmup_time = 1e3 * (perf_counter() - warmup_time) - result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options) + result = self.dev.compile_and_benchmark( + self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options + ) params.update(result) if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): - logging.debug('kernel configuration was skipped silently due to compile or runtime failure') + logging.debug("kernel configuration was skipped silently due to compile or runtime failure") # only compute metrics on configs that have not errored if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times - total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) - params['strategy_time'] = self.last_strategy_time - params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0) - params['timestamp'] = str(datetime.now(timezone.utc)) + total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) + params["strategy_time"] = self.last_strategy_time + params["framework_time"] = max( + total_time + - ( + params["compile_time"] + + params["verification_time"] + + params["benchmark_time"] + + params["strategy_time"] + ), + 0, + ) + params["timestamp"] = str(datetime.now(timezone.utc)) self.start_time = perf_counter() if result: diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py index 22c7c667c..7a167bfcf 100644 --- a/kernel_tuner/runners/simulation.py +++ b/kernel_tuner/runners/simulation.py @@ -14,11 +14,11 @@ class SimulationDevice(_SimulationDevice): @property def name(self): - return self.env['device_name'] + return self.env["device_name"] @name.setter def name(self, value): - self.env['device_name'] = value + self.env["device_name"] = value if not self.quiet: print("Simulating: " + value) @@ -38,12 +38,10 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob :param kernel_options: A dictionary with all options for the kernel. :type kernel_options: kernel_tuner.interface.Options - :param device_options: A dictionary with all options for the device - on which the kernel should be tuned. + :param device_options: A dictionary with all options for the device on which the kernel should be tuned. :type device_options: kernel_tuner.interface.Options - :param iterations: The number of iterations used for benchmarking - each kernel instance. + :param iterations: The number of iterations used for benchmarking each kernel instance. :type iterations: int """ self.quiet = device_options.quiet @@ -70,21 +68,18 @@ def run(self, parameter_space, tuning_options): :param parameter_space: The parameter space as an iterable. :type parameter_space: iterable - :param tuning_options: A dictionary with all options regarding the tuning - process. + :param tuning_options: A dictionary with all options regarding the tuning process. :type tuning_options: kernel_tuner.iterface.Options - :returns: A list of dictionaries for executed kernel configurations and their - execution times. + :returns: A list of dictionaries for executed kernel configurations and their execution times. :rtype: dict() """ - logging.debug('simulation runner started for ' + self.kernel_options.kernel_name) + logging.debug("simulation runner started for " + self.kernel_options.kernel_name) results = [] # iterate over parameter space for element in parameter_space: - # check if element is in the cache x_int = ",".join([str(i) for i in element]) if tuning_options.cache and x_int in tuning_options.cache: @@ -98,21 +93,22 @@ def run(self, parameter_space, tuning_options): # configuration is already counted towards the unique_results. # It is the responsibility of cost_func to add configs to unique_results. if x_int in tuning_options.unique_results: - - result['compile_time'] = 0 - result['verification_time'] = 0 - result['benchmark_time'] = 0 + result["compile_time"] = 0 + result["verification_time"] = 0 + result["benchmark_time"] = 0 else: # configuration is evaluated for the first time, print to the console - util.print_config_output(tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units) + util.print_config_output( + tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units + ) # Everything but the strategy time and framework time are simulated, # self.last_strategy_time is set by cost_func - result['strategy_time'] = self.last_strategy_time + result["strategy_time"] = self.last_strategy_time try: - simulated_time = result['compile_time'] + result['verification_time'] + result['benchmark_time'] + simulated_time = result["compile_time"] + result["verification_time"] + result["benchmark_time"] tuning_options.simulated_time += simulated_time except KeyError: if "time_limit" in tuning_options: @@ -122,13 +118,15 @@ def run(self, parameter_space, tuning_options): total_time = 1000 * (perf_counter() - self.start_time) self.start_time = perf_counter() - result['framework_time'] = total_time - self.last_strategy_time + result["framework_time"] = total_time - self.last_strategy_time results.append(result) continue # if the element is not in the cache, raise an error - check = util.check_restrictions(tuning_options.restrictions, dict(zip(tuning_options['tune_params'].keys(), element)), True) + check = util.check_restrictions( + tuning_options.restrictions, dict(zip(tuning_options["tune_params"].keys(), element)), True + ) err_string = f"kernel configuration {element} not in cache, does {'' if check else 'not '}pass extra restriction check ({check})" logging.debug(err_string) raise ValueError(f"{err_string} - in simulation mode, all configurations must be present in the cache") From a2328c4c5eb5abdfc99219dcda53615f2c7ed42f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 5 Jun 2025 11:12:02 +0200 Subject: [PATCH 4/9] First early draft of the parallel runner. --- examples/cuda/vector_add_parallel.py | 35 ++++++ kernel_tuner/interface.py | 14 ++- kernel_tuner/runners/parallel.py | 166 +++++++++++++++++++++++++++ 3 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 examples/cuda/vector_add_parallel.py create mode 100644 kernel_tuner/runners/parallel.py diff --git a/examples/cuda/vector_add_parallel.py b/examples/cuda/vector_add_parallel.py new file mode 100644 index 000000000..d1c112aa5 --- /dev/null +++ b/examples/cuda/vector_add_parallel.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +import numpy +from kernel_tuner import tune_kernel + + +def tune(): + kernel_string = """ + __global__ void vector_add(float *c, float *a, float *b, int n) { + int i = (blockIdx.x * block_size_x) + threadIdx.x; + if ( i < n ) { + c[i] = a[i] + b[i]; + } + } + """ + + size = 10000000 + + a = numpy.random.randn(size).astype(numpy.float32) + b = numpy.random.randn(size).astype(numpy.float32) + c = numpy.zeros_like(b) + n = numpy.int32(size) + + args = [c, a, b, n] + + tune_params = dict() + tune_params["block_size_x"] = [32 * i for i in range(1, 33)] + + results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, parallel_runner=4) + + return results + + +if __name__ == "__main__": + tune() diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index bd421aeab..ed7b56487 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -32,6 +32,7 @@ import kernel_tuner.core as core import kernel_tuner.util as util from kernel_tuner.integration import get_objective_defaults +from kernel_tuner.runners.parallel import ParallelRunner from kernel_tuner.runners.sequential import SequentialRunner from kernel_tuner.runners.simulation import SimulationRunner from kernel_tuner.searchspace import Searchspace @@ -463,6 +464,7 @@ def __deepcopy__(self, _): ), ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")), ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")), + ("parallel_runner", ("If the value is larger than 1 use that number as the number of parallel runners doing the tuning", "int")), ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")), ] ) @@ -574,6 +576,7 @@ def tune_kernel( cache=None, metrics=None, simulation_mode=False, + parallel_runner=1, observers=None, objective=None, objective_higher_is_better=None, @@ -600,6 +603,8 @@ def tune_kernel( if iterations < 1: raise ValueError("Iterations should be at least one!") + if parallel_runner < 1: + logging.warning("The number of parallel runners should be at least one!") # sort all the options into separate dicts opts = locals() @@ -650,7 +655,14 @@ def tune_kernel( strategy = brute_force # select the runner for this job based on input - selected_runner = SimulationRunner if simulation_mode else SequentialRunner + # TODO: we could use the "match case" syntax when removing support for 3.9 + if simulation_mode: + selected_runner = SimulationRunner + elif parallel_runner > 1: + selected_runner = ParallelRunner + tuning_options.parallel_runner = parallel_runner + else: + selected_runner = SequentialRunner tuning_options.simulated_time = 0 runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers) diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py new file mode 100644 index 000000000..a4362b0eb --- /dev/null +++ b/kernel_tuner/runners/parallel.py @@ -0,0 +1,166 @@ +"""A specialized runner that tunes in parallel the parameter space.""" +import logging +from time import perf_counter +from datetime import datetime, timezone + +from ray import remote, get, put + +from kernel_tuner.runners.runner import Runner +from kernel_tuner.core import DeviceInterface +from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache + + +class ParallelRunnerState: + """This class represents the state of a parallel tuning run.""" + + def __init__(self, observers, iterations): + self.device_options = None + self.quiet = False + self.kernel_source = None + self.warmed_up = False + self.simulation_mode = False + self.start_time = None + self.last_strategy_start_time = None + self.last_strategy_time = 0 + self.kernel_options = None + self.observers = observers + self.iterations = iterations + + +@remote +def parallel_run(task_id: int, state: ParallelRunnerState, parameter_space, tuning_options): + dev = DeviceInterface( + state.kernel_source, iterations=state.iterations, observers=state.observers, **state.device_options + ) + # move data to the GPU + gpu_args = dev.ready_argument_list(state.kernel_options.arguments) + # iterate over parameter space + results = [] + elements_per_task = len(parameter_space) / tuning_options.parallel_runner + first_element = task_id * elements_per_task + last_element = ( + (task_id + 1) * elements_per_task if task_id + 1 < tuning_options.parallel_runner else len(parameter_space) + ) + for element in parameter_space[first_element:last_element]: + params = dict(zip(tuning_options.tune_params.keys(), element)) + + result = None + warmup_time = 0 + + # check if configuration is in the cache + x_int = ",".join([str(i) for i in element]) + if tuning_options.cache and x_int in tuning_options.cache: + params.update(tuning_options.cache[x_int]) + params["compile_time"] = 0 + params["verification_time"] = 0 + params["benchmark_time"] = 0 + else: + # attempt to warm up the GPU by running the first config in the parameter space and ignoring the result + if not state.warmed_up: + warmup_time = perf_counter() + dev.compile_and_benchmark(state.kernel_source, gpu_args, params, state.kernel_options, tuning_options) + state.warmed_up = True + warmup_time = 1e3 * (perf_counter() - warmup_time) + + result = dev.compile_and_benchmark( + state.kernel_source, gpu_args, params, state.kernel_options, tuning_options + ) + + params.update(result) + + if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): + logging.debug("kernel configuration was skipped silently due to compile or runtime failure") + + # only compute metrics on configs that have not errored + if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig): + params = process_metrics(params, tuning_options.metrics) + + # get the framework time by estimating based on other times + total_time = 1000 * ((perf_counter() - state.start_time) - warmup_time) + params["strategy_time"] = state.last_strategy_time + params["framework_time"] = max( + total_time + - ( + params["compile_time"] + + params["verification_time"] + + params["benchmark_time"] + + params["strategy_time"] + ), + 0, + ) + params["timestamp"] = str(datetime.now(timezone.utc)) + state.start_time = perf_counter() + + if result: + # print configuration to the console + print_config_output(tuning_options.tune_params, params, state.quiet, tuning_options.metrics, dev.units) + + # add configuration to cache + store_cache(x_int, params, tuning_options) + + # all visited configurations are added to results to provide a trace for optimization strategies + results.append(params) + + return results + + +class ParallelRunner(Runner): + """ParallelRunner is used to distribute configurations across multiple nodes.""" + + def __init__(self, kernel_source, kernel_options, device_options, iterations, observers): + """Instantiate the ParallelRunner. + + :param kernel_source: The kernel source + :type kernel_source: kernel_tuner.core.KernelSource + + :param kernel_options: A dictionary with all options for the kernel. + :type kernel_options: kernel_tuner.interface.Options + + :param device_options: A dictionary with all options for the device + on which the kernel should be tuned. + :type device_options: kernel_tuner.interface.Options + + :param iterations: The number of iterations used for benchmarking + each kernel instance. + :type iterations: int + """ + self.state = ParallelRunnerState(observers, iterations) + self.state.quiet = device_options.quiet + self.state.kernel_source = kernel_source + self.state.warmed_up = False + self.state.simulation_mode = False + self.state.start_time = perf_counter() + self.state.last_strategy_start_time = self.state.start_time + self.state.last_strategy_time = 0 + self.state.kernel_options = kernel_options + + def get_environment(self, tuning_options): + # TODO: we are going to fix this one later + return None + + def run(self, parameter_space, tuning_options): + """Iterate through the entire parameter space using a single Python process. + + :param parameter_space: The parameter space as an iterable. + :type parameter_space: iterable + + :param tuning_options: A dictionary with all options regarding the tuning process. + :type tuning_options: kernel_tuner.interface.Options + + :returns: A list of dictionaries for executed kernel configurations and their execution times. + :rtype: dict() + """ + # given the parameter_space, distribute it over Ray tasks + logging.debug("parallel runner started for " + self.state.kernel_options.kernel_name) + + results = [] + tasks = [] + parameter_space_ref = put(parameter_space) + state_ref = put(self.state) + tuning_options_ref = put(tuning_options) + for task_id in range(0, tuning_options.parallel_runner): + tasks.append(parallel_run.remote(task_id, state_ref, parameter_space_ref, tuning_options_ref)) + for task in tasks: + results.append(get(task)) + + return results From 68a569ba8849673cde3735824e3fd0a6a5c4e3ef Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 5 Jun 2025 11:36:35 +0200 Subject: [PATCH 5/9] Need a dummy DeviceInterface even on the master. --- kernel_tuner/runners/parallel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py index a4362b0eb..e8d759ecd 100644 --- a/kernel_tuner/runners/parallel.py +++ b/kernel_tuner/runners/parallel.py @@ -133,10 +133,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob self.state.last_strategy_start_time = self.state.start_time self.state.last_strategy_time = 0 self.state.kernel_options = kernel_options + # define a dummy device interface + self.dev = DeviceInterface(kernel_source) def get_environment(self, tuning_options): - # TODO: we are going to fix this one later - return None + # dummy environment + return self.dev.get_environment() def run(self, parameter_space, tuning_options): """Iterate through the entire parameter space using a single Python process. From 9d0dee4a4870ef7d7694ae363855c0cd5ca237ef Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 5 Jun 2025 11:41:59 +0200 Subject: [PATCH 6/9] Missing device_options in state. --- kernel_tuner/runners/parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py index e8d759ecd..5b501d9c5 100644 --- a/kernel_tuner/runners/parallel.py +++ b/kernel_tuner/runners/parallel.py @@ -125,6 +125,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob :type iterations: int """ self.state = ParallelRunnerState(observers, iterations) + self.state.device_options = device_options self.state.quiet = device_options.quiet self.state.kernel_source = kernel_source self.state.warmed_up = False From aff21f035dd68b19e6b0de8c64c75efee3e01a4d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 5 Jun 2025 15:12:08 +0200 Subject: [PATCH 7/9] Flatten the results. --- kernel_tuner/runners/parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py index 5b501d9c5..2d20bd4bc 100644 --- a/kernel_tuner/runners/parallel.py +++ b/kernel_tuner/runners/parallel.py @@ -2,6 +2,7 @@ import logging from time import perf_counter from datetime import datetime, timezone +from itertools import chain from ray import remote, get, put @@ -166,4 +167,4 @@ def run(self, parameter_space, tuning_options): for task in tasks: results.append(get(task)) - return results + return [chain.from_iterable(results)] From d7e8cae2778b7aad7408b76bbfe313aa69f05841 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 5 Jun 2025 15:18:59 +0200 Subject: [PATCH 8/9] Various bug fixes. --- kernel_tuner/runners/parallel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py index 2d20bd4bc..f454d0686 100644 --- a/kernel_tuner/runners/parallel.py +++ b/kernel_tuner/runners/parallel.py @@ -28,7 +28,7 @@ def __init__(self, observers, iterations): self.iterations = iterations -@remote +@remote(num_cpus=1, num_gpus=1) def parallel_run(task_id: int, state: ParallelRunnerState, parameter_space, tuning_options): dev = DeviceInterface( state.kernel_source, iterations=state.iterations, observers=state.observers, **state.device_options @@ -37,9 +37,9 @@ def parallel_run(task_id: int, state: ParallelRunnerState, parameter_space, tuni gpu_args = dev.ready_argument_list(state.kernel_options.arguments) # iterate over parameter space results = [] - elements_per_task = len(parameter_space) / tuning_options.parallel_runner - first_element = task_id * elements_per_task - last_element = ( + elements_per_task = int(len(parameter_space) / tuning_options.parallel_runner) + first_element = int(task_id * elements_per_task) + last_element = int( (task_id + 1) * elements_per_task if task_id + 1 < tuning_options.parallel_runner else len(parameter_space) ) for element in parameter_space[first_element:last_element]: @@ -167,4 +167,4 @@ def run(self, parameter_space, tuning_options): for task in tasks: results.append(get(task)) - return [chain.from_iterable(results)] + return list(chain.from_iterable(results)) From b4ff7fa49574c8e277a71f70c289102e73243388 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 6 Jun 2025 11:21:07 +0200 Subject: [PATCH 9/9] Add another example for the parallel runner. --- examples/cuda/sepconv_parallel.py | 88 +++++++++++++++++++++++++++++++ kernel_tuner/runners/parallel.py | 5 +- 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 examples/cuda/sepconv_parallel.py diff --git a/examples/cuda/sepconv_parallel.py b/examples/cuda/sepconv_parallel.py new file mode 100644 index 000000000..074200e1b --- /dev/null +++ b/examples/cuda/sepconv_parallel.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +import numpy +from kernel_tuner import tune_kernel +from collections import OrderedDict + + +def tune(): + with open("convolution.cu", "r") as f: + kernel_string = f.read() + + # setup tunable parameters + tune_params = OrderedDict() + tune_params["filter_height"] = [i for i in range(3, 19, 2)] + tune_params["filter_width"] = [i for i in range(3, 19, 2)] + tune_params["block_size_x"] = [16 * i for i in range(1, 65)] + tune_params["block_size_y"] = [2**i for i in range(6)] + tune_params["tile_size_x"] = [i for i in range(1, 11)] + tune_params["tile_size_y"] = [i for i in range(1, 11)] + + tune_params["use_padding"] = [0, 1] # toggle the insertion of padding in shared memory + tune_params["read_only"] = [0, 1] # toggle using the read-only cache + + # limit the search to only use padding when its effective, and at least 32 threads in a block + restrict = ["use_padding==0 or (block_size_x % 32 != 0)", "block_size_x*block_size_y >= 32"] + + # setup input and output dimensions + problem_size = (4096, 4096) + size = numpy.prod(problem_size) + largest_fh = max(tune_params["filter_height"]) + largest_fw = max(tune_params["filter_width"]) + input_size = (problem_size[0] + largest_fw - 1) * (problem_size[1] + largest_fh - 1) + + # create input data + output_image = numpy.zeros(size).astype(numpy.float32) + input_image = numpy.random.randn(input_size).astype(numpy.float32) + filter_weights = numpy.random.randn(largest_fh * largest_fw).astype(numpy.float32) + + # setup kernel arguments + cmem_args = {"d_filter": filter_weights} + args = [output_image, input_image, filter_weights] + + # tell the Kernel Tuner how to compute grid dimensions + grid_div_x = ["block_size_x", "tile_size_x"] + grid_div_y = ["block_size_y", "tile_size_y"] + + # start tuning separable convolution (row) + tune_params["filter_height"] = [1] + tune_params["tile_size_y"] = [1] + results_row = tune_kernel( + "convolution_kernel", + kernel_string, + problem_size, + args, + tune_params, + grid_div_y=grid_div_y, + grid_div_x=grid_div_x, + cmem_args=cmem_args, + verbose=False, + restrictions=restrict, + parallel_runner=1024, + cache="convolution_kernel_row", + ) + + # start tuning separable convolution (col) + tune_params["filter_height"] = tune_params["filter_width"][:] + tune_params["file_size_y"] = tune_params["tile_size_x"][:] + tune_params["filter_width"] = [1] + tune_params["tile_size_x"] = [1] + results_col = tune_kernel( + "convolution_kernel", + kernel_string, + problem_size, + args, + tune_params, + grid_div_y=grid_div_y, + grid_div_x=grid_div_x, + cmem_args=cmem_args, + verbose=False, + restrictions=restrict, + parallel_runner=1024, + cache="convolution_kernel_col", + ) + + return results_row, results_col + + +if __name__ == "__main__": + results_row, results_col = tune() diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py index f454d0686..e689096f9 100644 --- a/kernel_tuner/runners/parallel.py +++ b/kernel_tuner/runners/parallel.py @@ -135,7 +135,10 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob self.state.last_strategy_start_time = self.state.start_time self.state.last_strategy_time = 0 self.state.kernel_options = kernel_options - # define a dummy device interface + # fields used directly by strategies + self.last_strategy_time = perf_counter() + self.state.last_strategy_start_time = self.last_strategy_time + # define a dummy device interface on the master node self.dev = DeviceInterface(kernel_source) def get_environment(self, tuning_options):