From 37c5338938e2514911815bbbbd8aa6c9272f3f3c Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Fri, 23 May 2025 11:12:20 +0200
Subject: [PATCH 1/9] Typo.

---
 kernel_tuner/runners/sequential.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index aeebd5116..194eb0545 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -55,7 +55,7 @@ def run(self, parameter_space, tuning_options):
 
         :param tuning_options: A dictionary with all options regarding the tuning
             process.
-        :type tuning_options: kernel_tuner.iterface.Options
+        :type tuning_options: kernel_tuner.interface.Options
 
         :returns: A list of dictionaries for executed kernel configurations and their
             execution times.

From 6c5b360cad115d840852f618f6ffda9cf89addbd Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Fri, 23 May 2025 11:12:43 +0200
Subject: [PATCH 2/9] Add missing parameter to the interface.

---
 kernel_tuner/runners/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/runner.py b/kernel_tuner/runners/runner.py
index 80ab32146..8c4de22d7 100644
--- a/kernel_tuner/runners/runner.py
+++ b/kernel_tuner/runners/runner.py
@@ -14,7 +14,7 @@ def __init__(
         pass
 
     @abstractmethod
-    def get_environment(self):
+    def get_environment(self, tuning_options):
         pass
 
     @abstractmethod

From a21caf8398306cb9f7b136043ae97f4a084c2759 Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Fri, 23 May 2025 11:53:57 +0200
Subject: [PATCH 3/9] Formatting.

---
 kernel_tuner/runners/sequential.py | 53 +++++++++++++++++-------------
 kernel_tuner/runners/simulation.py | 40 +++++++++++-----------
 2 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 194eb0545..eeeedbd29 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -20,15 +20,13 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param kernel_options: A dictionary with all options for the kernel.
         :type kernel_options: kernel_tuner.interface.Options
 
-        :param device_options: A dictionary with all options for the device
-            on which the kernel should be tuned.
+        :param device_options: A dictionary with all options for the device on which the kernel should be tuned.
         :type device_options: kernel_tuner.interface.Options
 
-        :param iterations: The number of iterations used for benchmarking
-            each kernel instance.
+        :param iterations: The number of iterations used for benchmarking each kernel instance.
         :type iterations: int
         """
-        #detect language and create high-level device interface
+        # detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
 
         self.units = self.dev.units
@@ -41,7 +39,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
 
-        #move data to the GPU
+        # move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
 
     def get_environment(self, tuning_options):
@@ -53,16 +51,14 @@ def run(self, parameter_space, tuning_options):
         :param parameter_space: The parameter space as an iterable.
         :type parameter_space: iterable
 
-        :param tuning_options: A dictionary with all options regarding the tuning
-            process.
+        :param tuning_options: A dictionary with all options regarding the tuning process.
         :type tuning_options: kernel_tuner.interface.Options
 
-        :returns: A list of dictionaries for executed kernel configurations and their
-            execution times.
-        :rtype: dict())
+        :returns: A list of dictionaries for executed kernel configurations and their execution times.
+        :rtype: dict()
 
         """
-        logging.debug('sequential runner started for ' + self.kernel_options.kernel_name)
+        logging.debug("sequential runner started for " + self.kernel_options.kernel_name)
 
         results = []
 
@@ -77,33 +73,46 @@ def run(self, parameter_space, tuning_options):
             x_int = ",".join([str(i) for i in element])
             if tuning_options.cache and x_int in tuning_options.cache:
                 params.update(tuning_options.cache[x_int])
-                params['compile_time'] = 0
-                params['verification_time'] = 0
-                params['benchmark_time'] = 0
+                params["compile_time"] = 0
+                params["verification_time"] = 0
+                params["benchmark_time"] = 0
             else:
                 # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
                 if not self.warmed_up:
                     warmup_time = perf_counter()
-                    self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                    self.dev.compile_and_benchmark(
+                        self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
+                    )
                     self.warmed_up = True
                     warmup_time = 1e3 * (perf_counter() - warmup_time)
 
-                result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                result = self.dev.compile_and_benchmark(
+                    self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
+                )
 
                 params.update(result)
 
                 if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                    logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+                    logging.debug("kernel configuration was skipped silently due to compile or runtime failure")
 
             # only compute metrics on configs that have not errored
             if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
                 params = process_metrics(params, tuning_options.metrics)
 
             # get the framework time by estimating based on other times
-            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) 
-            params['strategy_time'] = self.last_strategy_time
-            params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-            params['timestamp'] = str(datetime.now(timezone.utc))
+            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time)
+            params["strategy_time"] = self.last_strategy_time
+            params["framework_time"] = max(
+                total_time
+                - (
+                    params["compile_time"]
+                    + params["verification_time"]
+                    + params["benchmark_time"]
+                    + params["strategy_time"]
+                ),
+                0,
+            )
+            params["timestamp"] = str(datetime.now(timezone.utc))
             self.start_time = perf_counter()
 
             if result:
diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
index 22c7c667c..7a167bfcf 100644
--- a/kernel_tuner/runners/simulation.py
+++ b/kernel_tuner/runners/simulation.py
@@ -14,11 +14,11 @@ class SimulationDevice(_SimulationDevice):
 
     @property
     def name(self):
-        return self.env['device_name']
+        return self.env["device_name"]
 
     @name.setter
     def name(self, value):
-        self.env['device_name'] = value
+        self.env["device_name"] = value
         if not self.quiet:
             print("Simulating: " + value)
 
@@ -38,12 +38,10 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param kernel_options: A dictionary with all options for the kernel.
         :type kernel_options: kernel_tuner.interface.Options
 
-        :param device_options: A dictionary with all options for the device
-            on which the kernel should be tuned.
+        :param device_options: A dictionary with all options for the device on which the kernel should be tuned.
         :type device_options: kernel_tuner.interface.Options
 
-        :param iterations: The number of iterations used for benchmarking
-            each kernel instance.
+        :param iterations: The number of iterations used for benchmarking each kernel instance.
         :type iterations: int
         """
         self.quiet = device_options.quiet
@@ -70,21 +68,18 @@ def run(self, parameter_space, tuning_options):
         :param parameter_space: The parameter space as an iterable.
         :type parameter_space: iterable
 
-        :param tuning_options: A dictionary with all options regarding the tuning
-            process.
+        :param tuning_options: A dictionary with all options regarding the tuning process.
         :type tuning_options: kernel_tuner.iterface.Options
 
-        :returns: A list of dictionaries for executed kernel configurations and their
-            execution times.
+        :returns: A list of dictionaries for executed kernel configurations and their execution times.
         :rtype: dict()
         """
-        logging.debug('simulation runner started for ' + self.kernel_options.kernel_name)
+        logging.debug("simulation runner started for " + self.kernel_options.kernel_name)
 
         results = []
 
         # iterate over parameter space
         for element in parameter_space:
-
             # check if element is in the cache
             x_int = ",".join([str(i) for i in element])
             if tuning_options.cache and x_int in tuning_options.cache:
@@ -98,21 +93,22 @@ def run(self, parameter_space, tuning_options):
                 # configuration is already counted towards the unique_results.
                 # It is the responsibility of cost_func to add configs to unique_results.
                 if x_int in tuning_options.unique_results:
-
-                    result['compile_time'] = 0
-                    result['verification_time'] = 0
-                    result['benchmark_time'] = 0
+                    result["compile_time"] = 0
+                    result["verification_time"] = 0
+                    result["benchmark_time"] = 0
 
                 else:
                     # configuration is evaluated for the first time, print to the console
-                    util.print_config_output(tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units)
+                    util.print_config_output(
+                        tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units
+                    )
 
                 # Everything but the strategy time and framework time are simulated,
                 # self.last_strategy_time is set by cost_func
-                result['strategy_time'] = self.last_strategy_time
+                result["strategy_time"] = self.last_strategy_time
 
                 try:
-                    simulated_time = result['compile_time'] + result['verification_time'] + result['benchmark_time']
+                    simulated_time = result["compile_time"] + result["verification_time"] + result["benchmark_time"]
                     tuning_options.simulated_time += simulated_time
                 except KeyError:
                     if "time_limit" in tuning_options:
@@ -122,13 +118,15 @@ def run(self, parameter_space, tuning_options):
 
                 total_time = 1000 * (perf_counter() - self.start_time)
                 self.start_time = perf_counter()
-                result['framework_time'] = total_time - self.last_strategy_time
+                result["framework_time"] = total_time - self.last_strategy_time
 
                 results.append(result)
                 continue
 
             # if the element is not in the cache, raise an error
-            check = util.check_restrictions(tuning_options.restrictions, dict(zip(tuning_options['tune_params'].keys(), element)), True)
+            check = util.check_restrictions(
+                tuning_options.restrictions, dict(zip(tuning_options["tune_params"].keys(), element)), True
+            )
             err_string = f"kernel configuration {element} not in cache, does {'' if check else 'not '}pass extra restriction check ({check})"
             logging.debug(err_string)
             raise ValueError(f"{err_string} - in simulation mode, all configurations must be present in the cache")

From a2328c4c5eb5abdfc99219dcda53615f2c7ed42f Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Thu, 5 Jun 2025 11:12:02 +0200
Subject: [PATCH 4/9] First early draft of the parallel runner.

---
 examples/cuda/vector_add_parallel.py |  35 ++++++
 kernel_tuner/interface.py            |  14 ++-
 kernel_tuner/runners/parallel.py     | 166 +++++++++++++++++++++++++++
 3 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 examples/cuda/vector_add_parallel.py
 create mode 100644 kernel_tuner/runners/parallel.py

diff --git a/examples/cuda/vector_add_parallel.py b/examples/cuda/vector_add_parallel.py
new file mode 100644
index 000000000..d1c112aa5
--- /dev/null
+++ b/examples/cuda/vector_add_parallel.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+import numpy
+from kernel_tuner import tune_kernel
+
+
+def tune():
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = (blockIdx.x * block_size_x) + threadIdx.x;
+        if ( i < n ) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 10000000
+
+    a = numpy.random.randn(size).astype(numpy.float32)
+    b = numpy.random.randn(size).astype(numpy.float32)
+    c = numpy.zeros_like(b)
+    n = numpy.int32(size)
+
+    args = [c, a, b, n]
+
+    tune_params = dict()
+    tune_params["block_size_x"] = [32 * i for i in range(1, 33)]
+
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, parallel_runner=4)
+
+    return results
+
+
+if __name__ == "__main__":
+    tune()
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index bd421aeab..ed7b56487 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -32,6 +32,7 @@
 import kernel_tuner.core as core
 import kernel_tuner.util as util
 from kernel_tuner.integration import get_objective_defaults
+from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.searchspace import Searchspace
@@ -463,6 +464,7 @@ def __deepcopy__(self, _):
         ),
         ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")),
         ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")),
+        ("parallel_runner", ("If the value is larger than 1 use that number as the number of parallel runners doing the tuning", "int")),
         ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")),
     ]
 )
@@ -574,6 +576,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
+    parallel_runner=1,
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -600,6 +603,8 @@ def tune_kernel(
 
     if iterations < 1:
         raise ValueError("Iterations should be at least one!")
+    if parallel_runner < 1:
+        logging.warning("The number of parallel runners should be at least one!")
 
     # sort all the options into separate dicts
     opts = locals()
@@ -650,7 +655,14 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else SequentialRunner
+    # TODO: we could use the "match case" syntax when removing support for 3.9
+    if simulation_mode:
+        selected_runner = SimulationRunner
+    elif parallel_runner > 1:
+        selected_runner = ParallelRunner
+        tuning_options.parallel_runner = parallel_runner
+    else:
+        selected_runner = SequentialRunner
     tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
new file mode 100644
index 000000000..a4362b0eb
--- /dev/null
+++ b/kernel_tuner/runners/parallel.py
@@ -0,0 +1,166 @@
+"""A specialized runner that tunes in parallel the parameter space."""
+import logging
+from time import perf_counter
+from datetime import datetime, timezone
+
+from ray import remote, get, put
+
+from kernel_tuner.runners.runner import Runner
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+
+
+class ParallelRunnerState:
+    """This class represents the state of a parallel tuning run."""
+
+    def __init__(self, observers, iterations):
+        self.device_options = None
+        self.quiet = False
+        self.kernel_source = None
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = None
+        self.last_strategy_start_time = None
+        self.last_strategy_time = 0
+        self.kernel_options = None
+        self.observers = observers
+        self.iterations = iterations
+
+
+@remote
+def parallel_run(task_id: int, state: ParallelRunnerState, parameter_space, tuning_options):
+    dev = DeviceInterface(
+        state.kernel_source, iterations=state.iterations, observers=state.observers, **state.device_options
+    )
+    # move data to the GPU
+    gpu_args = dev.ready_argument_list(state.kernel_options.arguments)
+    # iterate over parameter space
+    results = []
+    elements_per_task = len(parameter_space) / tuning_options.parallel_runner
+    first_element = task_id * elements_per_task
+    last_element = (
+        (task_id + 1) * elements_per_task if task_id + 1 < tuning_options.parallel_runner else len(parameter_space)
+    )
+    for element in parameter_space[first_element:last_element]:
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params["compile_time"] = 0
+            params["verification_time"] = 0
+            params["benchmark_time"] = 0
+        else:
+            # attempt to warm up the GPU by running the first config in the parameter space and ignoring the result
+            if not state.warmed_up:
+                warmup_time = perf_counter()
+                dev.compile_and_benchmark(state.kernel_source, gpu_args, params, state.kernel_options, tuning_options)
+                state.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = dev.compile_and_benchmark(
+                state.kernel_source, gpu_args, params, state.kernel_options, tuning_options
+            )
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug("kernel configuration was skipped silently due to compile or runtime failure")
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * ((perf_counter() - state.start_time) - warmup_time)
+        params["strategy_time"] = state.last_strategy_time
+        params["framework_time"] = max(
+            total_time
+            - (
+                params["compile_time"]
+                + params["verification_time"]
+                + params["benchmark_time"]
+                + params["strategy_time"]
+            ),
+            0,
+        )
+        params["timestamp"] = str(datetime.now(timezone.utc))
+        state.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, state.quiet, tuning_options.metrics, dev.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        # all visited configurations are added to results to provide a trace for optimization strategies
+        results.append(params)
+
+    return results
+
+
+class ParallelRunner(Runner):
+    """ParallelRunner is used to distribute configurations across multiple nodes."""
+
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+        """Instantiate the ParallelRunner.
+
+        :param kernel_source: The kernel source
+        :type kernel_source: kernel_tuner.core.KernelSource
+
+        :param kernel_options: A dictionary with all options for the kernel.
+        :type kernel_options: kernel_tuner.interface.Options
+
+        :param device_options: A dictionary with all options for the device
+            on which the kernel should be tuned.
+        :type device_options: kernel_tuner.interface.Options
+
+        :param iterations: The number of iterations used for benchmarking
+            each kernel instance.
+        :type iterations: int
+        """
+        self.state = ParallelRunnerState(observers, iterations)
+        self.state.quiet = device_options.quiet
+        self.state.kernel_source = kernel_source
+        self.state.warmed_up = False
+        self.state.simulation_mode = False
+        self.state.start_time = perf_counter()
+        self.state.last_strategy_start_time = self.state.start_time
+        self.state.last_strategy_time = 0
+        self.state.kernel_options = kernel_options
+
+    def get_environment(self, tuning_options):
+        # TODO: we are going to fix this one later
+        return None
+
+    def run(self, parameter_space, tuning_options):
+        """Iterate through the entire parameter space using a single Python process.
+
+        :param parameter_space: The parameter space as an iterable.
+        :type parameter_space: iterable
+
+        :param tuning_options: A dictionary with all options regarding the tuning process.
+        :type tuning_options: kernel_tuner.interface.Options
+
+        :returns: A list of dictionaries for executed kernel configurations and their execution times.
+        :rtype: dict()
+        """
+        # given the parameter_space, distribute it over Ray tasks
+        logging.debug("parallel runner started for " + self.state.kernel_options.kernel_name)
+
+        results = []
+        tasks = []
+        parameter_space_ref = put(parameter_space)
+        state_ref = put(self.state)
+        tuning_options_ref = put(tuning_options)
+        for task_id in range(0, tuning_options.parallel_runner):
+            tasks.append(parallel_run.remote(task_id, state_ref, parameter_space_ref, tuning_options_ref))
+        for task in tasks:
+            results.append(get(task))
+
+        return results

From 68a569ba8849673cde3735824e3fd0a6a5c4e3ef Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Thu, 5 Jun 2025 11:36:35 +0200
Subject: [PATCH 5/9] Need a dummy DeviceInterface even on the master.

---
 kernel_tuner/runners/parallel.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index a4362b0eb..e8d759ecd 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -133,10 +133,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.state.last_strategy_start_time = self.state.start_time
         self.state.last_strategy_time = 0
         self.state.kernel_options = kernel_options
+        # define a dummy device interface
+        self.dev = DeviceInterface(kernel_source)
 
     def get_environment(self, tuning_options):
-        # TODO: we are going to fix this one later
-        return None
+        # dummy environment
+        return self.dev.get_environment()
 
     def run(self, parameter_space, tuning_options):
         """Iterate through the entire parameter space using a single Python process.

From 9d0dee4a4870ef7d7694ae363855c0cd5ca237ef Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Thu, 5 Jun 2025 11:41:59 +0200
Subject: [PATCH 6/9] Missing device_options in state.

---
 kernel_tuner/runners/parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index e8d759ecd..5b501d9c5 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -125,6 +125,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :type iterations: int
         """
         self.state = ParallelRunnerState(observers, iterations)
+        self.state.device_options = device_options
         self.state.quiet = device_options.quiet
         self.state.kernel_source = kernel_source
         self.state.warmed_up = False

From aff21f035dd68b19e6b0de8c64c75efee3e01a4d Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Thu, 5 Jun 2025 15:12:08 +0200
Subject: [PATCH 7/9] Flatten the results.

---
 kernel_tuner/runners/parallel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 5b501d9c5..2d20bd4bc 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -2,6 +2,7 @@
 import logging
 from time import perf_counter
 from datetime import datetime, timezone
+from itertools import chain
 
 from ray import remote, get, put
 
@@ -166,4 +167,4 @@ def run(self, parameter_space, tuning_options):
         for task in tasks:
             results.append(get(task))
 
-        return results
+        return [chain.from_iterable(results)]

From d7e8cae2778b7aad7408b76bbfe313aa69f05841 Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Thu, 5 Jun 2025 15:18:59 +0200
Subject: [PATCH 8/9] Various bug fixes.

---
 kernel_tuner/runners/parallel.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 2d20bd4bc..f454d0686 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -28,7 +28,7 @@ def __init__(self, observers, iterations):
         self.iterations = iterations
 
 
-@remote
+@remote(num_cpus=1, num_gpus=1)
 def parallel_run(task_id: int, state: ParallelRunnerState, parameter_space, tuning_options):
     dev = DeviceInterface(
         state.kernel_source, iterations=state.iterations, observers=state.observers, **state.device_options
@@ -37,9 +37,9 @@ def parallel_run(task_id: int, state: ParallelRunnerState, parameter_space, tuni
     gpu_args = dev.ready_argument_list(state.kernel_options.arguments)
     # iterate over parameter space
     results = []
-    elements_per_task = len(parameter_space) / tuning_options.parallel_runner
-    first_element = task_id * elements_per_task
-    last_element = (
+    elements_per_task = int(len(parameter_space) / tuning_options.parallel_runner)
+    first_element = int(task_id * elements_per_task)
+    last_element = int(
         (task_id + 1) * elements_per_task if task_id + 1 < tuning_options.parallel_runner else len(parameter_space)
     )
     for element in parameter_space[first_element:last_element]:
@@ -167,4 +167,4 @@ def run(self, parameter_space, tuning_options):
         for task in tasks:
             results.append(get(task))
 
-        return [chain.from_iterable(results)]
+        return list(chain.from_iterable(results))

From b4ff7fa49574c8e277a71f70c289102e73243388 Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Fri, 6 Jun 2025 11:21:07 +0200
Subject: [PATCH 9/9] Add another example for the parallel runner.

---
 examples/cuda/sepconv_parallel.py | 88 +++++++++++++++++++++++++++++++
 kernel_tuner/runners/parallel.py  |  5 +-
 2 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 examples/cuda/sepconv_parallel.py

diff --git a/examples/cuda/sepconv_parallel.py b/examples/cuda/sepconv_parallel.py
new file mode 100644
index 000000000..074200e1b
--- /dev/null
+++ b/examples/cuda/sepconv_parallel.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+import numpy
+from kernel_tuner import tune_kernel
+from collections import OrderedDict
+
+
+def tune():
+    with open("convolution.cu", "r") as f:
+        kernel_string = f.read()
+
+    # setup tunable parameters
+    tune_params = OrderedDict()
+    tune_params["filter_height"] = [i for i in range(3, 19, 2)]
+    tune_params["filter_width"] = [i for i in range(3, 19, 2)]
+    tune_params["block_size_x"] = [16 * i for i in range(1, 65)]
+    tune_params["block_size_y"] = [2**i for i in range(6)]
+    tune_params["tile_size_x"] = [i for i in range(1, 11)]
+    tune_params["tile_size_y"] = [i for i in range(1, 11)]
+
+    tune_params["use_padding"] = [0, 1]  # toggle the insertion of padding in shared memory
+    tune_params["read_only"] = [0, 1]  # toggle using the read-only cache
+
+    # limit the search to only use padding when its effective, and at least 32 threads in a block
+    restrict = ["use_padding==0 or (block_size_x % 32 != 0)", "block_size_x*block_size_y >= 32"]
+
+    # setup input and output dimensions
+    problem_size = (4096, 4096)
+    size = numpy.prod(problem_size)
+    largest_fh = max(tune_params["filter_height"])
+    largest_fw = max(tune_params["filter_width"])
+    input_size = (problem_size[0] + largest_fw - 1) * (problem_size[1] + largest_fh - 1)
+
+    # create input data
+    output_image = numpy.zeros(size).astype(numpy.float32)
+    input_image = numpy.random.randn(input_size).astype(numpy.float32)
+    filter_weights = numpy.random.randn(largest_fh * largest_fw).astype(numpy.float32)
+
+    # setup kernel arguments
+    cmem_args = {"d_filter": filter_weights}
+    args = [output_image, input_image, filter_weights]
+
+    # tell the Kernel Tuner how to compute grid dimensions
+    grid_div_x = ["block_size_x", "tile_size_x"]
+    grid_div_y = ["block_size_y", "tile_size_y"]
+
+    # start tuning separable convolution (row)
+    tune_params["filter_height"] = [1]
+    tune_params["tile_size_y"] = [1]
+    results_row = tune_kernel(
+        "convolution_kernel",
+        kernel_string,
+        problem_size,
+        args,
+        tune_params,
+        grid_div_y=grid_div_y,
+        grid_div_x=grid_div_x,
+        cmem_args=cmem_args,
+        verbose=False,
+        restrictions=restrict,
+        parallel_runner=1024,
+        cache="convolution_kernel_row",
+    )
+
+    # start tuning separable convolution (col)
+    tune_params["filter_height"] = tune_params["filter_width"][:]
+    tune_params["file_size_y"] = tune_params["tile_size_x"][:]
+    tune_params["filter_width"] = [1]
+    tune_params["tile_size_x"] = [1]
+    results_col = tune_kernel(
+        "convolution_kernel",
+        kernel_string,
+        problem_size,
+        args,
+        tune_params,
+        grid_div_y=grid_div_y,
+        grid_div_x=grid_div_x,
+        cmem_args=cmem_args,
+        verbose=False,
+        restrictions=restrict,
+        parallel_runner=1024,
+        cache="convolution_kernel_col",
+    )
+
+    return results_row, results_col
+
+
+if __name__ == "__main__":
+    results_row, results_col = tune()
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index f454d0686..e689096f9 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -135,7 +135,10 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.state.last_strategy_start_time = self.state.start_time
         self.state.last_strategy_time = 0
         self.state.kernel_options = kernel_options
-        # define a dummy device interface
+        # fields used directly by strategies
+        self.last_strategy_time = perf_counter()
+        self.state.last_strategy_start_time = self.last_strategy_time
+        # define a dummy device interface on the master node
         self.dev = DeviceInterface(kernel_source)
 
     def get_environment(self, tuning_options):