diff --git a/doc/source/optimization.rst b/doc/source/optimization.rst
index 59219ad51..2b8dd8987 100644
--- a/doc/source/optimization.rst
+++ b/doc/source/optimization.rst
@@ -25,6 +25,7 @@ the ``strategy=`` optional argument of ``tune_kernel()``. Kernel Tuner currently
  * "pso" particle swarm optimization
  * "random_sample" takes a random sample of the search space
  * "simulated_annealing" simulated annealing strategy
+ * "ensemble" ensemble strategy
 
 Most strategies have some mechanism built in to detect when to stop tuning, which may be controlled through specific 
 parameters that can be passed to the strategies using the ``strategy_options=`` optional argument of ``tune_kernel()``. You 
diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 491541909..84c346ea1 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -46,9 +46,7 @@ def select_for_configuration(self, params):
 
         if option not in self.data:
             list = ", ".join(map(str, self.data.keys()))
-            raise KeyError(
-                f"'{option}' is not a valid parameter value, should be one of: {list}"
-            )
+            raise KeyError(f"'{option}' is not a valid parameter value, should be one of: {list}")
 
         return self.data[option]
 
@@ -60,12 +58,14 @@ def _find_bfloat16_if_available():
     # Try to get bfloat16 if available.
     try:
         from bfloat16 import bfloat16
+
         return bfloat16
     except ImportError:
         pass
 
     try:
         from tensorflow import bfloat16
+
         return bfloat16.as_numpy_dtype
     except ImportError:
         pass
@@ -102,9 +102,7 @@ def _to_float_dtype(x: str) -> np.dtype:
 
 
 class TunablePrecision(Tunable):
-    def __init__(
-        self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
-    ):
+    def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None):
         """The ``Tunable`` object can be used as an input argument when tuning
         kernels. It is a container that internally holds several arrays
         containing the same data, but stored in using different levels of
@@ -135,7 +133,6 @@ def __init__(
             if bfloat16 is not None:
                 dtypes["bfloat16"] = bfloat16
 
-
         # If dtype is a list, convert it to a dictionary
         if isinstance(dtypes, (list, tuple)):
             dtypes = dict((name, _to_float_dtype(name)) for name in dtypes)
@@ -257,9 +254,7 @@ def metric(a, b):
         raise ValueError(f"invalid error metric provided: {user_key}")
 
     # cast both arguments to f64 before passing them to the metric
-    return lambda a, b: metric(
-        a.astype(np.float64, copy=False), b.astype(np.float64, copy=False)
-    )
+    return lambda a, b: metric(a.astype(np.float64, copy=False), b.astype(np.float64, copy=False))
 
 
 class AccuracyObserver(OutputObserver):
diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
index 730710489..b5aaf749a 100644
--- a/kernel_tuner/backends/compiler.py
+++ b/kernel_tuner/backends/compiler.py
@@ -34,7 +34,7 @@
 try:
     from hip._util.types import DeviceArray
 except ImportError:
-    Pointer = Exception # using Exception here as a type that will never be among kernel arguments
+    Pointer = Exception  # using Exception here as a type that will never be among kernel arguments
     DeviceArray = Exception
 
 
@@ -157,7 +157,9 @@ def ready_argument_list(self, arguments):
 
         for i, arg in enumerate(arguments):
             if not (isinstance(arg, (np.ndarray, np.number, DeviceArray)) or is_cupy_array(arg)):
-                raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}")
+                raise TypeError(
+                    f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}"
+                )
             dtype_str = arg.typestr if isinstance(arg, DeviceArray) else str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
@@ -288,7 +290,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             subprocess.run(
@@ -299,7 +301,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -439,7 +441,7 @@ def cleanup_lib(self):
         """unload the previously loaded shared library"""
         if self.lib is None:
             return
-        
+
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py
index 914f211a7..e6fbdefcd 100644
--- a/kernel_tuner/backends/cupy.py
+++ b/kernel_tuner/backends/cupy.py
@@ -70,9 +70,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         # collect environment information
         env = dict()
         cupy_info = str(cp._cupyx.get_runtime_info()).split("\n")[:-1]
-        info_dict = {
-            s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
-        }
+        info_dict = {s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info}
         env["device_name"] = info_dict[f"Device {device} Name"]
 
         env["cuda_version"] = cp.cuda.runtime.driverGetVersion()
@@ -129,9 +127,7 @@ def compile(self, kernel_instance):
 
         options = tuple(compiler_options)
 
-        self.current_module = cp.RawModule(
-            code=kernel_string, options=options, name_expressions=[kernel_name]
-        )
+        self.current_module = cp.RawModule(code=kernel_string, options=options, name_expressions=[kernel_name])
 
         self.func = self.current_module.get_function(kernel_name)
         self.num_regs = self.func.num_regs
diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
index 15259cb23..1eaad1d45 100644
--- a/kernel_tuner/backends/nvcuda.py
+++ b/kernel_tuner/backends/nvcuda.py
@@ -56,13 +56,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             CudaFunctions.last_selected_device = device
 
         # compute capabilities and device properties
-        err, major = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device
-        )
+        err, major = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
         cuda_error_check(err)
-        err, minor = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device
-        )
+        err, minor = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
         cuda_error_check(err)
         err, self.max_threads = cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
@@ -164,20 +160,14 @@ def compile(self, kernel_instance):
         if not any(["--std=" in opt for opt in self.compiler_options]):
             self.compiler_options.append("--std=c++11")
         if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
-            compiler_options.append(
-                f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
-            )
+            compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
         if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
             self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
 
-        err, program = nvrtc.nvrtcCreateProgram(
-            str.encode(kernel_string), b"CUDAProgram", 0, [], []
-        )
+        err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
         try:
             cuda_error_check(err)
-            err = nvrtc.nvrtcCompileProgram(
-                program, len(compiler_options), compiler_options
-            )
+            err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
             cuda_error_check(err)
             err, size = nvrtc.nvrtcGetPTXSize(program)
             cuda_error_check(err)
@@ -189,9 +179,7 @@ def compile(self, kernel_instance):
                 raise SkippableFailure("uses too much shared data")
             else:
                 cuda_error_check(err)
-            err, self.func = cuda.cuModuleGetFunction(
-                self.current_module, str.encode(kernel_name)
-            )
+            err, self.func = cuda.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
             cuda_error_check(err)
 
             # get the number of registers per thread used in this kernel
diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py
index af3be1c00..feb7cf938 100644
--- a/kernel_tuner/backends/opencl.py
+++ b/kernel_tuner/backends/opencl.py
@@ -16,9 +16,7 @@
 class OpenCLFunctions(GPUBackend):
     """Class that groups the OpenCL functions on maintains some state about the device."""
 
-    def __init__(
-        self, device=0, platform=0, iterations=7, compiler_options=None, observers=None
-    ):
+    def __init__(self, device=0, platform=0, iterations=7, compiler_options=None, observers=None):
         """Creates OpenCL device context and reads device properties.
 
         :param device: The ID of the OpenCL device to use for benchmarking
@@ -37,14 +35,10 @@ def __init__(
         platforms = cl.get_platforms()
         self.ctx = cl.Context(devices=[platforms[platform].get_devices()[device]])
 
-        self.queue = cl.CommandQueue(
-            self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
-        )
+        self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
         self.mf = cl.mem_flags
         # inspect device properties
-        self.max_threads = self.ctx.devices[0].get_info(
-            cl.device_info.MAX_WORK_GROUP_SIZE
-        )
+        self.max_threads = self.ctx.devices[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
         self.compiler_options = compiler_options or []
 
         # observer stuff
@@ -108,9 +102,7 @@ def compile(self, kernel_instance):
         :returns: An OpenCL kernel that can be called directly.
         :rtype: pyopencl.Kernel
         """
-        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(
-            options=self.compiler_options
-        )
+        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(options=self.compiler_options)
         func = getattr(prg, kernel_instance.name)
         return func
 
diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py
index 7fddc9393..57e7c07da 100644
--- a/kernel_tuner/backends/pycuda.py
+++ b/kernel_tuner/backends/pycuda.py
@@ -97,13 +97,9 @@ def _finish_up():
             PyCudaFunctions.last_selected_context = self.context
 
         # inspect device properties
-        devprops = {
-            str(k): v for (k, v) in self.context.get_device().get_attributes().items()
-        }
+        devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()}
         self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
-        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
-            devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
-        )
+        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(devprops.get("COMPUTE_CAPABILITY_MINOR", "0"))
         if cc == "00":
             cc = self.context.get_device().compute_capability()
         self.cc = str(cc[0]) + str(cc[1])
@@ -347,14 +343,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """
         if stream is None:
             stream = self.stream
-        func(
-            *gpu_args,
-            block=threads,
-            grid=grid,
-            stream=stream,
-            shared=self.smem_size,
-            texrefs=self.texrefs
-        )
+        func(*gpu_args, block=threads, grid=grid, stream=stream, shared=self.smem_size, texrefs=self.texrefs)
 
     def memset(self, allocation, value, size):
         """Set the memory in allocation to the value in value.
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 655779337..e4579b60e 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -32,7 +32,7 @@
 try:
     from hip._util.types import DeviceArray
 except ImportError:
-    DeviceArray = Exception # using Exception here as a type that will never be among kernel arguments
+    DeviceArray = Exception  # using Exception here as a type that will never be among kernel arguments
 
 _KernelInstance = namedtuple(
     "_KernelInstance",
@@ -84,9 +84,7 @@ def __init__(self, kernel_name, kernel_sources, lang, defines=None):
         self.defines = defines
         if lang is None:
             if callable(self.kernel_sources[0]):
-                raise TypeError(
-                    "Please specify language when using a code generator function"
-                )
+                raise TypeError("Please specify language when using a code generator function")
             kernel_string = self.get_kernel_string(0)
             lang = util.detect_language(kernel_string)
 
@@ -113,9 +111,7 @@ def get_kernel_string(self, index=0, params=None):
         kernel_source = self.kernel_sources[index]
         return util.get_kernel_string(kernel_source, params)
 
-    def prepare_list_of_files(
-        self, kernel_name, params, grid, threads, block_size_names
-    ):
+    def prepare_list_of_files(self, kernel_name, params, grid, threads, block_size_names):
         """prepare the kernel string along with any additional files
 
         The first file in the list is allowed to include or read in the others
@@ -151,9 +147,7 @@ def prepare_list_of_files(
 
         for i, f in enumerate(self.kernel_sources):
             if i > 0 and not util.looks_like_a_filename(f):
-                raise ValueError(
-                    "When passing multiple kernel sources, the secondary entries must be filenames"
-                )
+                raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames")
 
             ks = self.get_kernel_string(i, params)
             # add preprocessor statements
@@ -187,9 +181,7 @@ def prepare_list_of_files(
 
     def get_user_suffix(self, index=0):
         """Get the suffix of the kernel filename, if the user specified one. Return None otherwise."""
-        if util.looks_like_a_filename(self.kernel_sources[index]) and (
-            "." in self.kernel_sources[index]
-        ):
+        if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]):
             return "." + self.kernel_sources[index].split(".")[-1]
         return None
 
@@ -218,13 +210,9 @@ def check_argument_lists(self, kernel_name, arguments):
         """
         for i, f in enumerate(self.kernel_sources):
             if not callable(f):
-                util.check_argument_list(
-                    kernel_name, self.get_kernel_string(i), arguments
-                )
+                util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments)
             else:
-                logging.debug(
-                    "Checking of arguments list not supported yet for code generators."
-                )
+                logging.debug("Checking of arguments list not supported yet for code generators.")
 
 
 class DeviceInterface(object):
@@ -317,7 +305,9 @@ def __init__(
                 observers=observers,
             )
         else:
-            raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
+            raise ValueError(
+                "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet"
+            )
         self.dev = dev
 
         # look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
@@ -390,7 +380,6 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
         for obs in self.benchmark_observers:
             result.update(obs.get_results())
 
-
     def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
         """Benchmark continuously for at least 'duration' seconds"""
         iterations = int(np.ceil(duration / (result["time"] / 1000)))
@@ -414,7 +403,6 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
         for obs in self.continuous_observers:
             result.update(obs.get_results())
 
-
     def set_nvml_parameters(self, instance):
         """Set the NVML parameters. Avoids setting time leaking into benchmark time."""
         if self.use_nvml:
@@ -433,7 +421,6 @@ def set_nvml_parameters(self, instance):
             if "tegra_gr_clock" in instance.params:
                 self.tegra.gr_clock = instance.params["tegra_gr_clock"]
 
-
     def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
         """Benchmark the kernel instance."""
         logging.debug("benchmark " + instance.name)
@@ -458,9 +445,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                     obs.results = result
                     duration = max(duration, obs.continuous_duration)
 
-                self.benchmark_continuous(
-                    func, gpu_args, instance.threads, instance.grid, result, duration
-                )
+                self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration)
 
         except Exception as e:
             # some launches may fail because too many registers are required
@@ -473,9 +458,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                 "INVALID_WORK_GROUP_SIZE",
             ]
             if any([skip_str in str(e) for skip_str in skippable_exceptions]):
-                logging.debug(
-                    "benchmark fails due to runtime failure too many resources required"
-                )
+                logging.debug("benchmark fails due to runtime failure too many resources required")
                 if verbose:
                     print(
                         f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch"
@@ -487,20 +470,20 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                 raise e
         return result
 
-    def check_kernel_output(
-        self, func, gpu_args, instance, answer, atol, verify, verbose
-    ):
+    def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, verbose):
         """runs the kernel once and checks the result against answer"""
         logging.debug("check_kernel_output")
 
-        #if not using custom verify function, check if the length is the same
+        # if not using custom verify function, check if the length is the same
         if answer:
             if len(instance.arguments) != len(answer):
                 raise TypeError("The length of argument list and provided results do not match.")
 
             should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)]
         else:
-            should_sync = [isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor, DeviceArray)) for arg in instance.arguments]
+            should_sync = [
+                isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor, DeviceArray)) for arg in instance.arguments
+            ]
 
         # re-copy original contents of output arguments to GPU memory, to overwrite any changes
         # by earlier kernel runs
@@ -522,7 +505,7 @@ def check_kernel_output(
                     self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
                 elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
                     if not answer[i].is_cuda:
-                        #if the answer is on the host, copy gpu output to host as well
+                        # if the answer is on the host, copy gpu output to host as well
                         result_host.append(torch.zeros_like(answer[i]))
                         self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
                     else:
@@ -550,10 +533,7 @@ def check_kernel_output(
             correct = True
 
         if not correct:
-            raise RuntimeError(
-                "Kernel result verification failed for: "
-                + util.get_config_string(instance.params)
-            )
+            raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
 
     def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
         # reset previous timers
@@ -567,7 +547,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
         # Compile and benchmark a kernel instance based on kernel strings and parameters
         instance_string = util.get_instance_string(params)
 
-        logging.debug('compile_and_benchmark ' + instance_string)
+        logging.debug("compile_and_benchmark " + instance_string)
 
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
@@ -585,9 +565,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 else:
                     # add shared memory arguments to compiled module
                     if kernel_options.smem_args is not None:
-                        self.dev.copy_shared_memory_args(
-                            util.get_smem_args(kernel_options.smem_args, params)
-                        )
+                        self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
                     # add constant memory arguments to compiled module
                     if kernel_options.cmem_args is not None:
                         self.dev.copy_constant_memory_args(kernel_options.cmem_args)
@@ -601,12 +579,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 # test kernel for correctness
                 if func and (to.answer or to.verify or self.output_observers):
                     start_verification = time.perf_counter()
-                    self.check_kernel_output(
-                        func, gpu_args, instance, to.answer, to.atol, to.verify, verbose
-                    )
-                    last_verification_time = 1000 * (
-                        time.perf_counter() - start_verification
-                    )
+                    self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
+                    last_verification_time = 1000 * (time.perf_counter() - start_verification)
 
                 # benchmark
                 if func:
@@ -622,10 +596,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
             except Exception as e:
                 # dump kernel sources to temp file
                 temp_filenames = instance.prepare_temp_files_for_error_msg()
-                print(
-                    "Error while compiling or benchmarking, see source files: "
-                    + " ".join(temp_filenames)
-                )
+                print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
                 raise e
 
             # clean up any temporary files, if no error occured
@@ -656,9 +627,7 @@ def compile_kernel(self, instance, verbose):
             ]
             error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
             if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
-                logging.debug(
-                    "compile_kernel failed due to kernel using too much shared memory"
-                )
+                logging.debug("compile_kernel failed due to kernel using too much shared memory")
                 if verbose:
                     print(
                         f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used"
@@ -671,7 +640,7 @@ def compile_kernel(self, instance, verbose):
 
     @staticmethod
     def preprocess_gpu_arguments(old_arguments, params):
-        """ Get a flat list of arguments based on the configuration given by `params` """
+        """Get a flat list of arguments based on the configuration given by `params`"""
         return _preprocess_gpu_arguments(old_arguments, params)
 
     def copy_shared_memory_args(self, smem_args):
@@ -707,9 +676,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         )
         if np.prod(threads) > self.dev.max_threads:
             if verbose:
-                print(
-                    f"skipping config {util.get_instance_string(params)} reason: too many threads per block"
-                )
+                print(f"skipping config {util.get_instance_string(params)} reason: too many threads per block")
             return util.InvalidConfig()
 
         # obtain the kernel_string and prepare additional files, if any
@@ -728,7 +695,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         # Preprocess GPU arguments. Require for handling `Tunable` arguments
         arguments = _preprocess_gpu_arguments(kernel_options.arguments, params)
 
-        #collect everything we know about this instance and return it
+        # collect everything we know about this instance and return it
         return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments)
 
     def get_environment(self):
@@ -775,12 +742,8 @@ def run_kernel(self, func, gpu_args, instance):
         try:
             self.dev.run_kernel(func, gpu_args, instance.threads, instance.grid)
         except Exception as e:
-            if "too many resources requested for launch" in str(
-                e
-            ) or "OUT_OF_RESOURCES" in str(e):
-                logging.debug(
-                    "ignoring runtime failure due to too many resources required"
-                )
+            if "too many resources requested for launch" in str(e) or "OUT_OF_RESOURCES" in str(e):
+                logging.debug("ignoring runtime failure due to too many resources required")
                 return False
             else:
                 logging.debug("encountered unexpected runtime failure: " + str(e))
@@ -789,7 +752,7 @@ def run_kernel(self, func, gpu_args, instance):
 
 
 def _preprocess_gpu_arguments(old_arguments, params):
-    """ Get a flat list of arguments based on the configuration given by `params` """
+    """Get a flat list of arguments based on the configuration given by `params`"""
     new_arguments = []
 
     for argument in old_arguments:
@@ -806,15 +769,11 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
 
     # first check if the length is the same
     if len(instance.arguments) != len(answer):
-        raise TypeError(
-            "The length of argument list and provided results do not match."
-        )
+        raise TypeError("The length of argument list and provided results do not match.")
     # for each element in the argument list, check if the types match
     for i, arg in enumerate(instance.arguments):
         if answer[i] is not None:  # skip None elements in the answer list
-            if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(
-                arg, (np.ndarray, cp.ndarray)
-            ):
+            if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(arg, (np.ndarray, cp.ndarray)):
                 if answer[i].dtype != arg.dtype:
                     raise TypeError(
                         f"Element {i} of the expected results list is not of the same dtype as the kernel output: "
@@ -862,16 +821,14 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
                     )
             else:
                 # either answer[i] and argument have different types or answer[i] is not a numpy type
-                if not isinstance(
-                    answer[i], (np.ndarray, cp.ndarray, torch.Tensor)
-                ) or not isinstance(answer[i], np.number):
+                if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance(
+                    answer[i], np.number
+                ):
                     raise TypeError(
                         f"Element {i} of expected results list is not a numpy/cupy ndarray, torch Tensor or numpy scalar."
                     )
                 else:
-                    raise TypeError(
-                        f"Element {i} of expected results list and kernel arguments have different types."
-                    )
+                    raise TypeError(f"Element {i} of expected results list and kernel arguments have different types.")
 
     def _ravel(a):
         if hasattr(a, "ravel") and len(a.shape) > 1:
@@ -891,26 +848,15 @@ def _flatten(a):
             expected = _flatten(expected)
             if any([isinstance(array, cp.ndarray) for array in [expected, result]]):
                 output_test = cp.allclose(expected, result, atol=atol)
-            elif isinstance(expected, torch.Tensor) and isinstance(
-                result, torch.Tensor
-            ):
+            elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor):
                 output_test = torch.allclose(expected, result, atol=atol)
             else:
                 output_test = np.allclose(expected, result, atol=atol)
 
             if not output_test and verbose:
-                print(
-                    "Error: "
-                    + util.get_config_string(instance.params)
-                    + " detected during correctness check"
-                )
-                print(
-                    "this error occured when checking value of the %oth kernel argument"
-                    % (i,)
-                )
-                print(
-                    "Printing kernel output and expected result, set verbose=False to suppress this debug print"
-                )
+                print("Error: " + util.get_config_string(instance.params) + " detected during correctness check")
+                print("this error occured when checking value of the %oth kernel argument" % (i,))
+                print("Printing kernel output and expected result, set verbose=False to suppress this debug print")
                 np.set_printoptions(edgeitems=50)
                 print("Kernel output:")
                 print(result)
@@ -945,11 +891,7 @@ def apply_template_typenames(type_list, templated_typenames):
     def replace_typename_token(matchobj):
         """function for a whitespace preserving token regex replace"""
         # replace only the match, leaving the whitespace around it as is
-        return (
-            matchobj.group(1)
-            + templated_typenames[matchobj.group(2)]
-            + matchobj.group(3)
-        )
+        return matchobj.group(1) + templated_typenames[matchobj.group(2)] + matchobj.group(3)
 
     for i, arg_type in enumerate(type_list):
         for k, v in templated_typenames.items():
@@ -980,9 +922,7 @@ def wrap_templated_kernel(kernel_string, kernel_name):
     # relatively strict regex that does not allow nested template parameters like vector<TF>
     # within the template parameter list
     regex = (
-        r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?"
-        + name
-        + r"\s*\((.*?)\)\s*\{"
+        r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" + name + r"\s*\((.*?)\)\s*\{"
     )
     match = re.search(regex, kernel_string, re.S)
     if not match:
@@ -990,15 +930,12 @@ def wrap_templated_kernel(kernel_string, kernel_name):
 
     template_parameters = match.group(1).split(",")
     argument_list = match.group(3).split(",")
-    argument_list = [
-        s.strip() for s in argument_list
-    ]  # remove extra whitespace around 'type name' strings
+    # remove extra whitespace around 'type name' strings
+    argument_list = [s.strip() for s in argument_list]
 
     type_list, name_list = split_argument_list(argument_list)
 
-    templated_typenames = get_templated_typenames(
-        template_parameters, template_arguments
-    )
+    templated_typenames = get_templated_typenames(template_parameters, template_arguments)
     apply_template_typenames(type_list, templated_typenames)
 
     # replace __global__ with __device__ in the templated kernel definition
@@ -1012,9 +949,7 @@ def wrap_templated_kernel(kernel_string, kernel_name):
         launch_bounds = match.group(2)
 
     # generate code for the compile-time template instantiation
-    template_instantiation = (
-        f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n"
-    )
+    template_instantiation = f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n"
 
     # generate code for the wrapper kernel
     new_arg_list = ", ".join([" ".join((a, b)) for a, b in zip(type_list, name_list)])
diff --git a/kernel_tuner/energy/energy.py b/kernel_tuner/energy/energy.py
index ab0582c52..40bcbe080 100644
--- a/kernel_tuner/energy/energy.py
+++ b/kernel_tuner/energy/energy.py
@@ -37,7 +37,10 @@
 }
 """
 
-def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None, simulation_mode=None):
+
+def get_frequency_power_relation_fp32(
+    device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None, simulation_mode=None
+):
     """Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs."""
     # get some numbers about the device
     if not cache:
@@ -46,7 +49,7 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
 
         drv.init()
         dev = drv.Device(device)
-        device_name = dev.name().replace(' ', '_')
+        device_name = dev.name().replace(" ", "_")
         multiprocessor_count = dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
         max_block_dim_x = dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X)
 
@@ -76,12 +79,28 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
     metrics["f"] = lambda p: p["core_freq"]
 
     nvmlobserver = NVMLObserver(
-        ["core_freq", "nvml_power"], device=device, nvidia_smi_fallback=nvidia_smi_fallback, use_locked_clocks=use_locked_clocks)
-
-    results, _ = tune_kernel("fp32_kernel", fp32_kernel_string, problem_size=(multiprocessor_count, 64),
-                             arguments=arguments, tune_params=tune_params, observers=[nvmlobserver],
-                             verbose=False, quiet=True, metrics=metrics, iterations=10, simulation_mode=simulation_mode,
-                             grid_div_x=[], grid_div_y=[], cache=cache or f"synthetic_fp32_cache_{device_name}.json")
+        ["core_freq", "nvml_power"],
+        device=device,
+        nvidia_smi_fallback=nvidia_smi_fallback,
+        use_locked_clocks=use_locked_clocks,
+    )
+
+    results, _ = tune_kernel(
+        "fp32_kernel",
+        fp32_kernel_string,
+        problem_size=(multiprocessor_count, 64),
+        arguments=arguments,
+        tune_params=tune_params,
+        observers=[nvmlobserver],
+        verbose=False,
+        quiet=True,
+        metrics=metrics,
+        iterations=10,
+        simulation_mode=simulation_mode,
+        grid_div_x=[],
+        grid_div_y=[],
+        cache=cache or f"synthetic_fp32_cache_{device_name}.json",
+    )
 
     freqs = np.array([res["core_freq"] for res in results])
     nvml_power = np.array([res["nvml_power"] for res in results])
@@ -91,7 +110,7 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
 
 def estimated_voltage(clocks, clock_threshold, voltage_scale):
     """Estimate voltage based on clock_threshold and voltage_scale."""
-    return [1 + ((clock > clock_threshold) * (1e-3 * voltage_scale * (clock-clock_threshold))) for clock in clocks]
+    return [1 + ((clock > clock_threshold) * (1e-3 * voltage_scale * (clock - clock_threshold))) for clock in clocks]
 
 
 def estimated_power(clocks, clock_threshold, voltage_scale, clock_scale, power_max):
@@ -131,18 +150,24 @@ def fit_power_frequency_model(freqs, nvml_power):
 
     # fit the model
     p0 = (clock_threshold, voltage_scale, clock_scale, power_max)
-    bounds = ([clock_min, 0, 0, 0.9*power_max],
-              [clock_max, 1, 1, 1.1*power_max])
+    bounds = ([clock_min, 0, 0, 0.9 * power_max], [clock_max, 1, 1, 1.1 * power_max])
     res = optimize.curve_fit(estimated_power, x, y, p0=p0, bounds=bounds)
-    clock_threshold, voltage_scale, clock_scale, power_max = np.round(
-        res[0], 2)
+    clock_threshold, voltage_scale, clock_scale, power_max = np.round(res[0], 2)
 
     fit_parameters = (clock_threshold, voltage_scale, clock_scale, power_max)
     scale_parameters = (clock_min, min(nvml_power))
     return clock_threshold + clock_min, fit_parameters, scale_parameters
 
 
-def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None, simulation_mode=None):
+def create_power_frequency_model(
+    device=0,
+    n_samples=10,
+    verbose=False,
+    nvidia_smi_fallback=None,
+    use_locked_clocks=False,
+    cache=None,
+    simulation_mode=None,
+):
     """Calculate the most energy-efficient clock frequency of device.
 
     This function uses a performance model to fit the power-frequency curve
@@ -176,7 +201,9 @@ def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_s
     :rtype: float
 
     """
-    freqs, nvml_power = get_frequency_power_relation_fp32(device, n_samples, nvidia_smi_fallback, use_locked_clocks, cache=cache, simulation_mode=simulation_mode)
+    freqs, nvml_power = get_frequency_power_relation_fp32(
+        device, n_samples, nvidia_smi_fallback, use_locked_clocks, cache=cache, simulation_mode=simulation_mode
+    )
 
     if verbose:
         print("Clock frequencies:", freqs.tolist())
@@ -187,7 +214,7 @@ def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_s
     if verbose:
         print(f"Modelled most energy efficient frequency: {ridge_frequency} MHz")
 
-    all_frequencies = np.array(get_nvml_gr_clocks(device, quiet=True)['nvml_gr_clock'])
+    all_frequencies = np.array(get_nvml_gr_clocks(device, quiet=True)["nvml_gr_clock"])
     ridge_frequency_final = all_frequencies[np.argmin(abs(all_frequencies - ridge_frequency))]
 
     if verbose:
@@ -200,8 +227,12 @@ def get_frequency_range_around_ridge(ridge_frequency, all_frequencies, freq_rang
     """Return number_of_freqs frequencies in a freq_range percentage around the ridge_frequency from among all_frequencies."""
     min_freq = 1e-2 * (100 - int(freq_range)) * ridge_frequency
     max_freq = 1e-2 * (100 + int(freq_range)) * ridge_frequency
-    frequency_selection = np.unique([all_frequencies[np.argmin(abs(
-        all_frequencies - f))] for f in np.linspace(min_freq, max_freq, int(number_of_freqs))]).tolist()
+    frequency_selection = np.unique(
+        [
+            all_frequencies[np.argmin(abs(all_frequencies - f))]
+            for f in np.linspace(min_freq, max_freq, int(number_of_freqs))
+        ]
+    ).tolist()
 
     if verbose:
         print(f"Suggested range of frequencies to auto-tune: {frequency_selection} MHz")
diff --git a/kernel_tuner/hyper.py b/kernel_tuner/hyper.py
index f002882f3..b661609fd 100644
--- a/kernel_tuner/hyper.py
+++ b/kernel_tuner/hyper.py
@@ -9,7 +9,7 @@
 
 
 def tune_hyper_params(target_strategy, hyper_params, *args, **kwargs):
-    """ Tune hyperparameters for a given strategy and kernel
+    """Tune hyperparameters for a given strategy and kernel
 
     This function is to be called just like tune_kernel, except that you specify a strategy
     and a dictionary with hyperparameters in front of the arguments you pass to tune_kernel.
@@ -41,17 +41,17 @@ def put_if_not_present(target_dict, key, value):
     put_if_not_present(kwargs, "verbose", False)
     put_if_not_present(kwargs, "quiet", True)
     put_if_not_present(kwargs, "simulation_mode", True)
-    kwargs['strategy'] = 'brute_force'
+    kwargs["strategy"] = "brute_force"
 
-    #last position argument is tune_params
+    # last position argument is tune_params
     tune_params = args[-1]
 
-    #find optimum
+    # find optimum
     kwargs["strategy"] = "brute_force"
     results, _ = kernel_tuner.tune_kernel(*args, **kwargs)
     optimum = min(results, key=lambda p: p["time"])["time"]
 
-    #could throw a warning for the kwargs that will be overwritten, strategy(_options)
+    # could throw a warning for the kwargs that will be overwritten, strategy(_options)
     kwargs["strategy"] = target_strategy
 
     parameter_space = itertools.product(*hyper_params.values())
@@ -65,14 +65,13 @@ def put_if_not_present(target_dict, key, value):
         fevals = []
         p_of_opt = []
         for _ in range(100):
-            #measure
+            # measure
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 results, _ = kernel_tuner.tune_kernel(*args, **kwargs)
 
-            #get unique function evaluations
-            unique_fevals = {",".join([str(v) for k, v in record.items() if k in tune_params])
-                             for record in results}
+            # get unique function evaluations
+            unique_fevals = {",".join([str(v) for k, v in record.items() if k in tune_params]) for record in results}
 
             fevals.append(len(unique_fevals))
             p_of_opt.append(min(results, key=lambda p: p["time"])["time"] / optimum * 100)
diff --git a/kernel_tuner/integration.py b/kernel_tuner/integration.py
index d3219ba87..4f92d9582 100644
--- a/kernel_tuner/integration.py
+++ b/kernel_tuner/integration.py
@@ -6,8 +6,8 @@
 
 from kernel_tuner import util
 
-#specifies for a number of pre-defined objectives whether
-#the objective should be minimized or maximized (boolean value denotes higher is better)
+# specifies for a number of pre-defined objectives whether
+# the objective should be minimized or maximized (boolean value denotes higher is better)
 objective_default_map = {
     "time": False,
     "energy": False,
@@ -18,11 +18,12 @@
     "GFLOPS/W": True,
     "TFLOPS/W": True,
     "GFLOP/J": True,
-    "TFLOP/J": True
+    "TFLOP/J": True,
 }
 
+
 def get_objective_defaults(objective, objective_higher_is_better):
-    """ Uses time as default objective and attempts to lookup objective_higher_is_better for known objectives """
+    """Uses time as default objective and attempts to lookup objective_higher_is_better for known objectives"""
     objective = objective or "time"
     if objective_higher_is_better is None:
         if objective in objective_default_map:
@@ -31,6 +32,7 @@ def get_objective_defaults(objective, objective_higher_is_better):
             raise ValueError(f"Please specify objective_higher_is_better for objective {objective}")
     return objective, objective_higher_is_better
 
+
 schema_v1_0 = {
     "$schema": "https://json-schema.org/draft-07/schema#",
     "type": "object",
@@ -45,25 +47,20 @@ def get_objective_defaults(objective, objective_higher_is_better):
             "type": "array",
             "items": {
                 "type": "object",
-                "properties": {
-                    "device_name": {"type": "string"},
-                    "problem_size": {"type": "string"}
-                },
-                "required": ["device_name", "problem_size", "tunable_parameters"]
+                "properties": {"device_name": {"type": "string"}, "problem_size": {"type": "string"}},
+                "required": ["device_name", "problem_size", "tunable_parameters"],
             },
         },
     },
-    "required": ["version_number", "tunable_parameters", "kernel_name", "objective", "data"]
+    "required": ["version_number", "tunable_parameters", "kernel_name", "objective", "data"],
 }
 
 
-
-
-class TuneResults():
-    """ Object to represent the tuning results stored to file """
+class TuneResults(object):
+    """Object to represent the tuning results stored to file"""
 
     def __init__(self, results_filename):
-        #open results file
+        # open results file
         if not os.path.isfile(results_filename):
             raise ValueError("Error: results_filename does not exist")
         meta, data = _read_results_file(results_filename)
@@ -75,28 +72,28 @@ def __init__(self, results_filename):
         self.objective_higher_is_better = meta.get("objective_higher_is_better", False)
 
     def get_best_config(self, gpu_name="default", problem_size=None):
-        """ get the best config based on these tuning results
+        """get the best config based on these tuning results
 
-            This function returns the overall best performing kernel configuration
-            based on the tuning results for a given gpu_name and problem_size.
+        This function returns the overall best performing kernel configuration
+        based on the tuning results for a given gpu_name and problem_size.
 
-            If problem_size is not given this function will select a default configuration
-            based on the tuning results for all problem_sizes and the given gpu_name.
+        If problem_size is not given this function will select a default configuration
+        based on the tuning results for all problem_sizes and the given gpu_name.
 
-            If gpu_name is not given this function will select a default configuration
-            based on all tuning results.
+        If gpu_name is not given this function will select a default configuration
+        based on all tuning results.
 
-            :param gpu_name: Name of the GPU for which the best configuration
-                needs to be retrieved.
-            :type gpu_name: string
+        :param gpu_name: Name of the GPU for which the best configuration
+            needs to be retrieved.
+        :type gpu_name: string
 
-            :param problem_size: The problem size for which the best configuration
-                on the given gpu_name needs to be retrieved.
-            :type problem_size: tuple, int, or string
+        :param problem_size: The problem size for which the best configuration
+            on the given gpu_name needs to be retrieved.
+        :type problem_size: tuple, int, or string
 
-            :returns: A dictionary with tunable parameters of the selected kernel
-                kernel configuration.
-            :rtype: dict
+        :returns: A dictionary with tunable parameters of the selected kernel
+            kernel configuration.
+        :rtype: dict
         """
         gpu_name = gpu_name.replace("-", "_").replace(" ", "_")
 
@@ -111,88 +108,102 @@ def get_best_config(self, gpu_name="default", problem_size=None):
         gpu_match = [result for result in self.data if result["device_name"] == gpu_name]
 
         if gpu_match:
-            gpu_ps_match = [result for result in gpu_match if problem_size and result["problem_size"] == problem_size_str]
+            gpu_ps_match = [
+                result for result in gpu_match if problem_size and result["problem_size"] == problem_size_str
+            ]
             if gpu_ps_match:
                 return _get_best_config_from_list(gpu_ps_match, self.objective, self.objective_higher_is_better)
-            #problem size is not given or not among the results, so return a good default
+            # problem size is not given or not among the results, so return a good default
             return _select_best_common_config(gpu_match, self.objective, self.objective_higher_is_better)
 
-        #gpu is not among the results, so return a good default
+        # gpu is not among the results, so return a good default
         return _select_best_common_config(self.data, self.objective, self.objective_higher_is_better)
 
 
-def store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3, objective=None, objective_higher_is_better=None):
-    """ stores tuning results to a JSON file
-
-        Stores the top (3% by default) best kernel configurations in a JSON file.
-        The results are stored for a specific device (retrieved using env['device_name'])
-        and for a specific problem_size. If the file already exists, new results for
-        this device and problem_size will be appended. Any previous results already stored
-        in the file for this specific device and problem_size will be overwritten.
-
-        :param results_filename: Filename of the JSON file in which the results will be stored.
-            Results will be appended if the file already exists. Existing results within the
-            file for the same device and problem_size will be overwritten.
-        :type results_filename: string
-
-        :param tune_params: The tunable parameters of this kernel.
-        :type tune_params: dict
-
-        :param problem_size: The problem_size this kernel was tuned for
-        :type problem_size: tuple
-
-        :param results: A list of dictionaries of all executed kernel configurations and their
-            execution times, and possibly other user-defined metrics, as returned by
-            tune_kernel().
-        :type results: list(dict)
-
-        :param env: A dictionary with information about the environment
-            in which the tuning took place. This records device name, properties,
-            version info, and so on. Typicaly this dictionary is returned by tune_kernel().
-        :type env: dict
-
-        :param top: Denotes the top percentage of results to store in the results file
-        :type top: float
-
-        :param objective: Optimization objective to sort results on, consisting of a string
-            that also occurs in results as a metric.
-        :type objective: string
-
-        :param objective_higher_is_better: A boolean that specifies whether the objective should
-            be maximized or minimized.
-        :type objective_higher_is_better: bool
+def store_results(
+    results_filename,
+    kernel_name,
+    kernel_string,
+    tune_params,
+    problem_size,
+    results,
+    env,
+    top=3,
+    objective=None,
+    objective_higher_is_better=None,
+):
+    """stores tuning results to a JSON file
+
+    Stores the top (3% by default) best kernel configurations in a JSON file.
+    The results are stored for a specific device (retrieved using env['device_name'])
+    and for a specific problem_size. If the file already exists, new results for
+    this device and problem_size will be appended. Any previous results already stored
+    in the file for this specific device and problem_size will be overwritten.
+
+    :param results_filename: Filename of the JSON file in which the results will be stored.
+        Results will be appended if the file already exists. Existing results within the
+        file for the same device and problem_size will be overwritten.
+    :type results_filename: string
+
+    :param tune_params: The tunable parameters of this kernel.
+    :type tune_params: dict
+
+    :param problem_size: The problem_size this kernel was tuned for
+    :type problem_size: tuple
+
+    :param results: A list of dictionaries of all executed kernel configurations and their
+        execution times, and possibly other user-defined metrics, as returned by
+        tune_kernel().
+    :type results: list(dict)
+
+    :param env: A dictionary with information about the environment
+        in which the tuning took place. This records device name, properties,
+        version info, and so on. Typicaly this dictionary is returned by tune_kernel().
+    :type env: dict
+
+    :param top: Denotes the top percentage of results to store in the results file
+    :type top: float
+
+    :param objective: Optimization objective to sort results on, consisting of a string
+        that also occurs in results as a metric.
+    :type objective: string
+
+    :param objective_higher_is_better: A boolean that specifies whether the objective should
+        be maximized or minimized.
+    :type objective_higher_is_better: bool
 
     """
 
     objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
 
-    #filter results to only those that contain the objective
+    # filter results to only those that contain the objective
     results_filtered = [item for item in results if objective in item]
 
-    #get top results
+    # get top results
     if objective_higher_is_better:
         best_config = max(results_filtered, key=lambda x: x[objective])
     else:
         best_config = min(results_filtered, key=lambda x: x[objective])
     best = best_config[objective]
-    top_range = top/100.0
+    top_range = top / 100.0
 
     def top_result(item):
         current = item[objective]
         if objective_higher_is_better:
-            return current > best * (1-top_range)
-        return current < best * (1+top_range)
+            return current > best * (1 - top_range)
+        return current < best * (1 + top_range)
+
     top_results = [item for item in results_filtered if top_result(item)]
 
-    #filter result items to just the tunable parameters and the objective
+    # filter result items to just the tunable parameters and the objective
     filter_keys = list(tune_params.keys()) + [objective]
-    top_results = [{k:item[k] for k in filter_keys} for item in top_results]
+    top_results = [{k: item[k] for k in filter_keys} for item in top_results]
 
-    #read existing results file
+    # read existing results file
     if os.path.isfile(results_filename):
         meta, data = _read_results_file(results_filename)
 
-        #validate consistency between arguments and results file
+        # validate consistency between arguments and results file
         if not kernel_name == meta["kernel_name"]:
             raise ValueError("Mismatch between given kernel_name and results file")
         if not all([param in meta["tunable_parameters"] for param in tune_params]):
@@ -200,7 +211,7 @@ def top_result(item):
         if not objective == meta["objective"]:
             raise ValueError("Mismatch between given objective and results file")
     else:
-        #new file
+        # new file
         meta = {}
         meta["version_number"] = "1.0"
         meta["kernel_name"] = kernel_name
@@ -214,18 +225,18 @@ def top_result(item):
         meta["tunable_parameters"] = list(tune_params.keys())
         data = []
 
-    #insert new results into the list
+    # insert new results into the list
     if not isinstance(problem_size, (list, tuple)):
         problem_size = (problem_size,)
     problem_size_str = "x".join(str(i) for i in problem_size)
 
-    #replace all non alphanumeric characters with underscore
-    dev_name = re.sub('[^0-9a-zA-Z]+', '_', env["device_name"].strip())
+    # replace all non alphanumeric characters with underscore
+    dev_name = re.sub("[^0-9a-zA-Z]+", "_", env["device_name"].strip())
 
-    #remove existing entries for this GPU and problem_size combination from the results if any
+    # remove existing entries for this GPU and problem_size combination from the results if any
     data = [d for d in data if not (d["device_name"] == dev_name and d["problem_size"] == problem_size_str)]
 
-    #extend the results with the top_results
+    # extend the results with the top_results
     results = []
     for result in top_results:
         record = {"device_name": dev_name, "problem_size": problem_size_str, "tunable_parameters": {}}
@@ -236,65 +247,65 @@ def top_result(item):
         results.append(record)
     data.extend(results)
 
-    #write output file
+    # write output file
     meta["data"] = data
-    with open(results_filename, 'w') as fh:
+    with open(results_filename, "w") as fh:
         fh.write(json.dumps(meta, indent=""))
 
 
 def create_device_targets(header_filename, results_filename, objective=None, objective_higher_is_better=None):
-    """ create a header with device targets
+    """create a header with device targets
 
-        This function generates a header file with device targets for compiling
-        a kernel with different parameters on different devices. The tuning
-        results are stored in a JSON file created by store_results. Existing
-        header_filename will be overwritten.
+    This function generates a header file with device targets for compiling
+    a kernel with different parameters on different devices. The tuning
+    results are stored in a JSON file created by store_results. Existing
+    header_filename will be overwritten.
 
-        This function only creates device targets and does not create problem_size
-        specific targets. Instead it searches for configurations that perform well
-        for different problem sizes and selects a single configuration to use
-        for the kernel.
+    This function only creates device targets and does not create problem_size
+    specific targets. Instead it searches for configurations that perform well
+    for different problem sizes and selects a single configuration to use
+    for the kernel.
 
-        The header file can be included in a kernel source file using:
-        ``#include "header_filename.h"``
+    The header file can be included in a kernel source file using:
+    ``#include "header_filename.h"``
 
-        The kernel can then be compiled for a specific device using:
-        ``-DTARGET_GPU="name_of_gpu"``
+    The kernel can then be compiled for a specific device using:
+    ``-DTARGET_GPU="name_of_gpu"``
 
-        The header will also include a default value, which is chosen to perform well
-        on different devices.
+    The header will also include a default value, which is chosen to perform well
+    on different devices.
 
-        :param header_filename: Filename of the to be created header file.
-        :type header_filename: string
+    :param header_filename: Filename of the to be created header file.
+    :type header_filename: string
 
-        :param results_filename: Filename of the JSON file that stores the tuning results.
-        :type results_filename: string
+    :param results_filename: Filename of the JSON file that stores the tuning results.
+    :type results_filename: string
 
-        :param objective: Optimization objective to sort results on, consisting of a string
-            that also occurs in results as a metric.
-        :type objective: string
+    :param objective: Optimization objective to sort results on, consisting of a string
+        that also occurs in results as a metric.
+    :type objective: string
 
-        :param objective_higher_is_better: A boolean that specifies whether the objective should
-            be maximized or minimized.
-        :type objective_higher_is_better: bool
+    :param objective_higher_is_better: A boolean that specifies whether the objective should
+        be maximized or minimized.
+    :type objective_higher_is_better: bool
 
     """
     objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
 
-    #open results file
+    # open results file
     results = TuneResults(results_filename)
     data = results.data
 
-    #collect data for the if-block
+    # collect data for the if-block
     gpu_targets = list({r["device_name"] for r in data})
     targets = {}
     for gpu_name in gpu_targets:
         targets[gpu_name] = results.get_best_config(gpu_name)
 
-    #select a good default from all good configs
+    # select a good default from all good configs
     default_params = results.get_best_config()
 
-    #write the header output file
+    # write the header output file
     if_block = ""
     first = True
     for gpu_name, params in targets.items():
@@ -303,10 +314,10 @@ def create_device_targets(header_filename, results_filename, objective=None, obj
             first = False
         else:
             if_block += f"\n#elif TARGET_{gpu_name}\n"
-        if_block += "\n".join([f"#define {k} {v}" for k,v in params.items()])
+        if_block += "\n".join([f"#define {k} {v}" for k, v in params.items()])
         if_block += "\n"
 
-    default_config = "\n".join([f"#define {k} {v}" for k,v in default_params.items()])
+    default_config = "\n".join([f"#define {k} {v}" for k, v in default_params.items()])
 
     template_header_file = f"""/* header file generated by Kernel Tuner, do not modify by hand */
 #pragma once
@@ -320,79 +331,75 @@ def create_device_targets(header_filename, results_filename, objective=None, obj
 #endif /* kernel_tuner */
 """
 
-    with open(header_filename, 'w') as fh:
+    with open(header_filename, "w") as fh:
         fh.write(template_header_file)
 
 
-
-
 def _select_best_common_config(results, objective, objective_higher_is_better):
-    """ return the most common config among results obtained on different problem sizes """
+    """return the most common config among results obtained on different problem sizes"""
     results_table = {}
     total_performance = {}
 
     inverse_table = {}
 
-    #for each configuration in the list
+    # for each configuration in the list
     for config in results:
         params = config["tunable_parameters"]
 
         config_str = util.get_instance_string(params)
-        #count occurances
-        results_table[config_str] = results_table.get(config_str,0) + 1
-        #add to performance
-        total_performance[config_str] = total_performance.get(config_str,0) + config[objective]
-        #store mapping from config_str to the parameters
+        # count occurances
+        results_table[config_str] = results_table.get(config_str, 0) + 1
+        # add to performance
+        total_performance[config_str] = total_performance.get(config_str, 0) + config[objective]
+        # store mapping from config_str to the parameters
         inverse_table[config_str] = params
 
-    #look for best config
+    # look for best config
     top_freq = max(results_table.values())
     best_configs = [k for k in results_table if results_table[k] == top_freq]
 
-    #intersect total_performance with the best_configs
-    total_performance = {k:total_performance[k] for k in total_performance if k in best_configs}
+    # intersect total_performance with the best_configs
+    total_performance = {k: total_performance[k] for k in total_performance if k in best_configs}
 
-    #get the best config from this intersection
+    # get the best config from this intersection
     if objective_higher_is_better:
         best_config_str = max(total_performance.keys(), key=lambda x: total_performance[x])
     else:
         best_config_str = min(total_performance.keys(), key=lambda x: total_performance[x])
 
-    #lookup the tunable parameters of this configuration in the inverse table and return result
+    # lookup the tunable parameters of this configuration in the inverse table and return result
     return inverse_table[best_config_str]
 
 
 def _get_best_config_from_list(configs, objective, objective_higher_is_better):
-    """ return the tunable parameters of the best config from a list of configs """
+    """return the tunable parameters of the best config from a list of configs"""
     if objective_higher_is_better:
         best_config = max(configs, key=lambda x: x[objective])
     else:
         best_config = min(configs, key=lambda x: x[objective])
-    best_config_params = {k:best_config[k] for k in best_config if k != objective}
+    best_config_params = {k: best_config[k] for k in best_config if k != objective}
     return best_config_params
 
 
-
-
 def _read_results_file(results_filename):
-    """ Reader for results file
-
-        File format 1.0 specifies the following metadata
-        "version_number": string e.g. "1.0"
-        "tunable_parameters": list of strings
-        "kernel_name": string
-        "kernel_string": string with kernel code, optional
-        "objective": string
-        "objective_higher_is_better": True or False, default False
-        "data": list of dicts
-            each dict consists of the following keys:
-            - "device_name": device name as reported by the device, with all non-alphanumeric characters replaced with "_"
-            - "problem_size": a concatenated string of problem dimensions using "x" as separator
-            - "tunable_parameters": a dict with all tunable parameters
-            - "objective" as specified in the "objective" metadata
+    """Reader for results file
+
+    File format 1.0 specifies the following metadata
+    "version_number": string e.g. "1.0"
+    "tunable_parameters": list of strings
+    "kernel_name": string
+    "kernel_string": string with kernel code, optional
+    "objective": string
+    "objective_higher_is_better": True or False, default False
+    "data": list of dicts
+        each dict consists of the following keys:
+        - "device_name": device name as reported by the device, with all non-alphanumeric characters replaced with "_"
+        - "problem_size": a concatenated string of problem dimensions using "x" as separator
+        - "tunable_parameters": a dict with all tunable parameters
+        - "objective" as specified in the "objective" metadata
 
     """
-    with open(results_filename, 'r') as fh:
+    with open(results_filename, "r") as fh:
         data = json.loads(fh.read())
 
     if "version_number" in data:
@@ -402,7 +409,6 @@ def _read_results_file(results_filename):
     raise ValueError("Results fileformat not recognized")
 
 
-
 def _parse_results_file_version_1_0(data):
     validate(instance=data, schema=schema_v1_0)
 
@@ -412,7 +418,7 @@ def _parse_results_file_version_1_0(data):
     meta["kernel_string"] = data.get("kernel_string", "")
     entries = data["data"]
 
-    #do some final checks against the metadata that cannot be handled by the JSON schema
+    # do some final checks against the metadata that cannot be handled by the JSON schema
     entry_keys = ["tunable_parameters"] + [meta["objective"]] + ["device_name", "problem_size"]
     for entry in entries:
         if not all([k in entry for k in entry_keys]):
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 97ae22848..dc7d452a8 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -34,6 +34,7 @@
 from kernel_tuner.integration import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.searchspace import Searchspace
 
 try:
@@ -57,6 +58,7 @@
     pso,
     random_sample,
     simulated_annealing,
+    ensemble,
 )
 
 strategy_map = {
@@ -75,6 +77,7 @@
     "simulated_annealing": simulated_annealing,
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
+    "ensemble": ensemble,
 }
 
 
@@ -384,6 +387,7 @@ def __deepcopy__(self, _):
             * "pso" particle swarm optimization
             * "random_sample" takes a random sample of the search space
             * "simulated_annealing" simulated annealing strategy
+            * "ensemble" Ensemble Strategy
 
         Strategy-specific parameters and options are explained under strategy_options.
 
@@ -463,6 +467,7 @@ def __deepcopy__(self, _):
         ),
         ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")),
         ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")),
+        ("parallel_mode", ("Run the auto-tuning on multiple devices (brute-force execution)", "bool")),
         ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")),
     ]
 )
@@ -574,6 +579,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
+    parallel_mode=False,
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -611,6 +617,8 @@ def tune_kernel(
         tuning_options["max_fevals"] = strategy_options["max_fevals"]
     if strategy_options and "time_limit" in strategy_options:
         tuning_options["time_limit"] = strategy_options["time_limit"]
+    if strategy_options and "num_gpus" in strategy_options:
+        tuning_options["num_gpus"] = strategy_options["num_gpus"]
 
     logging.debug("tune_kernel called")
     logging.debug("kernel_options: %s", util.get_config_string(kernel_options))
@@ -650,9 +658,17 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else SequentialRunner
+    selected_runner = SequentialRunner
+    if simulation_mode:
+        selected_runner = SimulationRunner
+    elif parallel_mode:
+        selected_runner = ParallelRunner
     tuning_options.simulated_time = 0
-    runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
+    if parallel_mode:
+        num_gpus = tuning_options["num_gpus"] if "num_gpus" in tuning_options else None
+        runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers, num_gpus=num_gpus)
+    else:
+        runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
     # the user-specified function may or may not have an optional atol argument;
     # we normalize it so that it always accepts atol.
@@ -684,7 +700,7 @@ def tune_kernel(
     if results:  # checks if results is not empty
         best_config = util.get_best_config(results, objective, objective_higher_is_better)
         # add the best configuration to env
-        env['best_config'] = best_config
+        env["best_config"] = best_config
         if not device_options.quiet:
             units = getattr(runner, "units", None)
             print("best performing configuration:")
diff --git a/kernel_tuner/kernelbuilder.py b/kernel_tuner/kernelbuilder.py
index 0f3f6154f..1e46ac811 100644
--- a/kernel_tuner/kernelbuilder.py
+++ b/kernel_tuner/kernelbuilder.py
@@ -5,53 +5,69 @@
 
 from kernel_tuner.integration import TuneResults
 
-class PythonKernel(object):
-
-    def __init__(self, kernel_name, kernel_string, problem_size, arguments, params=None, inputs=None, outputs=None, device=0, platform=0,
-                 block_size_names=None, grid_div_x=None, grid_div_y=None, grid_div_z=None, verbose=True, lang=None,
-                 results_file=None):
-        """ Construct Python helper object to compile and call the kernel from Python
-
-            This object compiles a GPU kernel parameterized using the parameters in params.
-            GPU memory is allocated for each argument using its size and type as listed in arguments.
-            The object can be called directly as a function with the kernel arguments as function arguments.
-            Kernel arguments marked as inputs will be copied to the GPU on every kernel launch.
-            Only the kernel arguments marked as outputs will be returned, note that the result is always
-            returned in a list, even when there is only one output.
 
-            Most of the arguments to this function are the same as with tune_kernel or run_kernel in Kernel Tuner,
-            and are therefore not duplicated here. The two new arguments are:
-
-            :param inputs: a boolean list of length arguments to signal whether an argument is input to the kernel
-            :type inputs: list(bool)
-
-            :param outputs: a boolean list of length arguments to signal whether an argument is output of the kernel
-            :type outputs: list(bool)
+class PythonKernel(object):
+    def __init__(
+        self,
+        kernel_name,
+        kernel_string,
+        problem_size,
+        arguments,
+        params=None,
+        inputs=None,
+        outputs=None,
+        device=0,
+        platform=0,
+        block_size_names=None,
+        grid_div_x=None,
+        grid_div_y=None,
+        grid_div_z=None,
+        verbose=True,
+        lang=None,
+        results_file=None,
+    ):
+        """Construct Python helper object to compile and call the kernel from Python
+
+        This object compiles a GPU kernel parameterized using the parameters in params.
+        GPU memory is allocated for each argument using its size and type as listed in arguments.
+        The object can be called directly as a function with the kernel arguments as function arguments.
+        Kernel arguments marked as inputs will be copied to the GPU on every kernel launch.
+        Only the kernel arguments marked as outputs will be returned, note that the result is always
+        returned in a list, even when there is only one output.
+
+        Most of the arguments to this function are the same as with tune_kernel or run_kernel in Kernel Tuner,
+        and are therefore not duplicated here. The two new arguments are:
+
+        :param inputs: a boolean list of length arguments to signal whether an argument is input to the kernel
+        :type inputs: list(bool)
+
+        :param outputs: a boolean list of length arguments to signal whether an argument is output of the kernel
+        :type outputs: list(bool)
 
         """
-        #construct device interface
+        # construct device interface
         kernel_source = core.KernelSource(kernel_name, kernel_string, lang)
         self.dev = core.DeviceInterface(kernel_source, device=device, quiet=True)
         if not params:
             params = {}
 
-        #if results_file is passed use the results file to lookup tunable parameters
+        # if results_file is passed use the results file to lookup tunable parameters
         if results_file:
             results = TuneResults(results_file)
             params.update(results.get_best_config(self.dev.name, problem_size))
         self.params = params
 
-        #construct kernel_options to hold information about the kernel
+        # construct kernel_options to hold information about the kernel
         opts = locals()
         kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys() if k in opts.keys()])
 
-        #instantiate the kernel given the parameters in params
+        # instantiate the kernel given the parameters in params
         self.kernel_instance = self.dev.create_kernel_instance(kernel_source, kernel_options, params, verbose)
 
-        #compile the kernel
+        # compile the kernel
         self.func = self.dev.compile_kernel(self.kernel_instance, verbose)
 
-        #setup GPU memory
+        # setup GPU memory
         self.gpu_args = self.dev.ready_argument_list(arguments)
         if inputs:
             self.inputs = inputs
diff --git a/kernel_tuner/observers/hip.py b/kernel_tuner/observers/hip.py
index c536cf965..a21bb18bd 100644
--- a/kernel_tuner/observers/hip.py
+++ b/kernel_tuner/observers/hip.py
@@ -14,7 +14,9 @@ class HipRuntimeObserver(BenchmarkObserver):
 
     def __init__(self, dev):
         if not hip or not hiprtc:
-            raise ImportError("Unable to import HIP Python, or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#hip-and-hip-python.")
+            raise ImportError(
+                "Unable to import HIP Python, or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#hip-and-hip-python."
+            )
 
         self.dev = dev
         self.stream = dev.stream
diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py
index c727e1e30..0956dd2a7 100644
--- a/kernel_tuner/observers/ncu.py
+++ b/kernel_tuner/observers/ncu.py
@@ -2,24 +2,25 @@
 
 try:
     import nvmetrics
-except (ImportError):
+except ImportError:
     nvmetrics = None
 
+
 class NCUObserver(PrologueObserver):
     """``NCUObserver`` measures performance counters.
 
-        The exact performance counters supported differ per GPU, some examples:
+    The exact performance counters supported differ per GPU, some examples:
 
-         * "dram__bytes.sum",                                     # Counter         byte            # of bytes accessed in DRAM
-         * "dram__bytes_read.sum",                                # Counter         byte            # of bytes read from DRAM
-         * "dram__bytes_write.sum",                               # Counter         byte            # of bytes written to DRAM
-         * "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter         inst            # of FADD thread instructions executed where all predicates were true
-         * "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter         inst            # of FFMA thread instructions executed where all predicates were true
-         * "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter         inst            # of FMUL thread instructions executed where all predicates were true
+     * "dram__bytes.sum",                                     # Counter         byte            # of bytes accessed in DRAM
+     * "dram__bytes_read.sum",                                # Counter         byte            # of bytes read from DRAM
+     * "dram__bytes_write.sum",                               # Counter         byte            # of bytes written to DRAM
+     * "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter         inst            # of FADD thread instructions executed where all predicates were true
+     * "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter         inst            # of FFMA thread instructions executed where all predicates were true
+     * "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter         inst            # of FMUL thread instructions executed where all predicates were true
 
-        :param metrics: The metrics to observe. This should be a list of strings.
-                        You can use ``ncu --query-metrics`` to get a list of valid metrics.
-        :type metrics: list[str]
+    :param metrics: The metrics to observe. This should be a list of strings.
+                    You can use ``ncu --query-metrics`` to get a list of valid metrics.
+    :type metrics: list[str]
 
     """
 
diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py
index 24e3275af..8b8529aa2 100644
--- a/kernel_tuner/observers/nvml.py
+++ b/kernel_tuner/observers/nvml.py
@@ -326,6 +326,15 @@ def __init__(
         continuous_duration=1,
     ):
         """Create an NVMLObserver."""
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = {
+            "observables": observables,
+            "device": device,
+            "save_all": save_all,
+            "nvidia_smi_fallback": nvidia_smi_fallback,
+            "use_locked_clocks": use_locked_clocks,
+            "continous_duration": continuous_duration,
+        }
         if nvidia_smi_fallback:
             self.nvml = nvml(
                 device,
@@ -355,7 +364,9 @@ def __init__(
         if any([obs in self.needs_power for obs in observables]):
             self.measure_power = True
             power_observables = [obs for obs in observables if obs in self.needs_power]
-            self.continuous_observer = ContinuousObserver("nvml", power_observables, self, continuous_duration=continuous_duration)
+            self.continuous_observer = ContinuousObserver(
+                "nvml", power_observables, self, continuous_duration=continuous_duration
+            )
 
         # remove power observables
         self.observables = [obs for obs in observables if obs not in self.needs_power]
@@ -374,7 +385,7 @@ def __init__(
         self.iteration = {obs: [] for obs in self.during_obs}
 
     def read_power(self):
-        """ Return power in Watt """
+        """Return power in Watt"""
         return self.nvml.pwr_usage() / 1e3
 
     def before_start(self):
diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py
index bcf661c8a..545e3130f 100644
--- a/kernel_tuner/observers/observer.py
+++ b/kernel_tuner/observers/observer.py
@@ -2,6 +2,7 @@
 import time
 import numpy as np
 
+
 class BenchmarkObserver(ABC):
     """Base class for Benchmark Observers"""
 
@@ -47,9 +48,10 @@ class IterationObserver(BenchmarkObserver):
 class ContinuousObserver(BenchmarkObserver):
     """Generic observer that measures power while and continuous benchmarking.
 
-        To support continuous benchmarking an Observer should support:
-        a .read_power() method, which the ContinuousObserver can call to read power in Watt
+    To support continuous benchmarking an Observer should support:
+    a .read_power() method, which the ContinuousObserver can call to read power in Watt
     """
+
     def __init__(self, name, observables, parent, continuous_duration=1):
         self.parent = parent
         self.name = name
@@ -89,8 +91,7 @@ def during(self):
         timestamp = time.perf_counter() - self.t0
         # only store the result if we get a new measurement from the GPU
         if len(self.power_readings) == 0 or (
-            self.power_readings[-1][1] != power_usage
-            or timestamp - self.power_readings[-1][0] > 0.01
+            self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01
         ):
             self.power_readings.append([timestamp, power_usage])
 
@@ -118,6 +119,7 @@ def get_results(self):
             results["power_readings"] = self.power_readings
         return results
 
+
 class OutputObserver(BenchmarkObserver):
     """Observer that can verify or measure something about the output produced by a kernel."""
 
@@ -128,6 +130,7 @@ def process_output(self, answer, output):
         """
         pass
 
+
 class PrologueObserver(BenchmarkObserver):
     """Observer that measures something in a seperate kernel invocation prior to the normal benchmark."""
 
diff --git a/kernel_tuner/observers/pmt.py b/kernel_tuner/observers/pmt.py
index 254bec5c9..268d1177d 100644
--- a/kernel_tuner/observers/pmt.py
+++ b/kernel_tuner/observers/pmt.py
@@ -50,6 +50,9 @@ def __init__(self, observable=None, use_continuous_observer=False, continuous_du
         if not pmt:
             raise ImportError("could not import pmt")
 
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = {"observable": observable}
+
         # User specifices a dictonary of platforms and corresponding device
         if type(observable) is dict:
             pass
@@ -106,18 +109,19 @@ def get_results(self):
 class PMTContinuousObserver(ContinuousObserver):
     """Generic observer that measures power while and continuous benchmarking.
 
-        To support continuous benchmarking an Observer should support:
-        a .read_power() method, which the ContinuousObserver can call to read power in Watt
+    To support continuous benchmarking an Observer should support:
+    a .read_power() method, which the ContinuousObserver can call to read power in Watt
     """
+
     def before_start(self):
-        """ Override default method in ContinuousObserver """
+        """Override default method in ContinuousObserver"""
         pass
 
     def after_start(self):
         self.parent.after_start()
 
     def during(self):
-        """ Override default method in ContinuousObserver """
+        """Override default method in ContinuousObserver"""
         pass
 
     def after_finish(self):
diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py
index 6d07e8977..889071a9f 100644
--- a/kernel_tuner/observers/powersensor.py
+++ b/kernel_tuner/observers/powersensor.py
@@ -28,6 +28,9 @@ def __init__(self, observables=None, device=None):
         if not powersensor:
             raise ImportError("could not import powersensor")
 
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = {"observables": observables, "device": device}
+
         supported = ["ps_energy", "ps_power"]
         for obs in observables:
             if not obs in supported:
@@ -46,14 +49,10 @@ def after_start(self):
     def after_finish(self):
         end_state = self.ps.read()
         if "ps_energy" in self.observables:
-            ps_measured_e = powersensor.Joules(
-                self.begin_state, end_state, -1
-            )  # Joules
+            ps_measured_e = powersensor.Joules(self.begin_state, end_state, -1)  # Joules
             self.results["ps_energy"].append(ps_measured_e)
         if "ps_power" in self.observables:
-            ps_measured_t = (
-                    end_state.time_at_read - self.begin_state.time_at_read
-            )  # seconds
+            ps_measured_t = end_state.time_at_read - self.begin_state.time_at_read  # seconds
             self.results["ps_power"].append(ps_measured_e / ps_measured_t)  # Watt
 
     def get_results(self):
diff --git a/kernel_tuner/observers/tegra.py b/kernel_tuner/observers/tegra.py
index efc83048c..84495b1de 100644
--- a/kernel_tuner/observers/tegra.py
+++ b/kernel_tuner/observers/tegra.py
@@ -83,11 +83,11 @@ def get_gpu_channel(self):
         # Iterate over all channels in the of_node dir of the power path to
         # find the channel which holds GPU power information
         for channel_dir in Path(self.gpu_power_path + "/of_node/").iterdir():
-            if("channel@" in channel_dir.name):
+            if "channel@" in channel_dir.name:
                 with open(channel_dir / Path("label")) as fp:
                     channel_label = fp.read().strip()
                 if "GPU" in channel_label:
-                    return str(int(channel_dir.name[-1])+1)
+                    return str(int(channel_dir.name[-1]) + 1)
 
         # If this statement is reached, no channel for the GPU was found
         raise FileNotFoundError("No channel found with GPU power readings")
@@ -103,12 +103,7 @@ def _write_railgate_file(self, value):
         if value not in (0, 1):
             raise ValueError(f"Illegal governor value {value}, must be 0 or 1")
         full_path = self.dev_path / Path("device/railgate_enable")
-        args = [
-            "sudo",
-            "sh",
-            "-c",
-            f"echo {value} > {str(full_path)}"
-        ]
+        args = ["sudo", "sh", "-c", f"echo {value} > {str(full_path)}"]
         subprocess.run(args, check=True)
 
     def _read_clock_file(self, fname):
@@ -132,12 +127,7 @@ def _write_clock_file(self, fname, value):
             raise ValueError(f"Illegal frequency value {value}, must be one of {self.supported_gr_clocks}")
 
         full_path = self.dev_path / Path(fname)
-        args = [
-            "sudo",
-            "sh",
-            "-c",
-            f"echo {value} > {str(full_path)}"
-        ]
+        args = ["sudo", "sh", "-c", f"echo {value} > {str(full_path)}"]
         subprocess.run(args, check=True)
 
     @property
@@ -157,8 +147,8 @@ def gr_clock(self, new_clock):
             self._write_clock_file("min_freq", new_clock)
             self._write_clock_file("max_freq", new_clock)
         # wait for the new clock to be applied
-        while (self._read_clock_file("cur_freq") != new_clock):
-            time.sleep(.001)
+        while self._read_clock_file("cur_freq") != new_clock:
+            time.sleep(0.001)
 
     def reset_clock(self):
         """Reset the core clock frequency to the original values"""
@@ -180,9 +170,13 @@ def read_gpu_temp(self):
     def read_gpu_power(self):
         """Read the current and voltage to calculate and return the power int watt"""
 
-        result_cur = subprocess.run(["sudo", "cat", f"{self.gpu_power_path}/curr{self.gpu_channel}_input"], capture_output=True, text=True)
+        result_cur = subprocess.run(
+            ["sudo", "cat", f"{self.gpu_power_path}/curr{self.gpu_channel}_input"], capture_output=True, text=True
+        )
         current = int(result_cur.stdout.strip()) / 1000
-        result_vol = subprocess.run(["sudo", "cat", f"{self.gpu_power_path}/in{self.gpu_channel}_input"], capture_output=True, text=True)
+        result_vol = subprocess.run(
+            ["sudo", "cat", f"{self.gpu_power_path}/in{self.gpu_channel}_input"], capture_output=True, text=True
+        )
         voltage = int(result_vol.stdout.strip()) / 1000
 
         return current * voltage
@@ -200,13 +194,7 @@ class TegraObserver(BenchmarkObserver):
 
     """
 
-    def __init__(
-        self,
-        observables,
-        save_all=False,
-        power_path="",
-        temp_path=""
-    ):
+    def __init__(self, observables, save_all=False, power_path="", temp_path=""):
         """Create a TegraObserver"""
         self.tegra = tegra(power_path=power_path, temp_path=temp_path)
         self.save_all = save_all
@@ -233,19 +221,13 @@ def __init__(
         for obs in self.observables:
             self.results[obs + "s"] = []
 
-        self.during_obs = [
-            obs
-            for obs in observables
-            if obs in ["core_freq", "tegra_temp"]
-        ]
+        self.during_obs = [obs for obs in observables if obs in ["core_freq", "tegra_temp"]]
 
         self.iteration = {obs: [] for obs in self.during_obs}
 
-
     def read_power(self):
         return self.tegra.read_gpu_power()
 
-
     def before_start(self):
         # clear results of the observables for next measurement
         self.iteration = {obs: [] for obs in self.during_obs}
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
new file mode 100644
index 000000000..f5266dde1
--- /dev/null
+++ b/kernel_tuner/runners/parallel.py
@@ -0,0 +1,343 @@
+"""Parallel runner"""
+import ray
+import sys
+from ray.util.actor_pool import ActorPool
+from time import perf_counter
+from collections import deque
+import copy
+
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.runner import Runner
+from kernel_tuner.util import get_num_devices, GPUTypeMismatchError
+from kernel_tuner.runners.ray.cache_manager import CacheManager
+from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
+
+
+class ParallelRunner(Runner):
+    """ParallelRunner is used for tuning with multiple processes/threads using Ray for distributed computing."""
+
+    def __init__(
+        self,
+        kernel_source,
+        kernel_options,
+        device_options,
+        iterations,
+        observers,
+        num_gpus=None,
+        cache_manager=None,
+        actors=None,
+        simulation_mode=False,
+    ):
+        """Instantiate the ParallelRunner.
+
+        :param kernel_source: The kernel source
+        :type kernel_source: kernel_tuner.core.KernelSource
+
+        :param kernel_options: A dictionary with all options for the kernel.
+        :type kernel_options: kernel_tuner.interface.Options
+
+        :param device_options: A dictionary with all options for the device
+            on which the kernel should be tuned.
+        :type device_options: kernel_tuner.interface.Options
+
+        :param iterations: The number of iterations used for benchmarking
+            each kernel instance.
+        :type iterations: int
+
+        :param observers: List of observers.
+        :type observers: list
+
+        :param num_gpus: Number of GPUs to use. Defaults to None.
+        :type num_gpus: int, optional
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
+
+        :param actors: List of pre-initialized actors. Defaults to None.
+        :type actors: list, optional
+
+        :param simulation_mode: Flag to indicate simulation mode. Defaults to False.
+        :type simulation_mode: bool, optional
+        """
+        self.dev = (
+            DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+            if not simulation_mode
+            else None
+        )
+        self.kernel_source = kernel_source
+        self.simulation_mode = simulation_mode
+        self.kernel_options = kernel_options
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.observers = observers
+        self.iterations = iterations
+        self.device_options = device_options
+        self.cache_manager = cache_manager
+        self.num_gpus = num_gpus
+        self.actors = actors
+
+        initialize_ray()
+
+        if num_gpus is None:
+            self.num_gpus = get_num_devices(simulation_mode)
+
+        # So we know the number of GPUs in the cache file
+        if not simulation_mode:
+            self.dev.name = [self.dev.name] * self.num_gpus
+
+    def get_environment(self, tuning_options):
+        return self.dev.get_environment()
+
+    def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchspace=None, cache_manager=None):
+        """Run the tuning process with parallel execution.
+
+        :param parameter_space: The parameter space to explore.
+        :type parameter_space: iterable
+
+        :param tuning_options: Tuning options. Defaults to None.
+        :type tuning_options: dict, optional
+
+        :param ensemble: List of strategies for ensemble. Defaults to None.
+        :type ensemble: list, optional
+
+        :param searchspace: The search space to explore. Defaults to None.
+        :type searchspace: kernel_tuner.searchspace.Searchspace, optional
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
+
+        :returns: Results of the tuning process.
+        :rtype: list of dict
+        """
+        if tuning_options is None:
+            # HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
+            raise ValueError("tuning_options cannot be None")
+
+        # Create RemoteActor instances
+        if self.actors is None:
+            runner_attributes = [
+                self.kernel_source,
+                self.kernel_options,
+                self.device_options,
+                self.iterations,
+                self.observers,
+            ]
+            self.actors = [
+                create_actor_on_device(
+                    *runner_attributes,
+                    identifier=_id,
+                    cache_manager=self.cache_manager,
+                    simulation_mode=self.simulation_mode,
+                )
+                for _id in range(self.num_gpus)
+            ]
+
+        # Check if all GPUs are of the same type
+        if not self.simulation_mode and not self._check_gpus_equals():
+            raise GPUTypeMismatchError("Different GPU types found")
+
+        if self.cache_manager is None:
+            if cache_manager is None:
+                cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
+            self.cache_manager = cache_manager
+
+        # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
+        for actor in self.actors:
+            actor.set_cache_manager.remote(self.cache_manager)
+
+        # Some observers can't be pickled
+        run_tuning_options = copy.deepcopy(tuning_options)
+        run_tuning_options["observers"] = None
+        # Determine what type of parallelism and run appropriately
+        if parameter_space and not ensemble and not searchspace:
+            results, tuning_options_list = self.parallel_function_evaluation(run_tuning_options, parameter_space)
+        elif ensemble and searchspace and not parameter_space:
+            results, tuning_options_list = self.multi_strategy_parallel_execution(
+                ensemble, run_tuning_options, searchspace
+            )
+        else:
+            raise ValueError("Invalid arguments to parallel runner run method")
+
+        # Update tuning options
+        # NOTE: tuning options won't have the state of the observers created in the actors as they can't be pickled
+        cache, cachefile = ray.get(self.cache_manager.get_cache.remote())
+        tuning_options.cache = cache
+        tuning_options.cachefile = cachefile
+        if self.simulation_mode:
+            tuning_options.simulated_time += self._calculate_simulated_time(tuning_options_list)
+
+        return results
+
+    def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspace):
+        """Runs strategies from the ensemble in parallel using distributed actors,
+        manages dynamic task allocation, and collects results.
+
+        :param ensemble: List of strategies to execute.
+        :type ensemble: list
+
+        :param tuning_options: Tuning options.
+        :type tuning_options: dict
+
+        :param searchspace: Search space to explore.
+        :type searchspace: kernel_tuner.searchspace.Searchspace
+
+        :returns: Processed results and tuning options list.
+        :rtype: tuple
+        """
+        ensemble_queue = deque(ensemble)
+        pending_tasks = {}
+        all_results = []
+        options = tuning_options.strategy_options
+        max_feval = options["max_fevals"]
+        num_strategies = len(ensemble)
+
+        # distributing feval to all strategies
+        base_eval_per_strategy = max_feval // num_strategies
+        remainder = max_feval % num_strategies
+        evaluations_per_strategy = [base_eval_per_strategy] * num_strategies
+        for i in range(remainder):
+            evaluations_per_strategy[i] += 1
+
+        # Ensure we always have a list of search spaces
+        searchspaces = [searchspace] * num_strategies
+        searchspaces = deque(searchspaces)
+
+        # Start initial tasks for each actor
+        for actor in self.actors:
+            strategy = ensemble_queue.popleft()
+            searchspace = searchspaces.popleft()
+            remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
+            task = actor.execute.remote(
+                strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options
+            )
+            pending_tasks[task] = actor
+
+        # Manage task completion and redistribution
+        while pending_tasks:
+            done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
+            for done_id in done_ids:
+                result = ray.get(done_id)
+                all_results.append(result)
+                actor = pending_tasks.pop(done_id)
+
+                # Reassign actors if strategies remain
+                if ensemble_queue:
+                    strategy = ensemble_queue.popleft()
+                    searchspace = searchspaces.popleft()
+                    remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
+                    task = actor.execute.remote(
+                        strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options
+                    )
+                    pending_tasks[task] = actor
+
+        # Process results
+        results, tuning_options_list = self._process_results_ensemble(all_results)
+
+        return results, tuning_options_list
+
+    def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
+        """Set up tuning options for each strategy in the ensemble.
+
+        :param tuning_options: Original tuning options.
+        :type tuning_options: dict
+
+        :param evaluations_per_strategy: Number of evaluations per strategy.
+        :type evaluations_per_strategy: list
+
+        :returns: Modified tuning options.
+        :rtype: dict
+        """
+        new_tuning_options = copy.deepcopy(tuning_options)
+        new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
+        # the stop criterion uses the max feval in tuning options for some reason
+        new_tuning_options["max_fevals"] = new_tuning_options.strategy_options["max_fevals"]
+        return new_tuning_options
+
+    def _process_results_ensemble(self, all_results):
+        """Process the results from the ensemble execution.
+
+        :param all_results: List of results from all strategies.
+        :type all_results: list
+
+        :returns: Processed results and tuning options list.
+        :rtype: tuple
+        """
+        results = []
+        tuning_options_list = []
+
+        for strategy_results, tuning_options in all_results:
+            results.extend(strategy_results)
+            tuning_options_list.append(tuning_options)
+
+        return results, tuning_options_list
+
+    def parallel_function_evaluation(self, tuning_options, parameter_space):
+        """Perform parallel function evaluation.
+
+        :param tuning_options: Tuning options.
+        :type tuning_options: dict
+
+        :param parameter_space: Parameter space to explore.
+        :type parameter_space: list
+
+        :returns: Results and tuning options list.
+        :rtype: tuple
+        """
+        # Create a pool of RemoteActor actors
+        self.actor_pool = ActorPool(self.actors)
+        # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
+        all_results = list(
+            self.actor_pool.map_unordered(lambda a, v: a.execute.remote(tuning_options, element=v), parameter_space)
+        )
+        results = [x[0] for x in all_results]
+        tuning_options_list = [x[1] for x in all_results]
+        return results, tuning_options_list
+
+    def _process_results(self, all_results, searchspace):
+        """
+        Process the results and remove duplicates based on the searchspace.
+        """
+        unique_configs = set()
+        final_results = []
+
+        for strategy_results, tuning_options in all_results:
+            for new_result in strategy_results:
+                config_signature = tuple(new_result[key] for key in searchspace.tune_params)
+                if config_signature not in unique_configs:
+                    final_results.append(new_result)
+                    unique_configs.add(config_signature)
+        return final_results
+
+    def _calculate_simulated_time(self, tuning_options_list):
+        """
+        Calculate the maximum simulated time from the list of tuning options.
+        """
+        simulated_times = []
+        for tuning_options in tuning_options_list:
+            simulated_times.append(tuning_options.simulated_time)
+        return max(simulated_times)
+
+    def _check_gpus_equals(self):
+        """
+        Check if all GPUs are of the same type.
+        """
+        gpu_types = []
+        env_refs = [actor.get_environment.remote() for actor in self.actors]
+        environments = ray.get(env_refs)
+        for env in environments:
+            gpu_types.append(env["device_name"])
+        if len(set(gpu_types)) == 1:
+            print(f"Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
+            return True
+        else:
+            return False
+
+    def clean_up_ray(self):
+        """
+        Clean up Ray actors and cache manager.
+        """
+        if self.actors is not None:
+            for actor in self.actors:
+                ray.kill(actor)
+        if self.cache_manager is not None:
+            ray.kill(self.cache_manager)
diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
new file mode 100644
index 000000000..9e19fabec
--- /dev/null
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -0,0 +1,29 @@
+"""Ray caching module."""
+import ray
+
+from kernel_tuner.util import store_cache
+
+
+@ray.remote(num_cpus=1)
+class CacheManager(object):
+    """Manage the cache used by the parallel actors."""
+
+    def __init__(self, cache, cachefile):
+        # importing here due to circular import
+        from kernel_tuner.interface import Options
+
+        self.tuning_options = Options({"cache": cache, "cachefile": cachefile})
+
+    def store(self, key, params):
+        store_cache(key, params, self.tuning_options)
+
+    def check_and_retrieve(self, key):
+        """Checks if a result exists for the given key and returns it if found."""
+        if self.tuning_options["cache"]:
+            return self.tuning_options["cache"].get(key, None)
+        else:
+            return None
+
+    def get_cache(self):
+        """Returns the current tuning options."""
+        return self.tuning_options["cache"], self.tuning_options["cachefile"]
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
new file mode 100644
index 000000000..0d6fe3c39
--- /dev/null
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -0,0 +1,100 @@
+"""Ray actors module."""
+import ray
+
+from kernel_tuner.runners.sequential import SequentialRunner
+from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.observers.register import RegisterObserver
+from kernel_tuner.util import get_gpu_id, get_gpu_type
+
+
+@ray.remote
+class RemoteActor(object):
+    """Remote actor for the parallel tuner."""
+
+    def __init__(
+        self,
+        kernel_source,
+        kernel_options,
+        device_options,
+        iterations,
+        observers_type_and_arguments,
+        identifier,
+        cache_manager=None,
+        simulation_mode=False,
+    ):
+        self.kernel_source = kernel_source
+        self.kernel_options = kernel_options
+        self.device_options = device_options
+        self.iterations = iterations
+        self.cache_manager = cache_manager
+        self.simulation_mode = simulation_mode
+        self.runner = None
+        self.identifier = None
+        self._reinitialize_observers(observers_type_and_arguments)
+        self.dev = (
+            DeviceInterface(kernel_source, iterations=iterations, observers=self.observers, **device_options)
+            if not simulation_mode
+            else None
+        )
+
+    def get_environment(self):
+        return self.dev.get_environment()
+
+    def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
+        tuning_options["observers"] = self.observers
+        if self.runner is None:
+            self.init_runner()
+        if strategy and searchspace:
+            results = strategy.tune(searchspace, self.runner, tuning_options)
+            # observers can't be pickled
+            tuning_options["observers"] = None
+            return results, tuning_options
+        elif element:
+            results = self.runner.run([element], tuning_options)[0]
+            # observers can't be pickled
+            tuning_options["observers"] = None
+            return results, tuning_options
+        else:
+            raise ValueError("Invalid arguments for ray actor's execute method.")
+
+    def set_cache_manager(self, cache_manager):
+        if self.cache_manager is None:
+            self.cache_manager = cache_manager
+
+    def get_cache_manager(self):
+        return self.cache_manager
+
+    def init_runner(self):
+        if self.cache_manager is None:
+            raise ValueError("Cache manager is not set.")
+        if self.simulation_mode:
+            self.runner = SimulationRunner(
+                self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers
+            )
+        else:
+            self.runner = SequentialRunner(
+                self.kernel_source,
+                self.kernel_options,
+                self.device_options,
+                self.iterations,
+                self.observers,
+                cache_manager=self.cache_manager,
+                dev=self.dev,
+            )
+
+    def _reinitialize_observers(self, observers_type_and_arguments):
+        # observers can't be pickled to the actor so we need to re-initialize them
+        self.observers = []
+        for observer, arguments in observers_type_and_arguments:
+            if "device" in arguments:
+                self.identifier = get_gpu_id(self.kernel_source.lang) if self.identifier is None else self.identifier
+                arguments["device"] = self.identifier
+            if isinstance(observer, RegisterObserver):
+                self.observers.append(RegisterObserver())
+            else:
+                self.observers.append(observer(**arguments))
+
+    @staticmethod
+    def get_gpu_type(lang):
+        return get_gpu_type(lang)
diff --git a/kernel_tuner/runners/runner.py b/kernel_tuner/runners/runner.py
index 80ab32146..0929c9bbf 100644
--- a/kernel_tuner/runners/runner.py
+++ b/kernel_tuner/runners/runner.py
@@ -8,13 +8,11 @@ class Runner(ABC):
     """Base class for kernel_tuner runners"""
 
     @abstractmethod
-    def __init__(
-        self, kernel_source, kernel_options, device_options, iterations, observers
-    ):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         pass
 
     @abstractmethod
-    def get_environment(self):
+    def get_environment(self, tuning_options):
         pass
 
     @abstractmethod
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index aeebd5116..9660623dd 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -2,6 +2,7 @@
 import logging
 from datetime import datetime, timezone
 from time import perf_counter
+import ray
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
@@ -11,7 +12,9 @@
 class SequentialRunner(Runner):
     """SequentialRunner is used for tuning with a single process/thread."""
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+    def __init__(
+        self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, dev=None
+    ):
         """Instantiate the SequentialRunner.
 
         :param kernel_source: The kernel source
@@ -27,9 +30,19 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param iterations: The number of iterations used for benchmarking
             each kernel instance.
         :type iterations: int
+
+        :param observers: List of observers.
+        :type observers: list
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
         """
-        #detect language and create high-level device interface
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        # detect language and create high-level device interface
+        self.dev = (
+            DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+            if dev is None
+            else dev
+        )
 
         self.units = self.dev.units
         self.quiet = device_options.quiet
@@ -40,8 +53,15 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
-
-        #move data to the GPU
+        # needed for the ensemble strategy down the line
+        self.device_options = device_options
+        # needed for the ensemble strategy down the line
+        self.iterations = iterations
+        # needed for the ensemble strategy down the line
+        self.observers = observers
+        self.cache_manager = cache_manager
+
+        # move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
 
     def get_environment(self, tuning_options):
@@ -62,7 +82,7 @@ def run(self, parameter_space, tuning_options):
         :rtype: dict())
 
         """
-        logging.debug('sequential runner started for ' + self.kernel_options.kernel_name)
+        logging.debug("sequential runner started for " + self.kernel_options.kernel_name)
 
         results = []
 
@@ -75,35 +95,49 @@ def run(self, parameter_space, tuning_options):
 
             # check if configuration is in the cache
             x_int = ",".join([str(i) for i in element])
-            if tuning_options.cache and x_int in tuning_options.cache:
-                params.update(tuning_options.cache[x_int])
-                params['compile_time'] = 0
-                params['verification_time'] = 0
-                params['benchmark_time'] = 0
+            cache_result = self.config_in_cache(x_int, tuning_options)
+            if cache_result:
+                params.update(cache_result)
+                params["compile_time"] = 0
+                params["verification_time"] = 0
+                params["benchmark_time"] = 0
             else:
                 # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
                 if not self.warmed_up:
                     warmup_time = perf_counter()
-                    self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                    self.dev.compile_and_benchmark(
+                        self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
+                    )
                     self.warmed_up = True
                     warmup_time = 1e3 * (perf_counter() - warmup_time)
 
-                result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                result = self.dev.compile_and_benchmark(
+                    self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
+                )
 
                 params.update(result)
 
                 if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                    logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+                    logging.debug("kernel configuration was skipped silently due to compile or runtime failure")
 
             # only compute metrics on configs that have not errored
             if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
                 params = process_metrics(params, tuning_options.metrics)
 
             # get the framework time by estimating based on other times
-            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) 
-            params['strategy_time'] = self.last_strategy_time
-            params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-            params['timestamp'] = str(datetime.now(timezone.utc))
+            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time)
+            params["strategy_time"] = self.last_strategy_time
+            params["framework_time"] = max(
+                total_time
+                - (
+                    params["compile_time"]
+                    + params["verification_time"]
+                    + params["benchmark_time"]
+                    + params["strategy_time"]
+                ),
+                0,
+            )
+            params["timestamp"] = str(datetime.now(timezone.utc))
             self.start_time = perf_counter()
 
             if result:
@@ -111,9 +145,23 @@ def run(self, parameter_space, tuning_options):
                 print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
 
                 # add configuration to cache
-                store_cache(x_int, params, tuning_options)
+                self.store_in_cache(x_int, params, tuning_options)
 
             # all visited configurations are added to results to provide a trace for optimization strategies
             results.append(params)
 
         return results
+
+    def config_in_cache(self, x_int, tuning_options):
+        if self.cache_manager and tuning_options.strategy_options["check_and_retrieve"]:
+            return ray.get(self.cache_manager.check_and_retrieve.remote(x_int))
+        elif tuning_options.cache and x_int in tuning_options.cache:
+            return tuning_options.cache[x_int]
+        else:
+            return None
+
+    def store_in_cache(self, x_int, params, tuning_options):
+        if self.cache_manager:
+            self.cache_manager.store.remote(x_int, params)
+        else:
+            store_cache(x_int, params, tuning_options)
diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
index 22c7c667c..edf9a40e0 100644
--- a/kernel_tuner/runners/simulation.py
+++ b/kernel_tuner/runners/simulation.py
@@ -14,11 +14,11 @@ class SimulationDevice(_SimulationDevice):
 
     @property
     def name(self):
-        return self.env['device_name']
+        return self.env["device_name"]
 
     @name.setter
     def name(self, value):
-        self.env['device_name'] = value
+        self.env["device_name"] = value
         if not self.quiet:
             print("Simulating: " + value)
 
@@ -58,6 +58,13 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.units = {}
 
+        # needed for the ensemble strategy down the line
+        self.device_options = device_options
+        # needed for the ensemble strategy down the line
+        self.iterations = iterations
+        # needed for the ensemble strategy down the line
+        self.observers = observers
+
     def get_environment(self, tuning_options):
         env = self.dev.get_environment()
         env["simulation"] = True
@@ -78,13 +85,12 @@ def run(self, parameter_space, tuning_options):
             execution times.
         :rtype: dict()
         """
-        logging.debug('simulation runner started for ' + self.kernel_options.kernel_name)
+        logging.debug("simulation runner started for " + self.kernel_options.kernel_name)
 
         results = []
 
         # iterate over parameter space
         for element in parameter_space:
-
             # check if element is in the cache
             x_int = ",".join([str(i) for i in element])
             if tuning_options.cache and x_int in tuning_options.cache:
@@ -98,21 +104,22 @@ def run(self, parameter_space, tuning_options):
                 # configuration is already counted towards the unique_results.
                 # It is the responsibility of cost_func to add configs to unique_results.
                 if x_int in tuning_options.unique_results:
-
-                    result['compile_time'] = 0
-                    result['verification_time'] = 0
-                    result['benchmark_time'] = 0
+                    result["compile_time"] = 0
+                    result["verification_time"] = 0
+                    result["benchmark_time"] = 0
 
                 else:
                     # configuration is evaluated for the first time, print to the console
-                    util.print_config_output(tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units)
+                    util.print_config_output(
+                        tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units
+                    )
 
                 # Everything but the strategy time and framework time are simulated,
                 # self.last_strategy_time is set by cost_func
-                result['strategy_time'] = self.last_strategy_time
+                result["strategy_time"] = self.last_strategy_time
 
                 try:
-                    simulated_time = result['compile_time'] + result['verification_time'] + result['benchmark_time']
+                    simulated_time = result["compile_time"] + result["verification_time"] + result["benchmark_time"]
                     tuning_options.simulated_time += simulated_time
                 except KeyError:
                     if "time_limit" in tuning_options:
@@ -122,13 +129,15 @@ def run(self, parameter_space, tuning_options):
 
                 total_time = 1000 * (perf_counter() - self.start_time)
                 self.start_time = perf_counter()
-                result['framework_time'] = total_time - self.last_strategy_time
+                result["framework_time"] = total_time - self.last_strategy_time
 
                 results.append(result)
                 continue
 
             # if the element is not in the cache, raise an error
-            check = util.check_restrictions(tuning_options.restrictions, dict(zip(tuning_options['tune_params'].keys(), element)), True)
+            check = util.check_restrictions(
+                tuning_options.restrictions, dict(zip(tuning_options["tune_params"].keys(), element)), True
+            )
             err_string = f"kernel configuration {element} not in cache, does {'' if check else 'not '}pass extra restriction check ({check})"
             logging.debug(err_string)
             raise ValueError(f"{err_string} - in simulation mode, all configurations must be present in the cache")
diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py
index 36e772639..cb6d80d04 100644
--- a/kernel_tuner/searchspace.py
+++ b/kernel_tuner/searchspace.py
@@ -51,6 +51,11 @@ def __init__(
         restrictions = restrictions if restrictions is not None else []
         self.tune_params = tune_params
         self.restrictions = restrictions
+        self.max_threads = max_threads
+        self.block_size_names = block_size_names
+        self.framework = framework
+        self.solver_method = solver_method
+        self.path_to_ATF_cache = path_to_ATF_cache
         # the searchspace can add commonly used constraints (e.g. maxprod(blocks) <= maxthreads)
         self._modified_restrictions = restrictions
         self.param_names = list(self.tune_params.keys())
@@ -145,7 +150,7 @@ def __init__(
     #         num_solutions: int = csp.n_solutions()  # number of solutions
     #         solutions = [csp.values(sol=i) for i in range(num_solutions)]  # list of solutions
 
-    def __build_searchspace_bruteforce(self, block_size_names: list, max_threads: int, solver = None):
+    def __build_searchspace_bruteforce(self, block_size_names: list, max_threads: int, solver=None):
         # bruteforce solving of the searchspace
 
         from itertools import product
@@ -167,9 +172,15 @@ def __build_searchspace_bruteforce(self, block_size_names: list, max_threads: in
                 restrictions = [restrictions]
             block_size_restriction_spaced = f"{' * '.join(used_block_size_names)} <= {max_threads}"
             block_size_restriction_unspaced = f"{'*'.join(used_block_size_names)} <= {max_threads}"
-            if block_size_restriction_spaced not in restrictions and block_size_restriction_unspaced not in restrictions:
+            if (
+                block_size_restriction_spaced not in restrictions
+                and block_size_restriction_unspaced not in restrictions
+            ):
                 restrictions.append(block_size_restriction_spaced)
-                if isinstance(self._modified_restrictions, list) and block_size_restriction_spaced not in self._modified_restrictions:
+                if (
+                    isinstance(self._modified_restrictions, list)
+                    and block_size_restriction_spaced not in self._modified_restrictions
+                ):
                     self._modified_restrictions.append(block_size_restriction_spaced)
                     if isinstance(self.restrictions, list):
                         self.restrictions.append(block_size_restriction_spaced)
@@ -264,12 +275,7 @@ def get_params():
                 TP(key, Set(values))
             return params
 
-        tuning_result = (
-            Tuner()
-            .tuning_parameters(*get_params())
-            .search_technique(Exhaustive())
-            .tune(costfunc)
-        )
+        tuning_result = Tuner().tuning_parameters(*get_params()).search_technique(Exhaustive()).tune(costfunc)
         return tuning_result
 
     def __build_searchspace_ATF_cache(self, block_size_names: list, max_threads: int, solver: Solver):
@@ -323,7 +329,10 @@ def __build_searchspace(self, block_size_names: list, max_threads: int, solver:
         if len(valid_block_size_names) > 0:
             parameter_space.addConstraint(MaxProdConstraint(max_threads), valid_block_size_names)
             max_block_size_product = f"{' * '.join(valid_block_size_names)} <= {max_threads}"
-            if isinstance(self._modified_restrictions, list) and max_block_size_product not in self._modified_restrictions:
+            if (
+                isinstance(self._modified_restrictions, list)
+                and max_block_size_product not in self._modified_restrictions
+            ):
                 self._modified_restrictions.append(max_block_size_product)
                 if isinstance(self.restrictions, list):
                     self.restrictions.append((MaxProdConstraint(max_threads), valid_block_size_names))
@@ -348,10 +357,7 @@ def __add_restrictions(self, parameter_space: Problem) -> Problem:
                     parameter_space.addConstraint(restriction, required_params)
                 elif isinstance(restriction, Constraint):
                     all_params_required = all(param_name in required_params for param_name in self.param_names)
-                    parameter_space.addConstraint(
-                        restriction,
-                        None if all_params_required else required_params
-                    )
+                    parameter_space.addConstraint(restriction, None if all_params_required else required_params)
                 else:
                     raise ValueError(f"Unrecognized restriction {restriction}")
 
diff --git a/kernel_tuner/strategies/basinhopping.py b/kernel_tuner/strategies/basinhopping.py
index 20e800f6e..291987a81 100644
--- a/kernel_tuner/strategies/basinhopping.py
+++ b/kernel_tuner/strategies/basinhopping.py
@@ -8,8 +8,11 @@
 
 supported_methods = ["Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "SLSQP"]
 
-_options = dict(method=(f"Local optimization algorithm to use, choose any from {supported_methods}", "L-BFGS-B"),
-                       T=("Temperature parameter for the accept or reject criterion", 1.0))
+_options = dict(
+    method=(f"Local optimization algorithm to use, choose any from {supported_methods}", "L-BFGS-B"),
+    T=("Temperature parameter for the accept or reject criterion", 1.0),
+)
+
 
 def tune(searchspace: Searchspace, runner, tuning_options):
     method, T = common.get_options(tuning_options.strategy_options, _options)
@@ -21,16 +24,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     kwargs = setup_method_arguments(method, bounds)
     options = setup_method_options(method, tuning_options)
-    kwargs['options'] = options
-
+    kwargs["options"] = options
 
     minimizer_kwargs = dict(**kwargs)
     minimizer_kwargs["method"] = method
 
     opt_result = None
     try:
-        opt_result = scipy.optimize.basinhopping(cost_func, x0, T=T, stepsize=eps,
-                                             minimizer_kwargs=minimizer_kwargs, disp=tuning_options.verbose)
+        opt_result = scipy.optimize.basinhopping(
+            cost_func, x0, T=T, stepsize=eps, minimizer_kwargs=minimizer_kwargs, disp=tuning_options.verbose
+        )
     except util.StopCriterionReached as e:
         if tuning_options.verbose:
             print(e)
diff --git a/kernel_tuner/strategies/bayes_opt.py b/kernel_tuner/strategies/bayes_opt.py
index bd20e29a9..d4dac0524 100644
--- a/kernel_tuner/strategies/bayes_opt.py
+++ b/kernel_tuner/strategies/bayes_opt.py
@@ -235,10 +235,10 @@ def get_hyperparam(name: str, default, supported_values=list()):
         self.invalid_value = 1e20
         self.opt_direction = opt_direction
         if opt_direction == "min":
-            self.worst_value = np.PINF
+            self.worst_value = np.inf
             self.argopt = np.argmin
         elif opt_direction == "max":
-            self.worst_value = np.NINF
+            self.worst_value = -np.inf
             self.argopt = np.argmax
         else:
             raise ValueError("Invalid optimization direction '{}'".format(opt_direction))
@@ -262,7 +262,7 @@ def get_hyperparam(name: str, default, supported_values=list()):
         self.__visited_num = 0
         self.__visited_valid_num = 0
         self.__visited_searchspace_indices = [False] * self.searchspace_size
-        self.__observations = [np.NaN] * self.searchspace_size
+        self.__observations = [np.nan] * self.searchspace_size
         self.__valid_observation_indices = [False] * self.searchspace_size
         self.__valid_params = list()
         self.__valid_observations = list()
@@ -311,7 +311,7 @@ def is_not_visited(self, index: int) -> bool:
 
     def is_valid(self, observation: float) -> bool:
         """Returns whether an observation is valid."""
-        return not (observation is None or observation == self.invalid_value or observation == np.NaN)
+        return not (observation is None or observation == self.invalid_value or observation == np.nan)
 
     def get_af_by_name(self, name: str):
         """Get the basic acquisition functions by their name."""
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index a0e3f8ebe..9b2284969 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -1,13 +1,23 @@
 """ The default strategy that iterates through the whole parameter space """
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
+from kernel_tuner.runners.parallel import ParallelRunner
+from kernel_tuner.runners.ray.cache_manager import CacheManager
 
-_options = {}
+_options = dict(num_gpus=("Number of gpus to run parallel execution", None))
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
-    # call the runner
-    return runner.run(searchspace.sorted_list(), tuning_options)
+def tune(searchspace: Searchspace, runner, tuning_options):
+    if isinstance(runner, ParallelRunner):
+        if tuning_options.strategy_options is None:
+            tuning_options.strategy_options = {}
+        tuning_options.strategy_options["check_and_retrieve"] = False
+        cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
+        return runner.run(
+            parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager
+        )
+    else:
+        return runner.run(searchspace.sorted_list(), tuning_options)
 
 
 tune.__doc__ = common.get_strategy_docstring("Brute Force", _options)
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index d01eae937..94f73c958 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -1,11 +1,19 @@
 import logging
 import sys
 from time import perf_counter
+import warnings
+import ray
 
 import numpy as np
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
+from kernel_tuner.observers.nvml import NVMLObserver
+from kernel_tuner.observers.pmt import PMTObserver
+from kernel_tuner.observers.powersensor import PowerSensorObserver
+from kernel_tuner.observers.register import RegisterObserver
 
 _docstring_template = """ Find the best performing kernel configuration in the parameter space
 
@@ -30,7 +38,9 @@
 
 def get_strategy_docstring(name, strategy_options):
     """Generate docstring for a 'tune' method of a strategy."""
-    return _docstring_template.replace("$NAME$", name).replace("$STRAT_OPT$", make_strategy_options_doc(strategy_options))
+    return _docstring_template.replace("$NAME$", name).replace(
+        "$STRAT_OPT$", make_strategy_options_doc(strategy_options)
+    )
 
 
 def make_strategy_options_doc(strategy_options):
@@ -44,7 +54,7 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "check_and_retrieve"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
@@ -66,13 +76,65 @@ def __call__(self, x, check_restrictions=True):
         self.runner.last_strategy_time = 1000 * (perf_counter() - self.runner.last_strategy_start_time)
 
         # error value to return for numeric optimizers that need a numerical value
-        logging.debug('_cost_func called')
-        logging.debug('x: ' + str(x))
+        logging.debug("_cost_func called")
+        logging.debug("x: " + str(x))
 
         # check if max_fevals is reached or time limit is exceeded
         util.check_stop_criterion(self.tuning_options)
 
-        # snap values in x to nearest actual value for each parameter, unscale x if needed
+        x_list = [x] if self._is_single_configuration(x) else x
+        configs = [self._prepare_config(cfg) for cfg in x_list]
+
+        legal_configs = configs
+        illegal_results = []
+        if check_restrictions and self.searchspace.restrictions:
+            legal_configs, illegal_results = self._get_legal_configs(configs)
+
+        final_results = self._evaluate_configs(legal_configs) if len(legal_configs) > 0 else []
+        # get numerical return values, taking optimization direction into account
+        all_results = final_results + illegal_results
+        return_values = []
+        for result in all_results:
+            return_value = result[self.tuning_options.objective] or sys.float_info.max
+            return_values.append(return_value if not self.tuning_options.objective_higher_is_better else -return_value)
+
+        if len(return_values) == 1:
+            return return_values[0]
+        return return_values
+
+    @staticmethod
+    def _is_single_configuration(x):
+        """
+        Determines if the input is a single configuration based on its type and composition.
+
+        Parameters:
+            x: The input to check, which can be an int, float, numpy array, list, or tuple.
+
+        Returns:
+            bool: True if `x` is a single configuration, which includes being a singular int or float,
+                a numpy array of ints or floats, or a list or tuple where all elements are ints or floats.
+                Otherwise, returns False.
+        """
+        if isinstance(x, (int, float)):
+            return True
+        if isinstance(x, np.ndarray):
+            # Checks for data type being integer ('i') or float ('f')
+            return x.dtype.kind in "if"
+        if isinstance(x, (list, tuple)):
+            return all(isinstance(item, (int, float)) for item in x)
+        return False
+
+    def _prepare_config(self, x):
+        """
+        Prepare a single configuration by snapping to nearest values and/or scaling.
+
+        Args:
+            x (list): The input configuration to be prepared.
+
+        Returns:
+            list: The prepared configuration.
+
+        """
         if self.snap:
             if self.scaling:
                 params = unscale_and_snap_to_nearest(x, self.searchspace.tune_params, self.tuning_options.eps)
@@ -80,39 +142,66 @@ def __call__(self, x, check_restrictions=True):
                 params = snap_to_nearest_config(x, self.searchspace.tune_params)
         else:
             params = x
-        logging.debug('params ' + str(params))
-
-        legal = True
-        result = {}
-        x_int = ",".join([str(i) for i in params])
-
-        # else check if this is a legal (non-restricted) configuration
-        if check_restrictions and self.searchspace.restrictions:
-            params_dict = dict(zip(self.searchspace.tune_params.keys(), params))
+        return params
+
+    def _get_legal_configs(self, configs):
+        """
+        Filters and categorizes configurations into legal and illegal based on defined restrictions.
+        Configurations are checked against restrictions; illegal ones are modified to indicate an invalid state and
+        included in the results. Legal configurations are collected and returned for potential use.
+
+        Parameters:
+            configs (list of tuple): Configurations to be checked, each represented as a tuple of parameter values.
+
+        Returns:
+            tuple: A pair containing a list of legal configurations and a list of results with illegal configurations marked.
+        """
+        results = []
+        legal_configs = []
+        for config in configs:
+            params_dict = dict(zip(self.searchspace.tune_params.keys(), config))
             legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
             if not legal:
-                result = params_dict
-                result[self.tuning_options.objective] = util.InvalidConfig()
-
-        if legal:
-            # compile and benchmark this instance
-            res = self.runner.run([params], self.tuning_options)
-            result = res[0]
-
+                params_dict[self.tuning_options.objective] = util.InvalidConfig()
+                results.append(params_dict)
+            else:
+                legal_configs.append(config)
+        return legal_configs, results
+
+    def _evaluate_configs(self, configs):
+        """
+        Evaluate and manage configurations based on tuning options. Results are sorted by timestamp to maintain
+        order during parallel processing. The function ensures no duplicates in results and checks for stop criteria
+        post-processing. Strategy start time is updated upon completion.
+
+        Parameters:
+            configs (list): Configurations to be evaluated.
+
+        Returns:
+            list of dict: Processed results of the evaluations.
+        """
+        results = self.runner.run(configs, self.tuning_options)
+        # sort based on timestamp, needed because of parallel tuning of populations and restrospective stop criterion check
+        if "timestamp" in results[0]:
+            results.sort(key=lambda x: x["timestamp"])
+
+        final_results = []
+        for result in results:
+            config = tuple(result[key] for key in self.tuning_options.tune_params if key in result)
+            x_int = ",".join([str(i) for i in config])
             # append to tuning results
             if x_int not in self.tuning_options.unique_results:
                 self.tuning_options.unique_results[x_int] = result
-
+                # check retrospectively if max_fevals is reached or time limit is exceeded within the results
+                util.check_stop_criterion(self.tuning_options)
+            final_results.append(result)
+            # in case of stop creterion reached, save the results so far
             self.results.append(result)
 
-            # upon returning from this function control will be given back to the strategy, so reset the start time
-            self.runner.last_strategy_start_time = perf_counter()
-
-        # get numerical return value, taking optimization direction into account
-        return_value = result[self.tuning_options.objective] or sys.float_info.max
-        return_value = return_value if not self.tuning_options.objective_higher_is_better else -return_value
+        # upon returning from this function control will be given back to the strategy, so reset the start time
+        self.runner.last_strategy_start_time = perf_counter()
 
-        return return_value
+        return final_results
 
     def get_bounds_x0_eps(self):
         """Compute bounds, x0 (the initial guess), and eps."""
@@ -146,10 +235,10 @@ def get_bounds_x0_eps(self):
                     eps = min(eps, np.amin(np.gradient(vals)))
 
         self.tuning_options["eps"] = eps
-        logging.debug('get_bounds_x0_eps called')
-        logging.debug('bounds ' + str(bounds))
-        logging.debug('x0 ' + str(x0))
-        logging.debug('eps ' + str(eps))
+        logging.debug("get_bounds_x0_eps called")
+        logging.debug("bounds " + str(bounds))
+        logging.debug("x0 " + str(x0))
+        logging.debug("eps " + str(eps))
 
         return bounds, x0, eps
 
@@ -167,7 +256,7 @@ def setup_method_arguments(method, bounds):
     kwargs = {}
     # pass bounds to methods that support it
     if method in ["L-BFGS-B", "TNC", "SLSQP"]:
-        kwargs['bounds'] = bounds
+        kwargs["bounds"] = bounds
     return kwargs
 
 
@@ -180,21 +269,21 @@ def setup_method_options(method, tuning_options):
         maxiter = tuning_options.strategy_options.maxiter
     else:
         maxiter = 100
-    kwargs['maxiter'] = maxiter
+    kwargs["maxiter"] = maxiter
     if method in ["Nelder-Mead", "Powell"]:
-        kwargs['maxfev'] = maxiter
+        kwargs["maxfev"] = maxiter
     elif method == "L-BFGS-B":
-        kwargs['maxfun'] = maxiter
+        kwargs["maxfun"] = maxiter
 
     # pass eps to methods that support it
     if method in ["CG", "BFGS", "L-BFGS-B", "TNC", "SLSQP"]:
-        kwargs['eps'] = tuning_options.eps
+        kwargs["eps"] = tuning_options.eps
     elif method == "COBYLA":
-        kwargs['rhobeg'] = tuning_options.eps
+        kwargs["rhobeg"] = tuning_options.eps
 
     # not all methods support 'disp' option
-    if method not in ['TNC']:
-        kwargs['disp'] = tuning_options.verbose
+    if method not in ["TNC"]:
+        kwargs["disp"] = tuning_options.verbose
 
     return kwargs
 
@@ -241,5 +330,57 @@ def scale_from_params(params, tune_params, eps):
     """Helper func to do the inverse of the 'unscale' function."""
     x = np.zeros(len(params))
     for i, v in enumerate(tune_params.values()):
-        x[i] = 0.5 * eps + v.index(params[i])*eps
+        x[i] = 0.5 * eps + v.index(params[i]) * eps
     return x
+
+
+def check_num_devices(ensemble_size: int, simulation_mode: bool):
+    num_devices = get_num_devices(simulation_mode=simulation_mode)
+    if num_devices < ensemble_size:
+        warnings.warn(
+            "Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.",
+            UserWarning,
+        )
+
+
+def create_actor_on_device(
+    kernel_source, kernel_options, device_options, iterations, observers, cache_manager, simulation_mode, identifier
+):
+    # Check if Ray is initialized, raise an error if not
+    if not ray.is_initialized():
+        raise RuntimeError(
+            "Ray is not initialized. Initialize Ray before creating an actor (remember to include resources)."
+        )
+
+    if simulation_mode:
+        resource_options = {"num_cpus": 1}
+    else:
+        resource_options = {"num_gpus": 1}
+
+    observers_type_and_arguments = []
+    if observers is not None:
+        # observers can't be pickled so we will re-initialize them in the actors
+        # observers related to backends will be initialized once we call the device interface inside the actor, that is why we skip them here
+        for i, observer in enumerate(observers):
+            if isinstance(observer, (NVMLObserver, PMTObserver, PowerSensorObserver)):
+                observers_type_and_arguments.append((observer.__class__, observer.init_arguments))
+            if isinstance(observer, RegisterObserver):
+                observers_type_and_arguments.append((observer.__class__, []))
+
+    # Create the actor with the specified options and resources
+    return RemoteActor.options(**resource_options).remote(
+        kernel_source,
+        kernel_options,
+        device_options,
+        iterations,
+        observers_type_and_arguments=observers_type_and_arguments,
+        cache_manager=cache_manager,
+        simulation_mode=simulation_mode,
+        identifier=identifier,
+    )
+
+
+def initialize_ray():
+    # Initialize Ray
+    if not ray.is_initialized():
+        ray.init(include_dashboard=True, ignore_reinit_error=True)
diff --git a/kernel_tuner/strategies/diff_evo.py b/kernel_tuner/strategies/diff_evo.py
index 5ad2b9474..7aa717b26 100644
--- a/kernel_tuner/strategies/diff_evo.py
+++ b/kernel_tuner/strategies/diff_evo.py
@@ -6,16 +6,27 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
 
-supported_methods = ["best1bin", "best1exp", "rand1exp", "randtobest1exp", "best2exp", "rand2exp", "randtobest1bin", "best2bin", "rand2bin", "rand1bin"]
-
-_options = dict(method=(f"Creation method for new population, any of {supported_methods}", "best1bin"),
-                       popsize=("Population size", 20),
-                       maxiter=("Number of generations", 100))
+supported_methods = [
+    "best1bin",
+    "best1exp",
+    "rand1exp",
+    "randtobest1exp",
+    "best2exp",
+    "rand2exp",
+    "randtobest1bin",
+    "best2bin",
+    "rand2bin",
+    "rand1bin",
+]
+
+_options = dict(
+    method=(f"Creation method for new population, any of {supported_methods}", "best1bin"),
+    popsize=("Population size", 20),
+    maxiter=("Number of generations", 100),
+)
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
-
-
     method, popsize, maxiter = common.get_options(tuning_options.strategy_options, _options)
 
     # build a bounds array as needed for the optimizer
@@ -28,8 +39,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     # call the differential evolution optimizer
     opt_result = None
     try:
-        opt_result = differential_evolution(cost_func, bounds, maxiter=maxiter, popsize=popsize, init=population,
-                                        polish=False, strategy=method, disp=tuning_options.verbose)
+        opt_result = differential_evolution(
+            cost_func,
+            bounds,
+            maxiter=maxiter,
+            popsize=popsize,
+            init=population,
+            polish=False,
+            strategy=method,
+            disp=tuning_options.verbose,
+        )
     except util.StopCriterionReached as e:
         if tuning_options.verbose:
             print(e)
diff --git a/kernel_tuner/strategies/dual_annealing.py b/kernel_tuner/strategies/dual_annealing.py
index 0f44bd849..bbb8ffa48 100644
--- a/kernel_tuner/strategies/dual_annealing.py
+++ b/kernel_tuner/strategies/dual_annealing.py
@@ -6,23 +6,22 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc, setup_method_arguments, setup_method_options
 
-supported_methods = ['COBYLA', 'L-BFGS-B', 'SLSQP', 'CG', 'Powell', 'Nelder-Mead', 'BFGS', 'trust-constr']
+supported_methods = ["COBYLA", "L-BFGS-B", "SLSQP", "CG", "Powell", "Nelder-Mead", "BFGS", "trust-constr"]
 
 _options = dict(method=(f"Local optimization method to use, choose any from {supported_methods}", "Powell"))
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     method = common.get_options(tuning_options.strategy_options, _options)[0]
 
-    #scale variables in x to make 'eps' relevant for multiple variables
+    # scale variables in x to make 'eps' relevant for multiple variables
     cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True)
 
     bounds, x0, _ = cost_func.get_bounds_x0_eps()
 
     kwargs = setup_method_arguments(method, bounds)
     options = setup_method_options(method, tuning_options)
-    kwargs['options'] = options
-
+    kwargs["options"] = options
 
     minimizer_kwargs = {}
     minimizer_kwargs["method"] = method
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
new file mode 100644
index 000000000..d3160abd5
--- /dev/null
+++ b/kernel_tuner/strategies/ensemble.py
@@ -0,0 +1,101 @@
+"""
+The ensemble strategy that optimizes the search through the parameter space using a combination of multiple strategies.
+"""
+
+import warnings
+
+from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.strategies import common
+from kernel_tuner.strategies.common import initialize_ray
+from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.parallel import ParallelRunner
+
+from kernel_tuner.strategies import (
+    basinhopping,
+    bayes_opt,
+    diff_evo,
+    dual_annealing,
+    firefly_algorithm,
+    genetic_algorithm,
+    greedy_ils,
+    greedy_mls,
+    minimize,
+    mls,
+    ordered_greedy_mls,
+    pso,
+    random_sample,
+    simulated_annealing,
+)
+
+strategy_map = {
+    "random_sample": random_sample,
+    "minimize": minimize,
+    "basinhopping": basinhopping,
+    "diff_evo": diff_evo,
+    "genetic_algorithm": genetic_algorithm,
+    "greedy_mls": greedy_mls,
+    "ordered_greedy_mls": ordered_greedy_mls,
+    "greedy_ils": greedy_ils,
+    "dual_annealing": dual_annealing,
+    "mls": mls,
+    "pso": pso,
+    "simulated_annealing": simulated_annealing,
+    "firefly_algorithm": firefly_algorithm,
+    "bayes_opt": bayes_opt,
+}
+
+_options = dict(
+    ensemble=("List of strategies to be used in the ensemble", ["random_sample", "random_sample"]),
+    max_fevals=("Maximum number of function evaluations", None),
+    num_gpus=("Number of gpus to run the parallel ensemble on", None),
+)
+
+
+def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
+    clean_up = True if actors is None and cache_manager is None else False
+    simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    initialize_ray()
+
+    ensemble, max_fevals, num_gpus = common.get_options(tuning_options.strategy_options, _options)
+    num_devices = num_gpus if num_gpus is not None else get_num_devices(simulation_mode=simulation_mode)
+    ensemble_size = len(ensemble)
+
+    # setup strategy options
+    if "bayes_opt" in ensemble:
+        # All strategies start from a random sample except for BO
+        tuning_options.strategy_options["samplingmethod"] = "random"
+    tuning_options.strategy_options["max_fevals"] = 100 * ensemble_size if max_fevals is None else max_fevals
+    tuning_options.strategy_options["check_and_retrieve"] = True
+
+    # define number of ray actors needed
+    if num_devices < ensemble_size:
+        warnings.warn(
+            "Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.",
+            UserWarning,
+        )
+    num_actors = num_devices if ensemble_size > num_devices else ensemble_size
+
+    ensemble = [strategy_map[strategy] for strategy in ensemble]
+
+    parallel_runner = ParallelRunner(
+        runner.kernel_source,
+        runner.kernel_options,
+        runner.device_options,
+        runner.iterations,
+        runner.observers,
+        num_gpus=num_actors,
+        cache_manager=cache_manager,
+        simulation_mode=simulation_mode,
+        actors=actors,
+    )
+
+    final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
+
+    if clean_up:
+        parallel_runner.clean_up_ray()
+
+    return final_results
+
+
+tune.__doc__ = common.get_strategy_docstring("Ensemble", _options)
diff --git a/kernel_tuner/strategies/firefly_algorithm.py b/kernel_tuner/strategies/firefly_algorithm.py
index dc43aae6f..429a338fa 100644
--- a/kernel_tuner/strategies/firefly_algorithm.py
+++ b/kernel_tuner/strategies/firefly_algorithm.py
@@ -9,14 +9,16 @@
 from kernel_tuner.strategies.common import CostFunc, scale_from_params
 from kernel_tuner.strategies.pso import Particle
 
-_options = dict(popsize=("Population size", 20),
-                       maxiter=("Maximum number of iterations", 100),
-                       B0=("Maximum attractiveness", 1.0),
-                       gamma=("Light absorption coefficient", 1.0),
-                       alpha=("Randomization parameter", 0.2))
+_options = dict(
+    popsize=("Population size", 20),
+    maxiter=("Maximum number of iterations", 100),
+    B0=("Maximum attractiveness", 1.0),
+    gamma=("Light absorption coefficient", 1.0),
+    alpha=("Randomization parameter", 0.2),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     # scale variables in x because PSO works with velocities to visit different configurations
     cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True)
 
@@ -57,7 +59,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         # compare all to all and compute attractiveness
         for i in range(num_particles):
             for j in range(num_particles):
-
                 if swarm[i].intensity < swarm[j].intensity:
                     dist = swarm[i].distance_to(swarm[j])
                     beta = B0 * np.exp(-gamma * dist * dist)
@@ -78,7 +79,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         swarm.sort(key=lambda x: x.score)
 
     if tuning_options.verbose:
-        print('Final result:')
+        print("Final result:")
         print(best_position_global)
         print(best_score_global)
 
@@ -87,6 +88,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("firefly algorithm", _options)
 
+
 class Firefly(Particle):
     """Firefly object for use in the Firefly Algorithm."""
 
@@ -98,7 +100,7 @@ def __init__(self, bounds):
 
     def distance_to(self, other):
         """Return Euclidian distance between self and other Firefly."""
-        return np.linalg.norm(self.position-other.position)
+        return np.linalg.norm(self.position - other.position)
 
     def compute_intensity(self, fun):
         """Evaluate cost function and compute intensity at this position."""
diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index c29c150b5..913e449e9 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -17,7 +17,6 @@
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
-
     options = tuning_options.strategy_options
     pop_size, generations, method, mutation_chance = common.get_options(options, _options)
     crossover = supported_methods[method]
@@ -28,7 +27,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     population = list(list(p) for p in searchspace.get_random_sample(pop_size))
 
     for generation in range(generations):
-
         # determine fitness of population members
         weighted_population = []
         for dna in population:
@@ -46,7 +44,9 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # 'best_score' is used only for printing
         if tuning_options.verbose and cost_func.results:
-            best_score = util.get_best_config(cost_func.results, tuning_options.objective, tuning_options.objective_higher_is_better)[tuning_options.objective]
+            best_score = util.get_best_config(
+                cost_func.results, tuning_options.objective, tuning_options.objective_higher_is_better
+            )[tuning_options.objective]
 
         if tuning_options.verbose:
             print("Generation %d, best_score %f" % (generation, best_score))
diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index a4c521746..b134dff47 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -6,13 +6,18 @@
 from kernel_tuner.strategies.genetic_algorithm import mutate
 from kernel_tuner.strategies.hillclimbers import base_hillclimb
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
-                       no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
-                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
+    no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
+    random_walk=(
+        "controls greedyness, i.e. whether to restart from a position as soon as an improvement is found",
+        0.3,
+    ),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     dna_size = len(searchspace.tune_params.keys())
 
     options = tuning_options.strategy_options
@@ -30,15 +35,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     fevals = 0
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
-    #while searching
+    # while searching
     candidate = searchspace.get_random_sample(1)[0]
     best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
     while fevals < max_fevals:
-
         try:
-            candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
+            candidate = base_hillclimb(
+                candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True
+            )
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
             if tuning_options.verbose:
@@ -58,6 +64,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("Greedy Iterative Local Search (ILS)", _options)
 
+
 def random_walk(indiv, permutation_size, no_improve, last_improve, searchspace: Searchspace):
     if last_improve >= no_improve:
         return searchspace.get_random_sample(1)[0]
diff --git a/kernel_tuner/strategies/greedy_mls.py b/kernel_tuner/strategies/greedy_mls.py
index 1b34da501..cf90a7df9 100644
--- a/kernel_tuner/strategies/greedy_mls.py
+++ b/kernel_tuner/strategies/greedy_mls.py
@@ -4,13 +4,15 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.hillclimbers import base_hillclimb
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
-                       order=("set a user-specified order to search among dimensions while hillclimbing", None),
-                       randomize=("use a random order to search among dimensions while hillclimbing", True))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
+    order=("set a user-specified order to search among dimensions while hillclimbing", None),
+    randomize=("use a random order to search among dimensions while hillclimbing", True),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     # retrieve options with defaults
     options = tuning_options.strategy_options
     neighbor, restart, order, randomize = common.get_options(options, _options)
@@ -24,12 +26,22 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     fevals = 0
 
-    #while searching
+    # while searching
     while fevals < max_fevals:
         candidate = searchspace.get_random_sample(1)[0]
 
         try:
-            base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=randomize, order=order)
+            base_hillclimb(
+                candidate,
+                neighbor,
+                max_fevals,
+                searchspace,
+                tuning_options,
+                cost_func,
+                restart=restart,
+                randomize=randomize,
+                order=order,
+            )
         except util.StopCriterionReached as e:
             if tuning_options.verbose:
                 print(e)
diff --git a/kernel_tuner/strategies/hillclimbers.py b/kernel_tuner/strategies/hillclimbers.py
index b64e7d733..602d27d0e 100644
--- a/kernel_tuner/strategies/hillclimbers.py
+++ b/kernel_tuner/strategies/hillclimbers.py
@@ -5,9 +5,18 @@
 from kernel_tuner.strategies.common import CostFunc
 
 
-def base_hillclimb(base_sol: tuple, neighbor_method: str, max_fevals: int, searchspace: Searchspace, tuning_options,
-                   cost_func: CostFunc, restart=True, randomize=True, order=None):
-    """ Hillclimbing search until max_fevals is reached or no improvement is found
+def base_hillclimb(
+    base_sol: tuple,
+    neighbor_method: str,
+    max_fevals: int,
+    searchspace: Searchspace,
+    tuning_options,
+    cost_func: CostFunc,
+    restart=True,
+    randomize=True,
+    order=None,
+):
+    """Hillclimbing search until max_fevals is reached or no improvement is found
 
     Base hillclimber that evaluates neighbouring solutions in a random or fixed order
     and possibly immediately moves to the neighbour if it is an improvement.
diff --git a/kernel_tuner/strategies/minimize.py b/kernel_tuner/strategies/minimize.py
index 80c1c6f82..58a93e0b1 100644
--- a/kernel_tuner/strategies/minimize.py
+++ b/kernel_tuner/strategies/minimize.py
@@ -16,8 +16,8 @@
 
 _options = dict(method=(f"Local optimization algorithm to use, choose any from {supported_methods}", "L-BFGS-B"))
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     method = get_options(tuning_options.strategy_options, _options)[0]
 
     # scale variables in x to make 'eps' relevant for multiple variables
diff --git a/kernel_tuner/strategies/mls.py b/kernel_tuner/strategies/mls.py
index b8ecf030c..7f0601378 100644
--- a/kernel_tuner/strategies/mls.py
+++ b/kernel_tuner/strategies/mls.py
@@ -3,13 +3,15 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.greedy_mls import tune as mls_tune
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", False),
-                       order=("set a user-specified order to search among dimensions while hillclimbing", None),
-                       randomize=("use a random order to search among dimensions while hillclimbing", True))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", False),
+    order=("set a user-specified order to search among dimensions while hillclimbing", None),
+    randomize=("use a random order to search among dimensions while hillclimbing", True),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     # Default MLS uses 'best improvement' hillclimbing, so greedy hillclimbing is disabled with restart defaulting to False
     _, restart, _, _ = common.get_options(tuning_options.strategy_options, _options)
 
diff --git a/kernel_tuner/strategies/ordered_greedy_mls.py b/kernel_tuner/strategies/ordered_greedy_mls.py
index cd40ba778..f72257020 100644
--- a/kernel_tuner/strategies/ordered_greedy_mls.py
+++ b/kernel_tuner/strategies/ordered_greedy_mls.py
@@ -3,13 +3,15 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.greedy_mls import tune as mls_tune
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
-                       order=("set a user-specified order to search among dimensions while hillclimbing", None),
-                       randomize=("use a random order to search among dimensions while hillclimbing", False))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
+    order=("set a user-specified order to search among dimensions while hillclimbing", None),
+    randomize=("use a random order to search among dimensions while hillclimbing", False),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     _, restart, _, randomize = common.get_options(tuning_options.strategy_options, _options)
 
     # Delegate to Greedy MLS, but make sure our defaults are used if not overwritten by the user
diff --git a/kernel_tuner/strategies/pso.py b/kernel_tuner/strategies/pso.py
index 5b0df1429..ec92c1094 100644
--- a/kernel_tuner/strategies/pso.py
+++ b/kernel_tuner/strategies/pso.py
@@ -9,21 +9,22 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc, scale_from_params
 
-_options = dict(popsize=("Population size", 20),
-                       maxiter=("Maximum number of iterations", 100),
-                       w=("Inertia weight constant", 0.5),
-                       c1=("Cognitive constant", 2.0),
-                       c2=("Social constant", 1.0))
+_options = dict(
+    popsize=("Population size", 20),
+    maxiter=("Maximum number of iterations", 100),
+    w=("Inertia weight constant", 0.5),
+    c1=("Cognitive constant", 2.0),
+    c2=("Social constant", 1.0),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
-    #scale variables in x because PSO works with velocities to visit different configurations
+def tune(searchspace: Searchspace, runner, tuning_options):
+    # scale variables in x because PSO works with velocities to visit different configurations
     cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True)
 
-    #using this instead of get_bounds because scaling is used
+    # using this instead of get_bounds because scaling is used
     bounds, _, eps = cost_func.get_bounds_x0_eps()
 
-
     num_particles, maxiter, w, c1, c2 = common.get_options(tuning_options.strategy_options, _options)
 
     best_score_global = sys.float_info.max
@@ -64,7 +65,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             swarm[j].update_position(bounds)
 
     if tuning_options.verbose:
-        print('Final result:')
+        print("Final result:")
         print(best_position_global)
         print(best_score_global)
 
@@ -73,6 +74,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("Particle Swarm Optimization (PSO)", _options)
 
+
 class Particle:
     def __init__(self, bounds):
         self.ndim = len(bounds)
diff --git a/kernel_tuner/strategies/simulated_annealing.py b/kernel_tuner/strategies/simulated_annealing.py
index dce929b7b..8a4e43348 100644
--- a/kernel_tuner/strategies/simulated_annealing.py
+++ b/kernel_tuner/strategies/simulated_annealing.py
@@ -9,10 +9,13 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
 
-_options = dict(T=("Starting temperature", 1.0),
-                       T_min=("End temperature", 0.001),
-                       alpha=("Alpha parameter", 0.995),
-                       maxiter=("Number of iterations within each annealing step", 1))
+_options = dict(
+    T=("Starting temperature", 1.0),
+    T_min=("End temperature", 0.001),
+    alpha=("Alpha parameter", 0.995),
+    maxiter=("Number of iterations within each annealing step", 1),
+)
+
 
 def tune(searchspace: Searchspace, runner, tuning_options):
     # SA works with real parameter values and does not need scaling
@@ -23,7 +26,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     T_start = T
 
     # compute how many iterations would be needed to complete the annealing schedule
-    max_iter = int(np.ceil(np.log(T_min)/np.log(alpha)))
+    max_iter = int(np.ceil(np.log(T_min) / np.log(alpha)))
 
     # if user supplied max_fevals that is lower then max_iter we will
     # scale the annealing schedule to fit max_fevals
@@ -45,7 +48,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             iteration += 1
 
         for _ in range(niter):
-
             new_pos = neighbor(pos, searchspace)
             try:
                 new_cost = cost_func(new_pos, check_restrictions=False)
@@ -59,12 +61,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
             if ap > r:
                 if tuning_options.verbose:
-                    print("new position accepted", new_pos, new_cost, 'old:', pos, old_cost, 'ap', ap, 'r', r, 'T', T)
+                    print("new position accepted", new_pos, new_cost, "old:", pos, old_cost, "ap", ap, "r", r, "T", T)
                 pos = new_pos
                 old_cost = new_cost
 
         c = len(tuning_options.unique_results)
-        T = T_start * alpha**(max_iter/max_feval*c)
+        T = T_start * alpha ** (max_iter / max_feval * c)
 
         # check if solver gets stuck and if so restart from random position
         if c == c_old:
@@ -77,7 +79,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             stuck = 0
 
         # safeguard
-        if iteration > 10*max_iter:
+        if iteration > 10 * max_iter:
             break
 
     return cost_func.results
@@ -85,6 +87,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("Simulated Annealing", _options)
 
+
 def acceptance_prob(old_cost, new_cost, T, tuning_options):
     """Annealing equation, with modifications to work towards a lower value."""
     error_val = sys.float_info.max if not tuning_options.objective_higher_is_better else -sys.float_info.max
@@ -99,14 +102,18 @@ def acceptance_prob(old_cost, new_cost, T, tuning_options):
         return 1.0
     # maybe move if old cost is better than new cost depending on T and random value
     if tuning_options.objective_higher_is_better:
-        return np.exp(((new_cost-old_cost)/new_cost)/T)
-    return np.exp(((old_cost-new_cost)/old_cost)/T)
+        return np.exp(((new_cost - old_cost) / new_cost) / T)
+    return np.exp(((old_cost - new_cost) / old_cost) / T)
 
 
 def neighbor(pos, searchspace: Searchspace):
     """Return a random neighbor of pos."""
     # Note: this is not the same as the previous implementation, because it is possible that non-edge parameters remain the same, but suggested configurations will all be within restrictions
-    neighbors = searchspace.get_neighbors(tuple(pos), neighbor_method='Hamming') if random.random() < 0.2 else searchspace.get_neighbors(tuple(pos), neighbor_method='strictly-adjacent')
+    neighbors = (
+        searchspace.get_neighbors(tuple(pos), neighbor_method="Hamming")
+        if random.random() < 0.2
+        else searchspace.get_neighbors(tuple(pos), neighbor_method="strictly-adjacent")
+    )
     if len(neighbors) > 0:
         return list(random.choice(neighbors))
     # if there are no neighbors, return a random configuration
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 710b59e0d..f2faf0469 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -12,6 +12,8 @@
 from inspect import signature
 from types import FunctionType
 from typing import Optional, Union
+import ray
+import subprocess
 
 import numpy as np
 from constraint import (
@@ -92,6 +94,10 @@ class StopCriterionReached(Exception):
     """Exception thrown when a stop criterion has been reached."""
 
 
+class GPUTypeMismatchError(Exception):
+    """Exception thrown when GPU types are not the same in parallel execution"""
+
+
 try:
     import torch
 except ImportError:
@@ -1288,3 +1294,33 @@ def cuda_error_check(error):
         if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
+
+
+def get_num_devices(simulation_mode=False):
+    resources = ray.cluster_resources()
+    if simulation_mode:
+        num_devices = round(resources.get("CPU") * 0.8)
+    else:
+        num_devices = resources.get("GPU")
+    return int(num_devices)
+
+
+def get_gpu_id(lang):
+    if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
+        gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES") or os.environ.get("NVIDIA_VISIBLE_DEVICES") or "No GPU assigned"
+    else:
+        raise NotImplementedError("TODO: implement other languages")
+    return int(gpu_id)
+
+
+def get_gpu_type(lang):
+    gpu_id = get_gpu_id(lang)
+    if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader", "-i", str(gpu_id)],
+            capture_output=True,
+            text=True,
+        )
+        return result.stdout.strip()
+    else:
+        raise NotImplementedError("TODO: implement other languages")
diff --git a/pyproject.toml b/pyproject.toml
index 1634e973a..a11c41481 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,6 +68,7 @@ python-constraint2 = "^2.1.0"
 xmltodict = "*"
 pandas = ">=2.0.0"
 scikit-learn = ">=1.0.2"
+ray = { version = ">=2.9.1", extras = ["default"] }
 # Torch can be used with Kernel Tuner, but is not a dependency, should be up to the user to use it
 
 # List of optional dependencies for user installation, e.g. `pip install kernel_tuner[cuda]`, used in the below `extras`.
diff --git a/test/context.py b/test/context.py
index d1cbcf3c3..f932ad670 100644
--- a/test/context.py
+++ b/test/context.py
@@ -39,9 +39,7 @@
 try:
     import cupy
 
-    cupy.cuda.Device(
-        0
-    ).attributes  # triggers exception if there are no CUDA-capable devices
+    cupy.cuda.Device(0).attributes  # triggers exception if there are no CUDA-capable devices
     cupy_present = True
 except Exception:
     cupy_present = False
@@ -55,27 +53,18 @@
 
 try:
     from hip import hip
+
     hip_present = True
 except ImportError:
     hip_present = False
 
-skip_if_no_pycuda = pytest.mark.skipif(
-    not pycuda_present, reason="PyCuda not installed or no CUDA device detected"
-)
+skip_if_no_pycuda = pytest.mark.skipif(not pycuda_present, reason="PyCuda not installed or no CUDA device detected")
 skip_if_no_pynvml = pytest.mark.skipif(not pynvml_present, reason="NVML not installed")
-skip_if_no_cupy = pytest.mark.skipif(
-    not cupy_present, reason="CuPy not installed or no CUDA device detected"
-)
-skip_if_no_cuda = pytest.mark.skipif(
-    not cuda_present, reason="NVIDIA CUDA not installed"
-)
-skip_if_no_opencl = pytest.mark.skipif(
-    not opencl_present, reason="PyOpenCL not installed or no OpenCL device detected"
-)
+skip_if_no_cupy = pytest.mark.skipif(not cupy_present, reason="CuPy not installed or no CUDA device detected")
+skip_if_no_cuda = pytest.mark.skipif(not cuda_present, reason="NVIDIA CUDA not installed")
+skip_if_no_opencl = pytest.mark.skipif(not opencl_present, reason="PyOpenCL not installed or no OpenCL device detected")
 skip_if_no_gcc = pytest.mark.skipif(not gcc_present, reason="No gcc on PATH")
-skip_if_no_gfortran = pytest.mark.skipif(
-    not gfortran_present, reason="No gfortran on PATH"
-)
+skip_if_no_gfortran = pytest.mark.skipif(not gfortran_present, reason="No gfortran on PATH")
 skip_if_no_openmp = pytest.mark.skipif(not openmp_present, reason="No OpenMP found")
 skip_if_no_openacc = pytest.mark.skipif(not openacc_present, reason="No nvc++ on PATH")
 skip_if_no_hip = pytest.mark.skipif(not hip_present, reason="No HIP Python found")
diff --git a/test/strategies/test_bayesian_optimization.py b/test/strategies/test_bayesian_optimization.py
index dd206a37b..1aaa853d1 100644
--- a/test/strategies/test_bayesian_optimization.py
+++ b/test/strategies/test_bayesian_optimization.py
@@ -23,10 +23,10 @@
 max_threads = 1024
 searchspace = Searchspace(tune_params, [], max_threads)
 
-dev_dict = {'max_threads': max_threads}
-dev = namedtuple('Struct', dev_dict.keys())(*dev_dict.values())
-runner_dict = {'dev': dev}
-runner = namedtuple('Struct', runner_dict.keys())(*runner_dict.values())
+dev_dict = {"max_threads": max_threads}
+dev = namedtuple("Struct", dev_dict.keys())(*dev_dict.values())
+runner_dict = {"dev": dev}
+runner = namedtuple("Struct", runner_dict.keys())(*runner_dict.values())
 cost_func = CostFunc(searchspace, tuning_options, runner)
 
 # initialize required data
@@ -34,10 +34,19 @@
 _, _, eps = cost_func.get_bounds_x0_eps()
 original_to_normalized, normalized_to_original = bayes_opt.generate_normalized_param_dicts(tune_params, eps)
 normalized_parameter_space = bayes_opt.normalize_parameter_space(parameter_space, tune_params, original_to_normalized)
-pruned_parameter_space, removed_tune_params = bayes_opt.prune_parameter_space(normalized_parameter_space, tuning_options, tune_params, original_to_normalized)
+pruned_parameter_space, removed_tune_params = bayes_opt.prune_parameter_space(
+    normalized_parameter_space, tuning_options, tune_params, original_to_normalized
+)
 
 # initialize BO
-BO = BayesianOptimization(pruned_parameter_space, removed_tune_params, tuning_options, original_to_normalized, normalized_to_original, cost_func)
+BO = BayesianOptimization(
+    pruned_parameter_space,
+    removed_tune_params,
+    tuning_options,
+    original_to_normalized,
+    normalized_to_original,
+    cost_func,
+)
 predictions, _, std = BO.predict_list(BO.unvisited_cache)
 
 
@@ -61,7 +70,7 @@ def test_normalize_parameter_space():
 
 
 def test_prune_parameter_space():
-    assert removed_tune_params == [None, None, list(normalized_to_original['z'].keys())[0]]
+    assert removed_tune_params == [None, None, list(normalized_to_original["z"].keys())[0]]
     for index in range(len(pruned_parameter_space)):
         assert len(pruned_parameter_space[index]) <= len(parameter_space[index])
         assert len(parameter_space[index]) - len(pruned_parameter_space[index]) == 1
@@ -74,7 +83,8 @@ def test_bo_initialization():
     assert BO.searchspace == pruned_parameter_space
     assert BO.unvisited_cache == pruned_parameter_space
     assert len(BO.observations) == len(pruned_parameter_space)
-    assert BO.current_optimum == np.PINF
+    assert BO.current_optimum == np.inf
+
 
 def test_bo_initial_sample_lhs():
     sample = BO.draw_latin_hypercube_samples(num_samples=1)
@@ -85,18 +95,19 @@ def test_bo_initial_sample_lhs():
     assert len(sample[0]) == 2
     assert isinstance(sample[0][0], tuple)
     assert isinstance(sample[0][1], int)
-    assert len(sample[0][0]) == 2   # tune_params["z"] is dropped because it only has a single value
+    assert len(sample[0][0]) == 2  # tune_params["z"] is dropped because it only has a single value
     assert isinstance(sample[0][0][0], float)
     samples = BO.draw_latin_hypercube_samples(num_samples=3)
     assert len(samples) == 3
     with raises(ValueError):
         samples = BO.draw_latin_hypercube_samples(num_samples=30)
 
+
 def test_bo_is_better_than():
-    BO.opt_direction = 'max'
+    BO.opt_direction = "max"
     assert BO.is_better_than(2, 1)
     assert BO.is_better_than(-0.1, -0.2)
-    BO.opt_direction = 'min'
+    BO.opt_direction = "min"
     assert BO.is_better_than(1, 2)
     assert BO.is_better_than(-0.2, -0.1)
 
@@ -107,12 +118,12 @@ def test_bo_is_not_visited():
 
 
 def test_bo_get_af_by_name():
-    for basic_af in ['ei', 'poi', 'lcb']:
+    for basic_af in ["ei", "poi", "lcb"]:
         assert callable(BO.get_af_by_name(basic_af))
 
 
 def test_bo_set_acquisition_function():
-    BO.set_acquisition_function('multi-fast')
+    BO.set_acquisition_function("multi-fast")
     assert callable(BO.optimize)
 
 
diff --git a/test/strategies/test_common.py b/test/strategies/test_common.py
index 29ead8615..b769a107d 100644
--- a/test/strategies/test_common.py
+++ b/test/strategies/test_common.py
@@ -13,9 +13,7 @@
 
 
 def fake_runner():
-    fake_result = {
-        'time': 5
-    }
+    fake_result = {"time": 5}
     runner = Mock()
     runner.last_strategy_start_time = perf_counter()
     runner.run.return_value = [fake_result]
@@ -27,9 +25,18 @@ def fake_runner():
 
 def test_cost_func():
     x = [1, 4]
-    tuning_options = Options(scaling=False, snap=False, tune_params=tune_params,
-                             restrictions=None, strategy_options={}, cache={}, unique_results={},
-                             objective="time", objective_higher_is_better=False, metrics=None)
+    tuning_options = Options(
+        scaling=False,
+        snap=False,
+        tune_params=tune_params,
+        restrictions=None,
+        strategy_options={},
+        cache={},
+        unique_results={},
+        objective="time",
+        objective_higher_is_better=False,
+        metrics=None,
+    )
     runner = fake_runner()
 
     time = CostFunc(Searchspace(tune_params, None, 1024), tuning_options, runner)(x)
@@ -38,10 +45,20 @@ def test_cost_func():
     # check if restrictions are properly handled
     def restrictions(_):
         return False
-    tuning_options = Options(scaling=False, snap=False, tune_params=tune_params,
-                             restrictions=restrictions, strategy_options={},
-                             verbose=True, cache={}, unique_results={},
-                             objective="time", objective_higher_is_better=False, metrics=None)
+
+    tuning_options = Options(
+        scaling=False,
+        snap=False,
+        tune_params=tune_params,
+        restrictions=restrictions,
+        strategy_options={},
+        verbose=True,
+        cache={},
+        unique_results={},
+        objective="time",
+        objective_higher_is_better=False,
+        metrics=None,
+    )
     time = CostFunc(Searchspace(tune_params, restrictions, 1024), tuning_options, runner)(x)
     assert time == sys.float_info.max
 
diff --git a/test/strategies/test_strategies.py b/test/strategies/test_strategies.py
index 096be38b0..e3d8009d3 100644
--- a/test/strategies/test_strategies.py
+++ b/test/strategies/test_strategies.py
@@ -9,6 +9,7 @@
 
 cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/../test_cache_file.json"
 
+
 @pytest.fixture
 def vector_add():
     kernel_string = """
@@ -33,21 +34,28 @@ def vector_add():
     return ["vector_add", kernel_string, size, args, tune_params]
 
 
-@pytest.mark.parametrize('strategy', strategy_map)
+@pytest.mark.parametrize("strategy", strategy_map)
 def test_strategies(vector_add, strategy):
-
-    options = dict(popsize=5, neighbor='adjacent')
+    options = dict(popsize=5)
 
     print(f"testing {strategy}")
 
     if hasattr(kernel_tuner.interface.strategy_map[strategy], "_options"):
-        filter_options = {opt:val for opt, val in options.items() if opt in kernel_tuner.interface.strategy_map[strategy]._options}
+        filter_options = {
+            opt: val for opt, val in options.items() if opt in kernel_tuner.interface.strategy_map[strategy]._options
+        }
     else:
         filter_options = options
     filter_options["max_fevals"] = 10
 
-    results, _ = kernel_tuner.tune_kernel(*vector_add, strategy=strategy, strategy_options=filter_options,
-                                         verbose=False, cache=cache_filename, simulation_mode=True)
+    results, _ = kernel_tuner.tune_kernel(
+        *vector_add,
+        strategy=strategy,
+        strategy_options=filter_options,
+        verbose=False,
+        cache=cache_filename,
+        simulation_mode=True,
+    )
 
     assert len(results) > 0
 
@@ -63,15 +71,15 @@ def test_strategies(vector_add, strategy):
 
     # check whether the returned dictionaries contain exactly the expected keys and the appropriate type
     expected_items = {
-        'block_size_x': int,
-        'time': (float, int),
-        'times': list,
-        'compile_time': (float, int),
-        'verification_time': (float, int),
-        'benchmark_time': (float, int),
-        'strategy_time': (float, int),
-        'framework_time': (float, int),
-        'timestamp': str
+        "block_size_x": int,
+        "time": (float, int),
+        "times": list,
+        "compile_time": (float, int),
+        "verification_time": (float, int),
+        "benchmark_time": (float, int),
+        "strategy_time": (float, int),
+        "framework_time": (float, int),
+        "timestamp": str,
     }
     for res in results:
         assert len(res) == len(expected_items)
diff --git a/test/test_accuracy.py b/test/test_accuracy.py
index 1e5070637..17d6f5156 100644
--- a/test/test_accuracy.py
+++ b/test/test_accuracy.py
@@ -47,9 +47,7 @@ def test_tunable_precision():
     from kernel_tuner.accuracy import TunablePrecision
 
     inputs = np.array([1, 2, 3], dtype=np.float64)
-    x = TunablePrecision(
-        "foo", inputs, dict(float16=np.half, float32=np.float32, float64=np.double)
-    )
+    x = TunablePrecision("foo", inputs, dict(float16=np.half, float32=np.float32, float64=np.double))
 
     assert np.all(x(dict(foo="float16")) == inputs)
     assert x(dict(foo="float16")).dtype == np.half
diff --git a/test/test_common.py b/test/test_common.py
index 7c1bd6838..e4500fc73 100644
--- a/test/test_common.py
+++ b/test/test_common.py
@@ -9,7 +9,7 @@
 
 def test_get_bounds_x0_eps():
     tune_params = dict()
-    tune_params['x'] = [0, 1, 2, 3, 4]
+    tune_params["x"] = [0, 1, 2, 3, 4]
     searchspace = Searchspace(tune_params, [], 1024)
 
     tuning_options = Options()
@@ -28,11 +28,10 @@ def test_get_bounds_x0_eps():
 
 
 def test_get_bounds():
-
     tune_params = dict()
-    tune_params['x'] = [0, 1, 2, 3, 4]
-    tune_params['y'] = [i for i in range(0, 10000, 100)]
-    tune_params['z'] = [-11.2, 55.67, 123.27]
+    tune_params["x"] = [0, 1, 2, 3, 4]
+    tune_params["y"] = [i for i in range(0, 10000, 100)]
+    tune_params["z"] = [-11.2, 55.67, 123.27]
 
     for k in tune_params.keys():
         random.shuffle(tune_params[k])
@@ -45,30 +44,28 @@ def test_get_bounds():
 
 
 def test_snap_to_nearest_config():
-
     tune_params = dict()
-    tune_params['x'] = [0, 1, 2, 3, 4, 5]
-    tune_params['y'] = [0, 1, 2, 3, 4, 5]
-    tune_params['z'] = [0, 1, 2, 3, 4, 5]
-    tune_params['w'] = ['a', 'b', 'c']
+    tune_params["x"] = [0, 1, 2, 3, 4, 5]
+    tune_params["y"] = [0, 1, 2, 3, 4, 5]
+    tune_params["z"] = [0, 1, 2, 3, 4, 5]
+    tune_params["w"] = ["a", "b", "c"]
 
-    x = [-5.7, 3.14, 1e6, 'b']
-    expected = [0, 3, 5, 'b']
+    x = [-5.7, 3.14, 1e6, "b"]
+    expected = [0, 3, 5, "b"]
 
     answer = common.snap_to_nearest_config(x, tune_params)
     assert answer == expected
 
 
 def test_unscale():
-
     params = dict()
-    params['x'] = [2**i for i in range(4, 9)]
-    eps = 1.0 / len(params['x'])
+    params["x"] = [2**i for i in range(4, 9)]
+    eps = 1.0 / len(params["x"])
 
-    assert common.unscale_and_snap_to_nearest([0], params, eps)[0] == params['x'][0]
-    assert common.unscale_and_snap_to_nearest([1], params, eps)[0] == params['x'][-1]
+    assert common.unscale_and_snap_to_nearest([0], params, eps)[0] == params["x"][0]
+    assert common.unscale_and_snap_to_nearest([1], params, eps)[0] == params["x"][-1]
 
-    intervals = np.linspace(0, 1, len(params['x']) * 10)
+    intervals = np.linspace(0, 1, len(params["x"]) * 10)
 
     freq = dict()
     for i in intervals:
@@ -82,6 +79,6 @@ def test_unscale():
     print(freq)
 
     for v in freq.values():
-        assert v == freq[params['x'][0]]
+        assert v == freq[params["x"][0]]
 
-    assert len(freq.keys()) == len(params['x'])
+    assert len(freq.keys()) == len(params["x"])
diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py
index 913fee85d..99a0071fc 100644
--- a/test/test_compiler_functions.py
+++ b/test/test_compiler_functions.py
@@ -159,9 +159,7 @@ def test_compile(npct, subprocess):
     kernel_string = "this is a fake C program"
     kernel_name = "blabla"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions()
     f = cfunc.compile(kernel_instance)
@@ -191,9 +189,7 @@ def test_compile_detects_device_code(npct, subprocess):
     kernel_string = "this code clearly contains device code __global__ kernel(float* arg){ return; }"
     kernel_name = "blabla"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions()
     cfunc.compile(kernel_instance)
@@ -347,9 +343,7 @@ def test_complies_fortran_function_no_module():
     """
     kernel_name = "my_test_function"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions(compiler="gfortran")
     func = cfunc.compile(kernel_instance)
@@ -378,9 +372,7 @@ def test_complies_fortran_function_with_module():
     """
     kernel_name = "my_test_function"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     try:
         cfunc = CompilerFunctions(compiler="gfortran")
diff --git a/test/test_core.py b/test/test_core.py
index a8624470e..6156afbf2 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -14,9 +14,12 @@
 from .context import skip_if_no_pycuda
 
 
-mock_config = {"return_value.compile.return_value": "compile",
-               "return_value.ready_argument_list.return_value": "ready_argument_list",
-               "return_value.max_threads": 1024}
+mock_config = {
+    "return_value.compile.return_value": "compile",
+    "return_value.ready_argument_list.return_value": "ready_argument_list",
+    "return_value.max_threads": 1024,
+}
+
 
 def get_vector_add_args():
     size = int(1e6)
@@ -42,9 +45,19 @@ def env():
     lang = "CUDA"
     kernel_source = core.KernelSource(kernel_name, kernel_string, lang)
     verbose = True
-    kernel_options = Options(kernel_name=kernel_name, kernel_string=kernel_string, problem_size=args[-1],
-                             arguments=args, lang=lang, grid_div_x=None, grid_div_y=None, grid_div_z=None,
-                             cmem_args=None, texmem_args=None, block_size_names=None)
+    kernel_options = Options(
+        kernel_name=kernel_name,
+        kernel_string=kernel_string,
+        problem_size=args[-1],
+        arguments=args,
+        lang=lang,
+        grid_div_x=None,
+        grid_div_y=None,
+        grid_div_z=None,
+        cmem_args=None,
+        texmem_args=None,
+        block_size_names=None,
+    )
     device_options = Options(device=0, platform=0, quiet=False, compiler=None, compiler_options=None)
     dev = core.DeviceInterface(kernel_source, iterations=7, **device_options)
     instance = dev.create_kernel_instance(kernel_source, kernel_options, params, verbose)
@@ -54,7 +67,6 @@ def env():
 
 @skip_if_no_pycuda
 def test_default_verify_function(env):
-
     # gpu_args = dev.ready_argument_list(args)
     # func = dev.compile_kernel(instance, verbose)
 
@@ -93,7 +105,7 @@ def test_default_verify_function(env):
     assert True
 
 
-@patch('kernel_tuner.core.PyCudaFunctions')
+@patch("kernel_tuner.core.PyCudaFunctions")
 def test_check_kernel_output(dev_func_interface):
     dev_func_interface.configure_mock(**mock_config)
 
@@ -105,17 +117,17 @@ def test_check_kernel_output(dev_func_interface):
     wrong = [np.array([1, 2, 3, 4]).astype(np.float32)]
     atol = 1e-6
 
-    dev.check_kernel_output('func', answer, instance, answer, atol, None, True)
+    dev.check_kernel_output("func", answer, instance, answer, atol, None, True)
 
     dfi.memcpy_htod.assert_called_once_with(answer[0], answer[0])
-    dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1))
+    dfi.run_kernel.assert_called_once_with("func", answer, (256, 1, 1), (1, 1, 1))
 
     print(dfi.mock_calls)
 
     assert dfi.memcpy_dtoh.called == 1
 
     for name, args, _ in dfi.mock_calls:
-        if name == 'memcpy_dtoh':
+        if name == "memcpy_dtoh":
             assert all(args[0] == answer[0])
             assert all(args[1] == answer[0])
 
@@ -124,7 +136,7 @@ def test_check_kernel_output(dev_func_interface):
     # obviously does not result in the result_host array containing anything
     # non-zero
     try:
-        dev.check_kernel_output('func', wrong, instance, wrong, atol, None, True)
+        dev.check_kernel_output("func", wrong, instance, wrong, atol, None, True)
         print("check_kernel_output failed to throw an exception")
         assert False
     except Exception:
@@ -132,7 +144,6 @@ def test_check_kernel_output(dev_func_interface):
 
 
 def test_default_verify_function_arrays():
-
     answer = [np.zeros(4).astype(np.float32), None, np.ones(5).astype(np.int32)]
 
     answer_type_error1 = [np.zeros(4).astype(np.float32)]
@@ -157,7 +168,6 @@ def test_default_verify_function_arrays():
 
 
 def test_default_verify_function_scalar():
-
     answer = [np.zeros(4).astype(np.float32), None, np.int64(42)]
 
     instance = core.KernelInstance("name", None, "kernel_string", [], (256, 1, 1), (1, 1, 1), {}, answer)
@@ -198,16 +208,18 @@ def test_preprocess_gpu_arguments():
 
 def test_split_argument_list():
     test_string = "T *c, const T *__restrict__ a, T\n *\n b\n , int n"
-    ans1, ans2 = core.split_argument_list([s.strip() for s in test_string.split(',')])
+    ans1, ans2 = core.split_argument_list([s.strip() for s in test_string.split(",")])
     assert ans1 == ["T *", "const T *__restrict__", "T *", "int"]
     assert ans2 == ["c", "a", "b", "n"]
 
+
 def test_apply_template_typenames():
     type_list = ["T *", "CONST __restrict__", "double"]
     templated_typenames = {"T": "test"}
     core.apply_template_typenames(type_list, templated_typenames)
     assert type_list == ["test *", "CONST __restrict__", "double"]
 
+
 def test_get_templated_typenames():
     template_arguments = ["double", "32"]
     template_parameters = ["typename TF", "test1", "test2"]
@@ -217,6 +229,7 @@ def test_get_templated_typenames():
     assert len(ans) == 1
     assert ans["TF"] == "double"
 
+
 def test_wrap_templated_kernel():
     kernel_string = """
 template<typename TF> __global__ void vector_add(TF *c, const TF *__restrict__ a, TF * b , int n) {
@@ -228,15 +241,16 @@ def test_wrap_templated_kernel():
 """
     kernel_name = "vector_add<float>"
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
-    #check __global__ in templated definition is replaced with __device__
+    # check __global__ in templated definition is replaced with __device__
     assert "template<typename TF> __device__ void vector_add" in ans
-    #check if template instantiation is inserted
+    # check if template instantiation is inserted
     assert "template __device__ void vector_add<float>(float *, const float *__restrict__, float *, int);" in ans
-    #check if wrapper functions with C linkage is inserted
-    assert "extern \"C\" __global__ void vector_add" in ans
-    #check if original kernel is called
+    # check if wrapper functions with C linkage is inserted
+    assert 'extern "C" __global__ void vector_add' in ans
+    # check if original kernel is called
     assert "vector_add<float>(c, a, b, n);" in ans
 
+
 def test_wrap_templated_kernel2():
     kernel_string = """
 template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add(TF *c, const TF *__restrict__ a, TF * b , int n) {
@@ -251,6 +265,7 @@ def test_wrap_templated_kernel2():
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
     assert True
 
+
 def test_wrap_templated_kernel3():
     kernel_string = """
 template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1(TF *c, const TF *__restrict__ a, TF * b , int n) {
@@ -271,7 +286,10 @@ def test_wrap_templated_kernel3():
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
 
     # test that the template wrapper matches the right kernel (the first and not the second)
-    assert 'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)' in ans
+    assert (
+        'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)'
+        in ans
+    )
 
 
 def test_wrap_templated_kernel4():
@@ -295,4 +313,7 @@ def test_wrap_templated_kernel4():
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
 
     # test that the template wrapper matches the right kernel (the second not the first)
-    assert 'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)' in ans
+    assert (
+        'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)'
+        in ans
+    )
diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py
index 1dc68652d..47200cf30 100644
--- a/test/test_cuda_functions.py
+++ b/test/test_cuda_functions.py
@@ -16,7 +16,6 @@
 
 @skip_if_no_cuda
 def test_ready_argument_list():
-
     size = 1000
     a = np.int32(75)
     b = np.random.randn(size).astype(np.float32)
@@ -34,7 +33,6 @@ def test_ready_argument_list():
 
 @skip_if_no_cuda
 def test_compile():
-
     kernel_string = """
     extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
         int i = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/test/test_cupy_functions.py b/test/test_cupy_functions.py
index 4bb4d16f4..be781f368 100644
--- a/test/test_cupy_functions.py
+++ b/test/test_cupy_functions.py
@@ -1,4 +1,3 @@
-
 import kernel_tuner
 
 from .context import skip_if_no_cupy
@@ -9,4 +8,3 @@
 def test_tune_kernel(env):
     result, _ = kernel_tuner.tune_kernel(*env, lang="cupy", verbose=True)
     assert len(result) > 0
-
diff --git a/test/test_energy.py b/test/test_energy.py
index 187ac1cdc..3ca9f1e27 100644
--- a/test/test_energy.py
+++ b/test/test_energy.py
@@ -6,11 +6,13 @@
 
 cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/synthetic_fp32_cache_NVIDIA_RTX_A4000.json"
 
+
 @skip_if_no_pycuda
 @skip_if_no_pynvml
 def test_create_power_frequency_model():
-
-    ridge_frequency, freqs, nvml_power, fitted_params, scaling = energy.create_power_frequency_model(cache=cache_filename, simulation_mode=True)
+    ridge_frequency, freqs, nvml_power, fitted_params, scaling = energy.create_power_frequency_model(
+        cache=cache_filename, simulation_mode=True
+    )
     target_value = 1350
     tolerance = 0.05
-    assert target_value * (1-tolerance) <= ridge_frequency <= target_value * (1+tolerance)
+    assert target_value * (1 - tolerance) <= ridge_frequency <= target_value * (1 + tolerance)
diff --git a/test/test_ensemble_tuning.py b/test/test_ensemble_tuning.py
new file mode 100644
index 000000000..9f8cc75df
--- /dev/null
+++ b/test/test_ensemble_tuning.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pytest
+import logging
+import sys
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.backends import nvcuda
+from kernel_tuner.core import KernelInstance, KernelSource
+from .context import skip_if_no_pycuda
+
+try:
+    import pycuda.driver
+except Exception:
+    pass
+
+
+@pytest.fixture
+def env():
+    kernel_string = """
+    extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        int j = blockIdx.y * blockDim.y + threadIdx.y;
+        int index = i + j * gridDim.x * blockDim.x;
+        if (index < n) {
+            c[index] = a[index] + b[index];
+        }
+    }
+    """
+
+    size = 100
+    a = np.random.randn(size).astype(np.float32)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.zeros_like(b)
+    n = np.int32(size)
+
+    args = [c, a, b, n]
+    tune_params = dict()
+
+    # Extend the range of block sizes for a bigger search space
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(30)]
+    tune_params["block_size_y"] = [1 + i for i in range(1, 16)]
+
+    return ["vector_add", kernel_string, size, args, tune_params]
+
+
+@skip_if_no_pycuda
+def test_parallel_tune_kernel(env):
+    strategy_options = {"ensemble": ["greedy_ils", "greedy_ils"]}
+    result, _ = tune_kernel(
+        *env, lang="CUDA", verbose=True, strategy="ensemble", parallel_mode=True, strategy_options=strategy_options
+    )
+    assert len(result) > 0
diff --git a/test/test_file_utils.py b/test/test_file_utils.py
index 622e06b44..4e181729c 100644
--- a/test/test_file_utils.py
+++ b/test/test_file_utils.py
@@ -5,6 +5,7 @@
 from jsonschema import validate
 import numpy as np
 import warnings
+
 try:
     from hip import hip
 except:
@@ -64,6 +65,7 @@ def test_store_metadata_file():
         # clean up
         delete_temp_file(filename)
 
+
 def hip_check(call_result):
     err = call_result[0]
     result = call_result[1:]
@@ -73,6 +75,7 @@ def hip_check(call_result):
         raise RuntimeError(str(err))
     return result
 
+
 @skip_if_no_hip
 def test_check_argument_list_device_array():
     """Test check_argument_list with DeviceArray"""
@@ -84,12 +87,8 @@ def test_check_argument_list_device_array():
     host_array = np.ones((100,), dtype=np.float32)
     num_bytes = host_array.size * host_array.itemsize
     device_array = hip_check(hip.hipMalloc(num_bytes))
-    device_array.configure(
-        typestr="float32",
-        shape=host_array.shape,
-        itemsize=host_array.itemsize
-    )
-    
+    device_array.configure(typestr="float32", shape=host_array.shape, itemsize=host_array.itemsize)
+
     with warnings.catch_warnings():
         warnings.simplefilter("error")
         check_argument_list("simple_kernel", float_kernel, [device_array])
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index e192223ed..eb587c618 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -10,10 +10,12 @@
 
 try:
     from hip import hip, hiprtc
+
     hip_present = True
 except ImportError:
     pass
 
+
 def hip_check(call_result):
     err = call_result[0]
     result = call_result[1:]
@@ -25,6 +27,7 @@ def hip_check(call_result):
         raise RuntimeError(str(err))
     return result
 
+
 @pytest.fixture
 def env():
     kernel_string = """
@@ -48,6 +51,7 @@ def env():
 
     return ["vector_add", kernel_string, size, args, tune_params]
 
+
 @skip_if_no_hip
 def test_ready_argument_list():
     size = 1000
@@ -67,6 +71,7 @@ def test_ready_argument_list():
     assert gpu_args[1].value == a
     assert gpu_args[3].value == c
 
+
 @skip_if_no_hip
 def test_compile():
     kernel_string = """
@@ -87,6 +92,7 @@ def test_compile():
     except Exception as e:
         pytest.fail("Did not expect any exception:" + str(e))
 
+
 @skip_if_no_hip
 def test_memset_and_memcpy_dtoh():
     a = [1, 2, 3, 4]
@@ -101,6 +107,7 @@ def test_memset_and_memcpy_dtoh():
 
     assert all(output == np.full(4, 4))
 
+
 @skip_if_no_hip
 def test_memcpy_htod():
     a = [1, 2, 3, 4]
@@ -114,6 +121,7 @@ def test_memcpy_htod():
 
     assert all(output == x)
 
+
 @skip_if_no_hip
 def test_copy_constant_memory_args():
     kernel_string = """
@@ -133,7 +141,7 @@ def test_copy_constant_memory_args():
     kernel = dev.compile(kernel_instance)
 
     my_constant_data = np.full(100, 23).astype(np.float32)
-    cmem_args = {'my_constant_data': my_constant_data}
+    cmem_args = {"my_constant_data": my_constant_data}
     dev.copy_constant_memory_args(cmem_args)
 
     output = np.full(100, 0).astype(np.float32)
@@ -147,16 +155,12 @@ def test_copy_constant_memory_args():
 
     assert (my_constant_data == output).all()
 
+
 @skip_if_no_hip
 def test_smem_args(env):
-    result, _ = tune_kernel(*env,
-                          smem_args=dict(size="block_size_x*4"),
-                          verbose=True, lang="HIP")
+    result, _ = tune_kernel(*env, smem_args=dict(size="block_size_x*4"), verbose=True, lang="HIP")
     tune_params = env[-1]
     assert len(result) == len(tune_params["block_size_x"])
-    result, _ = tune_kernel(
-        *env,
-        smem_args=dict(size=lambda p: p['block_size_x'] * 4),
-        verbose=True, lang="HIP")
+    result, _ = tune_kernel(*env, smem_args=dict(size=lambda p: p["block_size_x"] * 4), verbose=True, lang="HIP")
     tune_params = env[-1]
-    assert len(result) == len(tune_params["block_size_x"])
\ No newline at end of file
+    assert len(result) == len(tune_params["block_size_x"])
diff --git a/test/test_hyper.py b/test/test_hyper.py
index 9d1dc55df..9b54b66bd 100644
--- a/test/test_hyper.py
+++ b/test/test_hyper.py
@@ -4,7 +4,6 @@
 
 
 def test_hyper(env):
-
     hyper_params = dict()
     hyper_params["popsize"] = [5]
     hyper_params["maxiter"] = [5, 10]
@@ -15,4 +14,3 @@ def test_hyper(env):
 
     result = tune_hyper_params(target_strategy, hyper_params, *env, verbose=True, cache=cache_filename)
     assert len(result) > 0
-
diff --git a/test/test_integration.py b/test/test_integration.py
index aafb437f1..30c3e26ac 100644
--- a/test/test_integration.py
+++ b/test/test_integration.py
@@ -11,11 +11,8 @@
 
 @pytest.fixture()
 def fake_results():
-    #create fake results for testing
-    tune_params = {
-        "a": [1, 2, 4],
-        "b": [4, 5, 6]
-    }
+    # create fake results for testing
+    tune_params = {"a": [1, 2, 4], "b": [4, 5, 6]}
     problem_size = 100
     parameter_space = itertools.product(*tune_params.values())
     results = [dict(zip(tune_params.keys(), element)) for element in parameter_space]
@@ -28,42 +25,50 @@ def fake_results():
         r["strategy_time"] = 20.0 + (i / 5)
         r["verification_time"] = 20.0 + (i / 5)
         r["timestamp"] = str(datetime.now(timezone.utc))
-    env = {
-        "device_name": "My GPU"
-    }
+    env = {"device_name": "My GPU"}
 
     return "fake_kernel", "fake_string", tune_params, problem_size, parameter_space, results, env
 
 
 def test_store_results(fake_results):
-
     filename = "temp_test_results_file.json"
     kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results
 
     try:
-        #test basic operation
+        # test basic operation
         integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3)
         meta, stored_data = integration._read_results_file(filename)
 
         assert len([d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100"]) == 3
 
-        #test if results for a different problem_size values are added
+        # test if results for a different problem_size values are added
         integration.store_results(filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         meta, stored_data = integration._read_results_file(filename)
 
         assert len([d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100"]) == 3
         assert len([d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "1000"]) == 3
 
-        #test if results for a different GPU can be added
-        integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, { "device_name": "Another GPU"}, top=3)
+        # test if results for a different GPU can be added
+        integration.store_results(
+            filename,
+            kernel_name,
+            kernel_string,
+            tune_params,
+            problem_size,
+            results,
+            {"device_name": "Another GPU"},
+            top=3,
+        )
         meta, stored_data = integration._read_results_file(filename)
 
         assert len(set([d["device_name"] for d in stored_data])) == 2
 
-        #test if overwriting results works
+        # test if overwriting results works
         for i, r in enumerate(results):
             r["time"] = 50.0 + i
-        integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=0.1)
+        integration.store_results(
+            filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=0.1
+        )
         meta, stored_data = integration._read_results_file(filename)
 
         my_gpu_100_data = [d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100"]
@@ -75,58 +80,59 @@ def test_store_results(fake_results):
 
 
 def test_setup_device_targets(fake_results):
-
     results_filename = "temp_test_results_file.json"
     header_filename = "temp_test_header_file.h"
     kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results
 
     try:
-        integration.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3)
-        #results file
-        #{'My_GPU': {'100': [{'a': 1, 'b': 4, 'time': 100.0}, {'a': 1, 'b': 5, 'time': 101.0}, {'a': 1, 'b': 6, 'time': 102.0}]}}
+        integration.store_results(
+            results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3
+        )
+        # results file
+        # {'My_GPU': {'100': [{'a': 1, 'b': 4, 'time': 100.0}, {'a': 1, 'b': 5, 'time': 101.0}, {'a': 1, 'b': 6, 'time': 102.0}]}}
 
         integration.create_device_targets(header_filename, results_filename)
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
 
         assert "#ifdef TARGET_My_GPU" in output_str
         assert "#define a 1" in output_str
         assert "#define b 4" in output_str
 
-        #test output when more then one problem size is used, and best configuration is different
+        # test output when more then one problem size is used, and best configuration is different
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 4:
-                e['time'] += 100
+            if e["a"] == 1 and e["b"] == 4:
+                e["time"] += 100
         integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         integration.create_device_targets(header_filename, results_filename, objective="time")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"])
         assert expected in output_str
 
-        #test output when more then one problem size is used, and best configuration depends on total time
+        # test output when more then one problem size is used, and best configuration depends on total time
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 6:
-                e['time'] -= 3
+            if e["a"] == 1 and e["b"] == 6:
+                e["time"] -= 3
         integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         integration.create_device_targets(header_filename, results_filename, objective="time")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"])
         assert expected in output_str
 
-        #test output when more then one GPU is used
+        # test output when more then one GPU is used
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 6:
-                e['time'] += 3.1
-        env['device_name'] = "My_GPU2"
+            if e["a"] == 1 and e["b"] == 6:
+                e["time"] += 3.1
+        env["device_name"] = "My_GPU2"
         integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         integration.create_device_targets(header_filename, results_filename, objective="time")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"])
         assert expected in output_str
@@ -141,34 +147,45 @@ def test_setup_device_targets(fake_results):
 
 
 def test_setup_device_targets_max(fake_results):
-
     results_filename = "temp_test_results_file.json"
     header_filename = "temp_test_header_file.h"
     kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results
 
-    #add GFLOP/s as metric
+    # add GFLOP/s as metric
     for i, e in enumerate(results):
-        e['GFLOP/s'] = 1e5 / e['time']
+        e["GFLOP/s"] = 1e5 / e["time"]
 
     try:
-        integration.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3, objective="GFLOP/s")
+        integration.store_results(
+            results_filename,
+            kernel_name,
+            kernel_string,
+            tune_params,
+            problem_size,
+            results,
+            env,
+            top=3,
+            objective="GFLOP/s",
+        )
         integration.create_device_targets(header_filename, results_filename, objective="GFLOP/s")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         assert "TARGET_My_GPU" in output_str
         assert "#define a 1" in output_str
         assert "#define b 4" in output_str
 
-        #test output when more then one problem size is used, and best configuration is different
+        # test output when more then one problem size is used, and best configuration is different
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 4:
-                e['time'] += 100
-                e['GFLOP/s'] = 1e5 / e['time']
-        integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3, objective="GFLOP/s")
+            if e["a"] == 1 and e["b"] == 4:
+                e["time"] += 100
+                e["GFLOP/s"] = 1e5 / e["time"]
+        integration.store_results(
+            results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3, objective="GFLOP/s"
+        )
         integration.create_device_targets(header_filename, results_filename, objective="GFLOP/s")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"])
         assert expected in output_str
diff --git a/test/test_kernelbuilder.py b/test/test_kernelbuilder.py
index c706e3953..6ef7c9d4a 100644
--- a/test/test_kernelbuilder.py
+++ b/test/test_kernelbuilder.py
@@ -32,7 +32,7 @@ def test_PythonKernel(test_kernel, backend):
     kernel_name, kernel_string, n, args, params = test_kernel
     kernel_function = kernelbuilder.PythonKernel(*test_kernel, lang=backend)
     reference = kernel_function(*args)
-    assert np.allclose(reference[0], args[1]+args[2])
+    assert np.allclose(reference[0], args[1] + args[2])
 
 
 @pytest.mark.parametrize("backend", backends)
@@ -42,21 +42,23 @@ def test_PythonKernel_tuned(test_kernel, backend):
     c, a, b, n = args
     test_results_file = "test_results_file.json"
     results = params.copy()
-    results['time'] = 1.0
+    results["time"] = 1.0
     env = {"device_name": "bogus GPU"}
     try:
-        #create a fake results file
+        # create a fake results file
         integration.store_results(test_results_file, kernel_name, kernel_string, params, n, [results], env)
 
-        #create a kernel using the results
-        kernel_function = kernelbuilder.PythonKernel(kernel_name, kernel_string, n, args, results_file=test_results_file, lang=backend)
+        # create a kernel using the results
+        kernel_function = kernelbuilder.PythonKernel(
+            kernel_name, kernel_string, n, args, results_file=test_results_file, lang=backend
+        )
 
-        #test if params were retrieved correctly
+        # test if params were retrieved correctly
         assert kernel_function.params["block_size_x"] == 384
 
-        #see if it functions properly
+        # see if it functions properly
         reference = kernel_function(c, a, b, n)
-        assert np.allclose(reference[0], a+b)
+        assert np.allclose(reference[0], a + b)
 
     finally:
         util.delete_temp_file(test_results_file)
diff --git a/test/test_nvml_mocked.py b/test/test_nvml_mocked.py
index b986f6686..43b0ec294 100644
--- a/test/test_nvml_mocked.py
+++ b/test/test_nvml_mocked.py
@@ -8,49 +8,49 @@
 from kernel_tuner.observers.nvml import get_nvml_pwr_limits, get_nvml_gr_clocks, get_nvml_mem_clocks, get_idle_power
 
 
-
 def setup_mock(nvml):
-    nvml.return_value.configure_mock(pwr_constraints=(90000, 150000),
-                                     supported_mem_clocks=[2100],
-                                     supported_gr_clocks={2100: [1000, 2000, 3000]},
-                                     pwr_usage=lambda : 5000)
+    nvml.return_value.configure_mock(
+        pwr_constraints=(90000, 150000),
+        supported_mem_clocks=[2100],
+        supported_gr_clocks={2100: [1000, 2000, 3000]},
+        pwr_usage=lambda: 5000,
+    )
 
     return nvml
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_nvml_pwr_limits(nvml):
     nvml = setup_mock(nvml)
     result = get_nvml_pwr_limits(0, quiet=True)
-    assert result['nvml_pwr_limit'] == [90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
+    assert result["nvml_pwr_limit"] == [90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
 
     result = get_nvml_pwr_limits(0, n=5, quiet=True)
-    assert len(result['nvml_pwr_limit']) == 5
-    assert result['nvml_pwr_limit'][0] == 90
-    assert result['nvml_pwr_limit'][-1] == 150
+    assert len(result["nvml_pwr_limit"]) == 5
+    assert result["nvml_pwr_limit"][0] == 90
+    assert result["nvml_pwr_limit"][-1] == 150
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_nvml_gr_clocks(nvml):
     nvml = setup_mock(nvml)
     result = get_nvml_gr_clocks(0, quiet=True)
-    assert result['nvml_gr_clock'] == [1000, 2000, 3000]
+    assert result["nvml_gr_clock"] == [1000, 2000, 3000]
 
     result = get_nvml_gr_clocks(0, n=2, quiet=True)
-    assert result['nvml_gr_clock'] == [1000, 3000]
+    assert result["nvml_gr_clock"] == [1000, 3000]
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_nvml_mem_clocks(nvml):
     nvml = setup_mock(nvml)
     result = get_nvml_mem_clocks(0, quiet=False)
     print(result)
-    assert result['nvml_mem_clock'] == [2100]
+    assert result["nvml_mem_clock"] == [2100]
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_idle_power(nvml):
     nvml = setup_mock(nvml)
     result = get_idle_power(0)
     assert np.isclose(result, 5)
-
diff --git a/test/test_observers.py b/test/test_observers.py
index 97928b477..a20cec89c 100644
--- a/test/test_observers.py
+++ b/test/test_observers.py
@@ -30,6 +30,7 @@ def test_nvml_observer(env):
     assert "temperature" in result[0]
     assert result[0]["temperature"] > 0
 
+
 @skip_if_no_pycuda
 def test_custom_observer(env):
     env[-1]["block_size_x"] = [128]
@@ -43,34 +44,39 @@ def get_results(self):
     assert "name" in result[0]
     assert len(result[0]["name"]) > 0
 
+
 @skip_if_no_pycuda
 def test_register_observer_pycuda(env):
-    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CUDA')
+    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang="CUDA")
     assert "num_regs" in result[0]
     assert result[0]["num_regs"] > 0
 
+
 @skip_if_no_cupy
 def test_register_observer_cupy(env):
-    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CuPy')
+    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang="CuPy")
     assert "num_regs" in result[0]
     assert result[0]["num_regs"] > 0
 
+
 @skip_if_no_cuda
 def test_register_observer_nvcuda(env):
-    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='NVCUDA')
+    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang="NVCUDA")
     assert "num_regs" in result[0]
     assert result[0]["num_regs"] > 0
 
+
 @skip_if_no_opencl
 def test_register_observer_opencl(env_opencl):
     with raises(NotImplementedError) as err:
-        kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='OpenCL')
+        kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang="OpenCL")
     assert err.errisinstance(NotImplementedError)
     assert "OpenCL" in str(err.value)
 
+
 @skip_if_no_hip
 def test_register_observer_hip(env_hip):
     with raises(NotImplementedError) as err:
-        kernel_tuner.tune_kernel(*env_hip, observers=[RegisterObserver()], lang='HIP')
+        kernel_tuner.tune_kernel(*env_hip, observers=[RegisterObserver()], lang="HIP")
     assert err.errisinstance(NotImplementedError)
     assert "Hip" in str(err.value)
diff --git a/test/test_opencl_functions.py b/test/test_opencl_functions.py
index 644c5dc08..5fafb0b03 100644
--- a/test/test_opencl_functions.py
+++ b/test/test_opencl_functions.py
@@ -15,7 +15,6 @@
 
 @skip_if_no_opencl
 def test_ready_argument_list():
-
     size = 1000
     a = np.int32(75)
     b = np.random.randn(size).astype(np.float32)
@@ -36,7 +35,6 @@ def test_ready_argument_list():
 
 @skip_if_no_opencl
 def test_compile():
-
     original_kernel = """
     __kernel void sum(__global const float *a_g, __global const float *b_g, __global float *res_g) {
         int gid = get_global_id(0);
@@ -58,13 +56,13 @@ def test_compile():
 
 @skip_if_no_opencl
 def test_run_kernel():
-
     threads = (1, 2, 3)
     grid = (4, 5, 1)
 
     def test_func(queue, global_size, local_size, arg):
         assert all(global_size == np.array([4, 10, 3]))
-        return type('Event', (object,), {'wait': lambda self: 0})()
+        return type("Event", (object,), {"wait": lambda self: 0})()
+
     dev = opencl.OpenCLFunctions(0)
     dev.run_kernel(test_func, [0], threads, grid)
 
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
new file mode 100644
index 000000000..a169c3ddc
--- /dev/null
+++ b/test/test_parallel_tuning.py
@@ -0,0 +1,44 @@
+import numpy as np
+import pytest
+import logging
+import sys
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.backends import nvcuda
+from kernel_tuner.core import KernelInstance, KernelSource
+from .context import skip_if_no_pycuda
+
+try:
+    import pycuda.driver
+except Exception:
+    pass
+
+
+@pytest.fixture
+def env():
+    kernel_string = """
+    extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 100
+    a = np.random.randn(size).astype(np.float32)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.zeros_like(b)
+    n = np.int32(size)
+
+    args = [c, a, b, n]
+    tune_params = dict()
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    return ["vector_add", kernel_string, size, args, tune_params]
+
+
+@skip_if_no_pycuda
+def test_parallel_tune_kernel(env):
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, parallel_mode=True)
+    assert len(result) > 0
diff --git a/test/test_pycuda_functions.py b/test/test_pycuda_functions.py
index 3581a43dd..02da8dcec 100644
--- a/test/test_pycuda_functions.py
+++ b/test/test_pycuda_functions.py
@@ -13,7 +13,6 @@
 
 @skip_if_no_pycuda
 def test_ready_argument_list():
-
     size = 1000
     a = np.int32(75)
     b = np.random.randn(size).astype(np.float32)
@@ -33,7 +32,6 @@ def test_ready_argument_list():
 
 @skip_if_no_pycuda
 def test_compile():
-
     kernel_string = """
     __global__ void vector_add(float *c, float *a, float *b, int n) {
         int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -55,5 +53,3 @@ def test_compile():
 
 def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None):
     pass
-
-
diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py
index 6bdfeef07..2d94ca4aa 100644
--- a/test/test_pycuda_mocked.py
+++ b/test/test_pycuda_mocked.py
@@ -11,19 +11,21 @@
 
 def setup_mock(drv):
     context = Mock()
-    devprops = {'MAX_THREADS_PER_BLOCK': 1024,
-                'COMPUTE_CAPABILITY_MAJOR': 5,
-                'COMPUTE_CAPABILITY_MINOR': 5,}
+    devprops = {
+        "MAX_THREADS_PER_BLOCK": 1024,
+        "COMPUTE_CAPABILITY_MAJOR": 5,
+        "COMPUTE_CAPABILITY_MINOR": 5,
+    }
     context.return_value.get_device.return_value.get_attributes.return_value = devprops
     context.return_value.get_device.return_value.compute_capability.return_value = "55"
     drv.Device.return_value.retain_primary_context.return_value = context()
-    drv.mem_alloc.return_value = 'mem_alloc'
+    drv.mem_alloc.return_value = "mem_alloc"
     return drv
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_ready_argument_list(drv, *args):
     drv = setup_mock(drv)
 
@@ -39,21 +41,20 @@ def test_ready_argument_list(drv, *args):
     print(gpu_args)
 
     drv.mem_alloc.assert_called_once_with(20)
-    drv.memcpy_htod.assert_called_once_with('mem_alloc', b)
+    drv.memcpy_htod.assert_called_once_with("mem_alloc", b)
 
     assert isinstance(gpu_args[0], np.int32)
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_compile(drv, *args):
-
     # setup mocked stuff
     drv = setup_mock(drv)
     dev = pycuda.PyCudaFunctions(0)
     dev.source_mod = Mock()
-    dev.source_mod.return_value.get_function.return_value = 'func'
+    dev.source_mod.return_value.get_function.return_value = "func"
 
     # call compile
     kernel_string = "__global__ void vector_add()"
@@ -65,45 +66,45 @@ def test_compile(drv, *args):
     # verify behavior
     assert dev.source_mod.call_count == 1
     assert dev.current_module is dev.source_mod.return_value
-    assert func == 'func'
+    assert func == "func"
 
     assert kernel_string == list(dev.source_mod.mock_calls[0])[1][0]
     optional_args = list(dev.source_mod.mock_calls[0])[2]
-    assert optional_args['code'] == 'sm_55'
-    assert optional_args['arch'] == 'compute_55'
+    assert optional_args["code"] == "sm_55"
+    assert optional_args["arch"] == "compute_55"
 
 
 def dummy_func(a, b, block=0, grid=0, shared=0, stream=None, texrefs=None):
     pass
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_copy_constant_memory_args(drv, *args):
     drv = setup_mock(drv)
 
     fake_array = np.zeros(10).astype(np.float32)
-    cmem_args = {'fake_array': fake_array}
+    cmem_args = {"fake_array": fake_array}
 
     dev = pycuda.PyCudaFunctions(0)
     dev.current_module = Mock()
-    dev.current_module.get_global.return_value = ['get_global']
+    dev.current_module.get_global.return_value = ["get_global"]
 
     dev.copy_constant_memory_args(cmem_args)
 
-    drv.memcpy_htod.assert_called_once_with('get_global', fake_array)
-    dev.current_module.get_global.assert_called_once_with('fake_array')
+    drv.memcpy_htod.assert_called_once_with("get_global", fake_array)
+    dev.current_module.get_global.assert_called_once_with("fake_array")
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_copy_texture_memory_args(drv, *args):
     drv = setup_mock(drv)
 
     fake_array = np.zeros(10).astype(np.float32)
-    texmem_args = {'fake_tex': fake_array}
+    texmem_args = {"fake_tex": fake_array}
 
     texref = Mock()
 
@@ -114,13 +115,13 @@ def test_copy_texture_memory_args(drv, *args):
     dev.copy_texture_memory_args(texmem_args)
 
     drv.matrix_to_texref.assert_called_once_with(fake_array, texref, order="C")
-    dev.current_module.get_texref.assert_called_once_with('fake_tex')
+    dev.current_module.get_texref.assert_called_once_with("fake_tex")
 
-    texmem_args = {'fake_tex2': {'array': fake_array, 'filter_mode': 'linear', 'address_mode': ['border', 'clamp']}}
+    texmem_args = {"fake_tex2": {"array": fake_array, "filter_mode": "linear", "address_mode": ["border", "clamp"]}}
 
     dev.copy_texture_memory_args(texmem_args)
     drv.matrix_to_texref.assert_called_with(fake_array, texref, order="C")
-    dev.current_module.get_texref.assert_called_with('fake_tex2')
+    dev.current_module.get_texref.assert_called_with("fake_tex2")
     texref.set_filter_mode.assert_called_once_with(drv.filter_mode.LINEAR)
     texref.set_address_mode.assert_any_call(0, drv.address_mode.BORDER)
-    texref.set_address_mode.assert_any_call(1, drv.address_mode.CLAMP)
\ No newline at end of file
+    texref.set_address_mode.assert_any_call(1, drv.address_mode.CLAMP)
diff --git a/test/test_runners.py b/test/test_runners.py
index 527c1d252..01eebd0e2 100644
--- a/test/test_runners.py
+++ b/test/test_runners.py
@@ -10,8 +10,7 @@
 
 from .context import skip_if_no_pycuda
 
-cache_filename = os.path.dirname(
-    os.path.realpath(__file__)) + "/test_cache_file.json"
+cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/test_cache_file.json"
 
 
 @pytest.fixture
@@ -40,7 +39,6 @@ def env():
 
 @skip_if_no_pycuda
 def test_sequential_runner_alt_block_size_names(env):
-
     kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) {
         int i = blockIdx.x * block_dim_x + threadIdx.x;
         if (i<n) {
@@ -49,11 +47,7 @@ def test_sequential_runner_alt_block_size_names(env):
     }
     """
 
-    tune_params = {
-        "block_dim_x": [128 + 64 * i for i in range(5)],
-        "block_size_y": [1],
-        "block_size_z": [1]
-    }
+    tune_params = {"block_dim_x": [128 + 64 * i for i in range(5)], "block_size_y": [1], "block_size_z": [1]}
 
     env[1] = kernel_string
     env[-1] = tune_params
@@ -63,25 +57,24 @@ def test_sequential_runner_alt_block_size_names(env):
 
     block_size_names = ["block_dim_x"]
 
-    result, _ = tune_kernel(*env,
-                            grid_div_x=["block_dim_x"],
-                            answer=answer,
-                            block_size_names=block_size_names, objective='time', objective_higher_is_better=False)
+    result, _ = tune_kernel(
+        *env,
+        grid_div_x=["block_dim_x"],
+        answer=answer,
+        block_size_names=block_size_names,
+        objective="time",
+        objective_higher_is_better=False
+    )
 
     assert len(result) == len(tune_params["block_dim_x"])
 
 
 @skip_if_no_pycuda
 def test_smem_args(env):
-    result, _ = tune_kernel(*env,
-                            smem_args=dict(size="block_size_x*4"),
-                            verbose=True)
+    result, _ = tune_kernel(*env, smem_args=dict(size="block_size_x*4"), verbose=True)
     tune_params = env[-1]
     assert len(result) == len(tune_params["block_size_x"])
-    result, _ = tune_kernel(
-        *env,
-        smem_args=dict(size=lambda p: p['block_size_x'] * 4),
-        verbose=True)
+    result, _ = tune_kernel(*env, smem_args=dict(size=lambda p: p["block_size_x"] * 4), verbose=True)
     tune_params = env[-1]
     assert len(result) == len(tune_params["block_size_x"])
 
@@ -89,10 +82,7 @@ def test_smem_args(env):
 @skip_if_no_pycuda
 def test_build_cache(env):
     if not os.path.isfile(cache_filename):
-        result, _ = tune_kernel(*env,
-                                cache=cache_filename,
-                                verbose=False,
-                                quiet=True)
+        result, _ = tune_kernel(*env, cache=cache_filename, verbose=False, quiet=True)
         tune_params = env[-1]
         assert len(result) == len(tune_params["block_size_x"])
 
@@ -100,17 +90,18 @@ def test_build_cache(env):
 def test_simulation_runner(env):
     kernel_name, kernel_string, size, args, tune_params = env
     start = time.perf_counter()
-    result, res_env = tune_kernel(*env,
-                                  cache=cache_filename,
-                                  strategy="random_sample",
-                                  simulation_mode=True,
-                                  strategy_options=dict(fraction=1))
+    result, res_env = tune_kernel(
+        *env, cache=cache_filename, strategy="random_sample", simulation_mode=True, strategy_options=dict(fraction=1)
+    )
     actual_time = (time.perf_counter() - start) * 1e3  # ms
     assert len(result) == len(tune_params["block_size_x"])
 
     timings = [
-        'total_framework_time', 'total_strategy_time', 'total_compile_time',
-        'total_benchmark_time', 'overhead_time'
+        "total_framework_time",
+        "total_strategy_time",
+        "total_compile_time",
+        "total_benchmark_time",
+        "overhead_time",
     ]
 
     # ensure all keys are there and non zero
@@ -123,20 +114,22 @@ def test_simulation_runner(env):
 
     # ensure recorded time is sensible number
     recorded_time_including_simulation = sum(res_env[key] for key in timings)
-    assert recorded_time_including_simulation - res_env['simulated_time'] > 0
+    assert recorded_time_including_simulation - res_env["simulated_time"] > 0
 
     # ensure difference between recorded time and actual time + simulated less then 10ms
-    max_time = actual_time + res_env['simulated_time']
+    max_time = actual_time + res_env["simulated_time"]
     assert max_time - recorded_time_including_simulation < 10
 
 
 def test_diff_evo(env):
-    result, _ = tune_kernel(*env,
-                            strategy="diff_evo",
-                            strategy_options=dict(popsize=5),
-                            verbose=True,
-                            cache=cache_filename,
-                            simulation_mode=True)
+    result, _ = tune_kernel(
+        *env,
+        strategy="diff_evo",
+        strategy_options=dict(popsize=5),
+        verbose=True,
+        cache=cache_filename,
+        simulation_mode=True
+    )
     assert len(result) > 0
 
 
@@ -145,24 +138,20 @@ def test_time_keeping(env):
     kernel_name, kernel_string, size, args, tune_params = env
     answer = [args[1] + args[2], None, None, None]
 
-    options = dict(method="uniform",
-                   popsize=10,
-                   maxiter=1,
-                   mutation_chance=1,
-                   max_fevals=10)
+    options = dict(method="uniform", popsize=10, maxiter=1, mutation_chance=1, max_fevals=10)
     start = time.perf_counter()
-    result, env = tune_kernel(*env,
-                              strategy="genetic_algorithm",
-                              strategy_options=options,
-                              verbose=True,
-                              answer=answer)
+    result, env = tune_kernel(*env, strategy="genetic_algorithm", strategy_options=options, verbose=True, answer=answer)
     max_time = (time.perf_counter() - start) * 1e3  # ms
 
     assert len(result) >= 10
 
     timings = [
-        'total_framework_time', 'total_strategy_time', 'total_compile_time',
-        'total_verification_time', 'total_benchmark_time', 'overhead_time'
+        "total_framework_time",
+        "total_strategy_time",
+        "total_compile_time",
+        "total_verification_time",
+        "total_benchmark_time",
+        "overhead_time",
     ]
 
     # ensure all keys are there and non zero
@@ -178,32 +167,29 @@ def test_time_keeping(env):
 
 
 def test_bayesian_optimization(env):
-    for method in [
-            "poi", "ei", "lcb", "lcb-srinivas", "multi", "multi-advanced",
-            "multi-fast"
-    ]:
+    for method in ["poi", "ei", "lcb", "lcb-srinivas", "multi", "multi-advanced", "multi-fast"]:
         print(method, flush=True)
         options = dict(popsize=5, max_fevals=10, method=method)
-        result, _ = tune_kernel(*env,
-                                strategy="bayes_opt",
-                                strategy_options=options,
-                                verbose=True,
-                                cache=cache_filename,
-                                simulation_mode=True)
+        result, _ = tune_kernel(
+            *env,
+            strategy="bayes_opt",
+            strategy_options=options,
+            verbose=True,
+            cache=cache_filename,
+            simulation_mode=True
+        )
         assert len(result) > 0
 
 
 def test_random_sample(env):
-    result, _ = tune_kernel(*env,
-                            strategy="random_sample",
-                            strategy_options={"fraction": 0.1},
-                            cache=cache_filename,
-                            simulation_mode=True)
+    result, _ = tune_kernel(
+        *env, strategy="random_sample", strategy_options={"fraction": 0.1}, cache=cache_filename, simulation_mode=True
+    )
     # check that number of benchmarked kernels is 10% (rounded up)
     assert len(result) == 2
     # check all returned results make sense
     for v in result:
-        assert v['time'] > 0.0 and v['time'] < 1.0
+        assert v["time"] > 0.0 and v["time"] < 1.0
 
 
 @skip_if_no_pycuda
@@ -230,29 +216,18 @@ def test_interface_handles_compile_failures(env):
     }
     """
 
-    results, env = tune_kernel(kernel_name,
-                               kernel_string,
-                               size,
-                               args,
-                               tune_params,
-                               verbose=True)
+    results, env = tune_kernel(kernel_name, kernel_string, size, args, tune_params, verbose=True)
 
-    failed_config = [
-        record for record in results if record["block_size_x"] == 256
-    ][0]
+    failed_config = [record for record in results if record["block_size_x"] == 256][0]
     assert isinstance(failed_config["time"], util.CompilationFailedConfig)
 
 
 @skip_if_no_pycuda
 def test_runner(env):
-
     kernel_name, kernel_source, problem_size, arguments, tune_params = env
 
     # create KernelSource
-    kernelsource = core.KernelSource(kernel_name,
-                                     kernel_source,
-                                     lang=None,
-                                     defines=None)
+    kernelsource = core.KernelSource(kernel_name, kernel_source, lang=None, defines=None)
 
     # create option bags
     device = 0
@@ -263,20 +238,13 @@ def test_runner(env):
     objective = "GFLOP/s"
     metrics = dict({objective: lambda p: 1})
     opts = locals()
-    kernel_options = Options([(k, opts.get(k, None))
-                              for k in _kernel_options.keys()])
-    tuning_options = Options([(k, opts.get(k, None))
-                              for k in _tuning_options.keys()])
-    device_options = Options([(k, opts.get(k, None))
-                              for k in _device_options.keys()])
+    kernel_options = Options([(k, opts.get(k, None)) for k in _kernel_options.keys()])
+    tuning_options = Options([(k, opts.get(k, None)) for k in _tuning_options.keys()])
+    device_options = Options([(k, opts.get(k, None)) for k in _device_options.keys()])
     tuning_options.cachefile = None
 
     # create runner
-    runner = SequentialRunner(kernelsource,
-                              kernel_options,
-                              device_options,
-                              iterations,
-                              observers=None)
+    runner = SequentialRunner(kernelsource, kernel_options, device_options, iterations, observers=None)
     runner.warmed_up = True  # disable warm up for this test
 
     # select a config to run
@@ -285,12 +253,11 @@ def test_runner(env):
     # insert configurations to run with this runner in this list
     # each configuration is described as a list of values, one for each tunable parameter
     # the order should correspond to the order of parameters specified in tune_params
-    searchspace.append(
-        [32])  # vector_add only has one tunable parameter (block_size_x)
+    searchspace.append([32])  # vector_add only has one tunable parameter (block_size_x)
 
     # call the runner
     results = runner.run(searchspace, tuning_options)
 
     assert len(results) == 1
-    assert results[0]['block_size_x'] == 32
-    assert len(results[0]['times']) == iterations
+    assert results[0]["block_size_x"] == 32
+    assert len(results[0]["times"]) == iterations
diff --git a/test/test_searchspace.py b/test/test_searchspace.py
index 8672c1d03..f31b052ba 100644
--- a/test/test_searchspace.py
+++ b/test/test_searchspace.py
@@ -37,6 +37,7 @@
 
 # each GPU must have at least one layer and the sum of all layers must not exceed the total number of layers
 
+
 def _min_func(gpu1, gpu2, gpu3, gpu4):
     return min([gpu1, gpu2, gpu3, gpu4]) >= 1
 
@@ -79,12 +80,13 @@ def test_internal_representation():
     for index, dict_config in enumerate(searchspace.get_list_dict().keys()):
         assert dict_config == searchspace.list[index]
 
+
 def test_check_restrictions():
     """Test whether the outcome of restrictions is as expected when using check_restrictions."""
     from kernel_tuner.util import check_restrictions
 
-    param_config_false = {'x': 1, 'y': 4, 'z': "string_1" }
-    param_config_true = {'x': 3, 'y': 4, 'z': "string_1" }
+    param_config_false = {"x": 1, "y": 4, "z": "string_1"}
+    param_config_true = {"x": 3, "y": 4, "z": "string_1"}
 
     assert check_restrictions(simple_searchspace.restrictions, param_config_false, verbose=False) is False
     assert check_restrictions(simple_searchspace.restrictions, param_config_true, verbose=False) is True
@@ -95,12 +97,11 @@ def test_against_bruteforce():
     compare_two_searchspace_objects(simple_searchspace, simple_searchspace_bruteforce)
     compare_two_searchspace_objects(searchspace, searchspace_bruteforce)
 
+
 def test_sort():
     """Test that the sort searchspace option works as expected."""
     simple_searchspace_sort = Searchspace(
-        simple_tuning_options.tune_params,
-        simple_tuning_options.restrictions,
-        max_threads
+        simple_tuning_options.tune_params, simple_tuning_options.restrictions, max_threads
     )
 
     expected = [
@@ -130,9 +131,7 @@ def test_sort():
 def test_sort_reversed():
     """Test that the sort searchspace option with the sort_last_param_first option enabled works as expected."""
     simple_searchspace_sort_reversed = Searchspace(
-        simple_tuning_options.tune_params,
-        simple_tuning_options.restrictions,
-        max_threads
+        simple_tuning_options.tune_params, simple_tuning_options.restrictions, max_threads
     )
 
     expected = [
@@ -200,7 +199,9 @@ def test_random_sample():
         print(value_error_expectation_message)
         assert False
     except ValueError as e:
-        assert "number of samples requested" in str(e) and "is greater than the searchspace size" in str(e), f"Expected string not in error {e}"
+        assert "number of samples requested" in str(e) and "is greater than the searchspace size" in str(
+            e
+        ), f"Expected string not in error {e}"
     except Exception:
         print(value_error_expectation_message)
         assert False
@@ -238,8 +239,8 @@ def test_neighbors_hamming():
     """Test whether the neighbors with Hamming distance are as expected."""
     test_config = tuple([1, 4, "string_1"])
     expected_neighbors = [
-        (1.5, 4, 'string_1'),
-        (3, 4, 'string_1'),
+        (1.5, 4, "string_1"),
+        (3, 4, "string_1"),
     ]
 
     __test_neighbors(test_config, expected_neighbors, "Hamming")
@@ -249,10 +250,10 @@ def test_neighbors_strictlyadjacent():
     """Test whether the strictly adjacent neighbors are as expected."""
     test_config = tuple([1, 4, "string_1"])
     expected_neighbors = [
-        (1.5, 4, 'string_1'),
-        (1.5, 4, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 5.5, 'string_2'),
+        (1.5, 4, "string_1"),
+        (1.5, 4, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (1.5, 5.5, "string_2"),
     ]
 
     __test_neighbors(test_config, expected_neighbors, "strictly-adjacent")
@@ -262,10 +263,10 @@ def test_neighbors_adjacent():
     """Test whether the adjacent neighbors are as expected."""
     test_config = tuple([1, 4, "string_1"])
     expected_neighbors = [
-        (1.5, 4, 'string_1'),
-        (1.5, 4, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 5.5, 'string_2'),
+        (1.5, 4, "string_1"),
+        (1.5, 4, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (1.5, 5.5, "string_2"),
     ]
 
     __test_neighbors(test_config, expected_neighbors, "adjacent")
@@ -275,22 +276,18 @@ def test_neighbors_fictious():
     """Test whether the neighbors are as expected for a fictious parameter configuration (i.e. not existing in the search space due to restrictions)."""
     test_config = tuple([1.5, 4, "string_1"])
     expected_neighbors_hamming = [
-        (1.5, 4, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (3, 4, 'string_1'),
-    ]
-    expected_neighbors_strictlyadjacent = [
-        (1.5, 5.5, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 4, 'string_2')
+        (1.5, 4, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (3, 4, "string_1"),
     ]
+    expected_neighbors_strictlyadjacent = [(1.5, 5.5, "string_2"), (1.5, 5.5, "string_1"), (1.5, 4, "string_2")]
 
     expected_neighbors_adjacent = [
-        (1.5, 5.5, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 4, 'string_2'),
-        (3, 4, 'string_1'),
-        (3, 4, 'string_2'),
+        (1.5, 5.5, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (1.5, 4, "string_2"),
+        (3, 4, "string_1"),
+        (3, 4, "string_2"),
     ]
 
     __test_neighbors_direct(test_config, expected_neighbors_hamming, "Hamming")
@@ -301,10 +298,7 @@ def test_neighbors_fictious():
 def test_neighbors_cached():
     """Test whether retrieving a set of neighbors twice returns the cached version."""
     simple_searchspace_duplicate = Searchspace(
-        simple_tuning_options.tune_params,
-        simple_tuning_options.restrictions,
-        max_threads,
-        neighbor_method="Hamming"
+        simple_tuning_options.tune_params, simple_tuning_options.restrictions, max_threads, neighbor_method="Hamming"
     )
 
     test_configs = simple_searchspace_duplicate.get_random_sample(5)
@@ -333,12 +327,7 @@ def test_order_param_configs():
     """Test whether the ordering of parameter configurations according to parameter index happens as expected."""
     test_order = [1, 2, 0]
     test_config = tuple([1, 4, "string_1"])
-    expected_order = [
-        (1.5, 5.5, 'string_2'),
-        (1.5, 4, 'string_2'),
-        (1.5, 4, 'string_1'),
-        (1.5, 5.5, 'string_1')
-    ]
+    expected_order = [(1.5, 5.5, "string_2"), (1.5, 4, "string_2"), (1.5, 4, "string_1"), (1.5, 5.5, "string_1")]
     neighbors = simple_searchspace.get_neighbors_no_cache(test_config, "adjacent")
 
     # test failsafe too few indices
@@ -391,9 +380,9 @@ def test_small_searchspace():
     """Test a small real-world searchspace and the usage of the `max_threads` parameter."""
     max_threads = 1024
     tune_params = dict()
-    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32*i for i in range(1,33)]
+    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32 * i for i in range(1, 33)]
     tune_params["block_size_y"] = [2**i for i in range(6)]
-    tune_params["tile_size_x"] = [i for i in range(1,11)]
+    tune_params["tile_size_x"] = [i for i in range(1, 11)]
     restrictions = [
         "block_size_x*block_size_y >= 32",
         f"block_size_x*block_size_y <= {max_threads}",
@@ -402,42 +391,45 @@ def test_small_searchspace():
     searchspace_bruteforce = Searchspace(tune_params, restrictions, max_threads, framework="bruteforce")
     compare_two_searchspace_objects(searchspace, searchspace_bruteforce)
 
+
 def test_full_searchspace(compare_against_bruteforce=False):
     """Tests a full real-world searchspace (expdist). If `compare_against_bruteforce`, the searcspace will be bruteforced to compare against, this can take a long time!."""
     # device characteristics
     dev = {
-        'device_name': 'NVIDIA A40',
-        'max_threads': 1024,
-        'max_shared_memory_per_block': 49152,
-        'max_shared_memory': 102400
+        "device_name": "NVIDIA A40",
+        "max_threads": 1024,
+        "max_shared_memory_per_block": 49152,
+        "max_shared_memory": 102400,
     }
 
     # tunable parameters and restrictions
     tune_params = dict()
-    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32*i for i in range(1,33)]
+    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32 * i for i in range(1, 33)]
     tune_params["block_size_y"] = [2**i for i in range(6)]
-    tune_params["tile_size_x"] = [i for i in range(1,11)]
-    tune_params["tile_size_y"] = [i for i in range(1,11)]
-    tune_params["temporal_tiling_factor"] = [i for i in range(1,11)]
+    tune_params["tile_size_x"] = [i for i in range(1, 11)]
+    tune_params["tile_size_y"] = [i for i in range(1, 11)]
+    tune_params["temporal_tiling_factor"] = [i for i in range(1, 11)]
     max_tfactor = max(tune_params["temporal_tiling_factor"])
     tune_params["max_tfactor"] = [max_tfactor]
-    tune_params["loop_unroll_factor_t"] = [i for i in range(1,max_tfactor+1)]
-    tune_params["sh_power"] = [0,1]
-    tune_params["blocks_per_sm"] = [0,1,2,3,4]
+    tune_params["loop_unroll_factor_t"] = [i for i in range(1, max_tfactor + 1)]
+    tune_params["sh_power"] = [0, 1]
+    tune_params["blocks_per_sm"] = [0, 1, 2, 3, 4]
 
     restrictions = [
-            "block_size_x*block_size_y >= 32",
-            "temporal_tiling_factor % loop_unroll_factor_t == 0",
-            f"block_size_x*block_size_y <= {dev['max_threads']}",
-            f"(block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4 <= {dev['max_shared_memory_per_block']}",
-            f"blocks_per_sm == 0 or (((block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4) * blocks_per_sm <= {dev['max_shared_memory']})"
-        ]
+        "block_size_x*block_size_y >= 32",
+        "temporal_tiling_factor % loop_unroll_factor_t == 0",
+        f"block_size_x*block_size_y <= {dev['max_threads']}",
+        f"(block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4 <= {dev['max_shared_memory_per_block']}",
+        f"blocks_per_sm == 0 or (((block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4) * blocks_per_sm <= {dev['max_shared_memory']})",
+    ]
 
     # build the searchspace
-    searchspace = Searchspace(tune_params, restrictions, max_threads=dev['max_threads'])
+    searchspace = Searchspace(tune_params, restrictions, max_threads=dev["max_threads"])
 
     if compare_against_bruteforce:
-        searchspace_bruteforce = Searchspace(tune_params, restrictions, max_threads=dev['max_threads'], framework='bruteforce')
+        searchspace_bruteforce = Searchspace(
+            tune_params, restrictions, max_threads=dev["max_threads"], framework="bruteforce"
+        )
         compare_two_searchspace_objects(searchspace, searchspace_bruteforce)
     else:
         assert searchspace.size == len(searchspace.list) == 349853
diff --git a/test/test_util_functions.py b/test/test_util_functions.py
index f3431991b..e90da8965 100644
--- a/test/test_util_functions.py
+++ b/test/test_util_functions.py
@@ -35,17 +35,13 @@ def test_get_grid_dimensions1():
     assert grid[1] == 28
     assert grid[2] == 1
 
-    grid = get_grid_dimensions(
-        problem_size, params, (grid_div[0], None, None), block_size_names
-    )
+    grid = get_grid_dimensions(problem_size, params, (grid_div[0], None, None), block_size_names)
 
     assert grid[0] == 25
     assert grid[1] == 1024
     assert grid[2] == 1
 
-    grid = get_grid_dimensions(
-        problem_size, params, (None, grid_div[1], None), block_size_names
-    )
+    grid = get_grid_dimensions(problem_size, params, (None, grid_div[1], None), block_size_names)
 
     assert grid[0] == 1024
     assert grid[1] == 28
@@ -67,9 +63,7 @@ def test_get_grid_dimensions2():
     grid_div_x = ["block_x*8"]
     grid_div_y = ["(block_y+2)/8"]
 
-    grid = get_grid_dimensions(
-        problem_size, params, (grid_div_x, grid_div_y, None), block_size_names
-    )
+    grid = get_grid_dimensions(problem_size, params, (grid_div_x, grid_div_y, None), block_size_names)
 
     assert grid[0] == 4
     assert grid[1] == 256
@@ -83,9 +77,7 @@ def test_get_grid_dimensions3():
     grid_div_y = ["(block_y+2)/8"]
 
     def assert_grid_dimensions(problem_size):
-        grid = get_grid_dimensions(
-            problem_size, params, (grid_div_x, grid_div_y, None), block_size_names
-        )
+        grid = get_grid_dimensions(problem_size, params, (grid_div_x, grid_div_y, None), block_size_names)
         assert grid[0] == 1
         assert grid[1] == 256
         assert grid[2] == 1
@@ -187,9 +179,7 @@ def test_prepare_kernel_string():
     # Throw exception on invalid name (for instance, a space in the name)
     invalid_defines = {"invalid name": "1"}
     with pytest.raises(ValueError):
-        prepare_kernel_string(
-            "this", kernel, params, grid, threads, block_size_names, "", invalid_defines
-        )
+        prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "", invalid_defines)
 
 
 def test_prepare_kernel_string_partial_loop_unrolling():
@@ -204,9 +194,7 @@ def test_prepare_kernel_string_partial_loop_unrolling():
     params = dict()
     params["loop_unroll_factor_monkey"] = 8
 
-    _, output = prepare_kernel_string(
-        "this", kernel, params, grid, threads, block_size_names, "CUDA", None
-    )
+    _, output = prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "CUDA", None)
     assert "constexpr int loop_unroll_factor_monkey = 8;" in output
 
     params["loop_unroll_factor_monkey"] = 0
@@ -214,6 +202,7 @@ def test_prepare_kernel_string_partial_loop_unrolling():
     assert "constexpr int loop_unroll_factor_monkey" not in output
     assert "#pragma unroll loop_unroll_factor_monkey" not in output
 
+
 def test_replace_param_occurrences():
     kernel = "this is a weird kernel"
     params = dict()
@@ -221,9 +210,7 @@ def test_replace_param_occurrences():
     params["weird"] = 14
 
     new_kernel = replace_param_occurrences(kernel, params)
-    assert (
-        new_kernel == "this 8 a 14 kernel"
-    )  # Note: The "is" in "this" should not be replaced
+    assert new_kernel == "this 8 a 14 kernel"  # Note: The "is" in "this" should not be replaced
 
     new_kernel = replace_param_occurrences(kernel, dict())
     assert kernel == new_kernel
@@ -351,9 +338,7 @@ def test_check_argument_list3():
         }
         """
     args = [np.uint16(42), np.float16([3, 4, 6]), np.int32([300])]
-    assert_user_warning(
-        check_argument_list, [kernel_name, kernel_string, args], "at position 2"
-    )
+    assert_user_warning(check_argument_list, [kernel_name, kernel_string, args], "at position 2")
 
 
 def test_check_argument_list4():
@@ -363,9 +348,7 @@ def test_check_argument_list4():
         }
         """
     args = [np.uint16(42), np.float16([3, 4, 6]), np.int64([300]), np.ubyte(32)]
-    assert_user_warning(
-        check_argument_list, [kernel_name, kernel_string, args], "do not match in size"
-    )
+    assert_user_warning(check_argument_list, [kernel_name, kernel_string, args], "do not match in size")
 
 
 def test_check_argument_list5():
@@ -483,18 +466,12 @@ def test_warnings(function, args, number, warning_type):
 
     # check warning does not triger when nondefault block size names are used correctly
     block_size_names = ["block_size_a", "block_size_b"]
-    tune_params = dict(
-        zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3])
-    )
-    test_warnings(
-        check_block_size_params_names_list, [block_size_names, tune_params], 0, None
-    )
+    tune_params = dict(zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3]))
+    test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 0, None)
 
     # check that a warning is issued when none of the default names are used and no alternative names are specified
     block_size_names = None
-    tune_params = dict(
-        zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3])
-    )
+    tune_params = dict(zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3]))
     test_warnings(
         check_block_size_params_names_list,
         [block_size_names, tune_params],
@@ -504,12 +481,8 @@ def test_warnings(function, args, number, warning_type):
 
     # check that no error is raised when any of the default block size names is being used
     block_size_names = None
-    tune_params = dict(
-        zip(["block_size_x", "several_other_things"], [[1, 2, 3, 4], [2, 4]])
-    )
-    test_warnings(
-        check_block_size_params_names_list, [block_size_names, tune_params], 0, None
-    )
+    tune_params = dict(zip(["block_size_x", "several_other_things"], [[1, 2, 3, 4], [2, 4]]))
+    test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 0, None)
 
 
 def test_get_kernel_string_func():
@@ -691,10 +664,7 @@ def test_process_metrics():
     # assert params["b"] == 15
 
     # test if a metric overrides any existing metrics
-    params = {
-        "x": 15,
-        "b": 12
-    }
+    params = {"x": 15, "b": 12}
     metrics = dict()
     metrics["b"] = "x"
     params = process_metrics(params, metrics)
@@ -704,7 +674,11 @@ def test_process_metrics():
 def test_parse_restrictions():
     tune_params = {"block_size_x": [50, 100], "use_padding": [0, 1]}
     restrict = ["block_size_x != 320"]
-    restrictions = ["block_size_x != 320", "use_padding == 0 or block_size_x % 32 != 0", "50 <= block_size_x * use_padding < 100"]
+    restrictions = [
+        "block_size_x != 320",
+        "use_padding == 0 or block_size_x % 32 != 0",
+        "50 <= block_size_x * use_padding < 100",
+    ]
 
     # test the monolithic parsed function
     parsed = parse_restrictions(restrict, tune_params, monolithic=True)[0]
@@ -746,11 +720,15 @@ def test_parse_restrictions():
     rw_tune_params = dict()
     rw_tune_params["tile_size_x"] = [1, 2, 3, 4, 5, 6, 7, 8]
     rw_tune_params["tile_size_y"] = [1, 2, 3, 4, 5, 6, 7, 8]
-    parsed_constraint, params_constraint = parse_restrictions(["tile_size_x*tile_size_y<30"], rw_tune_params, try_to_constraint=True)[0]
+    parsed_constraint, params_constraint = parse_restrictions(
+        ["tile_size_x*tile_size_y<30"], rw_tune_params, try_to_constraint=True
+    )[0]
     assert all(param in rw_tune_params for param in params_constraint)
     assert isinstance(parsed_constraint, MaxProdConstraint)
     assert parsed_constraint._maxprod == 29
-    parsed_constraint, params_constraint = parse_restrictions(["30<tile_size_x*tile_size_y"], rw_tune_params, try_to_constraint=True)[0]
+    parsed_constraint, params_constraint = parse_restrictions(
+        ["30<tile_size_x*tile_size_y"], rw_tune_params, try_to_constraint=True
+    )[0]
     assert all(param in rw_tune_params for param in params_constraint)
     assert isinstance(parsed_constraint, MinProdConstraint)
     assert parsed_constraint._minprod == 31