KernelTuner
diff --git a/‎kernel_tuner/accuracy.py‎
Lines changed: 5 additions & 10 deletions b/‎kernel_tuner/accuracy.py‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎kernel_tuner/backends/compiler.py‎
Lines changed: 7 additions & 5 deletions b/‎kernel_tuner/backends/compiler.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎kernel_tuner/backends/cupy.py‎
Lines changed: 2 additions & 6 deletions b/‎kernel_tuner/backends/cupy.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎kernel_tuner/backends/nvcuda.py‎
Lines changed: 6 additions & 18 deletions b/‎kernel_tuner/backends/nvcuda.py‎
Lines changed: 6 additions & 18 deletions
diff --git a/‎kernel_tuner/backends/opencl.py‎
Lines changed: 4 additions & 12 deletions b/‎kernel_tuner/backends/opencl.py‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎kernel_tuner/backends/pycuda.py‎
Lines changed: 3 additions & 14 deletions b/‎kernel_tuner/backends/pycuda.py‎
Lines changed: 3 additions & 14 deletions
@@ -46,9 +46,7 @@ def select_for_configuration(self, params):
 
         if option not in self.data:
             list = ", ".join(map(str, self.data.keys()))
-            raise KeyError(
-                f"'{option}' is not a valid parameter value, should be one of: {list}"
-            )
+            raise KeyError(f"'{option}' is not a valid parameter value, should be one of: {list}")
 
         return self.data[option]
 
@@ -60,12 +58,14 @@ def _find_bfloat16_if_available():
     # Try to get bfloat16 if available.
     try:
         from bfloat16 import bfloat16
+
         return bfloat16
     except ImportError:
         pass
 
     try:
         from tensorflow import bfloat16
+
         return bfloat16.as_numpy_dtype
     except ImportError:
         pass
@@ -102,9 +102,7 @@ def _to_float_dtype(x: str) -> np.dtype:
 
 
 class TunablePrecision(Tunable):
-    def __init__(
-        self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
-    ):
+    def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None):
         """The ``Tunable`` object can be used as an input argument when tuning
         kernels. It is a container that internally holds several arrays
         containing the same data, but stored in using different levels of
@@ -135,7 +133,6 @@ def __init__(
             if bfloat16 is not None:
                 dtypes["bfloat16"] = bfloat16
 
-
         # If dtype is a list, convert it to a dictionary
         if isinstance(dtypes, (list, tuple)):
             dtypes = dict((name, _to_float_dtype(name)) for name in dtypes)
@@ -257,9 +254,7 @@ def metric(a, b):
         raise ValueError(f"invalid error metric provided: {user_key}")
 
     # cast both arguments to f64 before passing them to the metric
-    return lambda a, b: metric(
-        a.astype(np.float64, copy=False), b.astype(np.float64, copy=False)
-    )
+    return lambda a, b: metric(a.astype(np.float64, copy=False), b.astype(np.float64, copy=False))
 
 
 class AccuracyObserver(OutputObserver):
 
@@ -34,7 +34,7 @@
 try:
     from hip._util.types import DeviceArray
 except ImportError:
-    Pointer = Exception # using Exception here as a type that will never be among kernel arguments
+    Pointer = Exception  # using Exception here as a type that will never be among kernel arguments
     DeviceArray = Exception
 
 
@@ -157,7 +157,9 @@ def ready_argument_list(self, arguments):
 
         for i, arg in enumerate(arguments):
             if not (isinstance(arg, (np.ndarray, np.number, DeviceArray)) or is_cupy_array(arg)):
-                raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}")
+                raise TypeError(
+                    f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}"
+                )
             dtype_str = arg.typestr if isinstance(arg, DeviceArray) else str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
@@ -288,7 +290,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             subprocess.run(
@@ -299,7 +301,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -439,7 +441,7 @@ def cleanup_lib(self):
         """unload the previously loaded shared library"""
         if self.lib is None:
             return
-        
+
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
 
@@ -70,9 +70,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         # collect environment information
         env = dict()
         cupy_info = str(cp._cupyx.get_runtime_info()).split("\n")[:-1]
-        info_dict = {
-            s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
-        }
+        info_dict = {s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info}
         env["device_name"] = info_dict[f"Device {device} Name"]
 
         env["cuda_version"] = cp.cuda.runtime.driverGetVersion()
@@ -129,9 +127,7 @@ def compile(self, kernel_instance):
 
         options = tuple(compiler_options)
 
-        self.current_module = cp.RawModule(
-            code=kernel_string, options=options, name_expressions=[kernel_name]
-        )
+        self.current_module = cp.RawModule(code=kernel_string, options=options, name_expressions=[kernel_name])
 
         self.func = self.current_module.get_function(kernel_name)
         self.num_regs = self.func.num_regs
 
@@ -56,13 +56,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             CudaFunctions.last_selected_device = device
 
         # compute capabilities and device properties
-        err, major = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device
-        )
+        err, major = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
         cuda_error_check(err)
-        err, minor = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device
-        )
+        err, minor = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
         cuda_error_check(err)
         err, self.max_threads = cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
@@ -164,20 +160,14 @@ def compile(self, kernel_instance):
         if not any(["--std=" in opt for opt in self.compiler_options]):
             self.compiler_options.append("--std=c++11")
         if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
-            compiler_options.append(
-                f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
-            )
+            compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
         if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
             self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
 
-        err, program = nvrtc.nvrtcCreateProgram(
-            str.encode(kernel_string), b"CUDAProgram", 0, [], []
-        )
+        err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
         try:
             cuda_error_check(err)
-            err = nvrtc.nvrtcCompileProgram(
-                program, len(compiler_options), compiler_options
-            )
+            err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
             cuda_error_check(err)
             err, size = nvrtc.nvrtcGetPTXSize(program)
             cuda_error_check(err)
@@ -189,9 +179,7 @@ def compile(self, kernel_instance):
                 raise SkippableFailure("uses too much shared data")
             else:
                 cuda_error_check(err)
-            err, self.func = cuda.cuModuleGetFunction(
-                self.current_module, str.encode(kernel_name)
-            )
+            err, self.func = cuda.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
             cuda_error_check(err)
 
             # get the number of registers per thread used in this kernel
 
@@ -16,9 +16,7 @@
 class OpenCLFunctions(GPUBackend):
     """Class that groups the OpenCL functions on maintains some state about the device."""
 
-    def __init__(
-        self, device=0, platform=0, iterations=7, compiler_options=None, observers=None
-    ):
+    def __init__(self, device=0, platform=0, iterations=7, compiler_options=None, observers=None):
         """Creates OpenCL device context and reads device properties.
 
         :param device: The ID of the OpenCL device to use for benchmarking
@@ -37,14 +35,10 @@ def __init__(
         platforms = cl.get_platforms()
         self.ctx = cl.Context(devices=[platforms[platform].get_devices()[device]])
 
-        self.queue = cl.CommandQueue(
-            self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
-        )
+        self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
         self.mf = cl.mem_flags
         # inspect device properties
-        self.max_threads = self.ctx.devices[0].get_info(
-            cl.device_info.MAX_WORK_GROUP_SIZE
-        )
+        self.max_threads = self.ctx.devices[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
         self.compiler_options = compiler_options or []
 
         # observer stuff
@@ -108,9 +102,7 @@ def compile(self, kernel_instance):
         :returns: An OpenCL kernel that can be called directly.
         :rtype: pyopencl.Kernel
         """
-        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(
-            options=self.compiler_options
-        )
+        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(options=self.compiler_options)
         func = getattr(prg, kernel_instance.name)
         return func
 
 
@@ -97,13 +97,9 @@ def _finish_up():
             PyCudaFunctions.last_selected_context = self.context
 
         # inspect device properties
-        devprops = {
-            str(k): v for (k, v) in self.context.get_device().get_attributes().items()
-        }
+        devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()}
         self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
-        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
-            devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
-        )
+        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(devprops.get("COMPUTE_CAPABILITY_MINOR", "0"))
         if cc == "00":
             cc = self.context.get_device().compute_capability()
         self.cc = str(cc[0]) + str(cc[1])
@@ -347,14 +343,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """
         if stream is None:
             stream = self.stream
-        func(
-            *gpu_args,
-            block=threads,
-            grid=grid,
-            stream=stream,
-            shared=self.smem_size,
-            texrefs=self.texrefs
-        )
+        func(*gpu_args, block=threads, grid=grid, stream=stream, shared=self.smem_size, texrefs=self.texrefs)
 
     def memset(self, allocation, value, size):
         """Set the memory in allocation to the value in value.