Split the cuda-python utils from the main file.

isazi · isazi · commit 881db4dbfedd · 2025-12-10T14:26:09.000+01:00
diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
@@ -5,7 +5,8 @@
 
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
-from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc
+from kernel_tuner.util import SkippableFailure
+from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc
 
 # embedded in try block to be able to generate documentation
 # and run tests without cuda-python installed
@@ -56,13 +57,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             CudaFunctions.last_selected_device = device
 
         # compute capabilities and device properties
-        err, major = runtime.cudaDeviceGetAttribute(
-            runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device
-        )
+        err, major = runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
         cuda_error_check(err)
-        err, minor = runtime.cudaDeviceGetAttribute(
-            runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device
-        )
+        err, minor = runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
         cuda_error_check(err)
         err, self.max_threads = runtime.cudaDeviceGetAttribute(
             runtime.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
@@ -164,20 +161,14 @@ def compile(self, kernel_instance):
         if not any(["--std=" in opt for opt in self.compiler_options]):
             self.compiler_options.append("--std=c++11")
         if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
-            compiler_options.append(
-                f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
-            )
+            compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
         if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
             self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
 
-        err, program = nvrtc.nvrtcCreateProgram(
-            str.encode(kernel_string), b"CUDAProgram", 0, [], []
-        )
+        err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
         try:
             cuda_error_check(err)
-            err = nvrtc.nvrtcCompileProgram(
-                program, len(compiler_options), compiler_options
-            )
+            err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
             cuda_error_check(err)
             err, size = nvrtc.nvrtcGetPTXSize(program)
             cuda_error_check(err)
@@ -189,9 +180,7 @@ def compile(self, kernel_instance):
                 raise SkippableFailure("uses too much shared data")
             else:
                 cuda_error_check(err)
-            err, self.func = driver.cuModuleGetFunction(
-                self.current_module, str.encode(kernel_name)
-            )
+            err, self.func = driver.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
             cuda_error_check(err)
 
             # get the number of registers per thread used in this kernel
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
@@ -38,10 +38,6 @@
     import cupy as cp
 except ImportError:
     cp = np
-try:
-    from cuda import cuda, cudart, nvrtc
-except ImportError:
-    cuda = None
 
 from kernel_tuner.observers.nvml import NVMLObserver
 
@@ -642,14 +638,6 @@ def get_total_timings(results, env, overhead_time):
     return env
 
 
-NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a"])
-
-
-def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str:
-    """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options."""
-    return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="52")
-
-
 def print_config(config, tuning_options, runner):
     """Print the configuration string with tunable parameters and benchmark results."""
     print_config_output(tuning_options.tune_params, config, runner.quiet, tuning_options.metrics, runner.units)
@@ -1315,19 +1303,3 @@ def dump_cache(obj: str, tuning_options):
     if isinstance(tuning_options.cache, dict) and tuning_options.cachefile:
         with open(tuning_options.cachefile, "a") as cachefile:
             cachefile.write(obj)
-
-
-def cuda_error_check(error):
-    """Checking the status of CUDA calls using the NVIDIA cuda-python backend."""
-    if isinstance(error, cuda.CUresult):
-        if error != cuda.CUresult.CUDA_SUCCESS:
-            _, name = cuda.cuGetErrorName(error)
-            raise RuntimeError(f"CUDA error: {name.decode()}")
-    elif isinstance(error, cudart.cudaError_t):
-        if error != cudart.cudaError_t.cudaSuccess:
-            _, name = cudart.getErrorName(error)
-            raise RuntimeError(f"CUDART error: {name.decode()}")
-    elif isinstance(error, nvrtc.nvrtcResult):
-        if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            _, desc = nvrtc.nvrtcGetErrorString(error)
-            raise RuntimeError(f"NVRTC error: {desc.decode()}")
diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py
@@ -0,0 +1,31 @@
+"""Module for kernel tuner cuda-python utility functions."""
+
+import numpy as np
+
+try:
+    from cuda.bindings import driver, runtime, nvrtc
+except ImportError:
+    cuda = None
+
+NVRTC_VALID_CC = np.array(["50", "52", "53", "60", "61", "62", "70", "72", "75", "80", "87", "89", "90", "90a"])
+
+
+def cuda_error_check(error):
+    """Checking the status of CUDA calls using the NVIDIA cuda-python backend."""
+    if isinstance(error, driver.CUresult):
+        if error != driver.CUresult.CUDA_SUCCESS:
+            _, name = driver.cuGetErrorName(error)
+            raise RuntimeError(f"CUDA error: {name.decode()}")
+    elif isinstance(error, runtime.cudaError_t):
+        if error != runtime.cudaError_t.cudaSuccess:
+            _, name = runtime.getErrorName(error)
+            raise RuntimeError(f"CUDART error: {name.decode()}")
+    elif isinstance(error, nvrtc.nvrtcResult):
+        if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            _, desc = nvrtc.nvrtcGetErrorString(error)
+            raise RuntimeError(f"NVRTC error: {desc.decode()}")
+
+
+def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str:
+    """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options."""
+    return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="52")