55
66from kernel_tuner .backends .backend import GPUBackend
77from kernel_tuner .observers .nvcuda import CudaRuntimeObserver
8- from kernel_tuner .util import SkippableFailure , cuda_error_check , to_valid_nvrtc_gpu_arch_cc
8+ from kernel_tuner .util import SkippableFailure
9+ from kernel_tuner .utils .nvcuda import cuda_error_check , to_valid_nvrtc_gpu_arch_cc
910
1011# embedded in try block to be able to generate documentation
1112# and run tests without cuda-python installed
@@ -56,13 +57,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
5657 CudaFunctions .last_selected_device = device
5758
5859 # compute capabilities and device properties
59- err , major = runtime .cudaDeviceGetAttribute (
60- runtime .cudaDeviceAttr .cudaDevAttrComputeCapabilityMajor , device
61- )
60+ err , major = runtime .cudaDeviceGetAttribute (runtime .cudaDeviceAttr .cudaDevAttrComputeCapabilityMajor , device )
6261 cuda_error_check (err )
63- err , minor = runtime .cudaDeviceGetAttribute (
64- runtime .cudaDeviceAttr .cudaDevAttrComputeCapabilityMinor , device
65- )
62+ err , minor = runtime .cudaDeviceGetAttribute (runtime .cudaDeviceAttr .cudaDevAttrComputeCapabilityMinor , device )
6663 cuda_error_check (err )
6764 err , self .max_threads = runtime .cudaDeviceGetAttribute (
6865 runtime .cudaDeviceAttr .cudaDevAttrMaxThreadsPerBlock , device
@@ -164,20 +161,14 @@ def compile(self, kernel_instance):
164161 if not any (["--std=" in opt for opt in self .compiler_options ]):
165162 self .compiler_options .append ("--std=c++11" )
166163 if not any ([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options ]):
167- compiler_options .append (
168- f"--gpu-architecture=compute_{ to_valid_nvrtc_gpu_arch_cc (self .cc )} " .encode ("UTF-8" )
169- )
164+ compiler_options .append (f"--gpu-architecture=compute_{ to_valid_nvrtc_gpu_arch_cc (self .cc )} " .encode ("UTF-8" ))
170165 if not any (["--gpu-architecture=" in opt or "-arch" in opt for opt in self .compiler_options ]):
171166 self .compiler_options .append (f"--gpu-architecture=compute_{ to_valid_nvrtc_gpu_arch_cc (self .cc )} " )
172167
173- err , program = nvrtc .nvrtcCreateProgram (
174- str .encode (kernel_string ), b"CUDAProgram" , 0 , [], []
175- )
168+ err , program = nvrtc .nvrtcCreateProgram (str .encode (kernel_string ), b"CUDAProgram" , 0 , [], [])
176169 try :
177170 cuda_error_check (err )
178- err = nvrtc .nvrtcCompileProgram (
179- program , len (compiler_options ), compiler_options
180- )
171+ err = nvrtc .nvrtcCompileProgram (program , len (compiler_options ), compiler_options )
181172 cuda_error_check (err )
182173 err , size = nvrtc .nvrtcGetPTXSize (program )
183174 cuda_error_check (err )
@@ -189,9 +180,7 @@ def compile(self, kernel_instance):
189180 raise SkippableFailure ("uses too much shared data" )
190181 else :
191182 cuda_error_check (err )
192- err , self .func = driver .cuModuleGetFunction (
193- self .current_module , str .encode (kernel_name )
194- )
183+ err , self .func = driver .cuModuleGetFunction (self .current_module , str .encode (kernel_name ))
195184 cuda_error_check (err )
196185
197186 # get the number of registers per thread used in this kernel
0 commit comments