22import numpy as np
33
44from kernel_tuner .backends .backend import GPUBackend
5- from kernel_tuner .observers .observer import BenchmarkObserver
6- from kernel_tuner .util import SkippableFailure
5+ from kernel_tuner .observers .nvcuda import CudaRuntimeObserver
6+ from kernel_tuner .util import SkippableFailure , cuda_error_check
77
88#embedded in try block to be able to generate documentation
99#and run tests without cuda-python installed
1313 cuda = None
1414
1515
16- def error_check (error ):
17- """ Checking the status of CUDA calls """
18- if isinstance (error , cuda .CUresult ):
19- if error != cuda .CUresult .CUDA_SUCCESS :
20- _ , name = cuda .cuGetErrorName (error )
21- raise RuntimeError (f"CUDA error: { name .decode ()} " )
22- elif isinstance (error , cudart .cudaError_t ):
23- if error != cudart .cudaError_t .cudaSuccess :
24- _ , name = cudart .getErrorName (error )
25- raise RuntimeError (f"CUDART error: { name .decode ()} " )
26- elif isinstance (error , nvrtc .nvrtcResult ):
27- if error != nvrtc .nvrtcResult .NVRTC_SUCCESS :
28- _ , desc = nvrtc .nvrtcGetErrorString (error )
29- raise RuntimeError (f"NVRTC error: { desc .decode ()} " )
30-
31-
32- class CudaRuntimeObserver (BenchmarkObserver ):
33- """ Observer that measures time using CUDA events during benchmarking """
34- def __init__ (self , dev ):
35- self .dev = dev
36- self .stream = dev .stream
37- self .start = dev .start
38- self .end = dev .end
39- self .times = []
40-
41- def after_finish (self ):
42- # time in ms
43- err , time = cudart .cudaEventElapsedTime (self .start , self .end )
44- error_check (err )
45- self .times .append (time )
46-
47- def get_results (self ):
48- results = {
49- "time" : np .average (self .times ),
50- "times" : self .times .copy ()
51- }
52- self .times = []
53- return results
54-
55-
5616class CudaFunctions (GPUBackend ):
5717 """Class that groups the Cuda functions on maintains state about the device"""
5818
@@ -82,23 +42,23 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
8242
8343 # initialize and select device
8444 err = cuda .cuInit (0 )
85- error_check (err )
45+ cuda_error_check (err )
8646 err , self .device = cuda .cuDeviceGet (device )
87- error_check (err )
47+ cuda_error_check (err )
8848 err , self .context = cuda .cuDevicePrimaryCtxRetain (device )
89- error_check (err )
49+ cuda_error_check (err )
9050 if CudaFunctions .last_selected_device != device :
9151 err = cuda .cuCtxSetCurrent (self .context )
92- error_check (err )
52+ cuda_error_check (err )
9353 CudaFunctions .last_selected_device = device
9454
9555 # compute capabilities and device properties
9656 err , major = cudart .cudaDeviceGetAttribute (cudart .cudaDeviceAttr .cudaDevAttrComputeCapabilityMajor , device )
97- error_check (err )
57+ cuda_error_check (err )
9858 err , minor = cudart .cudaDeviceGetAttribute (cudart .cudaDeviceAttr .cudaDevAttrComputeCapabilityMinor , device )
99- error_check (err )
59+ cuda_error_check (err )
10060 err , self .max_threads = cudart .cudaDeviceGetAttribute (cudart .cudaDeviceAttr .cudaDevAttrMaxThreadsPerBlock , device )
101- error_check (err )
61+ cuda_error_check (err )
10262 self .cc = f"{ major } { minor } "
10363 self .iterations = iterations
10464 self .current_module = None
@@ -110,11 +70,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
11070
11171 # create a stream and events
11272 err , self .stream = cuda .cuStreamCreate (0 )
113- error_check (err )
73+ cuda_error_check (err )
11474 err , self .start = cuda .cuEventCreate (0 )
115- error_check (err )
75+ cuda_error_check (err )
11676 err , self .end = cuda .cuEventCreate (0 )
117- error_check (err )
77+ cuda_error_check (err )
11878
11979 # default dynamically allocated shared memory size, can be overwritten using smem_args
12080 self .smem_size = 0
@@ -127,7 +87,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
12787
12888 # collect environment information
12989 err , device_properties = cudart .cudaGetDeviceProperties (device )
130- error_check (err )
90+ cuda_error_check (err )
13191 env = dict ()
13292 env ["device_name" ] = device_properties .name .decode ()
13393 env ["cuda_version" ] = cuda .CUDA_VERSION
@@ -142,7 +102,7 @@ def __del__(self):
142102 for device_memory in self .allocations :
143103 if isinstance (device_memory , cuda .CUdeviceptr ):
144104 err = cuda .cuMemFree (device_memory )
145- error_check (err )
105+ cuda_error_check (err )
146106
147107 def ready_argument_list (self , arguments ):
148108 """ready argument list to be passed to the kernel, allocates gpu mem
@@ -160,7 +120,7 @@ def ready_argument_list(self, arguments):
160120 # if arg is a numpy array copy it to device
161121 if isinstance (arg , np .ndarray ):
162122 err , device_memory = cuda .cuMemAlloc (arg .nbytes )
163- error_check (err )
123+ cuda_error_check (err )
164124 self .allocations .append (device_memory )
165125 gpu_args .append (device_memory )
166126 self .memcpy_htod (device_memory , arg )
@@ -201,21 +161,21 @@ def compile(self, kernel_instance):
201161
202162 err , program = nvrtc .nvrtcCreateProgram (str .encode (kernel_string ), b"CUDAProgram" , 0 , [], [])
203163 try :
204- error_check (err )
164+ cuda_error_check (err )
205165 err = nvrtc .nvrtcCompileProgram (program , len (compiler_options ), compiler_options )
206- error_check (err )
166+ cuda_error_check (err )
207167 err , size = nvrtc .nvrtcGetPTXSize (program )
208- error_check (err )
168+ cuda_error_check (err )
209169 buff = b' ' * size
210170 err = nvrtc .nvrtcGetPTX (program , buff )
211- error_check (err )
171+ cuda_error_check (err )
212172 err , self .current_module = cuda .cuModuleLoadData (np .char .array (buff ))
213173 if err == cuda .CUresult .CUDA_ERROR_INVALID_PTX :
214174 raise SkippableFailure ("uses too much shared data" )
215175 else :
216- error_check (err )
176+ cuda_error_check (err )
217177 err , self .func = cuda .cuModuleGetFunction (self .current_module , str .encode (kernel_name ))
218- error_check (err )
178+ cuda_error_check (err )
219179
220180 except RuntimeError as re :
221181 _ , n = nvrtc .nvrtcGetProgramLogSize (program )
@@ -229,12 +189,12 @@ def compile(self, kernel_instance):
229189 def start_event (self ):
230190 """ Records the event that marks the start of a measurement """
231191 err = cudart .cudaEventRecord (self .start , self .stream )
232- error_check (err )
192+ cuda_error_check (err )
233193
234194 def stop_event (self ):
235195 """ Records the event that marks the end of a measurement """
236196 err = cudart .cudaEventRecord (self .end , self .stream )
237- error_check (err )
197+ cuda_error_check (err )
238198
239199 def kernel_finished (self ):
240200 """ Returns True if the kernel has finished, False otherwise """
@@ -248,7 +208,7 @@ def kernel_finished(self):
248208 def synchronize ():
249209 """ Halts execution until device has finished its tasks """
250210 err = cudart .cudaDeviceSynchronize ()
251- error_check (err )
211+ cuda_error_check (err )
252212
253213
254214 def copy_constant_memory_args (self , cmem_args ):
@@ -263,9 +223,9 @@ def copy_constant_memory_args(self, cmem_args):
263223 """
264224 for k , v in cmem_args .items ():
265225 err , symbol , _ = cuda .cuModuleGetGlobal (self .current_module , str .encode (k ))
266- error_check (err )
226+ cuda_error_check (err )
267227 err = cuda .cuMemcpyHtoD (symbol , v , v .nbytes )
268- error_check (err )
228+ cuda_error_check (err )
269229
270230 def copy_shared_memory_args (self , smem_args ):
271231 """add shared memory arguments to the kernel"""
@@ -307,7 +267,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
307267 arg_types .append (np .ctypeslib .as_ctypes_type (arg .dtype ))
308268 kernel_args = (tuple (gpu_args ), tuple (arg_types ))
309269 err = cuda .cuLaunchKernel (func , grid [0 ], grid [1 ], grid [2 ], threads [0 ], threads [1 ], threads [2 ], self .smem_size , stream , kernel_args , 0 )
310- error_check (err )
270+ cuda_error_check (err )
311271
312272 @staticmethod
313273 def memset (allocation , value , size ):
@@ -324,7 +284,7 @@ def memset(allocation, value, size):
324284
325285 """
326286 err = cudart .cudaMemset (allocation , value , size )
327- error_check (err )
287+ cuda_error_check (err )
328288
329289 @staticmethod
330290 def memcpy_dtoh (dest , src ):
@@ -337,7 +297,7 @@ def memcpy_dtoh(dest, src):
337297 :type src: cuda.CUdeviceptr
338298 """
339299 err = cuda .cuMemcpyDtoH (dest , src , dest .nbytes )
340- error_check (err )
300+ cuda_error_check (err )
341301
342302 @staticmethod
343303 def memcpy_htod (dest , src ):
@@ -350,7 +310,7 @@ def memcpy_htod(dest, src):
350310 :type src: numpy.ndarray
351311 """
352312 err = cuda .cuMemcpyHtoD (dest , src , src .nbytes )
353- error_check (err )
313+ cuda_error_check (err )
354314
355315 units = {'time' : 'ms' }
356316
0 commit comments