Skip to content

Commit d6acee3

Browse files
committed
Split backends and runtime observers.
1 parent 59d4006 commit d6acee3

File tree

11 files changed

+180
-150
lines changed

11 files changed

+180
-150
lines changed

kernel_tuner/backends/c.py

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import numpy.ctypeslib
1414

1515
from kernel_tuner.backends.backend import CompilerBackend
16-
from kernel_tuner.observers.observer import BenchmarkObserver
16+
from kernel_tuner.observers.c import CRuntimeObserver
1717
from kernel_tuner.util import get_temp_filename, delete_temp_file, write_file, SkippableFailure
1818

1919
dtype_map = {"int8": C.c_int8,
@@ -33,26 +33,6 @@
3333
Argument = namedtuple("Argument", ["numpy", "ctypes"])
3434

3535

36-
class CRuntimeObserver(BenchmarkObserver):
37-
""" Observer that collects results returned by benchmarking function in the C backend """
38-
39-
def __init__(self, dev):
40-
self.dev = dev
41-
self.objective = "time"
42-
self.times = []
43-
44-
def after_finish(self):
45-
self.times.append(self.dev.last_result)
46-
47-
def get_results(self):
48-
results = {
49-
self.objective: np.average(self.times),
50-
self.objective + "s": self.times.copy()
51-
}
52-
self.times = []
53-
return results
54-
55-
5636
class CFunctions(CompilerBackend):
5737
"""Class that groups the code for running and compiling C functions"""
5838

kernel_tuner/backends/cupy.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
import numpy as np
88

99
from kernel_tuner.backends.backend import GPUBackend
10-
from kernel_tuner.observers.observer import BenchmarkObserver
10+
from kernel_tuner.observers.cupy import CupyRuntimeObserver
11+
1112

1213
#embedded in try block to be able to generate documentation
1314
#and run tests without cupy installed
@@ -17,24 +18,6 @@
1718
cp = None
1819

1920

20-
class CupyRuntimeObserver(BenchmarkObserver):
21-
""" Observer that measures time using CUDA events during benchmarking in the CuPy backend """
22-
def __init__(self, dev):
23-
self.dev = dev
24-
self.stream = dev.stream
25-
self.start = dev.start
26-
self.end = dev.end
27-
self.times = []
28-
29-
def after_finish(self):
30-
self.times.append(cp.cuda.get_elapsed_time(self.start, self.end)) #ms
31-
32-
def get_results(self):
33-
results = {"time": np.average(self.times), "times": self.times.copy()}
34-
self.times = []
35-
return results
36-
37-
3821
class CupyFunctions(GPUBackend):
3922
"""Class that groups the Cupy functions on maintains state about the device"""
4023

kernel_tuner/backends/nvcuda.py

Lines changed: 30 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
import numpy as np
33

44
from kernel_tuner.backends.backend import GPUBackend
5-
from kernel_tuner.observers.observer import BenchmarkObserver
6-
from kernel_tuner.util import SkippableFailure
5+
from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
6+
from kernel_tuner.util import SkippableFailure, cuda_error_check
77

88
#embedded in try block to be able to generate documentation
99
#and run tests without cuda-python installed
@@ -13,46 +13,6 @@
1313
cuda = None
1414

1515

16-
def error_check(error):
17-
""" Checking the status of CUDA calls """
18-
if isinstance(error, cuda.CUresult):
19-
if error != cuda.CUresult.CUDA_SUCCESS:
20-
_, name = cuda.cuGetErrorName(error)
21-
raise RuntimeError(f"CUDA error: {name.decode()}")
22-
elif isinstance(error, cudart.cudaError_t):
23-
if error != cudart.cudaError_t.cudaSuccess:
24-
_, name = cudart.getErrorName(error)
25-
raise RuntimeError(f"CUDART error: {name.decode()}")
26-
elif isinstance(error, nvrtc.nvrtcResult):
27-
if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
28-
_, desc = nvrtc.nvrtcGetErrorString(error)
29-
raise RuntimeError(f"NVRTC error: {desc.decode()}")
30-
31-
32-
class CudaRuntimeObserver(BenchmarkObserver):
33-
""" Observer that measures time using CUDA events during benchmarking """
34-
def __init__(self, dev):
35-
self.dev = dev
36-
self.stream = dev.stream
37-
self.start = dev.start
38-
self.end = dev.end
39-
self.times = []
40-
41-
def after_finish(self):
42-
# time in ms
43-
err, time = cudart.cudaEventElapsedTime(self.start, self.end)
44-
error_check(err)
45-
self.times.append(time)
46-
47-
def get_results(self):
48-
results = {
49-
"time": np.average(self.times),
50-
"times": self.times.copy()
51-
}
52-
self.times = []
53-
return results
54-
55-
5616
class CudaFunctions(GPUBackend):
5717
"""Class that groups the Cuda functions on maintains state about the device"""
5818

@@ -82,23 +42,23 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
8242

8343
# initialize and select device
8444
err = cuda.cuInit(0)
85-
error_check(err)
45+
cuda_error_check(err)
8646
err, self.device = cuda.cuDeviceGet(device)
87-
error_check(err)
47+
cuda_error_check(err)
8848
err, self.context = cuda.cuDevicePrimaryCtxRetain(device)
89-
error_check(err)
49+
cuda_error_check(err)
9050
if CudaFunctions.last_selected_device != device:
9151
err = cuda.cuCtxSetCurrent(self.context)
92-
error_check(err)
52+
cuda_error_check(err)
9353
CudaFunctions.last_selected_device = device
9454

9555
# compute capabilities and device properties
9656
err, major = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
97-
error_check(err)
57+
cuda_error_check(err)
9858
err, minor = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
99-
error_check(err)
59+
cuda_error_check(err)
10060
err, self.max_threads = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device)
101-
error_check(err)
61+
cuda_error_check(err)
10262
self.cc = f"{major}{minor}"
10363
self.iterations = iterations
10464
self.current_module = None
@@ -110,11 +70,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
11070

11171
# create a stream and events
11272
err, self.stream = cuda.cuStreamCreate(0)
113-
error_check(err)
73+
cuda_error_check(err)
11474
err, self.start = cuda.cuEventCreate(0)
115-
error_check(err)
75+
cuda_error_check(err)
11676
err, self.end = cuda.cuEventCreate(0)
117-
error_check(err)
77+
cuda_error_check(err)
11878

11979
# default dynamically allocated shared memory size, can be overwritten using smem_args
12080
self.smem_size = 0
@@ -127,7 +87,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
12787

12888
# collect environment information
12989
err, device_properties = cudart.cudaGetDeviceProperties(device)
130-
error_check(err)
90+
cuda_error_check(err)
13191
env = dict()
13292
env["device_name"] = device_properties.name.decode()
13393
env["cuda_version"] = cuda.CUDA_VERSION
@@ -142,7 +102,7 @@ def __del__(self):
142102
for device_memory in self.allocations:
143103
if isinstance(device_memory, cuda.CUdeviceptr):
144104
err = cuda.cuMemFree(device_memory)
145-
error_check(err)
105+
cuda_error_check(err)
146106

147107
def ready_argument_list(self, arguments):
148108
"""ready argument list to be passed to the kernel, allocates gpu mem
@@ -160,7 +120,7 @@ def ready_argument_list(self, arguments):
160120
# if arg is a numpy array copy it to device
161121
if isinstance(arg, np.ndarray):
162122
err, device_memory = cuda.cuMemAlloc(arg.nbytes)
163-
error_check(err)
123+
cuda_error_check(err)
164124
self.allocations.append(device_memory)
165125
gpu_args.append(device_memory)
166126
self.memcpy_htod(device_memory, arg)
@@ -201,21 +161,21 @@ def compile(self, kernel_instance):
201161

202162
err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
203163
try:
204-
error_check(err)
164+
cuda_error_check(err)
205165
err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
206-
error_check(err)
166+
cuda_error_check(err)
207167
err, size = nvrtc.nvrtcGetPTXSize(program)
208-
error_check(err)
168+
cuda_error_check(err)
209169
buff = b' ' * size
210170
err = nvrtc.nvrtcGetPTX(program, buff)
211-
error_check(err)
171+
cuda_error_check(err)
212172
err, self.current_module = cuda.cuModuleLoadData(np.char.array(buff))
213173
if err == cuda.CUresult.CUDA_ERROR_INVALID_PTX:
214174
raise SkippableFailure("uses too much shared data")
215175
else:
216-
error_check(err)
176+
cuda_error_check(err)
217177
err, self.func = cuda.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
218-
error_check(err)
178+
cuda_error_check(err)
219179

220180
except RuntimeError as re:
221181
_, n = nvrtc.nvrtcGetProgramLogSize(program)
@@ -229,12 +189,12 @@ def compile(self, kernel_instance):
229189
def start_event(self):
230190
""" Records the event that marks the start of a measurement """
231191
err = cudart.cudaEventRecord(self.start, self.stream)
232-
error_check(err)
192+
cuda_error_check(err)
233193

234194
def stop_event(self):
235195
""" Records the event that marks the end of a measurement """
236196
err = cudart.cudaEventRecord(self.end, self.stream)
237-
error_check(err)
197+
cuda_error_check(err)
238198

239199
def kernel_finished(self):
240200
""" Returns True if the kernel has finished, False otherwise """
@@ -248,7 +208,7 @@ def kernel_finished(self):
248208
def synchronize():
249209
""" Halts execution until device has finished its tasks """
250210
err = cudart.cudaDeviceSynchronize()
251-
error_check(err)
211+
cuda_error_check(err)
252212

253213

254214
def copy_constant_memory_args(self, cmem_args):
@@ -263,9 +223,9 @@ def copy_constant_memory_args(self, cmem_args):
263223
"""
264224
for k, v in cmem_args.items():
265225
err, symbol, _ = cuda.cuModuleGetGlobal(self.current_module, str.encode(k))
266-
error_check(err)
226+
cuda_error_check(err)
267227
err = cuda.cuMemcpyHtoD(symbol, v, v.nbytes)
268-
error_check(err)
228+
cuda_error_check(err)
269229

270230
def copy_shared_memory_args(self, smem_args):
271231
"""add shared memory arguments to the kernel"""
@@ -307,7 +267,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
307267
arg_types.append(np.ctypeslib.as_ctypes_type(arg.dtype))
308268
kernel_args = (tuple(gpu_args), tuple(arg_types))
309269
err = cuda.cuLaunchKernel(func, grid[0], grid[1], grid[2], threads[0], threads[1], threads[2], self.smem_size, stream, kernel_args, 0)
310-
error_check(err)
270+
cuda_error_check(err)
311271

312272
@staticmethod
313273
def memset(allocation, value, size):
@@ -324,7 +284,7 @@ def memset(allocation, value, size):
324284
325285
"""
326286
err = cudart.cudaMemset(allocation, value, size)
327-
error_check(err)
287+
cuda_error_check(err)
328288

329289
@staticmethod
330290
def memcpy_dtoh(dest, src):
@@ -337,7 +297,7 @@ def memcpy_dtoh(dest, src):
337297
:type src: cuda.CUdeviceptr
338298
"""
339299
err = cuda.cuMemcpyDtoH(dest, src, dest.nbytes)
340-
error_check(err)
300+
cuda_error_check(err)
341301

342302
@staticmethod
343303
def memcpy_htod(dest, src):
@@ -350,7 +310,7 @@ def memcpy_htod(dest, src):
350310
:type src: numpy.ndarray
351311
"""
352312
err = cuda.cuMemcpyHtoD(dest, src, src.nbytes)
353-
error_check(err)
313+
cuda_error_check(err)
354314

355315
units = {'time': 'ms'}
356316

kernel_tuner/backends/opencl.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55

66
from kernel_tuner.backends.backend import GPUBackend
7-
from kernel_tuner.observers.observer import BenchmarkObserver
7+
from kernel_tuner.observers.opencl import OpenCLObserver
88

99
#embedded in try block to be able to generate documentation
1010
try:
@@ -13,22 +13,6 @@
1313
cl = None
1414

1515

16-
class OpenCLObserver(BenchmarkObserver):
17-
""" Observer that measures time using CUDA events during benchmarking """
18-
def __init__(self, dev):
19-
self.dev = dev
20-
self.times = []
21-
22-
def after_finish(self):
23-
event = self.dev.event
24-
self.times.append((event.profile.end - event.profile.start)*1e-6) #ms
25-
26-
def get_results(self):
27-
results = {"time": np.average(self.times), "times": self.times.copy()}
28-
self.times = []
29-
return results
30-
31-
3216
class OpenCLFunctions(GPUBackend):
3317
"""Class that groups the OpenCL functions on maintains some state about the device"""
3418

kernel_tuner/backends/pycuda.py

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77

88
from kernel_tuner.backends.backend import GPUBackend
9-
from kernel_tuner.observers.observer import BenchmarkObserver
9+
from kernel_tuner.observers.pycuda import PyCudaRuntimeObserver
1010
from kernel_tuner.observers.nvml import nvml
1111
from kernel_tuner.util import TorchPlaceHolder, SkippableFailure
1212

@@ -52,28 +52,6 @@ def get_pointer(self):
5252
return self.t.data_ptr()
5353

5454

55-
class PyCudaRuntimeObserver(BenchmarkObserver):
56-
""" Observer that measures time using CUDA events during benchmarking """
57-
58-
def __init__(self, dev):
59-
self.dev = dev
60-
self.stream = dev.stream
61-
self.start = dev.start
62-
self.end = dev.end
63-
self.times = []
64-
65-
def after_finish(self):
66-
self.times.append(self.end.time_since(self.start)) #ms
67-
68-
def get_results(self):
69-
results = {
70-
"time": np.average(self.times),
71-
"times": self.times.copy()
72-
}
73-
self.times = []
74-
return results
75-
76-
7755
class PyCudaFunctions(GPUBackend):
7856
"""Class that groups the CUDA functions on maintains state about the device"""
7957

kernel_tuner/observers/c.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import numpy as np
2+
3+
from kernel_tuner.observers.observer import BenchmarkObserver
4+
5+
6+
class CRuntimeObserver(BenchmarkObserver):
7+
"""Observer that collects results returned by benchmarking function in the C backend"""
8+
9+
def __init__(self, dev):
10+
self.dev = dev
11+
self.objective = "time"
12+
self.times = []
13+
14+
def after_finish(self):
15+
self.times.append(self.dev.last_result)
16+
17+
def get_results(self):
18+
results = {
19+
self.objective: np.average(self.times),
20+
self.objective + "s": self.times.copy(),
21+
}
22+
self.times = []
23+
return results

0 commit comments

Comments
 (0)