Skip to content

Commit 55ab074

Browse files
committed
Added a flush kernel to clear the L2 cache between runs
1 parent 00ac419 commit 55ab074

File tree

1 file changed

+51
-2
lines changed

1 file changed

+51
-2
lines changed

kernel_tuner/core.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -340,14 +340,62 @@ def __init__(
340340
if not quiet:
341341
print("Using: " + self.dev.name)
342342

343-
def benchmark_default(self, func, gpu_args, threads, grid, result):
344-
"""Benchmark one kernel execution at a time"""
343+
if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']:
344+
# flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130
345+
flush_gpu_string = """
346+
__global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) {
347+
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
348+
const char val = d_flush[idx];
349+
if (do_write * val) {
350+
d_flush2[idx] = val;
351+
}
352+
}
353+
"""
354+
cache_size = self.dev.cache_size_L2
355+
d_flush = np.ones((cache_size), order='F').astype(np.float32)
356+
d_flush2 = np.ones((cache_size), order='F').astype(np.float32)
357+
self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)]
358+
359+
from kernel_tuner.interface import Options
360+
options = {
361+
'kernel_name': 'flush_gpu',
362+
'lang': 'CUDA',
363+
'arguments': self.flush_kernel_gpu_args,
364+
'problem_size': cache_size,
365+
'grid_div_x': None,
366+
'grid_div_y': None,
367+
'grid_div_z': None,
368+
'block_size_names': None,
369+
}
370+
options = Options(options)
371+
flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY'
372+
flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang)
373+
self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet)
374+
self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet)
375+
self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args)
376+
377+
# from kernel_tuner.kernelbuilder import PythonKernel
378+
# self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args)
379+
380+
def flush_cache(self):
381+
"""This special function can be called to flush the L2 cache."""
382+
if hasattr(self, 'flush_kernel'):
383+
return
384+
self.dev.synchronize()
385+
assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance)
386+
# self.flush_kernel.run_kernel(self.flush_kernel.gpu_args)
387+
self.dev.synchronize()
388+
389+
def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True):
390+
"""Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations."""
345391
observers = [
346392
obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
347393
]
348394

349395
self.dev.synchronize()
350396
for _ in range(self.iterations):
397+
if flush_cache:
398+
self.flush_cache()
351399
for obs in observers:
352400
obs.before_start()
353401
self.dev.synchronize()
@@ -1008,3 +1056,4 @@ def wrap_templated_kernel(kernel_string, kernel_name):
10081056
new_kernel_string += wrapper_function
10091057

10101058
return new_kernel_string, name + "_wrapper"
1059+

0 commit comments

Comments
 (0)