@@ -340,14 +340,62 @@ def __init__(
340
340
if not quiet :
341
341
print ("Using: " + self .dev .name )
342
342
343
- def benchmark_default (self , func , gpu_args , threads , grid , result ):
344
- """Benchmark one kernel execution at a time"""
343
+ if lang .upper () not in ['OPENCL' , 'C' , 'FORTRAN' ]:
344
+ # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130
345
+ flush_gpu_string = """
346
+ __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) {
347
+ const int idx = blockIdx.x * blockDim.x + threadIdx.x;
348
+ const char val = d_flush[idx];
349
+ if (do_write * val) {
350
+ d_flush2[idx] = val;
351
+ }
352
+ }
353
+ """
354
+ cache_size = self .dev .cache_size_L2
355
+ d_flush = np .ones ((cache_size ), order = 'F' ).astype (np .float32 )
356
+ d_flush2 = np .ones ((cache_size ), order = 'F' ).astype (np .float32 )
357
+ self .flush_kernel_gpu_args = [d_flush , d_flush2 , np .int32 (True )]
358
+
359
+ from kernel_tuner .interface import Options
360
+ options = {
361
+ 'kernel_name' : 'flush_gpu' ,
362
+ 'lang' : 'CUDA' ,
363
+ 'arguments' : self .flush_kernel_gpu_args ,
364
+ 'problem_size' : cache_size ,
365
+ 'grid_div_x' : None ,
366
+ 'grid_div_y' : None ,
367
+ 'grid_div_z' : None ,
368
+ 'block_size_names' : None ,
369
+ }
370
+ options = Options (options )
371
+ flush_kernel_lang = lang .upper () if lang .upper () in ['CUDA' , 'CUPY' , 'NVCUDA' ] else 'CUPY'
372
+ flush_kernel_source = KernelSource ('flush_gpu' , flush_gpu_string , flush_kernel_lang )
373
+ self .flush_kernel_instance = self .create_kernel_instance (flush_kernel_source , kernel_options = options , params = dict (), verbose = not quiet )
374
+ self .flush_kernel = self .compile_kernel (self .flush_kernel_instance , verbose = not quiet )
375
+ self .flush_kernel_gpu_args = self .ready_argument_list (self .flush_kernel_gpu_args )
376
+
377
+ # from kernel_tuner.kernelbuilder import PythonKernel
378
+ # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args)
379
+
380
+ def flush_cache (self ):
381
+ """This special function can be called to flush the L2 cache."""
382
+ if hasattr (self , 'flush_kernel' ):
383
+ return
384
+ self .dev .synchronize ()
385
+ assert self .run_kernel (self .flush_kernel , self .flush_kernel_gpu_args , self .flush_kernel_instance )
386
+ # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args)
387
+ self .dev .synchronize ()
388
+
389
+ def benchmark_default (self , func , gpu_args , threads , grid , result , flush_cache = True ):
390
+ """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations."""
345
391
observers = [
346
392
obs for obs in self .dev .observers if not isinstance (obs , ContinuousObserver )
347
393
]
348
394
349
395
self .dev .synchronize ()
350
396
for _ in range (self .iterations ):
397
+ if flush_cache :
398
+ self .flush_cache ()
351
399
for obs in observers :
352
400
obs .before_start ()
353
401
self .dev .synchronize ()
@@ -1008,3 +1056,4 @@ def wrap_templated_kernel(kernel_string, kernel_name):
1008
1056
new_kernel_string += wrapper_function
1009
1057
1010
1058
return new_kernel_string , name + "_wrapper"
1059
+