@@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
391
391
for obs in self .continuous_observers :
392
392
result .update (obs .get_results ())
393
393
394
- def benchmark (self , func , gpu_args , instance , verbose , objective ):
395
- """benchmark the kernel instance"""
396
- logging .debug ("benchmark " + instance .name )
397
- logging .debug ("thread block dimensions x,y,z=%d,%d,%d" , * instance .threads )
398
- logging .debug ("grid dimensions x,y,z=%d,%d,%d" , * instance .grid )
399
-
394
+ def set_nvml_parameters (self , instance ):
395
+ """Set the NVML parameters. Avoids setting time leaking into benchmark time."""
400
396
if self .use_nvml :
401
397
if "nvml_pwr_limit" in instance .params :
402
398
new_limit = int (
@@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
409
405
if "nvml_mem_clock" in instance .params :
410
406
self .nvml .mem_clock = instance .params ["nvml_mem_clock" ]
411
407
408
+ def benchmark (self , func , gpu_args , instance , verbose , objective , skip_nvml_setting = False ):
409
+ """Benchmark the kernel instance."""
410
+ logging .debug ("benchmark " + instance .name )
411
+ logging .debug ("thread block dimensions x,y,z=%d,%d,%d" , * instance .threads )
412
+ logging .debug ("grid dimensions x,y,z=%d,%d,%d" , * instance .grid )
413
+
414
+ if self .use_nvml and not skip_nvml_setting :
415
+ self .set_nvml_parameters (instance )
416
+
412
417
# Call the observers to register the configuration to be benchmarked
413
418
for obs in self .dev .observers :
414
419
obs .register_configuration (instance .params )
@@ -577,11 +582,15 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
577
582
578
583
# benchmark
579
584
if func :
585
+ # setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead
586
+ if self .use_nvml :
587
+ self .set_nvml_parameters (instance )
580
588
start_benchmark = time .perf_counter ()
581
589
result .update (
582
- self .benchmark (func , gpu_args , instance , verbose , to .objective )
590
+ self .benchmark (func , gpu_args , instance , verbose , to .objective , skip_nvml_setting = False )
583
591
)
584
592
last_benchmark_time = 1000 * (time .perf_counter () - start_benchmark )
593
+ print (f"Benchmark time: { last_benchmark_time } " )
585
594
586
595
except Exception as e :
587
596
# dump kernel sources to temp file
0 commit comments