@@ -259,8 +259,6 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
259259 self .units = dev .units
260260 self .name = dev .name
261261 self .max_threads = dev .max_threads
262- self .last_compilation_time = None
263- self .last_verification_time = None
264262 if not quiet :
265263 print ("Using: " + self .dev .name )
266264
@@ -317,7 +315,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
317315
318316
319317
320- def benchmark (self , func , gpu_args , instance , verbose ):
318+ def benchmark (self , func , gpu_args , instance , verbose , objective ):
321319 """benchmark the kernel instance"""
322320 logging .debug ('benchmark ' + instance .name )
323321 logging .debug ('thread block dimensions x,y,z=%d,%d,%d' , * instance .threads )
@@ -333,9 +331,8 @@ def benchmark(self, func, gpu_args, instance, verbose):
333331 if "nvml_mem_clock" in instance .params :
334332 self .nvml .mem_clock = instance .params ["nvml_mem_clock" ]
335333
336- result = None
334+ result = {}
337335 try :
338- result = dict ()
339336 self .benchmark_default (func , gpu_args , instance .threads , instance .grid , result )
340337
341338 if self .continuous_observers :
@@ -348,16 +345,16 @@ def benchmark(self, func, gpu_args, instance, verbose):
348345
349346
350347 except Exception as e :
351- #some launches may fail because too many registers are required
352- #to run the kernel given the current thread block size
353- #the desired behavior is to simply skip over this configuration
354- #and proceed to try the next one
348+ # some launches may fail because too many registers are required
349+ # to run the kernel given the current thread block size
350+ # the desired behavior is to simply skip over this configuration
351+ # and proceed to try the next one
355352 skippable_exceptions = ["too many resources requested for launch" , "OUT_OF_RESOURCES" , "INVALID_WORK_GROUP_SIZE" ]
356353 if any ([skip_str in str (e ) for skip_str in skippable_exceptions ]):
357354 logging .debug ('benchmark fails due to runtime failure too many resources required' )
358355 if verbose :
359356 print (f"skipping config { util .get_instance_string (instance .params )} reason: too many resources requested for launch" )
360- return util .RuntimeFailedConfig ()
357+ result [ objective ] = util .RuntimeFailedConfig ()
361358 else :
362359 logging .debug ('benchmark encountered runtime failure: ' + str (e ))
363360 print ("Error while benchmarking:" , instance .name )
@@ -408,61 +405,69 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
408405 if not correct :
409406 raise RuntimeError ("Kernel result verification failed for: " + util .get_config_string (instance .params ))
410407
411- def compile_and_benchmark (self , kernel_source , gpu_args , params , kernel_options , tuning_options ):
408+ def compile_and_benchmark (self , kernel_source , gpu_args , params , kernel_options , to ):
412409 """ Compile and benchmark a kernel instance based on kernel strings and parameters """
413- start_compilation = time .perf_counter ()
414410 instance_string = util .get_instance_string (params )
415411
416412 # reset previous timers
417- self .last_compilation_time = None
418- self .last_verification_time = None
413+ last_compilation_time = None
414+ last_verification_time = None
415+ last_benchmark_time = None
419416
420417 logging .debug ('compile_and_benchmark ' + instance_string )
421418
422- verbose = tuning_options .verbose
419+ verbose = to .verbose
420+ result = {}
423421
424422 instance = self .create_kernel_instance (kernel_source , kernel_options , params , verbose )
425423 if isinstance (instance , util .ErrorConfig ):
426424 return instance
427425
428426 try :
429- #compile the kernel
427+ # compile the kernel
428+ start_compilation = time .perf_counter ()
430429 func = self .compile_kernel (instance , verbose )
431- if func is None :
432- return util .CompilationFailedConfig ()
433-
434- # add shared memory arguments to compiled module
435- if kernel_options .smem_args is not None :
436- self .dev .copy_shared_memory_args (util .get_smem_args (kernel_options .smem_args , params ))
437- # add constant memory arguments to compiled module
438- if kernel_options .cmem_args is not None :
439- self .dev .copy_constant_memory_args (kernel_options .cmem_args )
440- # add texture memory arguments to compiled module
441- if kernel_options .texmem_args is not None :
442- self .dev .copy_texture_memory_args (kernel_options .texmem_args )
430+ if not func :
431+ result [ to . objective ] = util .CompilationFailedConfig ()
432+ else :
433+ # add shared memory arguments to compiled module
434+ if kernel_options .smem_args is not None :
435+ self .dev .copy_shared_memory_args (util .get_smem_args (kernel_options .smem_args , params ))
436+ # add constant memory arguments to compiled module
437+ if kernel_options .cmem_args is not None :
438+ self .dev .copy_constant_memory_args (kernel_options .cmem_args )
439+ # add texture memory arguments to compiled module
440+ if kernel_options .texmem_args is not None :
441+ self .dev .copy_texture_memory_args (kernel_options .texmem_args )
443442
444443 # stop compilation stopwatch and convert to miliseconds
445- self . last_compilation_time = 1000 * (time .perf_counter () - start_compilation )
444+ last_compilation_time = 1000 * (time .perf_counter () - start_compilation )
446445
447- #test kernel for correctness and benchmark
448- start_verification = time .perf_counter ()
449- if tuning_options .answer is not None or tuning_options .verify is not None :
450- self .check_kernel_output (func , gpu_args , instance , tuning_options .answer , tuning_options .atol , tuning_options .verify , verbose )
451- # stop verification stopwatch and convert to miliseconds
452- self .last_verification_time = 1000 * (time .perf_counter () - start_verification )
446+ # test kernel for correctness
447+ if func and (to .answer or to .verify ):
448+ start_verification = time .perf_counter ()
449+ self .check_kernel_output (func , gpu_args , instance , to .answer , to .atol , to .verify , verbose )
450+ last_verification_time = 1000 * (time .perf_counter () - start_verification )
453451
454452 # benchmark
455- result = self .benchmark (func , gpu_args , instance , verbose )
453+ if func :
454+ start_benchmark = time .perf_counter ()
455+ result .update (self .benchmark (func , gpu_args , instance , verbose , to .objective ))
456+ last_benchmark_time = 1000 * (time .perf_counter () - start_benchmark )
456457
457458 except Exception as e :
458- #dump kernel_string to temp file
459+ # dump kernel sources to temp file
459460 temp_filenames = instance .prepare_temp_files_for_error_msg ()
460461 print ("Error while compiling or benchmarking, see source files: " + " " .join (temp_filenames ))
461462 raise e
462463
463464 #clean up any temporary files, if no error occured
464465 instance .delete_temp_files ()
465466
467+ result ['compile_time' ] = last_compilation_time or 0
468+ result ['verification_time' ] = last_verification_time or 0
469+ result ['benchmark_time' ] = last_benchmark_time or 0
470+
466471 return result
467472
468473 def compile_kernel (self , instance , verbose ):
0 commit comments