KernelTuner
diff --git a/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎kernel_tuner/core.py‎
Lines changed: 42 additions & 37 deletions b/‎kernel_tuner/core.py‎
Lines changed: 42 additions & 37 deletions
diff --git a/‎kernel_tuner/interface.py‎
Lines changed: 3 additions & 2 deletions b/‎kernel_tuner/interface.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎kernel_tuner/runners/sequential.py‎
Lines changed: 37 additions & 30 deletions b/‎kernel_tuner/runners/sequential.py‎
Lines changed: 37 additions & 30 deletions
@@ -4,6 +4,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+### Added
+- Support for using time_limit in simulation mode
+
+### Changed
+- Changed what timings are stored in cache files
 
 ## [0.4.3] - 2022-10-19
 ### Added
 
@@ -259,8 +259,6 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
         self.units = dev.units
         self.name = dev.name
         self.max_threads = dev.max_threads
-        self.last_compilation_time = None
-        self.last_verification_time = None
         if not quiet:
             print("Using: " + self.dev.name)
 
@@ -317,7 +315,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
 
 
 
-    def benchmark(self, func, gpu_args, instance, verbose):
+    def benchmark(self, func, gpu_args, instance, verbose, objective):
         """benchmark the kernel instance"""
         logging.debug('benchmark ' + instance.name)
         logging.debug('thread block dimensions x,y,z=%d,%d,%d', *instance.threads)
@@ -333,9 +331,8 @@ def benchmark(self, func, gpu_args, instance, verbose):
             if "nvml_mem_clock" in instance.params:
                 self.nvml.mem_clock = instance.params["nvml_mem_clock"]
 
-        result = None
+        result = {}
         try:
-            result = dict()
             self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result)
 
             if self.continuous_observers:
@@ -348,16 +345,16 @@ def benchmark(self, func, gpu_args, instance, verbose):
 
 
         except Exception as e:
-            #some launches may fail because too many registers are required
-            #to run the kernel given the current thread block size
-            #the desired behavior is to simply skip over this configuration
-            #and proceed to try the next one
+            # some launches may fail because too many registers are required
+            # to run the kernel given the current thread block size
+            # the desired behavior is to simply skip over this configuration
+            # and proceed to try the next one
             skippable_exceptions = ["too many resources requested for launch", "OUT_OF_RESOURCES", "INVALID_WORK_GROUP_SIZE"]
             if any([skip_str in str(e) for skip_str in skippable_exceptions]):
                 logging.debug('benchmark fails due to runtime failure too many resources required')
                 if verbose:
                     print(f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch")
-                return util.RuntimeFailedConfig()
+                result[objective] = util.RuntimeFailedConfig()
             else:
                 logging.debug('benchmark encountered runtime failure: ' + str(e))
                 print("Error while benchmarking:", instance.name)
@@ -408,61 +405,69 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
         if not correct:
             raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
 
-    def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, tuning_options):
+    def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
         """ Compile and benchmark a kernel instance based on kernel strings and parameters """
-        start_compilation = time.perf_counter()
         instance_string = util.get_instance_string(params)
 
         # reset previous timers
-        self.last_compilation_time = None
-        self.last_verification_time = None
+        last_compilation_time = None
+        last_verification_time = None
+        last_benchmark_time = None
 
         logging.debug('compile_and_benchmark ' + instance_string)
 
-        verbose = tuning_options.verbose
+        verbose = to.verbose
+        result = {}
 
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
             return instance
 
         try:
-            #compile the kernel
+            # compile the kernel
+            start_compilation = time.perf_counter()
             func = self.compile_kernel(instance, verbose)
-            if func is None:
-                return util.CompilationFailedConfig()
-
-            #add shared memory arguments to compiled module
-            if kernel_options.smem_args is not None:
-                self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
-            #add constant memory arguments to compiled module
-            if kernel_options.cmem_args is not None:
-                self.dev.copy_constant_memory_args(kernel_options.cmem_args)
-            #add texture memory arguments to compiled module
-            if kernel_options.texmem_args is not None:
-                self.dev.copy_texture_memory_args(kernel_options.texmem_args)
+            if not func:
+                result[to.objective] = util.CompilationFailedConfig()
+            else:
+                # add shared memory arguments to compiled module
+                if kernel_options.smem_args is not None:
+                    self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
+                # add constant memory arguments to compiled module
+                if kernel_options.cmem_args is not None:
+                    self.dev.copy_constant_memory_args(kernel_options.cmem_args)
+                # add texture memory arguments to compiled module
+                if kernel_options.texmem_args is not None:
+                    self.dev.copy_texture_memory_args(kernel_options.texmem_args)
 
             # stop compilation stopwatch and convert to miliseconds
-            self.last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
+            last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
 
-            #test kernel for correctness and benchmark
-            start_verification = time.perf_counter()
-            if tuning_options.answer is not None or tuning_options.verify is not None:
-                self.check_kernel_output(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose)
-            # stop verification stopwatch and convert to miliseconds
-            self.last_verification_time = 1000 * (time.perf_counter() - start_verification)
+            # test kernel for correctness
+            if func and (to.answer or to.verify):
+                start_verification = time.perf_counter()
+                self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
+                last_verification_time = 1000 * (time.perf_counter() - start_verification)
 
             # benchmark
-            result = self.benchmark(func, gpu_args, instance, verbose)
+            if func:
+                start_benchmark = time.perf_counter()
+                result.update(self.benchmark(func, gpu_args, instance, verbose, to.objective))
+                last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark)
 
         except Exception as e:
-            #dump kernel_string to temp file
+            # dump kernel sources to temp file
             temp_filenames = instance.prepare_temp_files_for_error_msg()
             print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
             raise e
 
         #clean up any temporary files, if no error occured
         instance.delete_temp_files()
 
+        result['compile_time'] = last_compilation_time or 0
+        result['verification_time'] = last_verification_time or 0
+        result['benchmark_time'] = last_benchmark_time or 0
+
         return result
 
     def compile_kernel(self, instance, verbose):
 
@@ -352,7 +352,7 @@ def tune_kernel(kernel_name, kernel_source, problem_size, arguments, tune_params
     objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
 
     # check for forbidden names in tune parameters
-    util.check_tune_params_list(tune_params)
+    util.check_tune_params_list(tune_params, observers)
 
     # check whether block_size_names are used as expected
     util.check_block_size_params_names_list(block_size_names, tune_params)
@@ -415,7 +415,8 @@ def tune_kernel(kernel_name, kernel_source, problem_size, arguments, tune_params
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode is True else SequentialRunner
+    selected_runner = SimulationRunner if simulation_mode else SequentialRunner
+    tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
     # the user-specified function may or may not have an optional atol argument;
 
@@ -1,13 +1,14 @@
 """ The default runner for sequentially tuning the parameter space """
-from collections import OrderedDict
 import logging
+from collections import OrderedDict
 from time import perf_counter
 
-from kernel_tuner.util import get_config_string, store_cache, process_metrics, print_config_output, ErrorConfig
 from kernel_tuner.core import DeviceInterface
+from kernel_tuner.util import (ErrorConfig, print_config_output,
+                               process_metrics, store_cache)
 
 
-class SequentialRunner(object):
+class SequentialRunner:
     """ SequentialRunner is used for tuning with a single process/thread """
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
@@ -36,7 +37,9 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.kernel_source = kernel_source
         self.warmed_up = False
         self.simulation_mode = False
-        self.last_strategy_start_time = perf_counter()
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
 
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
@@ -64,47 +67,51 @@ def run(self, parameter_space, kernel_options, tuning_options):
 
         results = []
 
-        #iterate over parameter space
+        # iterate over parameter space
         for element in parameter_space:
             params = OrderedDict(zip(tuning_options.tune_params.keys(), element))
 
-            #attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            warmup_time = 0
             if not self.warmed_up:
+                warmup_time = perf_counter()
                 self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, kernel_options, tuning_options)
                 self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = None
 
-            #check if element is in the cache
+            # check if configuration is in the cache
             x_int = ",".join([str(i) for i in element])
             if tuning_options.cache and x_int in tuning_options.cache:
-                results.append(tuning_options.cache[x_int])
-                continue
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, kernel_options, tuning_options)
+                params.update(tuning_options.cache[x_int])
+                params['compile_time'] = 0
+                params['verification_time'] = 0
+                params['benchmark_time'] = 0
+            else:
+                result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, kernel_options, tuning_options)
 
-            if self.dev.last_compilation_time is not None:
-                params['compile_time'] = self.dev.last_compilation_time
-            if self.dev.last_verification_time is not None:
-                params['verification_time'] = self.dev.last_verification_time
+                params.update(result)
 
-            if isinstance(result, ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-                params.update({ tuning_options.objective: result })
-                store_cache(x_int, params, tuning_options)
-                results.append(params)
-                continue
+                # only compute metrics on configs that have not errored
+                if isinstance(result[tuning_options.objective], ErrorConfig):
+                    logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+                elif tuning_options.metrics:
+                    params = process_metrics(params, tuning_options.metrics)
 
-            # print and append to results
-            if not isinstance(result, dict):
-                params[tuning_options.objective] = result
-            else:
-                params.update(result)
+                # print configuration to the console
+                print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
 
-            if tuning_options.metrics:
-                params = process_metrics(params, tuning_options.metrics)
+            # get the framework time by estimating based on other times
+            total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+            params['strategy_time'] = self.last_strategy_time
+            params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+            self.start_time = perf_counter()
 
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+            if result:
+                store_cache(x_int, params, tuning_options)
 
-            store_cache(x_int, params, tuning_options)
+            # all visited configurations are added to results to provide a trace for optimization strategies
             results.append(params)
 
         return results, self.dev.get_environment()