Change how timings are collected in all runners

stijnh · stijnh · commit d844c197b86b · 2026-02-10T18:15:05.000+01:00
diff --git a/doc/source/parallel.rst b/doc/source/parallel.rst
@@ -69,7 +69,7 @@ Setting up Ray
 --------------
 
 Kernel Tuner uses `Ray <https://docs.ray.io/en/latest/>`_ to distribute kernel evaluations across multiple GPUs.
-ay is an open-source framework for distributed computing in Python.
+Ray is an open-source framework for distributed computing in Python.
 
 To use parallel tuning, you must first install Ray itself:
 
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
@@ -68,16 +68,13 @@ def get_environment(self):
     def run(self, params):
         # TODO: logging.debug("sequential runner started for " + self.kernel_options.kernel_name)
         result = None
-        warmup_time = 0
 
         # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
         if not self.warmed_up:
-            warmup_time = perf_counter()
             self.dev.compile_and_benchmark(
                 self.kernel_source, self.gpu_args, params, self.kernel_options, self.tuning_options
             )
             self.warmed_up = True
-            warmup_time = 1e3 * (perf_counter() - warmup_time)
 
         result = self.dev.compile_and_benchmark(
             self.kernel_source, self.gpu_args, params, self.kernel_options, self.tuning_options
@@ -173,6 +170,8 @@ def __init__(
         observers,
         num_workers=None,
     ):
+        super().__init__()
+
         if not ray.is_initialized():
             ray.init()
 
@@ -209,8 +208,6 @@ def __init__(
         # TODO: Get units from the device?
         self.units = {"time": "ms"}
         self.quiet = device_options.quiet
-        self.start_time = perf_counter()
-        self.last_strategy_time = 0
 
         # Print some debugging information
         if tuning_options.verbose:
@@ -332,7 +329,7 @@ def run(self, parameter_space, tuning_options) -> List[Optional[dict]]:
             # Collect total time spent by worker
             total_worker_time += (
                 result["compile_time"] + result["verification_time"] + result["benchmark_time"]
-            )
+            ) / 1000
 
             # only compute metrics on configs that have not errored
             if not isinstance(result.get(objective), ErrorConfig):
@@ -365,19 +362,18 @@ def run(self, parameter_space, tuning_options) -> List[Optional[dict]]:
 
         # If there are valid results, set timings
         if num_valid_results > 0:
-            total_time = 1000 * (perf_counter() - self.start_time)
-            self.start_time = perf_counter()
+            total_time = self.timer.get_and_reset()
 
-            strategy_time = self.last_strategy_time
-            self.last_strategy_time = 0
+            strategy_time = self.accumulated_strategy_time
+            self.accumulated_strategy_time = 0
 
             runner_time = total_time - strategy_time
             framework_time = max(runner_time * len(self.workers) - total_worker_time, 0)
 
-            # Post-process all the results
+            # Amortize the time over all the results
             for result in results:
-                # Amortize the time over all the results
                 if result:
+                    # Time must be in ms
                     result["strategy_time"] = strategy_time / num_valid_results
                     result["framework_time"] = framework_time / num_valid_results
 
diff --git a/kernel_tuner/runners/runner.py b/kernel_tuner/runners/runner.py
@@ -3,15 +3,18 @@
 
 from abc import ABC, abstractmethod
 
+from kernel_tuner.util import Timer
+
 
 class Runner(ABC):
     """Base class for kernel_tuner runners"""
 
-    @abstractmethod
-    def __init__(
-        self, kernel_source, kernel_options, device_options, iterations, observers
-    ):
-        pass
+    def __init__(self):
+        self.timer = Timer()
+        self.accumulated_strategy_time = 0
+
+    def add_strategy_time(self, seconds):
+        self.accumulated_strategy_time += seconds
 
     def shutdown(self):
         pass
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
@@ -5,7 +5,7 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache, disable_benchmark_timings
+from kernel_tuner.util import ErrorConfig, Timer, print_config_output, process_metrics, store_cache, disable_benchmark_timings
 
 
 class SequentialRunner(Runner):
@@ -27,16 +27,14 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :type iterations: int
         """
         # detect language and create high-level device interface
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        super().__init__()
 
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
         self.units = self.dev.units
         self.quiet = device_options.quiet
         self.kernel_source = kernel_source
         self.warmed_up = False if self.dev.requires_warmup else True
         self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
         self.kernel_options = kernel_options
 
         # move data to the GPU
@@ -64,7 +62,7 @@ def run(self, parameter_space, tuning_options):
         logging.debug("sequential runner started for " + self.kernel_options.kernel_name)
 
         results = []
-        total_worker_time = 0
+        worker_time = 0
 
         # iterate over parameter space
         for element in parameter_space:
@@ -88,21 +86,21 @@ def run(self, parameter_space, tuning_options):
             else:
                 # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
                 if not self.warmed_up:
-                    warmup_time = perf_counter()
+                    warmup_timer = Timer()
                     self.dev.compile_and_benchmark(
                         self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
                     )
                     self.warmed_up = True
-                    warmup_time = 1e3 * (perf_counter() - warmup_time)
+                    warmup_time = warmup_timer.get()
 
                 result = self.dev.compile_and_benchmark(
                     self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
                 )
 
-                # Collect total time spent by worker
+                # Collect total time spent by worker in seconds
                 worker_time += (
                     result["compile_time"] + result["verification_time"] + result["benchmark_time"]
-                )
+                ) / 1000
 
                 params.update(result)
 
@@ -128,20 +126,17 @@ def run(self, parameter_space, tuning_options):
         num_valid_results = sum(bool(r) for r in results)  # Count the number of valid results
 
         if num_valid_results > 0:
-            # get the framework time by estimating based on other times
-            total_time = 1000 * (perf_counter() - self.start_time)
-            self.start_time = perf_counter()
-
-            strategy_time = self.last_strategy_time
-            self.last_strategy_time = 0
+            strategy_time = self.accumulated_strategy_time
+            self.accumulated_strategy_time = 0
 
+            # get the framework time by estimating based on other times
+            total_time = self.timer.get_and_reset() - warmup_time
             framework_time = max(total_time - strategy_time - worker_time, 0)
 
-            # Post-process all the results
+            # Amortize the time over all the results
             for result in results:
-                # Amortize the time over all the results
                 if result:
-                    result["strategy_time"] = strategy_time / num_valid_results
-                    result["framework_time"] = framework_time / num_valid_results
+                    result["strategy_time"] = 1000 * strategy_time / num_valid_results
+                    result["framework_time"] = 1000 * framework_time / num_valid_results
 
         return results
diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
@@ -46,17 +46,15 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param iterations: The number of iterations used for benchmarking each kernel instance.
         :type iterations: int
         """
+        super().__init__()
         self.quiet = device_options.quiet
         self.dev = SimulationDevice(1024, dict(device_name="Simulation"), self.quiet)
 
         self.kernel_source = kernel_source
         self.simulation_mode = True
         self.kernel_options = kernel_options
 
-        self.start_time = perf_counter()
         self.total_simulated_time = 0
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
         self.visited_results = set()
         self.units = {}
 
@@ -85,9 +83,6 @@ def run(self, parameter_space, tuning_options):
 
         results = []
 
-        # self.last_strategy_time is set by cost_func
-        strategy_time_per_config = self.last_strategy_time / len(parameter_space) if len(parameter_space) > 0 else 0
-
         # iterate over parameter space
         for element in parameter_space:
 
@@ -120,9 +115,6 @@ def run(self, parameter_space, tuning_options):
                     util.print_config_output(tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units)
                     self.visited_results.add(key)
 
-                # Everything but the strategy time and framework time are simulated,
-                result["strategy_time"] = strategy_time_per_config
-
                 # Simulate the evaluation of this configuration
                 tuning_options.budget.add_evaluations(1)
                 tuning_options.budget.add_time(milliseconds=result["compile_time"])
@@ -136,10 +128,6 @@ def run(self, parameter_space, tuning_options):
                         "Cannot use simulation mode with a time limit on a cache file that does not have full compile, verification, and benchmark timings on all configurations"
                     )
 
-                total_time = 1000 * (perf_counter() - self.start_time)
-                self.start_time = perf_counter()
-                result["framework_time"] = total_time
-
                 results.append(result)
                 continue
 
@@ -148,12 +136,6 @@ def run(self, parameter_space, tuning_options):
             check = util.check_restrictions(tuning_options.restrictions, params_dict, True)
             if not check:
                 result = util.disable_benchmark_timings(params_dict) # Set timings to zero
-                result['strategy_time'] = strategy_time_per_config
-
-                total_time = 1000 * (perf_counter() - self.start_time)
-                self.start_time = perf_counter()
-                result['framework_time'] = total_time
-
                 result[tuning_options.objective] = util.InvalidConfig()
                 results.append(result)
                 warn(f"Configuration {element} not in cache, does not pass restrictions. Will be treated as an InvalidConfig, but make sure you are evaluating the correct cache file.")
@@ -164,4 +146,21 @@ def run(self, parameter_space, tuning_options):
             logging.debug(err_string)
             raise ValueError(f"{err_string} - in simulation mode, all configurations must be present in the cache")
 
+        num_valid_results = sum(bool(r) for r in results)
+        if num_valid_results:
+            total_time = self.timer.get_and_reset()
+
+            strategy_time = self.accumulated_strategy_time
+            self.accumulated_strategy_time = 0
+
+            framework_time = max(total_time - strategy_time, 0)
+
+            # Amortize the time over all the results
+            for result in results:
+                if result:
+                    # Time must be in ms
+                    result["strategy_time"] = strategy_time / num_valid_results
+                    result["framework_time"] = framework_time / num_valid_results
+
+
         return results
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
@@ -98,6 +98,7 @@ def __init__(
         self.results = []
         self.budget_spent_fraction = 0.0
         self.invalid_return_value = invalid_value
+        self.strategy_timer = util.Timer()
 
     def _normalize_and_validate_config(self, x, check_restrictions=True):
         # snap values in x to nearest actual value for each parameter, unscale x if needed
@@ -129,8 +130,8 @@ def _normalize_and_validate_config(self, x, check_restrictions=True):
 
     def _run_configs(self, xs, check_restrictions=True):
         """ Takes a list of Euclidian coordinates and evaluates the configurations at those points. """
-        self.runner.last_strategy_time += 1000 * (perf_counter() - self.runner.last_strategy_start_time)
-        self.runner.start_time = perf_counter() # start framework time
+        strategy_time = self.strategy_timer.get()
+        self.runner.add_strategy_time(strategy_time)
 
         # error value to return for numeric optimizers that need a numerical value
         logging.debug("_cost_func called")
@@ -176,9 +177,6 @@ def _run_configs(self, xs, check_restrictions=True):
             self.unique_results.setdefault(key, result)
             self.results.append(result)
 
-        # upon returning from this function control will be given back to the strategy, so reset the start time
-        self.runner.last_strategy_start_time = perf_counter()
-
         # this check is necessary because some strategies cannot handle partially completed requests
         # for example when only half of the configs in a population have been evaluated
         self.tuning_options.budget.raise_exception_if_done()
@@ -189,6 +187,9 @@ def _run_configs(self, xs, check_restrictions=True):
         if not all(final_results):
             raise util.StopCriterionReached("runner did not evaluate all given configurations")
 
+        # upon returning from this function control will be given back to the strategy, so reset the start time
+        self.strategy_timer.reset()
+
         return final_results
 
     def eval_all(self, xs, check_restrictions=True):
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
@@ -188,6 +188,26 @@ def check_argument_list(kernel_name, kernel_string, args):
         warnings.warn(errors[0], UserWarning)
 
 
+class Timer:
+    def __init__(self):
+        self._start_ns = time.perf_counter_ns()
+
+    def get(self) -> float:
+        """Elapsed time in seconds."""
+        now = time.perf_counter_ns()
+        return (now - self._start_ns) * 1e-9
+
+    def get_and_reset(self) -> float:
+        """Elapsed time in seconds, then reset."""
+        now = time.perf_counter_ns()
+        elapsed_ns = now - self._start_ns
+        self._start_ns = now
+        return elapsed_ns * 1e-9
+
+    def reset(self) -> None:
+        self.get_and_reset()
+
+
 class TuningBudget:
     def __init__(self, time_limit=None, max_fevals=None):
         if time_limit is not None and not isinstance(time_limit, timedelta):
@@ -199,7 +219,7 @@ def __init__(self, time_limit=None, max_fevals=None):
         if time_limit is not None and time_limit <= timedelta(seconds=0):
             raise ValueError("time_limit must be greater than zero")
 
-        self.start_time_seconds = time.perf_counter()
+        self.start_timer = Timer()
         self.time_spent_extra = timedelta()
         self.time_limit = time_limit
         self.num_fevals = 0
@@ -212,7 +232,7 @@ def add_time(self, seconds=0, milliseconds=0):
         self.time_spent_extra += timedelta(seconds=seconds, milliseconds=milliseconds)
     
     def get_time_spent(self) -> timedelta:
-        seconds_passed = time.perf_counter() - self.start_time_seconds
+        seconds_passed = self.start_timer.get()
         return timedelta(seconds=seconds_passed) + self.time_spent_extra
     
     def get_time_remaining(self) -> timedelta:
@@ -259,7 +279,6 @@ def get_fraction_consumed(self) -> float:
 
     
 
-
 def check_tune_params_list(tune_params, observers, simulation_mode=False):
     """Raise an exception if a tune parameter has a forbidden name."""
     forbidden_names = ("grid_size_x", "grid_size_y", "grid_size_z", "time")
diff --git a/test/strategies/test_common.py b/test/strategies/test_common.py
@@ -20,7 +20,6 @@ def fake_runner():
         'time': 5
     }
     runner = Mock()
-    runner.last_strategy_start_time = perf_counter()
     runner.run.return_value = [fake_result]
     return runner
 

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,6 @@ def fake_runner():`
`20`	`20`	`'time': 5`
`21`	`21`	`}`
`22`	`22`	`runner = Mock()`
`23`		`- runner.last_strategy_start_time = perf_counter()`
`24`	`23`	`runner.run.return_value = [fake_result]`
`25`	`24`	`return runner`
`26`	`25`