Move check for non-unique configuration from CostFunc to runners

stijnh · stijnh · commit ceaa96c6fbd9 · 2026-02-10T15:33:46.000+01:00
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
@@ -291,43 +291,53 @@ def run(self, parameter_space, tuning_options) -> List[Optional[dict]]:
         jobs = []  # Jobs that need to be executed
         results = []  # Results that will be returned at the end
         key2index = dict()  # Used to insert job result back into `results`
-
-        total_worker_time = 0
+        duplicate_entries = []  # Stores (i, j) if `i` is a duplicate of `j`.
 
         # Select jobs which are not in the cache
         for index, config in enumerate(parameter_space):
             params = dict(zip(tuning_options.tune_params.keys(), config))
             key = ",".join([str(i) for i in config])
 
+            # Element is in cache
             if key in tuning_options.cache:
-                cache_entry = tuning_options.cache[key]
-
                 # We must disable the timings as otherwise these will counted
                 # as part of the total_compile/benchmark/verification_time
-                results.append(disable_benchmark_timings(cache_entry))
+                result = disable_benchmark_timings(tuning_options.cache[key])
+
+                # recompute matrics for this entry
+                result = process_metrics(result, metrics)
+
+                results.append(result)
+
+            # Element is duplicate entry in `parameter_space`
+            elif key in key2index:
+                duplicate_entries.append((index, key2index[key]))
+                results.append(None)
+
+            # Element must become a job
             else:
-                assert key not in key2index, "duplicate jobs submitted"
                 key2index[key] = index
-
                 jobs.append((key, params))
                 results.append(None)
 
+        total_worker_time = 0
+
         # Submit jobs and wait for them to finish
         for key, result in self.submit_jobs(jobs, tuning_options.budget):
             # `None` indicate that no result is available since the budget is exceeded.
             # We can skip it, meaning that `results` contains `None`s for these entries
             if result is None:
                 continue
 
-            # Store the result into the output array
-            results[key2index[key]] = result
-
             # Collect total time spent by worker
             total_worker_time += (
                 result["compile_time"] + result["verification_time"] + result["benchmark_time"]
             )
 
-            if isinstance(result.get(objective), ErrorConfig):
+            # only compute metrics on configs that have not errored
+            if not isinstance(result.get(objective), ErrorConfig):
+                result = process_metrics(result, metrics)
+            else:
                 logging.error(
                     "kernel configuration {key} was skipped silently due to compile or runtime failure",
                     key,
@@ -341,29 +351,34 @@ def run(self, parameter_space, tuning_options) -> List[Optional[dict]]:
             # add configuration to cache
             store_cache(key, result, tuning_options.cachefile, tuning_options.cache)
 
-        total_time = 1000 * (perf_counter() - self.start_time)
-        self.start_time = perf_counter()
+            # Store the result into the output array
+            results[key2index[key]] = result
 
-        strategy_time = self.last_strategy_time
-        self.last_strategy_time = 0
+        # Fix duplicate entries. Duplicate entires do not get benchmark timings
+        # as otherwise we would count them multiple times in the total
+        for i, j in duplicate_entries:
+            if results[j]:
+                results[i] = disable_benchmark_timings(results[j])
 
-        runner_time = total_time - strategy_time
-        framework_time = max(runner_time * len(self.workers) - total_worker_time, 0)
+        # Count the number of valid results
+        num_valid_results = sum(bool(r) for r in results)
 
-        num_valid_results = sum(bool(r) for r in results)  # Count the number of valid results
+        # If there are valid results, set timings
+        if num_valid_results > 0:
+            total_time = 1000 * (perf_counter() - self.start_time)
+            self.start_time = perf_counter()
 
-        # Post-process all the results
-        for result in results:
-            # Skip missing results
-            if not result:
-                continue
+            strategy_time = self.last_strategy_time
+            self.last_strategy_time = 0
 
-            # Amortize the time over all the results
-            result["strategy_time"] = strategy_time / num_valid_results
-            result["framework_time"] = framework_time / num_valid_results
+            runner_time = total_time - strategy_time
+            framework_time = max(runner_time * len(self.workers) - total_worker_time, 0)
 
-            # only compute metrics on configs that have not errored
-            if not isinstance(result.get(objective), ErrorConfig):
-                result = process_metrics(result, metrics)
+            # Post-process all the results
+            for result in results:
+                # Amortize the time over all the results
+                if result:
+                    result["strategy_time"] = strategy_time / num_valid_results
+                    result["framework_time"] = framework_time / num_valid_results
 
         return results
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
@@ -64,9 +64,7 @@ def run(self, parameter_space, tuning_options):
         logging.debug("sequential runner started for " + self.kernel_options.kernel_name)
 
         results = []
-
-        # self.last_strategy_time is set by cost_func
-        strategy_time_per_config = self.last_strategy_time / len(parameter_space) if len(parameter_space) > 0 else 0
+        total_worker_time = 0
 
         # iterate over parameter space
         for element in parameter_space:
@@ -101,15 +99,22 @@ def run(self, parameter_space, tuning_options):
                     self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
                 )
 
+                # Collect total time spent by worker
+                worker_time += (
+                    result["compile_time"] + result["verification_time"] + result["benchmark_time"]
+                )
+
                 params.update(result)
 
-                if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                if isinstance(result.get(tuning_options.objective), ErrorConfig):
                     logging.debug("kernel configuration was skipped silently due to compile or runtime failure")
 
             # only compute metrics on configs that have not errored
-            if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            if not isinstance(params.get(tuning_options.objective), ErrorConfig):
                 params = process_metrics(params, tuning_options.metrics)
 
+            params["timestamp"] = str(datetime.now(timezone.utc))
+
             if result:
                 # print configuration to the console
                 print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
@@ -120,20 +125,23 @@ def run(self, parameter_space, tuning_options):
             # all visited configurations are added to results to provide a trace for optimization strategies
             results.append(params)
 
+        num_valid_results = sum(bool(r) for r in results)  # Count the number of valid results
+
+        if num_valid_results > 0:
             # get the framework time by estimating based on other times
-            total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+            total_time = 1000 * (perf_counter() - self.start_time)
             self.start_time = perf_counter()
 
-            params["strategy_time"] = strategy_time_per_config
-            params["framework_time"] = max(
-                total_time
-                - (
-                    params["compile_time"]
-                    + params["verification_time"]
-                    + params["benchmark_time"]
-                ),
-                0,
-            )
-            params["timestamp"] = str(datetime.now(timezone.utc))
+            strategy_time = self.last_strategy_time
+            self.last_strategy_time = 0
+
+            framework_time = max(total_time - strategy_time - worker_time, 0)
+
+            # Post-process all the results
+            for result in results:
+                # Amortize the time over all the results
+                if result:
+                    result["strategy_time"] = strategy_time / num_valid_results
+                    result["framework_time"] = framework_time / num_valid_results
 
         return results
diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
@@ -57,6 +57,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.total_simulated_time = 0
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
+        self.visited_results = set()
         self.units = {}
 
     def get_device_info(self):
@@ -106,10 +107,18 @@ def run(self, parameter_space, tuning_options):
                 if tuning_options.metrics and not isinstance(result.get(tuning_options.objective), util.ErrorConfig):
                     result = util.process_metrics(result, tuning_options.metrics)
 
-                # configuration is evaluated for the first time, print to the console
-                util.print_config_output(
-                    tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units
-                )
+                # Simulate behavior of sequential runner that when a configuration is
+                # served from the cache by the sequential runner, the compile_time,
+                # verification_time, and benchmark_time are set to 0.
+                # This step is only performed in the simulation runner when a configuration
+                # is served from the cache beyond the first timel. That is, when the
+                # configuration is already counted towards the unique_results.
+                if key in self.visited_results:
+                    result = util.disable_benchmark_timings(result)
+                else:
+                    # configuration is evaluated for the first time, print to the console
+                    util.print_config_output(tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units)
+                    self.visited_results.add(key)
 
                 # Everything but the strategy time and framework time are simulated,
                 result["strategy_time"] = strategy_time_per_config
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
@@ -129,7 +129,7 @@ def _normalize_and_validate_config(self, x, check_restrictions=True):
 
     def _run_configs(self, xs, check_restrictions=True):
         """ Takes a list of Euclidian coordinates and evaluates the configurations at those points. """
-        self.runner.last_strategy_time = 1000 * (perf_counter() - self.runner.last_strategy_start_time)
+        self.runner.last_strategy_time += 1000 * (perf_counter() - self.runner.last_strategy_start_time)
         self.runner.start_time = perf_counter() # start framework time
 
         # error value to return for numeric optimizers that need a numerical value
@@ -138,68 +138,43 @@ def _run_configs(self, xs, check_restrictions=True):
         # check if max_fevals is reached or time limit is exceeded
         self.tuning_options.budget.raise_exception_if_done()
 
+        batch_indices = []  # Where to store result in `final_results`
         batch_configs = []  # The configs to run
-        batch_keys = [] # The keys of the configs to run
-        pending_indices_by_key = dict()  # Maps key => where to store result in `final_results`
         final_results = []  # List returned to the user
-        legal_indices = []  # Indices in `final_results` that are legal
 
-        # Loop over all configurations. For each configurations there are four cases:
-        # 1. The configuration is invalid, we can skip it
-        # 2. The configuration is in  `unique_results`, we can get it from there
-        # 3. The configuration is in  `pending_indices_by_key`, it is duplicate in `xs`
-        # 4. The configuration must be evaluated by the runner.
+        # Loop over all configurations.
         for index, x in enumerate(xs):
             config, is_legal = self._normalize_and_validate_config(x, check_restrictions=check_restrictions)
             logging.debug("normalize config: %s -> %s (legal: %s)", str(x), str(config), is_legal)
             key = ",".join([str(i) for i in config])
 
-            # 1. Not legal, just return `InvalidConfig`
+            # Not legal, just return `InvalidConfig`
             if not is_legal:
                 result = dict(zip(self.searchspace.tune_params.keys(), config))
                 result[self.objective] = util.InvalidConfig()
                 final_results.append(result)
 
-            # 2. Attempt to retrieve from `unique_results` 
-            elif key in self.unique_results:
-                result = dict(self.unique_results[key])
-                legal_indices.append(index)
-                final_results.append(result)
-
-            # 3. We have already seen this config in the current batch
-            elif key in pending_indices_by_key:
-                pending_indices_by_key[key].append(index)
-                final_results.append(None)
-
-            # 4. A new config, we must evaluate this
+            # Legal config, we must evaluate this
             else:
-                batch_keys.append(key)
+                batch_indices.append(index)
                 batch_configs.append(config)
-                pending_indices_by_key[key] = [index]
                 final_results.append(None)
 
         # compile and benchmark the batch
         batch_results = self.runner.run(batch_configs, self.tuning_options)
 
-        for key, result in zip(batch_keys, batch_results):
+        for index, config, result in zip(batch_indices, batch_configs, batch_results):
             # Skip. Result is missing because the runner has exhausted the budget
             if result is None:
                 continue
 
             # set in the results array
-            for index in pending_indices_by_key[key]:
-                legal_indices.append(index)
-                final_results[index] = result
-
-                # Disable the timings. Only the first result must get these.
-                result = util.disable_benchmark_timings(result)
+            final_results[index] = result
 
             # Put result in `unique_results`
-            self.unique_results[key] = result
-
-        # Only things in `legal_indices` are valid results
-        for index in sorted(legal_indices):
-            self.results.append(final_results[index])
+            key = ",".join([str(i) for i in config])
+            self.unique_results.setdefault(key, result)
+            self.results.append(result)
 
         # upon returning from this function control will be given back to the strategy, so reset the start time
         self.runner.last_strategy_start_time = perf_counter()