Completed merge with updated custom_strategies

fjwillemsen · fjwillemsen · commit dbcb89d17256 · 2025-06-04T15:48:43.000+02:00
diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
@@ -19,7 +19,6 @@
     "bool": ctypes.c_bool,
     "int8": ctypes.c_int8,
     "int16": ctypes.c_int16,
-    "float16": ctypes.c_int16,
     "int32": ctypes.c_int32,
     "int64": ctypes.c_int64,
     "uint8": ctypes.c_uint8,
@@ -40,7 +39,9 @@ def hip_check(call_result):
     if len(result) == 1:
         result = result[0]
     if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
-        raise RuntimeError(str(err), hip.hipGetLastError())
+        _, error_name = hip.hipGetErrorName(err)
+        _, error_str = hip.hipGetErrorString(err)
+        raise RuntimeError(f"{error_name}: {error_str}")
     return result
 
 
@@ -120,25 +121,29 @@ def ready_argument_list(self, arguments):
 
             # Handle numpy arrays
             if isinstance(arg, np.ndarray):
-                if dtype_str in dtype_map.keys():
-                    # Allocate device memory
-                    device_ptr = hip_check(hip.hipMalloc(arg.nbytes))
+                # Allocate device memory
+                device_ptr = hip_check(hip.hipMalloc(arg.nbytes))
 
-                    # Copy data to device using hipMemcpy
-                    hip_check(hip.hipMemcpy(device_ptr, arg, arg.nbytes, hip.hipMemcpyKind.hipMemcpyHostToDevice))
+                # Copy data to device using hipMemcpy
+                hip_check(hip.hipMemcpy(device_ptr, arg, arg.nbytes, hip.hipMemcpyKind.hipMemcpyHostToDevice))
 
-                    prepared_args.append(device_ptr)
-                else:
-                    raise TypeError(f"Unknown dtype {dtype_str} for ndarray")
+                prepared_args.append(device_ptr)
 
             # Handle numpy scalar types
             elif isinstance(arg, np.generic):
                 # Convert numpy scalar to corresponding ctypes
-                ctype_arg = dtype_map[dtype_str](arg)
-                prepared_args.append(ctype_arg)
+                if dtype_str in dtype_map:
+                    ctype_arg = dtype_map[dtype_str](arg)
+                    prepared_args.append(ctype_arg)
+                # 16-bit float is not supported, view it as uint16
+                elif dtype_str in ("float16", "bfloat16"):
+                    ctype_arg = ctypes.c_uint16(arg.view(np.uint16))
+                    prepared_args.append(ctype_arg)
+                else:
+                    raise ValueError(f"Invalid argument type {dtype_str}: {arg}")
 
             else:
-                raise ValueError(f"Invalid argument type {type(arg)}, {arg}")
+                raise ValueError(f"Invalid argument type {type(arg)}: {arg}")
 
         return prepared_args
 
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -509,7 +509,7 @@ def check_kernel_output(
         # run the kernel
         check = self.run_kernel(func, gpu_args, instance)
         if not check:
-            # runtime failure occured that should be ignored, skip correctness check
+            # runtime failure occurred that should be ignored, skip correctness check
             return
 
         # retrieve gpu results to host memory
@@ -908,7 +908,7 @@ def split_argument_list(argument_list):
         match = re.match(regex, arg, re.S)
         if not match:
             raise ValueError("error parsing templated kernel argument list")
-        type_list.append(re.sub(r"\s+", " ", match.group(1).strip(), re.S))
+        type_list.append(re.sub(r"\s+", " ", match.group(1).strip(), flags=re.S))
         name_list.append(match.group(2).strip())
     return type_list, name_list
 
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
@@ -100,6 +100,7 @@ def __init__(
                 tuning_options["max_fevals"] if "max_fevals" in tuning_options else np.inf, searchspace.size
             )
         self.results = []
+        self.budget_spent_fraction = 0.0
 
         # if enabled, encode non-numeric parameter values as a numeric value
         if self.encode_non_numeric:
@@ -127,7 +128,7 @@ def __call__(self, x, check_restrictions=True):
         logging.debug("x: %s", str(x))
 
         # check if max_fevals is reached or time limit is exceeded
-        util.check_stop_criterion(self.tuning_options)
+        self.budget_spent_fraction = util.check_stop_criterion(self.tuning_options)
 
         # snap values in x to nearest actual value for each parameter, unscale x if needed
         if self.snap:
diff --git a/kernel_tuner/strategies/wrapper.py b/kernel_tuner/strategies/wrapper.py
@@ -1,16 +1,37 @@
 """Wrapper intended for user-defined custom optimization methods"""
 
+from abc import ABC, abstractmethod
+
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies.common import CostFunc
 
 
+class OptAlg(ABC):
+    """Base class for user-defined optimization algorithms."""
+
+    def __init__(self):
+        self.costfunc_kwargs = {"scaling": True, "snap": True}
+
+    @abstractmethod
+    def __call__(self, func: CostFunc, searchspace: Searchspace) -> tuple[tuple, float]:
+        """Optimize the black box function `func` within the given `searchspace`.
+
+        Args:
+            func (CostFunc): Cost function to be optimized. Has a property `budget_spent_fraction` that indicates how much of the budget has been spent.
+            searchspace (Searchspace): Search space containing the parameters to be optimized.
+
+        Returns:
+            tuple[tuple, float]: tuple of the best parameters and the corresponding cost value
+        """
+        pass
+
+
 class OptAlgWrapper:
     """Wrapper class for user-defined optimization algorithms"""
 
-    def __init__(self, optimizer):
-        self.optimizer = optimizer
-
+    def __init__(self, optimizer: OptAlg):
+        self.optimizer: OptAlg = optimizer
 
     def tune(self, searchspace: Searchspace, runner, tuning_options):
         cost_func = CostFunc(searchspace, tuning_options, runner, **self.optimizer.costfunc_kwargs)
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
@@ -190,12 +190,28 @@ def check_argument_list(kernel_name, kernel_string, args):
         warnings.warn(errors[0], UserWarning)
 
 
-def check_stop_criterion(to):
-    """Checks if max_fevals is reached or time limit is exceeded."""
-    if "max_fevals" in to and len(to.unique_results) >= to.max_fevals:
-        raise StopCriterionReached(f"max_fevals reached ({len(to.unique_results)} >= {to.max_fevals})")
-    if "time_limit" in to and (((time.perf_counter() - to.start_time) + (to.simulated_time * 1e-3) + to.startup_time) > to.time_limit):
-        raise StopCriterionReached("time limit exceeded")
+def check_stop_criterion(to: dict) -> float:
+    """Check if the stop criterion is reached.
+
+    Args:
+        to (dict): tuning options.
+
+    Raises:
+        StopCriterionReached: if the max_fevals is reached or time limit is exceeded.
+
+    Returns:
+        float: fraction of budget spent.
+    """
+    if "max_fevals" in to:
+        if len(to.unique_results) >= to.max_fevals:
+            raise StopCriterionReached(f"max_fevals ({to.max_fevals}) reached")
+        return len(to.unique_results) / to.max_fevals
+    if "time_limit" in to:
+        time_spent = (time.perf_counter() - to.start_time) + (to.simulated_time * 1e-3) + to.startup_time
+        if time_spent > to.time_limit:
+            raise StopCriterionReached("time limit exceeded")
+        return time_spent / to.time_limit
+    
 
 
 def check_tune_params_list(tune_params, observers, simulation_mode=False):
diff --git a/test/test_custom_optimizer.py b/test/test_custom_optimizer.py
@@ -3,7 +3,9 @@
 
 import numpy as np
 
-class HybridDELocalRefinement:
+from kernel_tuner.strategies.wrapper import OptAlg
+
+class HybridDELocalRefinement(OptAlg):
     """
     A two-phase differential evolution with local refinement, intended for BBOB-type
     black box optimization problems in [-5,5]^dim.
@@ -12,21 +14,14 @@ class HybridDELocalRefinement:
     exploration and local exploitation under a strict function evaluation budget.
     """
 
-    def __init__(self, budget, dim):
-        """
-        Initialize the optimizer with:
-        - budget: total number of function evaluations allowed.
-        - dim: dimensionality of the search space.
-        """
-        self.budget = budget
-        self.dim = dim
+    def __init__(self):
+        super().__init__()
         # You can adjust these hyperparameters based on experimentation/tuning:
-        self.population_size = min(50, 10 * dim)  # Caps for extremely large dim
         self.F = 0.8        # Differential weight
         self.CR = 0.9       # Crossover probability
         self.local_search_freq = 10  # Local refinement frequency in generations
 
-    def __call__(self, func):
+    def __call__(self, func, searchspace):
         """
         Optimize the black box function `func` in [-5,5]^dim, using
         at most self.budget function evaluations.
@@ -35,9 +30,8 @@ def __call__(self, func):
             best_params: np.ndarray representing the best parameters found
             best_value: float representing the best objective value found
         """
-        # Check if we have a non-positive budget
-        if self.budget <= 0:
-            raise ValueError("Budget must be a positive integer.")
+        self.dim = searchspace.num_params
+        self.population_size = round(min(min(50, 10 * self.dim), np.ceil(searchspace.size / 3)))  # Caps for extremely large dim
 
         # 1. Initialize population
         lower_bound, upper_bound = -5.0, 5.0
@@ -49,8 +43,6 @@ def __call__(self, func):
         for i in range(self.population_size):
             fitness[i] = func(pop[i])
             evaluations += 1
-            if evaluations >= self.budget:
-                break
 
         # Track best solution
         best_idx = np.argmin(fitness)
@@ -59,7 +51,7 @@ def __call__(self, func):
 
         # 2. Main evolutionary loop
         gen = 0
-        while evaluations < self.budget:
+        while func.budget_spent_fraction < 1.0 and evaluations < searchspace.size:
             gen += 1
             for i in range(self.population_size):
                 # DE mutation: pick three distinct indices
@@ -78,7 +70,7 @@ def __call__(self, func):
                 # Evaluate trial
                 trial_fitness = func(trial)
                 evaluations += 1
-                if evaluations >= self.budget:
+                if func.budget_spent_fraction > 1.0:
                     # If out of budget, wrap up
                     if trial_fitness < fitness[i]:
                         pop[i] = trial
@@ -99,14 +91,11 @@ def __call__(self, func):
                         best_params = trial.copy()
 
             # Periodically refine best solution with a small local neighborhood search
-            if gen % self.local_search_freq == 0 and evaluations < self.budget:
+            if gen % self.local_search_freq == 0 and func.budget_spent_fraction < 1.0:
                 best_params, best_value, evaluations = self._local_refinement(
                     func, best_params, best_value, evaluations, lower_bound, upper_bound
                 )
 
-            if evaluations >= self.budget:
-                break
-
         return best_params, best_value
 
     def _local_refinement(self, func, best_params, best_value, evaluations, lb, ub):
@@ -115,11 +104,10 @@ def _local_refinement(self, func, best_params, best_value, evaluations, lb, ub):
         Uses a quick 'perturb-and-accept' approach in a shrinking neighborhood.
         """
         # Neighborhood size shrinks as the budget is consumed
-        frac_budget_used = evaluations / self.budget
-        step_size = 0.2 * (1.0 - frac_budget_used)
+        step_size = 0.2 * (1.0 - func.budget_spent_fraction)
 
         for _ in range(5):  # 5 refinements each time
-            if evaluations >= self.budget:
+            if func.budget_spent_fraction >= 1.0:
                 break
             candidate = best_params + np.random.uniform(-step_size, step_size, self.dim)
             candidate = np.clip(candidate, lb, ub)
@@ -138,26 +126,23 @@ def _local_refinement(self, func, best_params, best_value, evaluations, lb, ub):
 import os
 from kernel_tuner import tune_kernel
 from kernel_tuner.strategies.wrapper import OptAlgWrapper
-cache_filename = os.path.dirname(
-
-    os.path.realpath(__file__)) + "/test_cache_file.json"
 
 from .test_runners import env
 
+cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/test_cache_file.json"
 
 def test_OptAlgWrapper(env):
     kernel_name, kernel_string, size, args, tune_params = env
 
     # Instantiate LLaMAE optimization algorithm
-    budget = int(15)
-    dim = len(tune_params)
-    optimizer = HybridDELocalRefinement(budget, dim)
+    optimizer = HybridDELocalRefinement()
 
     # Wrap the algorithm class in the OptAlgWrapper
     # for use in Kernel Tuner
     strategy = OptAlgWrapper(optimizer)
+    strategy_options = { 'max_fevals': 15 }
 
     # Call the tuner
     tune_kernel(kernel_name, kernel_string, size, args, tune_params,
-                strategy=strategy, cache=cache_filename,
+                strategy=strategy, strategy_options=strategy_options, cache=cache_filename,
                 simulation_mode=True, verbose=True)