KernelTuner · fjwillemsen · Nov 5, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 poetry.lock
 noxenv.txt
 noxsettings.toml
-hyperparamtuning/
+hyperparamtuning*/*
 *.prof
 
 ### Python ###
@@ -20,13 +20,15 @@ push_to_pypi.sh
 *.json
 !kernel_tuner/schema/T1/1.0.0/input-schema.json
 !test/test_T1_input.json
+!test_cache_file*.json
 *.csv
 .cache
 *.ipynb_checkpoints
 examples/cuda/output
 deploy_key
 *.mod
 temp_*.*
+.DS_Store
 .python-version
 .nox
 
@@ -41,4 +43,4 @@ temp_*.*
 .LSOverride
 
 .vscode
-.idea
+.idea
diff --git a/doc/requirements_test.txt b/doc/requirements_test.txt
diff --git a/doc/source/dev-environment.rst b/doc/source/dev-environment.rst
@@ -78,7 +78,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster):
     * Verify that your development environment has no missing installs or updates with :bash:`poetry install --sync --dry-run --with test`. 
 #. Check if the environment is setup correctly by running :bash:`pytest`. All tests should pass, except if you're not on a GPU node, or one or more extras has been left out in the previous step, then these tests will skip gracefully.
 #. Set Nox to use the correct backend and location:
-    * Run :bash:`conda -- create-settings-file` to automatically create a settings file. 
+    * Run :bash:`nox -- create-settings-file` to automatically create a settings file. 
     * In this settings file :bash:`noxsettings.toml`, change the :bash:`venvbackend`:
         * If you used Mamba in step 2, to :bash:`mamba`.
         * If you used Miniconda or Anaconda in step 2, to :bash:`conda`.

diff --git a/examples/c/vector_add.py b/examples/c/vector_add.py
diff --git a/examples/cuda-c++/vector_add.py b/examples/cuda-c++/vector_add.py
diff --git a/examples/cuda-c++/vector_add_blocksize.py b/examples/cuda-c++/vector_add_blocksize.py
diff --git a/examples/cuda-c++/vector_add_cupy.py b/examples/cuda-c++/vector_add_cupy.py
diff --git a/examples/cuda/convolution.py b/examples/cuda/convolution.py
diff --git a/examples/cuda/convolution_correct.py b/examples/cuda/convolution_correct.py
diff --git a/examples/cuda/convolution_streams.py b/examples/cuda/convolution_streams.py
diff --git a/examples/cuda/expdist.py b/examples/cuda/expdist.py
diff --git a/examples/cuda/matmul.py b/examples/cuda/matmul.py
diff --git a/examples/cuda/pnpoly.py b/examples/cuda/pnpoly.py
diff --git a/examples/cuda/python_kernel.py b/examples/cuda/python_kernel.py
diff --git a/examples/cuda/reduction.py b/examples/cuda/reduction.py
diff --git a/examples/cuda/sepconv.py b/examples/cuda/sepconv.py
diff --git a/examples/cuda/spmv.py b/examples/cuda/spmv.py
diff --git a/examples/cuda/stencil.py b/examples/cuda/stencil.py
diff --git a/examples/cuda/test_vector_add.py b/examples/cuda/test_vector_add.py
diff --git a/examples/cuda/test_vector_add_parameterized.py b/examples/cuda/test_vector_add_parameterized.py
diff --git a/examples/cuda/vector_add.py b/examples/cuda/vector_add.py
diff --git a/examples/cuda/vector_add_codegen.py b/examples/cuda/vector_add_codegen.py
diff --git a/examples/cuda/vector_add_cupy.py b/examples/cuda/vector_add_cupy.py
diff --git a/examples/cuda/vector_add_custom_strategy.py b/examples/cuda/vector_add_custom_strategy.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+"""This is the minimal example from the README"""
+
+import numpy
+import kernel_tuner
+from kernel_tuner import tune_kernel
+from kernel_tuner.file_utils import store_output_file, store_metadata_file
+
+def tune():
+
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 10000000
+
+    a = numpy.random.randn(size).astype(numpy.float32)
+    b = numpy.random.randn(size).astype(numpy.float32)
+    c = numpy.zeros_like(b)
+    n = numpy.int32(size)
+
+    args = [c, a, b, n]
+
+    tune_params = dict()
+    tune_params["block_size_x"] = [128+64*i for i in range(15)]
+
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, strategy=kernel_tuner.strategies.minimize, verbose=True)
+
+    # Store the tuning results in an output file
+    store_output_file("vector_add.json", results, tune_params)
+
+    # Store the metadata of this run
+    store_metadata_file("vector_add-metadata.json")
+
+    return results
+
+
+if __name__ == "__main__":
+    tune()
diff --git a/examples/cuda/vector_add_jinja.py b/examples/cuda/vector_add_jinja.py
diff --git a/examples/cuda/vector_add_metric.py b/examples/cuda/vector_add_metric.py
diff --git a/examples/cuda/vector_add_observers.py b/examples/cuda/vector_add_observers.py
diff --git a/examples/cuda/zeromeanfilter.py b/examples/cuda/zeromeanfilter.py
diff --git a/examples/fortran/vector_add.py b/examples/fortran/vector_add.py
diff --git a/examples/opencl/convolution.py b/examples/opencl/convolution.py
diff --git a/examples/opencl/convolution_correct.py b/examples/opencl/convolution_correct.py
diff --git a/examples/opencl/matmul.py b/examples/opencl/matmul.py
diff --git a/examples/opencl/reduction.py b/examples/opencl/reduction.py
diff --git a/examples/opencl/sepconv.py b/examples/opencl/sepconv.py
diff --git a/examples/opencl/stencil.py b/examples/opencl/stencil.py
diff --git a/examples/opencl/vector_add.py b/examples/opencl/vector_add.py
diff --git a/examples/opencl/vector_add_codegen.py b/examples/opencl/vector_add_codegen.py
diff --git a/examples/opencl/vector_add_observers.py b/examples/opencl/vector_add_observers.py
diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py
@@ -1,6 +1,5 @@
 """This module contains all Cupy specific kernel_tuner functions."""
 from __future__ import print_function
-from warnings import warn
 
 import numpy as np
 

diff --git a/kernel_tuner/backends/hypertuner.py b/kernel_tuner/backends/hypertuner.py
@@ -17,6 +17,8 @@
 
 
 class ScoreObserver(BenchmarkObserver):
+    """BenchmarkObserver subclass for registering the hyperparameter tuning score."""
+
     def __init__(self, dev):
         self.dev = dev
         self.scores = []
@@ -33,13 +35,70 @@ class HypertunerFunctions(Backend):
     """Class for executing hyperparameter tuning."""
     units = {}
 
-    def __init__(self, iterations):
+    def __init__(self, iterations, compiler_options=None):
         self.iterations = iterations
+        self.compiler_options = compiler_options
         self.observers = [ScoreObserver(self)]
         self.name = platform.processor()
         self.max_threads = 1024
         self.last_score = None
 
+        # set the defaults
+        self.gpus = ["A100", "A4000", "MI250X"]
+        folder = "../autotuning_methodology/benchmark_hub/kernels"
+        self.applications = [
+            {
+                "name": "dedispersion_milo",
+                "folder": folder,
+                "input_file": "dedispersion_milo.json",
+                "objective_performance_keys": ["time"]
+            },
+            {
+                "name": "hotspot_milo",
+                "folder": folder,
+                "input_file": "hotspot_milo.json",
+                "objective_performance_keys": ["GFLOP/s"]
+            },
+            {
+                "name": "convolution_milo",
+                "folder": folder,
+                "input_file": "convolution_milo.json",
+                "objective_performance_keys": ["time"]
+            },
+            {
+                "name": "gemm_milo",
+                "folder": folder,
+                "input_file": "gemm_milo.json",
+                "objective_performance_keys": ["time"]
+            }
+        ]
+        # any additional settings
+        self.override = { 
+            "experimental_groups_defaults": { 
+                "repeats": 25,
+                "samples": self.iterations,
+                "minimum_fraction_of_budget_valid": 0.1,
+                "minimum_number_of_valid_search_iterations": 5,
+            },
+            "statistics_settings": {
+                "cutoff_percentile": 0.95,
+                "cutoff_percentile_start": 0.01,
+                "cutoff_type": "time",
+                "objective_time_keys": [
+                    "all"
+                ]
+            }
+        }
+
+        # override the defaults with compiler options if provided
+        if self.compiler_options is not None:
+            if "gpus" in self.compiler_options:
+                self.gpus = self.compiler_options["gpus"]
+            if "applications" in self.compiler_options:
+                self.applications = self.compiler_options["applications"]
+            if "override" in self.compiler_options:
+                self.override = self.compiler_options["override"]
+
         # set the environment options
         env = dict()
         env["iterations"] = self.iterations
@@ -60,22 +119,6 @@ def compile(self, kernel_instance):
         path = Path(__file__).parent.parent.parent / "hyperparamtuning"
         path.mkdir(exist_ok=True)
 
-        # TODO get applications & GPUs args from benchmark
-        gpus = ["RTX_3090", "RTX_2080_Ti"]
-        applications = None
-        # applications = [
-        #     {
-        #         "name": "convolution",
-        #         "folder": "./cached_data_used/kernels",
-        #         "input_file": "convolution.json"
-        #     },
-        #     {
-        #         "name": "pnpoly",
-        #         "folder": "./cached_data_used/kernels",
-        #         "input_file": "pnpoly.json"
-        #     }
-        # ]
-
         # strategy settings
         strategy: str = kernel_instance.arguments[0]
         hyperparams = [{'name': k, 'value': v} for k, v in kernel_instance.params.items()]
@@ -88,16 +131,9 @@ def compile(self, kernel_instance):
             'search_method_hyperparameters': hyperparams
         }]
 
-        # any additional settings
-        override = { 
-            "experimental_groups_defaults": { 
-                "samples": self.iterations 
-            }
-        }
-
         name = kernel_instance.name if len(kernel_instance.name) > 0 else kernel_instance.kernel_source.kernel_name
-        experiments_filepath = generate_experiment_file(name, path, searchspace_strategies, applications, gpus, 
-                                                        override=override, overwrite_existing_file=True)
+        experiments_filepath = generate_experiment_file(name, path, searchspace_strategies, self.applications, self.gpus, 
+                                                        override=self.override, generate_unique_file=True, overwrite_existing_file=True)
         return str(experiments_filepath)
 
     def start_event(self):
@@ -114,12 +150,27 @@ def synchronize(self):
         return super().synchronize()
 
     def run_kernel(self, func, gpu_args=None, threads=None, grid=None, stream=None):
+        # from cProfile import Profile
+
+        # # generate the experiments file
+        # experiments_filepath = Path(func)
+
+        # # run the methodology to get a fitness score for this configuration
+        # with Profile() as pr:
+        #     scores = get_strategy_scores(str(experiments_filepath), full_validate_on_load=False)
+        #     pr.dump_stats('diff_evo_hypertune_hotspot.prof')
+        # self.last_score = scores[list(scores.keys())[0]]['score']
+        # raise ValueError(scores)
+
         # generate the experiments file
         experiments_filepath = Path(func)
 
         # run the methodology to get a fitness score for this configuration
-        scores = get_strategy_scores(str(experiments_filepath))
+        scores = get_strategy_scores(str(experiments_filepath), full_validate_on_load=False)
         self.last_score = scores[list(scores.keys())[0]]['score']
+
+        # remove the experiments file
+        experiments_filepath.unlink()
 
     def memset(self, allocation, value, size):
         return super().memset(allocation, value, size)
@@ -129,3 +180,7 @@ def memcpy_dtoh(self, dest, src):
 
     def memcpy_htod(self, dest, src):
         return super().memcpy_htod(dest, src)
+
+    def refresh_memory(self, device_memory, host_arguments, should_sync):
+        """This is a no-op for the hypertuner backend, as it does not manage memory directly."""
+        pass
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -315,10 +315,13 @@ def __init__(
                 observers=observers,
             )
         elif lang.upper() == "HYPERTUNER":
-            dev = HypertunerFunctions(iterations=iterations)
+            dev = HypertunerFunctions(
+                iterations=iterations,
+                compiler_options=compiler_options
+            )
             self.requires_warmup = False
         else:
-            raise ValueError(
+            raise NotImplementedError(
                 "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet"
             )
         self.dev = dev

diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py
@@ -3,6 +3,7 @@
 import json
 import subprocess
 from importlib.metadata import PackageNotFoundError, requires, version
+from importlib.util import spec_from_file_location, module_from_spec
 from pathlib import Path
 from sys import platform
 
@@ -152,7 +153,7 @@ def get_t4_results(results, tune_params, objective="time"):
 
     # write output_data to a JSON file
     version, _ = output_file_schema("results")
-    output_json = dict(results=output_data, schema_version=version, metadata={'timeunit': 'miliseconds'})
+    output_json = dict(results=output_data, schema_version=version, metadata={'timeunit': 'milliseconds'})
     return output_json
 
 def store_output_file(output_filename: str, results, tune_params, objective="time"):
@@ -302,3 +303,25 @@ def store_metadata_file(metadata_filename: str):
     with open(metadata_filenamepath, "w+") as fh:
         json.dump(metadata_json, fh, indent="  ")
 
+def import_class_from_file(file_path: Path, class_name):
+    """Import a class from a file."""
+
+    def load_module(module_name):
+        spec = spec_from_file_location(module_name, file_path)
+        if spec is None:
+            raise ImportError(f"Could not load spec from {file_path}")
+
+        # create a module from the spec and execute it
+        module = module_from_spec(spec)
+        spec.loader.exec_module(module)
+        if not hasattr(module, class_name):
+            raise ImportError(f"Module '{module_name}' has no class '{class_name}'")
+        return module
+
+    try:
+        module = load_module(file_path.stem)
+    except ImportError:
+        module = load_module(f"{file_path.parent.stem}.{file_path.stem}")
+
+    # return the class from the module
+    return getattr(module, class_name)