Expanded custom optimizer test to T1 input format

fjwillemsen · fjwillemsen · commit 160d4b7db4b8 · 2025-06-05T10:02:22.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -20,7 +20,7 @@ push_to_pypi.sh
 *.json
 !kernel_tuner/schema/T1/1.0.0/input-schema.json
 !test/test_T1_input.json
-!test_cache_file.json
+!test_cache_file*.json
 *.csv
 .cache
 *.ipynb_checkpoints
diff --git a/test/test_cache_file_T1_input.json b/test/test_cache_file_T1_input.json
@@ -0,0 +1,82 @@
+{
+    "General": {
+        "BenchmarkName": "vector_add",
+        "OutputFormat": "JSON"
+    },
+    "ConfigurationSpace": {
+        "TuningParameters": [
+            {
+                "Name": "block_size_x",
+                "Type": "int",
+                "Values": "[128+64*i for i in range(15)]",
+                "Default": 512
+            }
+        ],
+        "Conditions": []
+    },
+    "KernelSpecification": {
+        "Language": "CUDA",
+        "CompilerOptions": [
+            "-std=c++11"
+        ],
+        "BenchmarkName": "vector_add",
+        "KernelName": "vector_add",
+        "KernelFile": "vector_add.cu",
+        "GlobalSizeType": "CUDA",
+        "LocalSize": {
+            "X": "block_size_x",
+            "Y": "1",
+            "Z": "1"
+        },
+        "GlobalSize": {
+            "X": "10000000 // block_size_x",
+            "Y": "1",
+            "Z": "1"
+        },
+        "GridDivX": [
+            "block_size_x"
+        ],
+        "GridDivY": [
+            "block_size_y"
+        ],
+        "ProblemSize": [],
+        "SharedMemory": 0,
+        "Stream": null,
+        "Arguments": [
+            {
+                "Name": "a",
+                "Type": "float",
+                "MemoryType": "Vector",
+                "AccessType": "ReadOnly",
+                "FillType": "Random",
+                "Size": 10000000,
+                "FillValue": 1.0
+            },
+            {
+                "Name": "b",
+                "Type": "float",
+                "MemoryType": "Vector",
+                "AccessType": "ReadOnly",
+                "FillType": "Random",
+                "Size": 10000000,
+                "FillValue": 1.0
+            },
+            {
+                "Name": "c",
+                "Type": "float",
+                "MemoryType": "Vector",
+                "AccessType": "WriteOnly",
+                "FillType": "Constant",
+                "Size": 10000000,
+                "FillValue": 0.0
+            },
+            {
+                "Name": "n",
+                "Type": "int32",
+                "MemoryType": "Scalar",
+                "AccessType": "ReadOnly",
+                "FillValue": 10000000
+            }
+        ]
+    }
+}
diff --git a/test/test_custom_optimizer.py b/test/test_custom_optimizer.py
@@ -120,16 +120,14 @@ def _local_refinement(self, func, best_params, best_value, evaluations, lb, ub):
         return best_params, best_value, evaluations
 
 
-
-
 ### Testing the Optimization Algorithm Wrapper in Kernel Tuner
-import os
-from kernel_tuner import tune_kernel
+from kernel_tuner import tune_kernel, tune_kernel_T1
 from kernel_tuner.strategies.wrapper import OptAlgWrapper
+from pathlib import Path
 
-from .test_runners import env
+from .test_runners import env   # noqa: F401
 
-cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/test_cache_file.json"
+cache_filename = Path(__file__).parent.resolve() / "test_cache_file.json"
 
 def test_OptAlgWrapper(env):
     kernel_name, kernel_string, size, args, tune_params = env
@@ -143,6 +141,33 @@ def test_OptAlgWrapper(env):
     strategy_options = { 'max_fevals': 15 }
 
     # Call the tuner
-    tune_kernel(kernel_name, kernel_string, size, args, tune_params,
+    res, _ = tune_kernel(kernel_name, kernel_string, size, args, tune_params,
                 strategy=strategy, strategy_options=strategy_options, cache=cache_filename,
                 simulation_mode=True, verbose=True)
+    assert len(res) == strategy_options['max_fevals']
+
+def test_OptAlgWrapper_T1(env):
+    kernel_name, kernel_string, size, args, tune_params = env
+
+    strategy = "HybridDELocalRefinement"
+    strategy_options = {
+        "max_fevals": 15,
+        "custom_search_method_path": Path(__file__).resolve(),
+        "constraint_aware": False,
+    }
+    iterations = 1
+    
+    res, _ = tune_kernel_T1(
+        Path(__file__).parent.resolve() / "test_cache_file_T1_input.json",
+        cache_filename,
+        device="NVIDIA RTX A4000",
+        objective="time",
+        objective_higher_is_better=False,
+        simulation_mode=True,
+        output_T4=False,
+        iterations=iterations,
+        strategy=strategy,
+        strategy_options=strategy_options,
+    )
+
+    assert len(res) == strategy_options['max_fevals']
diff --git a/test/vector_add.cu b/test/vector_add.cu
@@ -0,0 +1,6 @@
+__global__ void vector_add(float *c, float *a, float *b, int n) {
+    int i = blockIdx.x * block_size_x + threadIdx.x;
+    if (i<n) {
+        c[i] = a[i] + b[i];
+    }
+}