KernelTuner · fjwillemsen · Dec 1, 2021 · Dec 1, 2021 · Jan 12, 2022 · Jan 15, 2022
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 poetry.lock
 noxenv.txt
 noxsettings.toml
-hyperparamtuning/
+hyperparamtuning*/*
 *.prof
 
 ### Python ###
@@ -20,13 +20,15 @@ push_to_pypi.sh
 *.json
 !kernel_tuner/schema/T1/1.0.0/input-schema.json
 !test/test_T1_input.json
+!test_cache_file.json
 *.csv
 .cache
 *.ipynb_checkpoints
 examples/cuda/output
 deploy_key
 *.mod
 temp_*.*
+.DS_Store
 .python-version
 .nox
 
@@ -41,4 +43,4 @@ temp_*.*
 .LSOverride
 
 .vscode
-.idea
+.idea
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@
 
 
 Create optimized GPU applications in any mainstream GPU 
-programming language (CUDA, HIP, OpenCL, OpenACC).
+programming language (CUDA, HIP, OpenCL, OpenACC, OpenMP).
 
 What Kernel Tuner does:
 

diff --git a/doc/requirements.txt b/doc/requirements.txt
diff --git a/doc/requirements_test.txt b/doc/requirements_test.txt
diff --git a/doc/source/dev-environment.rst b/doc/source/dev-environment.rst
@@ -78,7 +78,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster):
     * Verify that your development environment has no missing installs or updates with :bash:`poetry install --sync --dry-run --with test`. 
 #. Check if the environment is setup correctly by running :bash:`pytest`. All tests should pass, except if you're not on a GPU node, or one or more extras has been left out in the previous step, then these tests will skip gracefully.
 #. Set Nox to use the correct backend and location:
-    * Run :bash:`conda -- create-settings-file` to automatically create a settings file. 
+    * Run :bash:`nox -- create-settings-file` to automatically create a settings file. 
     * In this settings file :bash:`noxsettings.toml`, change the :bash:`venvbackend`:
         * If you used Mamba in step 2, to :bash:`mamba`.
         * If you used Miniconda or Anaconda in step 2, to :bash:`conda`.

diff --git a/examples/c/vector_add.py b/examples/c/vector_add.py
@@ -26,7 +26,7 @@
 }
 """
 
-size = 72*1024*1024
+size = 72 * 1024 * 1024
 
 a = numpy.random.randn(size).astype(numpy.float32)
 b = numpy.random.randn(size).astype(numpy.float32)
@@ -39,7 +39,6 @@
 tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32]
 tune_params["vecsize"] = [1, 2, 4, 8, 16]
 
-answer = [a+b, None, None, None]
+answer = [a + b, None, None, None]
 
-tune_kernel("vector_add", kernel_string, size, args, tune_params,
-    answer=answer, compiler_options=['-O3'])
+tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"])
diff --git a/examples/cuda-c++/vector_add.py b/examples/cuda-c++/vector_add.py
diff --git a/examples/cuda-c++/vector_add_blocksize.py b/examples/cuda-c++/vector_add_blocksize.py
diff --git a/examples/cuda-c++/vector_add_cupy.py b/examples/cuda-c++/vector_add_cupy.py
diff --git a/examples/cuda/convolution.py b/examples/cuda/convolution.py
diff --git a/examples/cuda/convolution_correct.py b/examples/cuda/convolution_correct.py
diff --git a/examples/cuda/convolution_streams.py b/examples/cuda/convolution_streams.py
diff --git a/examples/cuda/expdist.py b/examples/cuda/expdist.py
diff --git a/examples/cuda/matmul.py b/examples/cuda/matmul.py
diff --git a/examples/cuda/pnpoly.py b/examples/cuda/pnpoly.py
diff --git a/examples/cuda/python_kernel.py b/examples/cuda/python_kernel.py
diff --git a/examples/cuda/reduction.py b/examples/cuda/reduction.py
diff --git a/examples/cuda/sepconv.py b/examples/cuda/sepconv.py
diff --git a/examples/cuda/spmv.py b/examples/cuda/spmv.py
diff --git a/examples/cuda/stencil.py b/examples/cuda/stencil.py
diff --git a/examples/cuda/test_vector_add.py b/examples/cuda/test_vector_add.py
diff --git a/examples/cuda/test_vector_add_parameterized.py b/examples/cuda/test_vector_add_parameterized.py
diff --git a/examples/cuda/vector_add.py b/examples/cuda/vector_add.py
diff --git a/examples/cuda/vector_add_codegen.py b/examples/cuda/vector_add_codegen.py
diff --git a/examples/cuda/vector_add_cupy.py b/examples/cuda/vector_add_cupy.py
diff --git a/examples/cuda/vector_add_jinja.py b/examples/cuda/vector_add_jinja.py
diff --git a/examples/cuda/vector_add_metric.py b/examples/cuda/vector_add_metric.py
diff --git a/examples/cuda/vector_add_observers.py b/examples/cuda/vector_add_observers.py
diff --git a/examples/cuda/zeromeanfilter.py b/examples/cuda/zeromeanfilter.py
diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
+
+
+# Naive Python histogram implementation
+def histogram(vector, hist):
+    for i in range(0, len(vector)):
+        hist[vector[i]] += 1
+    return hist
+
+
+code = """
+#include <stdlib.h>
+
+#define HIST_SIZE 256
+#define VECTOR_SIZE 1000000
+
+#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
+#if enable_reduction == 1
+#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE])
+#else
+#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
+#endif
+#pragma acc loop independent
+for ( int i = 0; i < VECTOR_SIZE; i++ ) {
+#if enable_atomic == 1
+    #pragma acc atomic update
+#endif
+    hist[vector[i]] += 1;
+}
+#pragma tuner stop
+"""
+
+# Extract tunable directive
+app = Code(OpenACC(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["ngangs"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+tune_params["enable_reduction"] = [0, 1]
+tune_params["enable_atomic"] = [0, 1]
+constraints = ["enable_reduction != enable_atomic"]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
+kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
+reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
+reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
+answer = [None, reference_hist]
+
+tune_kernel(
+    "histogram",
+    kernel_string["histogram"],
+    0,
+    kernel_args["histogram"],
+    tune_params,
+    restrictions=constraints,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
+)
diff --git a/examples/directives/histogram_c_openmp.py b/examples/directives/histogram_c_openmp.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+
+# Naive Python histogram implementation
+def histogram(vector, hist):
+    for i in range(0, len(vector)):
+        hist[vector[i]] += 1
+    return hist
+
+
+code = """
+#include <stdlib.h>
+
+#define HIST_SIZE 256
+#define VECTOR_SIZE 1000000
+
+#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
+#if enable_reduction == 1
+#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE])
+#else
+#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
+#endif
+for ( int i = 0; i < VECTOR_SIZE; i++ ) {
+#if enable_atomic == 1
+    #pragma omp atomic update
+#endif
+    hist[vector[i]] += 1;
+}
+#pragma tuner stop
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["nteams"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+tune_params["enable_reduction"] = [0, 1]
+tune_params["enable_atomic"] = [0, 1]
+constraints = ["enable_reduction != enable_atomic"]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
+kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
+reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
+reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
+answer = [None, reference_hist]
+
+tune_kernel(
+    "histogram",
+    kernel_string["histogram"],
+    0,
+    kernel_args["histogram"],
+    tune_params,
+    restrictions=constraints,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py
@@ -1,13 +1,8 @@
 #!/usr/bin/env python
 """This is an example tuning a naive matrix multiplication using the simplified directives interface"""
 
-from kernel_tuner import tune_kernel
-from kernel_tuner.utils.directives import (
-    Code,
-    OpenACC,
-    Cxx,
-    process_directives
-)
+from kernel_tuner import tune_kernel, run_kernel
+from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
 
 N = 4096
 
@@ -45,13 +40,20 @@
 metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
 metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
 
+# compute reference solution from CPU
+results = run_kernel(
+    "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
+)
+answer = [None, None, results[2]]
+
 tune_kernel(
     "mm",
     kernel_string["mm"],
     0,
     kernel_args["mm"],
     tune_params,
     metrics=metrics,
-    compiler_options=["-fast", "-acc=gpu"],
+    answer=answer,
     compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
 )
diff --git a/examples/directives/matrix_multiply_c_openmp.py b/examples/directives/matrix_multiply_c_openmp.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
+
+from kernel_tuner import tune_kernel, run_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+N = 4096
+
+code = """
+#define N 4096
+
+void matrix_multiply(float *A, float *B, float *C) {
+    #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
+    float temp_sum = 0.0f;
+    #pragma omp target
+    #pragma omp teams distribute collapse(2)
+    for ( int i = 0; i < N; i++) {
+        for ( int j = 0; j < N; j++ ) {
+            temp_sum = 0.0f;
+            #pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum)
+            for ( int k = 0; k < N; k++ ) {
+                temp_sum += A[(i * N) + k] * B[(k * N) + j];
+            }
+            C[(i * N) + j] = temp_sum;
+        }
+    }
+    #pragma tuner stop
+}
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+dims = {"NN": N**2}
+kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)
+
+tune_params = dict()
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+metrics = dict()
+metrics["time_s"] = lambda x: x["time"] / 10**3
+metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
+metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
+
+# compute reference solution from CPU
+results = run_kernel(
+    "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
+)
+answer = [None, None, results[2]]
+
+tune_kernel(
+    "mm",
+    kernel_string["mm"],
+    0,
+    kernel_args["mm"],
+    tune_params,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
diff --git a/examples/directives/vector_add_c_openacc.py b/examples/directives/vector_add_c_openacc.py
@@ -67,6 +67,6 @@
     tune_params,
     metrics=metrics,
     answer=answer,
-    compiler_options=["-fast", "-acc=gpu"],
     compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
 )
diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+code = """
+#include <stdlib.h>
+
+#define VECTOR_SIZE 1000000
+
+int main(void) {
+	int size = VECTOR_SIZE;
+	float * a = (float *) malloc(VECTOR_SIZE * sizeof(float));
+	float * b = (float *) malloc(VECTOR_SIZE * sizeof(float));
+	float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));
+
+	#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
+	#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
+	for ( int i = 0; i < size; i++ ) {
+		c[i] = a[i] + b[i];
+	}
+	#pragma tuner stop
+
+	free(a);
+	free(b);
+	free(c);
+}
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["nteams"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["vector_add"][0])) + (4 * len(kernel_args["vector_add"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+answer = [None, None, kernel_args["vector_add"][0] + kernel_args["vector_add"][1], None]
+
+tune_kernel(
+    "vector_add",
+    kernel_string["vector_add"],
+    0,
+    kernel_args["vector_add"],
+    tune_params,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
diff --git a/examples/directives/vector_add_fortran_openacc.py b/examples/directives/vector_add_fortran_openacc.py
@@ -62,6 +62,6 @@
     tune_params,
     metrics=metrics,
     answer=answer,
-    compiler_options=["-fast", "-acc=gpu"],
     compiler="nvfortran",
+    compiler_options=["-fast", "-acc=gpu"],
 )