KernelTuner
diff --git a/‎INSTALL.rst
Lines changed: 1 addition & 1 deletion b/‎INSTALL.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/source/architecture.png
10.6 KB b/‎doc/source/architecture.png
10.6 KB
diff --git a/‎doc/source/design.rst
Lines changed: 2 additions & 8 deletions b/‎doc/source/design.rst
Lines changed: 2 additions & 8 deletions
diff --git a/‎examples/c/vector_add.py
Lines changed: 3 additions & 4 deletions b/‎examples/c/vector_add.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/directives/histogram_c_openacc.py
Lines changed: 72 additions & 0 deletions b/‎examples/directives/histogram_c_openacc.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎examples/directives/histogram_c_openmp.py
Lines changed: 71 additions & 0 deletions b/‎examples/directives/histogram_c_openmp.py
Lines changed: 71 additions & 0 deletions
diff --git a/‎examples/directives/matrix_multiply_c_openacc.py
Lines changed: 10 additions & 8 deletions b/‎examples/directives/matrix_multiply_c_openacc.py
Lines changed: 10 additions & 8 deletions
diff --git a/‎examples/directives/matrix_multiply_c_openmp.py
Lines changed: 59 additions & 0 deletions b/‎examples/directives/matrix_multiply_c_openmp.py
Lines changed: 59 additions & 0 deletions
diff --git a/‎examples/directives/vector_add_c_openacc.py
Lines changed: 1 addition & 1 deletion b/‎examples/directives/vector_add_c_openacc.py
Lines changed: 1 addition & 1 deletion
@@ -125,7 +125,7 @@ Or you could install Kernel Tuner and PyOpenCL together if you haven't done so a
 If this fails, please see the PyOpenCL installation guide (https://wiki.tiker.net/PyOpenCL/Installation)
 
 HIP and HIP Python
--------------
+------------------
 
 Before we can install HIP Python, you'll need to have the HIP runtime and compiler installed on your system.
 The HIP compiler is included as part of the ROCm software stack. Here is AMD's installation guide:
 
@@ -16,7 +16,7 @@
 
 
 Create optimized GPU applications in any mainstream GPU 
-programming language (CUDA, HIP, OpenCL, OpenACC).
+programming language (CUDA, HIP, OpenCL, OpenACC, OpenMP).
 
 What Kernel Tuner does:
 
 
@@ -89,14 +89,8 @@ kernel_tuner.runners.sequential.SimulationRunner
     :members:
 
 
-Device Interfaces
------------------
-
-kernel_tuner.core.DeviceInterface
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: kernel_tuner.core.DeviceInterface
-    :special-members: __init__
-    :members:
+Backends
+--------
 
 kernel_tuner.backends.pycuda.PyCudaFunctions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -26,7 +26,7 @@
 }
 """
 
-size = 72*1024*1024
+size = 72 * 1024 * 1024
 
 a = numpy.random.randn(size).astype(numpy.float32)
 b = numpy.random.randn(size).astype(numpy.float32)
@@ -39,7 +39,6 @@
 tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32]
 tune_params["vecsize"] = [1, 2, 4, 8, 16]
 
-answer = [a+b, None, None, None]
+answer = [a + b, None, None, None]
 
-tune_kernel("vector_add", kernel_string, size, args, tune_params,
-    answer=answer, compiler_options=['-O3'])
+tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"])
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
+
+
+# Naive Python histogram implementation
+def histogram(vector, hist):
+    for i in range(0, len(vector)):
+        hist[vector[i]] += 1
+    return hist
+
+
+code = """
+#include <stdlib.h>
+
+#define HIST_SIZE 256
+#define VECTOR_SIZE 1000000
+
+#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
+#if enable_reduction == 1
+#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE])
+#else
+#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
+#endif
+#pragma acc loop independent
+for ( int i = 0; i < VECTOR_SIZE; i++ ) {
+#if enable_atomic == 1
+    #pragma acc atomic update
+#endif
+    hist[vector[i]] += 1;
+}
+#pragma tuner stop
+"""
+
+# Extract tunable directive
+app = Code(OpenACC(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["ngangs"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+tune_params["enable_reduction"] = [0, 1]
+tune_params["enable_atomic"] = [0, 1]
+constraints = ["enable_reduction != enable_atomic"]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
+kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
+reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
+reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
+answer = [None, reference_hist]
+
+tune_kernel(
+    "histogram",
+    kernel_string["histogram"],
+    0,
+    kernel_args["histogram"],
+    tune_params,
+    restrictions=constraints,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
+)
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+
+# Naive Python histogram implementation
+def histogram(vector, hist):
+    for i in range(0, len(vector)):
+        hist[vector[i]] += 1
+    return hist
+
+
+code = """
+#include <stdlib.h>
+
+#define HIST_SIZE 256
+#define VECTOR_SIZE 1000000
+
+#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
+#if enable_reduction == 1
+#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE])
+#else
+#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
+#endif
+for ( int i = 0; i < VECTOR_SIZE; i++ ) {
+#if enable_atomic == 1
+    #pragma omp atomic update
+#endif
+    hist[vector[i]] += 1;
+}
+#pragma tuner stop
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["nteams"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+tune_params["enable_reduction"] = [0, 1]
+tune_params["enable_atomic"] = [0, 1]
+constraints = ["enable_reduction != enable_atomic"]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
+kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
+reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
+reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
+answer = [None, reference_hist]
+
+tune_kernel(
+    "histogram",
+    kernel_string["histogram"],
+    0,
+    kernel_args["histogram"],
+    tune_params,
+    restrictions=constraints,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
@@ -1,13 +1,8 @@
 #!/usr/bin/env python
 """This is an example tuning a naive matrix multiplication using the simplified directives interface"""
 
-from kernel_tuner import tune_kernel
-from kernel_tuner.utils.directives import (
-    Code,
-    OpenACC,
-    Cxx,
-    process_directives
-)
+from kernel_tuner import tune_kernel, run_kernel
+from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
 
 N = 4096
 
@@ -45,13 +40,20 @@
 metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
 metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
 
+# compute reference solution from CPU
+results = run_kernel(
+    "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
+)
+answer = [None, None, results[2]]
+
 tune_kernel(
     "mm",
     kernel_string["mm"],
     0,
     kernel_args["mm"],
     tune_params,
     metrics=metrics,
-    compiler_options=["-fast", "-acc=gpu"],
+    answer=answer,
     compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
 )
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
+
+from kernel_tuner import tune_kernel, run_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+N = 4096
+
+code = """
+#define N 4096
+
+void matrix_multiply(float *A, float *B, float *C) {
+    #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
+    float temp_sum = 0.0f;
+    #pragma omp target
+    #pragma omp teams distribute collapse(2)
+    for ( int i = 0; i < N; i++) {
+        for ( int j = 0; j < N; j++ ) {
+            temp_sum = 0.0f;
+            #pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum)
+            for ( int k = 0; k < N; k++ ) {
+                temp_sum += A[(i * N) + k] * B[(k * N) + j];
+            }
+            C[(i * N) + j] = temp_sum;
+        }
+    }
+    #pragma tuner stop
+}
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+dims = {"NN": N**2}
+kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)
+
+tune_params = dict()
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+metrics = dict()
+metrics["time_s"] = lambda x: x["time"] / 10**3
+metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
+metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
+
+# compute reference solution from CPU
+results = run_kernel(
+    "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
+)
+answer = [None, None, results[2]]
+
+tune_kernel(
+    "mm",
+    kernel_string["mm"],
+    0,
+    kernel_args["mm"],
+    tune_params,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
@@ -67,6 +67,6 @@
     tune_params,
     metrics=metrics,
     answer=answer,
-    compiler_options=["-fast", "-acc=gpu"],
     compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
 )
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,6 @@`
`67`	`67`	`tune_params,`
`68`	`68`	`metrics=metrics,`
`69`	`69`	`answer=answer,`
`70`		`- compiler_options=["-fast", "-acc=gpu"],`
`71`	`70`	`compiler="nvc++",`
	`71`	`+ compiler_options=["-fast", "-acc=gpu"],`
`72`	`72`	`)`