Skip to content

Commit 8afc3d6

Browse files
committed
Merge remote-tracking branch 'origin/master' into custom_strategies
2 parents 1c57201 + 948c957 commit 8afc3d6

33 files changed

+635
-468
lines changed

INSTALL.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ Or you could install Kernel Tuner and PyOpenCL together if you haven't done so a
125125
If this fails, please see the PyOpenCL installation guide (https://wiki.tiker.net/PyOpenCL/Installation)
126126

127127
HIP and HIP Python
128-
-------------
128+
------------------
129129

130130
Before we can install HIP Python, you'll need to have the HIP runtime and compiler installed on your system.
131131
The HIP compiler is included as part of the ROCm software stack. Here is AMD's installation guide:

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717

1818
Create optimized GPU applications in any mainstream GPU
19-
programming language (CUDA, HIP, OpenCL, OpenACC).
19+
programming language (CUDA, HIP, OpenCL, OpenACC, OpenMP).
2020

2121
What Kernel Tuner does:
2222

doc/source/architecture.png

10.6 KB
Loading

doc/source/design.rst

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,8 @@ kernel_tuner.runners.sequential.SimulationRunner
8989
:members:
9090

9191

92-
Device Interfaces
93-
-----------------
94-
95-
kernel_tuner.core.DeviceInterface
96-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
97-
.. autoclass:: kernel_tuner.core.DeviceInterface
98-
:special-members: __init__
99-
:members:
92+
Backends
93+
--------
10094

10195
kernel_tuner.backends.pycuda.PyCudaFunctions
10296
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

examples/c/vector_add.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
}
2727
"""
2828

29-
size = 72*1024*1024
29+
size = 72 * 1024 * 1024
3030

3131
a = numpy.random.randn(size).astype(numpy.float32)
3232
b = numpy.random.randn(size).astype(numpy.float32)
@@ -39,7 +39,6 @@
3939
tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32]
4040
tune_params["vecsize"] = [1, 2, 4, 8, 16]
4141

42-
answer = [a+b, None, None, None]
42+
answer = [a + b, None, None, None]
4343

44-
tune_kernel("vector_add", kernel_string, size, args, tune_params,
45-
answer=answer, compiler_options=['-O3'])
44+
tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"])
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/usr/bin/env python
2+
"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""
3+
import numpy as np
4+
5+
from kernel_tuner import tune_kernel
6+
from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
7+
8+
9+
# Naive Python histogram implementation
10+
def histogram(vector, hist):
11+
for i in range(0, len(vector)):
12+
hist[vector[i]] += 1
13+
return hist
14+
15+
16+
code = """
17+
#include <stdlib.h>
18+
19+
#define HIST_SIZE 256
20+
#define VECTOR_SIZE 1000000
21+
22+
#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
23+
#if enable_reduction == 1
24+
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE])
25+
#else
26+
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
27+
#endif
28+
#pragma acc loop independent
29+
for ( int i = 0; i < VECTOR_SIZE; i++ ) {
30+
#if enable_atomic == 1
31+
#pragma acc atomic update
32+
#endif
33+
hist[vector[i]] += 1;
34+
}
35+
#pragma tuner stop
36+
"""
37+
38+
# Extract tunable directive
39+
app = Code(OpenACC(), Cxx())
40+
kernel_string, kernel_args = process_directives(app, code)
41+
42+
tune_params = dict()
43+
tune_params["ngangs"] = [2**i for i in range(1, 11)]
44+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
45+
tune_params["enable_reduction"] = [0, 1]
46+
tune_params["enable_atomic"] = [0, 1]
47+
constraints = ["enable_reduction != enable_atomic"]
48+
metrics = dict()
49+
metrics["GB/s"] = (
50+
lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
51+
/ (x["time"] / 10**3)
52+
/ 10**9
53+
)
54+
55+
kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
56+
kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
57+
reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
58+
reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
59+
answer = [None, reference_hist]
60+
61+
tune_kernel(
62+
"histogram",
63+
kernel_string["histogram"],
64+
0,
65+
kernel_args["histogram"],
66+
tune_params,
67+
restrictions=constraints,
68+
metrics=metrics,
69+
answer=answer,
70+
compiler="nvc++",
71+
compiler_options=["-fast", "-acc=gpu"],
72+
)
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env python
2+
"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
3+
import numpy as np
4+
5+
from kernel_tuner import tune_kernel
6+
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
7+
8+
9+
# Naive Python histogram implementation
10+
def histogram(vector, hist):
11+
for i in range(0, len(vector)):
12+
hist[vector[i]] += 1
13+
return hist
14+
15+
16+
code = """
17+
#include <stdlib.h>
18+
19+
#define HIST_SIZE 256
20+
#define VECTOR_SIZE 1000000
21+
22+
#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
23+
#if enable_reduction == 1
24+
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE])
25+
#else
26+
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
27+
#endif
28+
for ( int i = 0; i < VECTOR_SIZE; i++ ) {
29+
#if enable_atomic == 1
30+
#pragma omp atomic update
31+
#endif
32+
hist[vector[i]] += 1;
33+
}
34+
#pragma tuner stop
35+
"""
36+
37+
# Extract tunable directive
38+
app = Code(OpenMP(), Cxx())
39+
kernel_string, kernel_args = process_directives(app, code)
40+
41+
tune_params = dict()
42+
tune_params["nteams"] = [2**i for i in range(1, 11)]
43+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
44+
tune_params["enable_reduction"] = [0, 1]
45+
tune_params["enable_atomic"] = [0, 1]
46+
constraints = ["enable_reduction != enable_atomic"]
47+
metrics = dict()
48+
metrics["GB/s"] = (
49+
lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
50+
/ (x["time"] / 10**3)
51+
/ 10**9
52+
)
53+
54+
kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
55+
kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
56+
reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
57+
reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
58+
answer = [None, reference_hist]
59+
60+
tune_kernel(
61+
"histogram",
62+
kernel_string["histogram"],
63+
0,
64+
kernel_args["histogram"],
65+
tune_params,
66+
restrictions=constraints,
67+
metrics=metrics,
68+
answer=answer,
69+
compiler="nvc++",
70+
compiler_options=["-fast", "-mp=gpu"],
71+
)

examples/directives/matrix_multiply_c_openacc.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
#!/usr/bin/env python
22
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
33

4-
from kernel_tuner import tune_kernel
5-
from kernel_tuner.utils.directives import (
6-
Code,
7-
OpenACC,
8-
Cxx,
9-
process_directives
10-
)
4+
from kernel_tuner import tune_kernel, run_kernel
5+
from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
116

127
N = 4096
138

@@ -45,13 +40,20 @@
4540
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
4641
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
4742

43+
# compute reference solution from CPU
44+
results = run_kernel(
45+
"mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
46+
)
47+
answer = [None, None, results[2]]
48+
4849
tune_kernel(
4950
"mm",
5051
kernel_string["mm"],
5152
0,
5253
kernel_args["mm"],
5354
tune_params,
5455
metrics=metrics,
55-
compiler_options=["-fast", "-acc=gpu"],
56+
answer=answer,
5657
compiler="nvc++",
58+
compiler_options=["-fast", "-acc=gpu"],
5759
)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python
2+
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
3+
4+
from kernel_tuner import tune_kernel, run_kernel
5+
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
6+
7+
N = 4096
8+
9+
code = """
10+
#define N 4096
11+
12+
void matrix_multiply(float *A, float *B, float *C) {
13+
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
14+
float temp_sum = 0.0f;
15+
#pragma omp target
16+
#pragma omp teams distribute collapse(2)
17+
for ( int i = 0; i < N; i++) {
18+
for ( int j = 0; j < N; j++ ) {
19+
temp_sum = 0.0f;
20+
#pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum)
21+
for ( int k = 0; k < N; k++ ) {
22+
temp_sum += A[(i * N) + k] * B[(k * N) + j];
23+
}
24+
C[(i * N) + j] = temp_sum;
25+
}
26+
}
27+
#pragma tuner stop
28+
}
29+
"""
30+
31+
# Extract tunable directive
32+
app = Code(OpenMP(), Cxx())
33+
dims = {"NN": N**2}
34+
kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)
35+
36+
tune_params = dict()
37+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
38+
metrics = dict()
39+
metrics["time_s"] = lambda x: x["time"] / 10**3
40+
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
41+
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
42+
43+
# compute reference solution from CPU
44+
results = run_kernel(
45+
"mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
46+
)
47+
answer = [None, None, results[2]]
48+
49+
tune_kernel(
50+
"mm",
51+
kernel_string["mm"],
52+
0,
53+
kernel_args["mm"],
54+
tune_params,
55+
metrics=metrics,
56+
answer=answer,
57+
compiler="nvc++",
58+
compiler_options=["-fast", "-mp=gpu"],
59+
)

examples/directives/vector_add_c_openacc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,6 @@
6767
tune_params,
6868
metrics=metrics,
6969
answer=answer,
70-
compiler_options=["-fast", "-acc=gpu"],
7170
compiler="nvc++",
71+
compiler_options=["-fast", "-acc=gpu"],
7272
)

0 commit comments

Comments
 (0)