Skip to content

Commit 79fe080

Browse files
committed
Merge with master branch
2 parents 80a5b62 + 948c957 commit 79fe080

22 files changed

+588
-429
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717

1818
Create optimized GPU applications in any mainstream GPU
19-
programming language (CUDA, HIP, OpenCL, OpenACC).
19+
programming language (CUDA, HIP, OpenCL, OpenACC, OpenMP).
2020

2121
What Kernel Tuner does:
2222

examples/c/vector_add.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
}
2727
"""
2828

29-
size = 72*1024*1024
29+
size = 72 * 1024 * 1024
3030

3131
a = numpy.random.randn(size).astype(numpy.float32)
3232
b = numpy.random.randn(size).astype(numpy.float32)
@@ -39,7 +39,6 @@
3939
tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32]
4040
tune_params["vecsize"] = [1, 2, 4, 8, 16]
4141

42-
answer = [a+b, None, None, None]
42+
answer = [a + b, None, None, None]
4343

44-
tune_kernel("vector_add", kernel_string, size, args, tune_params,
45-
answer=answer, compiler_options=['-O3'])
44+
tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"])
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/usr/bin/env python
2+
"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""
3+
import numpy as np
4+
5+
from kernel_tuner import tune_kernel
6+
from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
7+
8+
9+
# Naive Python histogram implementation
10+
def histogram(vector, hist):
11+
for i in range(0, len(vector)):
12+
hist[vector[i]] += 1
13+
return hist
14+
15+
16+
code = """
17+
#include <stdlib.h>
18+
19+
#define HIST_SIZE 256
20+
#define VECTOR_SIZE 1000000
21+
22+
#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
23+
#if enable_reduction == 1
24+
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE])
25+
#else
26+
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
27+
#endif
28+
#pragma acc loop independent
29+
for ( int i = 0; i < VECTOR_SIZE; i++ ) {
30+
#if enable_atomic == 1
31+
#pragma acc atomic update
32+
#endif
33+
hist[vector[i]] += 1;
34+
}
35+
#pragma tuner stop
36+
"""
37+
38+
# Extract tunable directive
39+
app = Code(OpenACC(), Cxx())
40+
kernel_string, kernel_args = process_directives(app, code)
41+
42+
tune_params = dict()
43+
tune_params["ngangs"] = [2**i for i in range(1, 11)]
44+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
45+
tune_params["enable_reduction"] = [0, 1]
46+
tune_params["enable_atomic"] = [0, 1]
47+
constraints = ["enable_reduction != enable_atomic"]
48+
metrics = dict()
49+
metrics["GB/s"] = (
50+
lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
51+
/ (x["time"] / 10**3)
52+
/ 10**9
53+
)
54+
55+
kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
56+
kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
57+
reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
58+
reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
59+
answer = [None, reference_hist]
60+
61+
tune_kernel(
62+
"histogram",
63+
kernel_string["histogram"],
64+
0,
65+
kernel_args["histogram"],
66+
tune_params,
67+
restrictions=constraints,
68+
metrics=metrics,
69+
answer=answer,
70+
compiler="nvc++",
71+
compiler_options=["-fast", "-acc=gpu"],
72+
)
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env python
2+
"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
3+
import numpy as np
4+
5+
from kernel_tuner import tune_kernel
6+
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
7+
8+
9+
# Naive Python histogram implementation
10+
def histogram(vector, hist):
11+
for i in range(0, len(vector)):
12+
hist[vector[i]] += 1
13+
return hist
14+
15+
16+
code = """
17+
#include <stdlib.h>
18+
19+
#define HIST_SIZE 256
20+
#define VECTOR_SIZE 1000000
21+
22+
#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
23+
#if enable_reduction == 1
24+
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE])
25+
#else
26+
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
27+
#endif
28+
for ( int i = 0; i < VECTOR_SIZE; i++ ) {
29+
#if enable_atomic == 1
30+
#pragma omp atomic update
31+
#endif
32+
hist[vector[i]] += 1;
33+
}
34+
#pragma tuner stop
35+
"""
36+
37+
# Extract tunable directive
38+
app = Code(OpenMP(), Cxx())
39+
kernel_string, kernel_args = process_directives(app, code)
40+
41+
tune_params = dict()
42+
tune_params["nteams"] = [2**i for i in range(1, 11)]
43+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
44+
tune_params["enable_reduction"] = [0, 1]
45+
tune_params["enable_atomic"] = [0, 1]
46+
constraints = ["enable_reduction != enable_atomic"]
47+
metrics = dict()
48+
metrics["GB/s"] = (
49+
lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
50+
/ (x["time"] / 10**3)
51+
/ 10**9
52+
)
53+
54+
kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
55+
kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
56+
reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
57+
reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
58+
answer = [None, reference_hist]
59+
60+
tune_kernel(
61+
"histogram",
62+
kernel_string["histogram"],
63+
0,
64+
kernel_args["histogram"],
65+
tune_params,
66+
restrictions=constraints,
67+
metrics=metrics,
68+
answer=answer,
69+
compiler="nvc++",
70+
compiler_options=["-fast", "-mp=gpu"],
71+
)

examples/directives/matrix_multiply_c_openacc.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
#!/usr/bin/env python
22
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
33

4-
from kernel_tuner import tune_kernel
5-
from kernel_tuner.utils.directives import (
6-
Code,
7-
OpenACC,
8-
Cxx,
9-
process_directives
10-
)
4+
from kernel_tuner import tune_kernel, run_kernel
5+
from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
116

127
N = 4096
138

@@ -45,13 +40,20 @@
4540
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
4641
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
4742

43+
# compute reference solution from CPU
44+
results = run_kernel(
45+
"mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
46+
)
47+
answer = [None, None, results[2]]
48+
4849
tune_kernel(
4950
"mm",
5051
kernel_string["mm"],
5152
0,
5253
kernel_args["mm"],
5354
tune_params,
5455
metrics=metrics,
55-
compiler_options=["-fast", "-acc=gpu"],
56+
answer=answer,
5657
compiler="nvc++",
58+
compiler_options=["-fast", "-acc=gpu"],
5759
)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python
2+
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
3+
4+
from kernel_tuner import tune_kernel, run_kernel
5+
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
6+
7+
N = 4096
8+
9+
code = """
10+
#define N 4096
11+
12+
void matrix_multiply(float *A, float *B, float *C) {
13+
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
14+
float temp_sum = 0.0f;
15+
#pragma omp target
16+
#pragma omp teams distribute collapse(2)
17+
for ( int i = 0; i < N; i++) {
18+
for ( int j = 0; j < N; j++ ) {
19+
temp_sum = 0.0f;
20+
#pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum)
21+
for ( int k = 0; k < N; k++ ) {
22+
temp_sum += A[(i * N) + k] * B[(k * N) + j];
23+
}
24+
C[(i * N) + j] = temp_sum;
25+
}
26+
}
27+
#pragma tuner stop
28+
}
29+
"""
30+
31+
# Extract tunable directive
32+
app = Code(OpenMP(), Cxx())
33+
dims = {"NN": N**2}
34+
kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)
35+
36+
tune_params = dict()
37+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
38+
metrics = dict()
39+
metrics["time_s"] = lambda x: x["time"] / 10**3
40+
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
41+
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
42+
43+
# compute reference solution from CPU
44+
results = run_kernel(
45+
"mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
46+
)
47+
answer = [None, None, results[2]]
48+
49+
tune_kernel(
50+
"mm",
51+
kernel_string["mm"],
52+
0,
53+
kernel_args["mm"],
54+
tune_params,
55+
metrics=metrics,
56+
answer=answer,
57+
compiler="nvc++",
58+
compiler_options=["-fast", "-mp=gpu"],
59+
)

examples/directives/vector_add_c_openacc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,6 @@
6767
tune_params,
6868
metrics=metrics,
6969
answer=answer,
70-
compiler_options=["-fast", "-acc=gpu"],
7170
compiler="nvc++",
71+
compiler_options=["-fast", "-acc=gpu"],
7272
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env python
2+
"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
3+
4+
from kernel_tuner import tune_kernel
5+
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
6+
7+
code = """
8+
#include <stdlib.h>
9+
10+
#define VECTOR_SIZE 1000000
11+
12+
int main(void) {
13+
int size = VECTOR_SIZE;
14+
float * a = (float *) malloc(VECTOR_SIZE * sizeof(float));
15+
float * b = (float *) malloc(VECTOR_SIZE * sizeof(float));
16+
float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));
17+
18+
#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
19+
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
20+
for ( int i = 0; i < size; i++ ) {
21+
c[i] = a[i] + b[i];
22+
}
23+
#pragma tuner stop
24+
25+
free(a);
26+
free(b);
27+
free(c);
28+
}
29+
"""
30+
31+
# Extract tunable directive
32+
app = Code(OpenMP(), Cxx())
33+
kernel_string, kernel_args = process_directives(app, code)
34+
35+
tune_params = dict()
36+
tune_params["nteams"] = [2**i for i in range(1, 11)]
37+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
38+
metrics = dict()
39+
metrics["GB/s"] = (
40+
lambda x: ((2 * 4 * len(kernel_args["vector_add"][0])) + (4 * len(kernel_args["vector_add"][0])))
41+
/ (x["time"] / 10**3)
42+
/ 10**9
43+
)
44+
45+
answer = [None, None, kernel_args["vector_add"][0] + kernel_args["vector_add"][1], None]
46+
47+
tune_kernel(
48+
"vector_add",
49+
kernel_string["vector_add"],
50+
0,
51+
kernel_args["vector_add"],
52+
tune_params,
53+
metrics=metrics,
54+
answer=answer,
55+
compiler="nvc++",
56+
compiler_options=["-fast", "-mp=gpu"],
57+
)

examples/directives/vector_add_fortran_openacc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,6 @@
6262
tune_params,
6363
metrics=metrics,
6464
answer=answer,
65-
compiler_options=["-fast", "-acc=gpu"],
6665
compiler="nvfortran",
66+
compiler_options=["-fast", "-acc=gpu"],
6767
)

examples/fortran/test_fortran_vector_add.py

Lines changed: 0 additions & 42 deletions
This file was deleted.

0 commit comments

Comments
 (0)