Skip to content

Commit 8babcc1

Browse files
adarsh.yogaZzEeKkAa
authored andcommitted
Adding numba cuda framework and implementations
1 parent 1562421 commit 8babcc1

File tree

12 files changed

+410
-1
lines changed

12 files changed

+410
-1
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from math import ceil, erf, exp, log, sqrt
6+
7+
from numba import cuda
8+
9+
10+
@cuda.jit
11+
def _black_scholes_kernel(nopt, price, strike, t, rate, volatility, call, put):
12+
dtype = price.dtype
13+
mr = -rate
14+
sig_sig_two = volatility * volatility * dtype.type(2)
15+
16+
i = cuda.grid(1)
17+
18+
P = price[i]
19+
S = strike[i]
20+
T = t[i]
21+
22+
a = log(P / S)
23+
b = T * mr
24+
25+
z = T * sig_sig_two
26+
c = dtype.type(0.25) * z
27+
y = dtype.type(1.0) / sqrt(z)
28+
29+
w1 = (a - b + c) * y
30+
w2 = (a - b - c) * y
31+
32+
d1 = dtype.type(0.5) + dtype.type(0.5) * erf(w1)
33+
d2 = dtype.type(0.5) + dtype.type(0.5) * erf(w2)
34+
35+
Se = exp(b) * S
36+
37+
r = P * d1 - Se * d2
38+
call[i] = r
39+
put[i] = r - P + Se
40+
41+
42+
def black_scholes(nopt, price, strike, t, rate, volatility, call, put):
43+
nthreads = 256
44+
nblocks = ceil(nopt // nthreads)
45+
46+
_black_scholes_kernel[nblocks, nthreads](
47+
nopt, price, strike, t, rate, volatility, call, put
48+
)
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from math import ceil
6+
7+
import cupy as cp
8+
from numba import cuda
9+
10+
11+
@cuda.jit
12+
def count_weighted_pairs_3d_diff_ker(
13+
n, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result
14+
):
15+
i = cuda.grid(1)
16+
17+
px = x1[i]
18+
py = y1[i]
19+
pz = z1[i]
20+
pw = w1[i]
21+
for j in range(n):
22+
qx = x2[j]
23+
qy = y2[j]
24+
qz = z2[j]
25+
qw = w2[j]
26+
dx = px - qx
27+
dy = py - qy
28+
dz = pz - qz
29+
wprod = pw * qw
30+
dsq = dx * dx + dy * dy + dz * dz
31+
32+
if dsq <= rbins_squared[nbins - 1]:
33+
for k in range(nbins - 1, -1, -1):
34+
if dsq > rbins_squared[k]:
35+
result[i, k + 1] += wprod
36+
break
37+
if k == 0:
38+
result[i, k] += wprod
39+
break
40+
41+
for j in range(nbins - 2, -1, -1):
42+
for k in range(j + 1, nbins, 1):
43+
result[i, k] += result[i, j]
44+
45+
46+
@cuda.jit
47+
def count_weighted_pairs_3d_diff_agg_ker(nbins, result, n):
48+
col_id = cuda.grid(1)
49+
50+
for i in range(1, n):
51+
result[0, col_id] += result[i, col_id]
52+
53+
54+
def gpairs(nopt, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins, results):
55+
# allocate per-work item private result vector in device global memory
56+
results_disjoint = cp.zeros_like(results, shape=(nopt, rbins.shape[0]))
57+
58+
nthreads = 256
59+
nblocks = ceil(nopt / nthreads)
60+
61+
# call gpairs compute kernel
62+
count_weighted_pairs_3d_diff_ker[nblocks, nthreads](
63+
nopt, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins, results_disjoint
64+
)
65+
66+
nthreads = nbins if nbins < 256 else 256
67+
nblocks = ceil(nbins / 256)
68+
69+
# aggregate the results from the compute kernel
70+
count_weighted_pairs_3d_diff_agg_ker[nblocks, nthreads](
71+
nbins, results_disjoint, nopt
72+
)
73+
74+
# copy to results vector
75+
results[:] = results_disjoint[0]
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from math import ceil, sqrt
6+
7+
import cupy as cp
8+
from numba import cuda
9+
10+
11+
@cuda.jit
12+
def _knn_kernel( # noqa: C901: TODO: can we simplify logic?
13+
train,
14+
train_labels,
15+
test,
16+
k,
17+
classes_num,
18+
train_size,
19+
predictions,
20+
votes_to_classes_lst,
21+
data_dim,
22+
):
23+
dtype = train.dtype
24+
25+
i = cuda.grid(1)
26+
27+
# here k has to be 5 in order to match with numpy
28+
queue_neighbors = cuda.local.array(shape=(5, 2), dtype=dtype)
29+
30+
for j in range(k):
31+
x1 = train[j]
32+
x2 = test[i]
33+
34+
distance = dtype.type(0.0)
35+
for jj in range(data_dim):
36+
diff = x1[jj] - x2[jj]
37+
distance += diff * diff
38+
dist = sqrt(distance)
39+
40+
queue_neighbors[j, 0] = dist
41+
queue_neighbors[j, 1] = train_labels[j]
42+
43+
for j in range(k):
44+
new_distance = queue_neighbors[j, 0]
45+
new_neighbor_label = queue_neighbors[j, 1]
46+
index = j
47+
48+
while index > 0 and new_distance < queue_neighbors[index - 1, 0]:
49+
queue_neighbors[index, 0] = queue_neighbors[index - 1, 0]
50+
queue_neighbors[index, 1] = queue_neighbors[index - 1, 1]
51+
52+
index = index - 1
53+
54+
queue_neighbors[index, 0] = new_distance
55+
queue_neighbors[index, 1] = new_neighbor_label
56+
57+
for j in range(k, train_size):
58+
x1 = train[j]
59+
x2 = test[i]
60+
61+
distance = dtype.type(0.0)
62+
for jj in range(data_dim):
63+
diff = x1[jj] - x2[jj]
64+
distance += diff * diff
65+
dist = sqrt(distance)
66+
67+
if dist < queue_neighbors[k - 1][0]:
68+
queue_neighbors[k - 1][0] = dist
69+
queue_neighbors[k - 1][1] = train_labels[j]
70+
new_distance = queue_neighbors[k - 1, 0]
71+
new_neighbor_label = queue_neighbors[k - 1, 1]
72+
index = k - 1
73+
74+
while index > 0 and new_distance < queue_neighbors[index - 1, 0]:
75+
queue_neighbors[index, 0] = queue_neighbors[index - 1, 0]
76+
queue_neighbors[index, 1] = queue_neighbors[index - 1, 1]
77+
78+
index = index - 1
79+
80+
queue_neighbors[index, 0] = new_distance
81+
queue_neighbors[index, 1] = new_neighbor_label
82+
83+
votes_to_classes = votes_to_classes_lst[i]
84+
85+
for j in range(len(queue_neighbors)):
86+
votes_to_classes[int(queue_neighbors[j, 1])] += 1
87+
88+
max_ind = 0
89+
max_value = dtype.type(0)
90+
91+
for j in range(classes_num):
92+
if votes_to_classes[j] > max_value:
93+
max_value = votes_to_classes[j]
94+
max_ind = j
95+
96+
predictions[i] = max_ind
97+
98+
99+
def knn(
100+
x_train,
101+
y_train,
102+
x_test,
103+
k,
104+
classes_num,
105+
test_size,
106+
train_size,
107+
predictions,
108+
votes_to_classes,
109+
data_dim,
110+
):
111+
nthreads = 256
112+
nblocks = ceil(test_size // nthreads)
113+
114+
_knn_kernel[nblocks, nthreads](
115+
x_train,
116+
y_train,
117+
x_test,
118+
k,
119+
classes_num,
120+
train_size,
121+
predictions,
122+
votes_to_classes,
123+
data_dim,
124+
)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from math import ceil, sqrt
6+
7+
from numba import cuda
8+
9+
10+
@cuda.jit
11+
def l2_norm_kernel(a, d):
12+
i = cuda.grid(1)
13+
14+
a_rows = a.shape[1]
15+
d[i] = 0.0
16+
for k in range(a_rows):
17+
d[i] += a[i, k] * a[i, k]
18+
d[i] = sqrt(d[i])
19+
20+
21+
def l2_norm(a, d):
22+
nthreads = 256
23+
nblocks = ceil(a.shape[0] // nthreads)
24+
25+
l2_norm_kernel[nblocks, nthreads](a, d)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from math import ceil, sqrt
6+
7+
from numba import cuda
8+
9+
10+
@cuda.jit
11+
def _pairwise_distance_kernel(X1, X2, D):
12+
i, j = cuda.grid(2)
13+
14+
X2_cols = X2.shape[1]
15+
16+
d = X1.dtype.type(0.0)
17+
for k in range(X2_cols):
18+
tmp = X1[i, k] - X2[j, k]
19+
d += tmp * tmp
20+
D[i, j] = sqrt(d)
21+
22+
23+
def pairwise_distance(X1, X2, D):
24+
threadsperblock = (16, 16)
25+
blockspergrid_x = ceil(X1.shape[0] / threadsperblock[0])
26+
blockspergrid_y = ceil(X2.shape[0] / threadsperblock[1])
27+
blockspergrid = (blockspergrid_x, blockspergrid_y)
28+
29+
_pairwise_distance_kernel[blockspergrid, threadsperblock](X1, X2, D)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from math import ceil, cos, log, pi, sin, sqrt
6+
7+
from numba import cuda
8+
9+
10+
@cuda.jit
11+
def _rambo(C1, F1, Q1, nout, output):
12+
dtype = C1.dtype
13+
i = cuda.grid(1)
14+
for j in range(nout):
15+
C = dtype.type(2.0) * C1[i, j] - dtype.type(1.0)
16+
S = sqrt(dtype.type(1) - C * C)
17+
F = dtype.type(2.0 * pi) * F1[i, j]
18+
Q = -log(Q1[i, j])
19+
20+
output[i, j, 0] = Q
21+
output[i, j, 1] = Q * S * sin(F)
22+
output[i, j, 2] = Q * S * cos(F)
23+
output[i, j, 3] = Q * C
24+
25+
26+
def rambo(nevts, nout, C1, F1, Q1, output):
27+
nthreads = 256
28+
nblocks = ceil(nevts // nthreads)
29+
30+
_rambo[nblocks, nthreads](
31+
C1,
32+
F1,
33+
Q1,
34+
nout,
35+
output,
36+
)
37+
38+
# _rambo[nevts,](
39+
# C1,
40+
# F1,
41+
# Q1,
42+
# nout,
43+
# output,
44+
# )
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
[framework]
6+
simple_name = "numba_cuda"
7+
full_name = "Numba for CUDA GPUs"
8+
prefix = "nbcd"
9+
class = "NumbaCudaFramework"
10+
arch = "gpu"
11+
12+
[[framework.postfixes]]
13+
impl_postfix = "numba_cuda"
14+
description = "numba cuda jit"

dpbench/infrastructure/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
DpcppFramework,
1919
DpnpFramework,
2020
Framework,
21+
NumbaCudaFramework,
2122
NumbaDpexFramework,
2223
NumbaFramework,
2324
NumbaMlirFramework,
@@ -41,6 +42,7 @@
4142
"NumbaMlirFramework",
4243
"DpnpFramework",
4344
"CupyFramework",
45+
"NumbaCudaFramework",
4446
"DpcppFramework",
4547
"create_connection",
4648
"create_results_table",

dpbench/infrastructure/frameworks/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .dpnp_framework import DpnpFramework
88
from .fabric import build_framework, build_framework_map
99
from .framework import Framework
10+
from .numba_cuda_framework import NumbaCudaFramework
1011
from .numba_dpex_framework import NumbaDpexFramework
1112
from .numba_framework import NumbaFramework
1213
from .numba_mlir_framework import NumbaMlirFramework
@@ -17,6 +18,7 @@
1718
"NumbaDpexFramework",
1819
"DpnpFramework",
1920
"CupyFramework",
21+
"NumbaCudaFramework",
2022
"DpcppFramework",
2123
"NumbaMlirFramework",
2224
"build_framework",

0 commit comments

Comments
 (0)