Merge pull request #323 from IntelPython/numba_cuda_impls

adarshyoga · web-flow · commit 9729df44d11d · 2024-01-02T11:16:01.000-08:00
Numba cuda implementations of dpbench workloads
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_numba_cuda.py b/dpbench/benchmarks/black_scholes/black_scholes_numba_cuda.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from math import ceil, erf, exp, log, sqrt
+
+from numba import cuda
+
+
+@cuda.jit
+def _black_scholes_kernel(nopt, price, strike, t, rate, volatility, call, put):
+    dtype = price.dtype
+    mr = -rate
+    sig_sig_two = volatility * volatility * dtype.type(2)
+
+    i = cuda.grid(1)
+
+    P = price[i]
+    S = strike[i]
+    T = t[i]
+
+    a = log(P / S)
+    b = T * mr
+
+    z = T * sig_sig_two
+    c = dtype.type(0.25) * z
+    y = dtype.type(1.0) / sqrt(z)
+
+    w1 = (a - b + c) * y
+    w2 = (a - b - c) * y
+
+    d1 = dtype.type(0.5) + dtype.type(0.5) * erf(w1)
+    d2 = dtype.type(0.5) + dtype.type(0.5) * erf(w2)
+
+    Se = exp(b) * S
+
+    r = P * d1 - Se * d2
+    call[i] = r
+    put[i] = r - P + Se
+
+
+def black_scholes(nopt, price, strike, t, rate, volatility, call, put):
+    nthreads = 256
+    nblocks = ceil(nopt // nthreads)
+
+    _black_scholes_kernel[nblocks, nthreads](
+        nopt, price, strike, t, rate, volatility, call, put
+    )
diff --git a/dpbench/benchmarks/gpairs/gpairs_numba_cuda.py b/dpbench/benchmarks/gpairs/gpairs_numba_cuda.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from math import ceil
+
+import cupy as cp
+from numba import cuda
+
+
+@cuda.jit
+def count_weighted_pairs_3d_diff_ker(
+    n, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result
+):
+    i = cuda.grid(1)
+
+    px = x1[i]
+    py = y1[i]
+    pz = z1[i]
+    pw = w1[i]
+    for j in range(n):
+        qx = x2[j]
+        qy = y2[j]
+        qz = z2[j]
+        qw = w2[j]
+        dx = px - qx
+        dy = py - qy
+        dz = pz - qz
+        wprod = pw * qw
+        dsq = dx * dx + dy * dy + dz * dz
+
+        if dsq <= rbins_squared[nbins - 1]:
+            for k in range(nbins - 1, -1, -1):
+                if dsq > rbins_squared[k]:
+                    result[i, k + 1] += wprod
+                    break
+                if k == 0:
+                    result[i, k] += wprod
+                    break
+
+    for j in range(nbins - 2, -1, -1):
+        for k in range(j + 1, nbins, 1):
+            result[i, k] += result[i, j]
+
+
+@cuda.jit
+def count_weighted_pairs_3d_diff_agg_ker(nbins, result, n):
+    col_id = cuda.grid(1)
+
+    for i in range(1, n):
+        result[0, col_id] += result[i, col_id]
+
+
+def gpairs(nopt, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins, results):
+    # allocate per-work item private result vector in device global memory
+    results_disjoint = cp.zeros_like(results, shape=(nopt, rbins.shape[0]))
+
+    nthreads = 256
+    nblocks = ceil(nopt / nthreads)
+
+    # call gpairs compute kernel
+    count_weighted_pairs_3d_diff_ker[nblocks, nthreads](
+        nopt, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins, results_disjoint
+    )
+
+    nthreads = nbins if nbins < 256 else 256
+    nblocks = ceil(nbins / 256)
+
+    # aggregate the results from the compute kernel
+    count_weighted_pairs_3d_diff_agg_ker[nblocks, nthreads](
+        nbins, results_disjoint, nopt
+    )
+
+    # copy to results vector
+    results[:] = results_disjoint[0]
diff --git a/dpbench/benchmarks/knn/knn_numba_cuda.py b/dpbench/benchmarks/knn/knn_numba_cuda.py
@@ -0,0 +1,124 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from math import ceil, sqrt
+
+import cupy as cp
+from numba import cuda
+
+
+@cuda.jit
+def _knn_kernel(  # noqa: C901: TODO: can we simplify logic?
+    train,
+    train_labels,
+    test,
+    k,
+    classes_num,
+    train_size,
+    predictions,
+    votes_to_classes_lst,
+    data_dim,
+):
+    dtype = train.dtype
+
+    i = cuda.grid(1)
+
+    # here k has to be 5 in order to match with numpy
+    queue_neighbors = cuda.local.array(shape=(5, 2), dtype=dtype)
+
+    for j in range(k):
+        x1 = train[j]
+        x2 = test[i]
+
+        distance = dtype.type(0.0)
+        for jj in range(data_dim):
+            diff = x1[jj] - x2[jj]
+            distance += diff * diff
+        dist = sqrt(distance)
+
+        queue_neighbors[j, 0] = dist
+        queue_neighbors[j, 1] = train_labels[j]
+
+    for j in range(k):
+        new_distance = queue_neighbors[j, 0]
+        new_neighbor_label = queue_neighbors[j, 1]
+        index = j
+
+        while index > 0 and new_distance < queue_neighbors[index - 1, 0]:
+            queue_neighbors[index, 0] = queue_neighbors[index - 1, 0]
+            queue_neighbors[index, 1] = queue_neighbors[index - 1, 1]
+
+            index = index - 1
+
+            queue_neighbors[index, 0] = new_distance
+            queue_neighbors[index, 1] = new_neighbor_label
+
+    for j in range(k, train_size):
+        x1 = train[j]
+        x2 = test[i]
+
+        distance = dtype.type(0.0)
+        for jj in range(data_dim):
+            diff = x1[jj] - x2[jj]
+            distance += diff * diff
+        dist = sqrt(distance)
+
+        if dist < queue_neighbors[k - 1][0]:
+            queue_neighbors[k - 1][0] = dist
+            queue_neighbors[k - 1][1] = train_labels[j]
+            new_distance = queue_neighbors[k - 1, 0]
+            new_neighbor_label = queue_neighbors[k - 1, 1]
+            index = k - 1
+
+            while index > 0 and new_distance < queue_neighbors[index - 1, 0]:
+                queue_neighbors[index, 0] = queue_neighbors[index - 1, 0]
+                queue_neighbors[index, 1] = queue_neighbors[index - 1, 1]
+
+                index = index - 1
+
+                queue_neighbors[index, 0] = new_distance
+                queue_neighbors[index, 1] = new_neighbor_label
+
+    votes_to_classes = votes_to_classes_lst[i]
+
+    for j in range(len(queue_neighbors)):
+        votes_to_classes[int(queue_neighbors[j, 1])] += 1
+
+    max_ind = 0
+    max_value = dtype.type(0)
+
+    for j in range(classes_num):
+        if votes_to_classes[j] > max_value:
+            max_value = votes_to_classes[j]
+            max_ind = j
+
+    predictions[i] = max_ind
+
+
+def knn(
+    x_train,
+    y_train,
+    x_test,
+    k,
+    classes_num,
+    test_size,
+    train_size,
+    predictions,
+    votes_to_classes,
+    data_dim,
+):
+    nthreads = 256
+    nblocks = ceil(test_size // nthreads)
+
+    _knn_kernel[nblocks, nthreads](
+        x_train,
+        y_train,
+        x_test,
+        k,
+        classes_num,
+        train_size,
+        predictions,
+        votes_to_classes,
+        data_dim,
+    )
diff --git a/dpbench/benchmarks/l2_norm/l2_norm_numba_cuda.py b/dpbench/benchmarks/l2_norm/l2_norm_numba_cuda.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from math import ceil, sqrt
+
+from numba import cuda
+
+
+@cuda.jit
+def l2_norm_kernel(a, d):
+    i = cuda.grid(1)
+
+    a_rows = a.shape[1]
+    d[i] = 0.0
+    for k in range(a_rows):
+        d[i] += a[i, k] * a[i, k]
+    d[i] = sqrt(d[i])
+
+
+def l2_norm(a, d):
+    nthreads = 256
+    nblocks = ceil(a.shape[0] // nthreads)
+
+    l2_norm_kernel[nblocks, nthreads](a, d)
diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_cuda.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_cuda.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from math import ceil, sqrt
+
+from numba import cuda
+
+
+@cuda.jit
+def _pairwise_distance_kernel(X1, X2, D):
+    i, j = cuda.grid(2)
+
+    X2_cols = X2.shape[1]
+
+    d = X1.dtype.type(0.0)
+    for k in range(X2_cols):
+        tmp = X1[i, k] - X2[j, k]
+        d += tmp * tmp
+    D[i, j] = sqrt(d)
+
+
+def pairwise_distance(X1, X2, D):
+    threadsperblock = (16, 16)
+    blockspergrid_x = ceil(X1.shape[0] / threadsperblock[0])
+    blockspergrid_y = ceil(X2.shape[0] / threadsperblock[1])
+    blockspergrid = (blockspergrid_x, blockspergrid_y)
+
+    _pairwise_distance_kernel[blockspergrid, threadsperblock](X1, X2, D)
diff --git a/dpbench/benchmarks/rambo/rambo_numba_cuda.py b/dpbench/benchmarks/rambo/rambo_numba_cuda.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from math import ceil, cos, log, pi, sin, sqrt
+
+from numba import cuda
+
+
+@cuda.jit
+def _rambo(C1, F1, Q1, nout, output):
+    dtype = C1.dtype
+    i = cuda.grid(1)
+    for j in range(nout):
+        C = dtype.type(2.0) * C1[i, j] - dtype.type(1.0)
+        S = sqrt(dtype.type(1) - C * C)
+        F = dtype.type(2.0 * pi) * F1[i, j]
+        Q = -log(Q1[i, j])
+
+        output[i, j, 0] = Q
+        output[i, j, 1] = Q * S * sin(F)
+        output[i, j, 2] = Q * S * cos(F)
+        output[i, j, 3] = Q * C
+
+
+def rambo(nevts, nout, C1, F1, Q1, output):
+    nthreads = 256
+    nblocks = ceil(nevts // nthreads)
+
+    _rambo[nblocks, nthreads](
+        C1,
+        F1,
+        Q1,
+        nout,
+        output,
+    )
+
+    # _rambo[nevts,](
+    #     C1,
+    #     F1,
+    #     Q1,
+    #     nout,
+    #     output,
+    # )
diff --git a/dpbench/configs/framework_info/numba_cuda.toml b/dpbench/configs/framework_info/numba_cuda.toml
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[framework]
+simple_name = "numba_cuda"
+full_name = "Numba for CUDA GPUs"
+prefix = "nbcd"
+class = "NumbaCudaFramework"
+arch = "gpu"
+
+[[framework.postfixes]]
+impl_postfix = "numba_cuda"
+description = "numba cuda jit"
diff --git a/dpbench/infrastructure/__init__.py b/dpbench/infrastructure/__init__.py
@@ -18,6 +18,7 @@
     DpcppFramework,
     DpnpFramework,
     Framework,
+    NumbaCudaFramework,
     NumbaDpexFramework,
     NumbaFramework,
     NumbaMlirFramework,
@@ -41,6 +42,7 @@
     "NumbaMlirFramework",
     "DpnpFramework",
     "CupyFramework",
+    "NumbaCudaFramework",
     "DpcppFramework",
     "create_connection",
     "create_results_table",
diff --git a/dpbench/infrastructure/frameworks/__init__.py b/dpbench/infrastructure/frameworks/__init__.py
@@ -7,6 +7,7 @@
 from .dpnp_framework import DpnpFramework
 from .fabric import build_framework, build_framework_map
 from .framework import Framework
+from .numba_cuda_framework import NumbaCudaFramework
 from .numba_dpex_framework import NumbaDpexFramework
 from .numba_framework import NumbaFramework
 from .numba_mlir_framework import NumbaMlirFramework
@@ -17,6 +18,7 @@
     "NumbaDpexFramework",
     "DpnpFramework",
     "CupyFramework",
+    "NumbaCudaFramework",
     "DpcppFramework",
     "NumbaMlirFramework",
     "build_framework",
diff --git a/dpbench/infrastructure/frameworks/fabric.py b/dpbench/infrastructure/frameworks/fabric.py
diff --git a/dpbench/infrastructure/frameworks/framework.py b/dpbench/infrastructure/frameworks/framework.py
diff --git a/dpbench/infrastructure/frameworks/numba_cuda_framework.py b/dpbench/infrastructure/frameworks/numba_cuda_framework.py