diff --git a/arrayfire/array_object.py b/arrayfire/array_object.py
index d44ee29..ea8c5cd 100755
--- a/arrayfire/array_object.py
+++ b/arrayfire/array_object.py
@@ -746,7 +746,7 @@ def __getitem__(self, key: IndexKey, /) -> Array:
         ----------
         self : Array
             Array instance.
-        key : int | slice | tuple[int | slice, ...] | Array
+        key : int | slice | tuple[int | slice | Array, ...] | Array
             Index key.
 
         Returns
@@ -754,19 +754,40 @@ def __getitem__(self, key: IndexKey, /) -> Array:
         out : Array
             An array containing the accessed value(s). The returned array must have the same data type as self.
         """
-        # TODO
-        # API Specification - key: Union[int, slice, ellipsis, tuple[Union[int, slice, ellipsis], ...], array].
-        # consider using af.span to replace ellipsis during refactoring
         out = Array()
         ndims = self.ndim
 
-        if isinstance(key, Array) and key == afbool.c_api_value:
+        indexing = key
+        
+        if isinstance(key, int | float | slice): # when indexing with one dimension, treat it as indexing a flat array
+            ndims = 1
+        elif isinstance(key, Array): # when indexing with one array, treat it as indexing a flat array
             ndims = 1
-            if wrapper.count_all(key.arr) == 0:  # HACK was count() method before
-                return out
+            if key.is_bool:
+                indexing = wrapper.where(key.arr)
+            else:
+                indexing = key.arr
+        elif isinstance(key, tuple):
+            key_list = []
+            for elem in key:
+                if isinstance(elem, Array):
+                    if elem.is_bool:
+                        key_list.append(wrapper.where(elem.arr))
+                    else:
+                        key_list.append(elem.arr)
+                else:
+                    key_list.append(elem)
+            indexing = tuple(key_list)
+
+        out._arr = wrapper.index_gen(self._arr, ndims, wrapper.get_indices(indexing))  # type: ignore[arg-type]
+
+        if isinstance(key, Array) and key.is_bool:
+            wrapper.release_array(indexing)
+        elif isinstance(key, tuple):
+            for i in range(len(key)):
+                if isinstance(key[i], Array) and key[i].is_bool:
+                    wrapper.release_array(indexing[i])
 
-        # HACK known issue
-        out._arr = wrapper.index_gen(self._arr, ndims, wrapper.get_indices(key))  # type: ignore[arg-type]
         return out
 
     def __index__(self) -> int:
@@ -781,8 +802,19 @@ def __len__(self) -> int:
         return self.shape[0] if self.shape else 0
 
     def __setitem__(self, key: IndexKey, value: int | float | bool | Array, /) -> None:
-        ndims = self.ndim
+        """
+        Assigns self[key] = value
+
+        Parameters
+        ----------
+        self : Array
+            Array instance.
+        key : int | slice | tuple[int | slice | Array, ...] | Array
+            Index key.
+        value: int | float | complex | bool | Array
 
+        """
+        ndims = self.ndim
         is_array_with_bool = isinstance(key, Array) and type(key) is afbool
 
         if is_array_with_bool:
@@ -803,12 +835,41 @@ def __setitem__(self, key: IndexKey, value: int | float | bool | Array, /) -> No
             other_arr = value.arr
             del_other = False
 
-        indices = wrapper.get_indices(key)  # type: ignore[arg-type]  # FIXME
-        out = wrapper.assign_gen(self._arr, other_arr, ndims, indices)
+        indexing = key
+        if isinstance(key, int | float | slice): # when indexing with one dimension, treat it as indexing a flat array
+            ndims = 1
+        elif isinstance(key, Array): # when indexing with one array, treat it as indexing a flat array
+            ndims = 1
+            if key.is_bool:
+                indexing = wrapper.where(key.arr)
+            else:
+                indexing = key.arr
+        elif isinstance(key, tuple):
+            key_list = []
+            for elem in key:
+                if isinstance(elem, Array):
+                    if elem.is_bool:
+                        locs = wrapper.where(elem.arr)
+                        key_list.append(locs)
+                    else:
+                        key_list.append(elem.arr)
+                else:
+                    key_list.append(elem)
+            indexing = tuple(key_list)
+
+        out = wrapper.assign_gen(self._arr, other_arr, ndims, wrapper.get_indices(indexing))
+
+        if isinstance(key, Array) and key.is_bool:
+            wrapper.release_array(indexing)
+        elif isinstance(key, tuple):
+            for i in range(len(key)):
+                if isinstance(key[i], Array) and key[i].is_bool:
+                    wrapper.release_array(indexing[i])
 
         wrapper.release_array(self._arr)
         if del_other:
             wrapper.release_array(other_arr)
+
         self._arr = out
 
     def __str__(self) -> str:
@@ -1144,7 +1205,10 @@ def _get_processed_index(key: IndexKey, shape: tuple[int, ...]) -> tuple[int, ..
     if isinstance(key, tuple):
         return tuple(_index_to_afindex(key[i], shape[i]) for i in range(len(key)))
 
-    return (_index_to_afindex(key, shape[0]),) + shape[1:]
+    size = 1
+    for dim_size in shape:
+        size *= dim_size
+    return (_index_to_afindex(key, size),)
 
 
 def _index_to_afindex(key: int | float | complex | bool | slice | wrapper.ParallelRange | Array, axis: int) -> int:
@@ -1168,6 +1232,10 @@ def _index_to_afindex(key: int | float | complex | bool | slice | wrapper.Parall
 
 
 def _slice_to_length(key: slice, axis: int) -> int:
+    start = key.start
+    stop = key.stop
+    step = key.step
+
     if key.start is None:
         start = 0
     elif key.start < 0:
diff --git a/arrayfire/library/array_functions.py b/arrayfire/library/array_functions.py
index e8028e5..90956b3 100644
--- a/arrayfire/library/array_functions.py
+++ b/arrayfire/library/array_functions.py
@@ -954,6 +954,7 @@ def replace(lhs: Array, rhs: Array | int | float, conditional: Array, /) -> None
     wrapper.replace_scalar(lhs.arr, conditional.arr, rhs)
 
 
+@afarray_as_array
 def select(lhs: Array | int | float, rhs: Array | int | float, conditional: Array, /) -> Array:
     """
     Conditionally selects elements from one of two sources (ArrayFire arrays or scalars) based on a condition array.
diff --git a/arrayfire/library/linear_algebra.py b/arrayfire/library/linear_algebra.py
index ee1f09e..67109f2 100644
--- a/arrayfire/library/linear_algebra.py
+++ b/arrayfire/library/linear_algebra.py
@@ -92,6 +92,7 @@ def gemm(
     rhs_opts: MatProp = MatProp.NONE,
     alpha: int | float = 1.0,
     beta: int | float = 0.0,
+    accum: Array = None
 ) -> Array:
     """
     Performs BLAS general matrix multiplication (GEMM) on two Array instances.
@@ -125,6 +126,10 @@ def gemm(
     beta : int | float, optional
         Scalar multiplier for the existing matrix C in the accumulation. Default is 0.0.
 
+    accum: Array, optional
+        A 2-dimensional, real or complex array representing the matrix C in the accumulation.
+        Default is None (no accumulation).
+
     Returns
     -------
     Array
@@ -135,7 +140,10 @@ def gemm(
     - The data types of `lhs` and `rhs` must be compatible.
     - Batch operations are not supported in this version.
     """
-    return cast(Array, wrapper.gemm(lhs.arr, rhs.arr, lhs_opts, rhs_opts, alpha, beta))
+    accumulator = None
+    if isinstance(accum, Array):
+        accumulator = accum.arr
+    return cast(Array, wrapper.gemm(lhs.arr, rhs.arr, lhs_opts, rhs_opts, alpha, beta, accumulator))
 
 
 @afarray_as_array
diff --git a/arrayfire/library/mathematical_functions.py b/arrayfire/library/mathematical_functions.py
index bc7b970..0d2e99b 100644
--- a/arrayfire/library/mathematical_functions.py
+++ b/arrayfire/library/mathematical_functions.py
@@ -261,12 +261,11 @@ def atan2(x1: int | float | Array, x2: int | float | Array, /) -> Array:
     return process_c_function(x1, x2, wrapper.atan2)
 
 
-@afarray_as_array
 def cplx(x1: int | float | Array, /, x2: int | float | Array | None = None) -> Array:
     if x2 is None:
         if not isinstance(x1, Array):
             raise TypeError("x1 can not be int or tuple when x2 is None.")
-        return cast(Array, wrapper.cplx(x1.arr))
+        return Array.from_afarray(Array, wrapper.cplx(x1.arr))
     else:
         return process_c_function(x1, x2, wrapper.cplx2)
 
diff --git a/benchmarks/src/README.md b/benchmarks/src/README.md
new file mode 100644
index 0000000..ae84c4e
--- /dev/null
+++ b/benchmarks/src/README.md
@@ -0,0 +1,36 @@
+Benchmarks
+===========
+
+## Setting up environment
+
+```sh
+    python -m pip install -r requirements.txt
+```
+
+## Benchmark parameters
+
+The benchmark packages, rounds, array sizes, and numeric type may be specified on the constants at the top of [pytest_benchmark/common.py](pytest_benchmark/common.py).
+
+Alternatively, they may be specified individually at the top of each test file.
+
+
+## Running
+
+These are the steps to run the benchmarks, and produce the graphs 
+
+Run the benchmarks and store the results in `results.json`
+```sh
+    pytest .\pytest_benchmark --benchmark-json=results.json
+```
+
+To create graphs and store the timing results after creating the `results.json`, run:
+```sh
+    python graphs.py
+```
+
+To modify the tests being shown, modify the `TESTS` list at the top of the `graphs.py` file.
+To modify the labels shown, modify `PKG_LABELS`
+To modify the hardware display, modify `HARDWARE` 
+
+## Notes
+When running with `dpnp`, set the environment variable `DPNP_RAISE_EXCEPION_ON_NUMPY_FALLBACK` to 0.
\ No newline at end of file
diff --git a/benchmarks/src/graphs.py b/benchmarks/src/graphs.py
new file mode 100644
index 0000000..cd19775
--- /dev/null
+++ b/benchmarks/src/graphs.py
@@ -0,0 +1,225 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import json
+import pandas as pd
+
+BENCHMARKS_JSON = 'results.json'
+
+# Hardware details shown in title
+HARDWARE = "AMD Ryzen 9 9900X 12-Core Processor 63032 MB (fp64 fp16)\noneAPI 2025.1.3 Intel(R) OpenCL Graphics: Intel(R) Arc(TM) B580 Graphics, 11873 MB (fp64 fp16)"
+
+# Show speedup in graph
+SHOW_NUMBERS = True
+
+# Round to digits after decimal
+ROUND_NUMBERS = 1 
+
+# package list in graph order; arrayfire packages are added later
+PKG_NAMES = [
+    'numpy',
+    'dpnp',
+    'cupy'
+] 
+
+# color used in graphs
+PKG_COLOR = {
+    "numpy": "tab:blue",
+    "cupy": "tab:green",
+    "dpnp": "tab:red",
+    "afcpu": "tab:orange",
+    "afopencl": "tab:orange",
+    "afcuda": "tab:orange",
+    "afoneapi": "tab:orange"
+}
+
+# labels displayed in the graph
+PKG_LABELS = {
+    "numpy": "numpy[cpu]",
+    "dpnp": "dpnp[level_zero:gpu]",
+    "cupy": "cupy",
+    "afcpu": "afcpu",
+    "afcuda": "afcuda",
+    "afopencl": "afopencl[opencl:gpu]",
+    "afoneapi": "afoneapi[opencl:gpu]"
+}
+
+AFBACKENDS = [
+    'afcpu',
+    'afcuda',
+    'afopencl',
+    'afoneapi'
+]
+
+# Tests to be shown in graphs
+TESTS = [
+    'qr',
+    'neural_network',
+    'gemm',
+    'mandelbrot',
+    'nbody',
+    'pi',
+    'black_scholes',
+    'fft',
+    'normal',
+    'group_elementwise',
+
+    #Other tests
+    # 'svd
+    # 'cholesky',
+    # 'det',
+    # 'norm',
+    # 'uniform',
+    # 'inv'
+] 
+
+def get_benchmark_data():
+    results = {}
+    descriptions = {}
+    with open(BENCHMARKS_JSON) as f:
+        js = json.load(f)
+        for bench in js['benchmarks']:
+            test_name = bench["name"]
+            test_name = test_name[test_name.find('_') + 1:test_name.find('[')]
+
+            key = bench["param"]
+            val = bench["stats"]["ops"]
+
+            if len(bench["extra_info"]) != 0 and (not test_name in descriptions):
+                descriptions[test_name] = bench["extra_info"]["description"]
+
+            if test_name not in results:
+                results[test_name] = { key : val }
+            else:
+                results[test_name][key] = val
+
+    return results, descriptions
+
+def create_graph(test_name, test_results):
+    names = []
+    values = []
+    for name in test_results:
+        names.append(name)
+        values.append(test_results[name])
+
+    bar = plt.bar(names, values)
+    plt.title(test_name)
+
+    plt.savefig("img/" + test_name + ".png")
+    plt.close()
+
+def generate_individual_graphs():
+    results, descriptions = get_benchmark_data()
+
+    for test in results:
+        create_graph(test, results[test])
+
+# Stores the timing results in a csv file
+def store_csv():
+    data_dict = {}
+    data_dict["Test(seconds)"] = []
+    results = {}
+    for pkg in PKG_LABELS.keys():
+        data_dict[pkg] = []
+        results[pkg] = {}
+
+    with open(BENCHMARKS_JSON) as f:
+        js = json.load(f)
+        for bench in js['benchmarks']:
+            test_name = bench["name"]
+            test_name = test_name[test_name.find('_') + 1:test_name.find('[')]
+
+            pkg = bench["param"]
+            time = bench["stats"]["mean"]
+
+            if not test_name in data_dict["Test(seconds)"]:
+                data_dict["Test(seconds)"].append(test_name)
+
+            results[pkg][test_name] = time
+    
+    for test in data_dict["Test(seconds)"]:
+        for pkg in PKG_LABELS.keys():
+            if test in results[pkg]:
+                data_dict[pkg].append(results[pkg][test])
+            else:
+                data_dict[pkg].append(np.nan)
+            
+    df = pd.DataFrame(data_dict)
+    df.to_csv("summary.csv")
+
+def generate_group_graph(test_list = None, show_numbers = False, filename = "comparison"):
+    results, descriptions = get_benchmark_data()
+
+    width = 1 / (1 + len(PKG_NAMES))
+    multiplier = 0
+
+    tests = None
+    if test_list:
+        tests = test_list
+    else:
+        tests = results.keys()
+
+    tests_values = {}
+    x = np.arange(len(tests))
+
+    for name in PKG_NAMES:
+        tests_values[name] = []
+
+    max_val = 1
+    for test in tests:
+        for name in PKG_NAMES:
+            base_value = results[test]["numpy"]
+            if name in results[test]:
+                val = results[test][name] / base_value
+
+                if ROUND_NUMBERS:
+                    val = round(val, ROUND_NUMBERS)
+
+                if max_val < val:
+                    max_val = val
+
+                tests_values[name].append(val)
+            else:
+                tests_values[name].append(np.nan)
+
+    fig, ax = plt.subplots(layout='constrained')
+
+    for name in PKG_NAMES:
+        offset = width * multiplier
+        rects = ax.barh(x + offset, tests_values[name], width, label=PKG_LABELS[name], color=PKG_COLOR[name])
+
+        if show_numbers:
+            ax.bar_label(rects, padding=3, rotation=0)
+        multiplier += 1
+
+    xlabels = []
+    for test in tests:
+        xlabels.append(test  + "\n" + descriptions[test])
+
+    ax.set_xlabel('Speedup')
+    ax.set_xscale('log')
+    ax.set_title(f'Runtime Comparison\n{HARDWARE}')
+    ax.set_yticks(x + width, xlabels, rotation=0)
+    xmin, xmax = ax.get_xlim()
+    ax.set_xlim(xmin, xmax * 2)
+
+    ax.legend(loc='lower right', ncols=len(PKG_NAMES))
+    fig.set_figheight(8)
+    fig.set_figwidth(13)
+    fig.savefig(f"img/{filename}.png")
+    plt.show()
+    
+def main():
+    store_csv()
+    for backend in AFBACKENDS:
+        try:
+            filename = f"comparison_{backend}"
+            if not backend in PKG_NAMES:
+                PKG_NAMES.insert(1, backend)
+            generate_group_graph(TESTS, SHOW_NUMBERS, filename)
+            PKG_NAMES.remove(backend)
+        except Exception as e:
+            print(e)
+            print("No data for", backend)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/common.py b/benchmarks/src/pytest_benchmark/common.py
new file mode 100644
index 0000000..58ee9d5
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/common.py
@@ -0,0 +1,103 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+import math
+
+import arrayfire as af
+import numpy as np
+import dpnp
+import dpctl
+import cupy
+import gc
+
+# modify parameters for most benchmarks
+ROUNDS = 30
+NSIZE = 2 ** 13
+NNSIZE = NSIZE ** 2
+DTYPE = "float32"
+
+# comment a line to remove that package from testing
+PKGDICT = {
+    "dpnp" : dpnp,
+    "numpy" : np,
+    "cupy": cupy,
+    # "afcpu": af,
+    "afopencl": af,
+    "afcuda" : af,
+    "afoneapi": af
+}
+
+PKGS = []
+IDS = []
+
+for key, value in PKGDICT.items():
+    IDS.append(key)
+    PKGS.append(value)
+
+# Initialize packages and cleanup memory before each round
+def initialize_package(PKG_ID):
+    pkg = PKGDICT[PKG_ID]
+
+    try:
+        af.device_gc()
+        mempool = cupy.get_default_memory_pool()
+        mempool.free_all_blocks()
+    except:
+        pass
+    
+    if PKG_ID == "afcpu":
+        af.set_backend(af.BackendType.cpu)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "afopencl":
+        af.set_backend(af.BackendType.opencl)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "afcuda":
+        af.set_backend(af.BackendType.cuda)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "afoneapi":
+        af.set_backend(af.BackendType.oneapi)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "numpy":
+        np.random.seed(0)
+    elif PKG_ID == "dpnp":
+        dpnp.random.seed(0)
+        print(dpctl.get_devices()[0])
+    elif PKG_ID == "cupy":
+        cupy.random.seed(0)
+        print(cupy.cuda.Device())
+        mempool = cupy.get_default_memory_pool()
+        mempool.free_all_blocks()
+    else:
+        raise NotImplementedError()
+
+    # Free all unused memory
+    gc.collect()
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/sgemm.cu b/benchmarks/src/pytest_benchmark/sgemm.cu
new file mode 100644
index 0000000..9e4791f
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/sgemm.cu
@@ -0,0 +1,200 @@
+/*
+Original works by:
+--------------------------------------------------------
+MAGMA
+Copyright (c) 2017 The University of Tennessee. All rights reserved.
+Licensed under modified BSD license
+*/
+
+
+// These parameters will be determined by utils.read_code
+//#define DIM_X  ${DIM_X}
+//#define DIM_Y  ${DIM_Y}
+//#define BLK_M  ${BLK_M}
+//#define BLK_N  ${BLK_N}
+//#define BLK_K  ${BLK_K}
+//#define DIM_XA  ${DIM_XA}
+//#define DIM_YA  ${DIM_YA}
+//#define DIM_XB  ${DIM_XB}
+//#define DIM_YB  ${DIM_YB}
+//#define THR_N  ${THR_N}
+//#define THR_M  ${THR_M}
+
+#define fetch(arr, col, m, n, bound) arr[min(n*col + m, bound)]
+
+
+extern "C" __global__
+void sgemm(
+        int M, int N, int K,
+        const float* A,
+        const float* B,
+        float * C)
+{
+    int idx = threadIdx.x;
+    int idy = threadIdx.y;
+
+    int idt = DIM_X * idy + idx;
+
+    int idxA = idt % DIM_XA;
+    int idyA = idt / DIM_XA;
+
+    int idxB = idt % DIM_XB;
+    int idyB = idt / DIM_XB;
+
+    int blx = blockIdx.x;
+    int bly = blockIdx.y;
+
+    __shared__ float sA[BLK_K][BLK_M + 1];
+    __shared__ float sB[BLK_N][BLK_K + 1];
+
+    // registers for the innermost loop
+    float rC[THR_N][THR_M];
+    float rA[THR_M];
+    float rB[THR_N];
+
+    float ra[BLK_K / DIM_YA][BLK_M / DIM_XA];
+    float rb[BLK_N / DIM_YB][BLK_K / DIM_XB];
+
+    const float* offs_dA = A + blx * BLK_M       + idyA * M + idxA;
+    int boundA = (M * (K - 1) + M) - (blx * BLK_M + idyA * M + idxA) - 1;
+    const float* offs_dB = B + bly * BLK_N * K + idyB * K + idxB;
+    int boundB = (K * (N - 1) + K) - (bly * BLK_N * K + idyB * K + idxB) - 1;
+
+    int m, n, k, kk;
+    
+    #pragma unroll
+    for (n = 0; n < THR_N; n++) {
+        #pragma unroll
+        for (m = 0 ; m < THR_M; m++) {
+            rC[n][m] = 0;
+        }
+    }
+
+    // blockwise transpose to transpose load
+    #pragma unroll
+    for (n = 0; n < BLK_K; n += DIM_YA) {
+        #pragma unroll
+        for (m = 0; m < BLK_M; m += DIM_XA) {
+            sA[n + idyA][m + idxA] = fetch(offs_dA, M, m, n, boundA);
+        }
+    }
+    // blockwise transpose to transpose load
+    #pragma unroll
+    for (n = 0; n < BLK_N; n += DIM_YB) {
+        #pragma unroll
+        for (m = 0; m < BLK_K; m += DIM_XB) {
+            sB[n + idyB][m + idxB] = fetch(offs_dB, K, m, n, boundB);
+        }
+    }
+    __syncthreads();
+
+    for (kk = 0; kk < K - BLK_K; kk += BLK_K)
+    {
+        offs_dA += BLK_K * M;
+        boundA -= BLK_K * M;
+        offs_dB += BLK_K;
+        boundB -= BLK_K;
+        
+        #pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++) {
+            #pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++) {
+                ra[n][m] = fetch(offs_dA, M, m * DIM_XA, n * DIM_YA, boundA);
+            }
+        }
+
+        #pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++) {
+            #pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++) {
+                rb[n][m] = fetch(offs_dB, K, m * DIM_XB, n * DIM_YB, boundB);
+            }
+        }
+
+        // multiply
+        #pragma unroll
+        for (k = 0; k < BLK_K; k++)
+        {
+            #pragma unroll
+            for (m = 0; m < THR_M; m++) {
+                rA[m] = sA[k][m * DIM_X + idx];
+            }
+            
+            #pragma unroll
+            for (n = 0; n < THR_N; n++) {
+                rB[n] = sB[n * DIM_Y + idy][k];
+            }
+
+            #pragma unroll
+            for (n = 0; n < THR_N; n++) {
+                #pragma unroll
+                for (m = 0; m < THR_M; m++) {
+                    rC[n][m] += rA[m] * rB[n];
+                }
+            }
+        }
+        __syncthreads();
+
+        // store A regs->smem
+        #pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++)
+        {
+            #pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++)
+            {
+                sA[n * DIM_YA + idyA][m * DIM_XA + idxA] = ra[n][m];
+            }
+        }
+
+        #pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++)
+        {
+            #pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++)
+            {
+                sB[n * DIM_YB + idyB][m * DIM_XB + idxB] = rb[n][m];
+            }
+        }
+        __syncthreads();
+    }
+
+    // Multiply last full (BLK_K) or partial block of columns of A and
+    // rows of B.
+    // It's okay that m,n exceed matrix bounds as all work is in registers
+    // or shared memory, and out-of-bounds rC[n][m] will not be saved later.
+
+    kk = K - kk;
+    #pragma unroll
+    for (k = 0; k < kk; k++)
+    {
+        #pragma unroll
+        for (m = 0; m < THR_M; m++) {
+            rA[m] = sA[k][m * DIM_X + idx];
+        }
+
+        #pragma unroll
+        for (n = 0; n < THR_N; n++) {
+            rB[n] = sB[n * DIM_Y + idy][k];
+        }
+        
+        #pragma unroll
+        for (n = 0; n < THR_N; n++) {
+            #pragma unroll
+            for (m = 0; m < THR_M; m++) {
+                rC[n][m] += rA[m] * rB[n];
+            }
+        }
+    }
+
+    #pragma unroll
+    for (n = 0; n < THR_N; n++) {
+        int coord_dCn = bly * BLK_N + n * DIM_Y + idy;
+        #pragma unroll
+        for (m = 0; m < THR_M; m++) {
+            int coord_dCm = blx * BLK_M + m * DIM_X + idx;
+            if (coord_dCm < M && coord_dCn < N) {
+                C[coord_dCn * M + coord_dCm] = rC[n][m];
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_blackscholes.py b/benchmarks/src/pytest_benchmark/test_blackscholes.py
new file mode 100644
index 0000000..f940ff9
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_blackscholes.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python
+
+#######################################################
+# Copyright (c) 2015, ArrayFire
+# All rights reserved.
+#
+# This file is distributed under 3-clause BSD license.
+# The complete license agreement can be obtained at:
+# http://arrayfire.com/licenses/BSD-3-Clause
+########################################################
+
+from common import *
+
+ITERATIONS = 1
+
+sqrt2 = math.sqrt(2.0)
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestBlackScholes:
+    def test_black_scholes(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 5), {})
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        result = benchmark.pedantic(
+            target=FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+def black_scholes_numpy(S, X, R, V, T):
+    # S = Underlying stock price
+    # X = Strike Price
+    # R = Risk free rate of interest
+    # V = Volatility
+    # T = Time to maturity
+    def cnd(x):
+        temp = (x > 0)
+        erf = lambda arr : np.exp(-arr * arr)
+        return temp * (0.5 + erf(x/sqrt2)/2) + (1 - temp) * (0.5 - erf((-x)/sqrt2)/2)
+    
+    d1 = np.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * np.sqrt(T))
+
+    d2 = d1 - (V * np.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * np.exp((-R) * T) * cnd_d2)
+    P = X * np.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1))
+
+    return (C, P)
+
+def black_scholes_dpnp(S, X, R, V, T):
+    def cnd(x):
+        temp = (x > 0)
+        return temp * (0.5 + dpnp.erf(x/sqrt2)/2) + (1 - temp) * (0.5 - dpnp.erf((-x)/sqrt2)/2)
+
+    d1 = dpnp.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * dpnp.sqrt(T))
+
+    d2 = d1 - (V * dpnp.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * dpnp.exp((-R) * T) * cnd_d2)
+    P = X * dpnp.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1))
+
+    return (C, P)
+
+def black_scholes_cupy(S, X, R, V, T):
+    def cnd(x):
+        temp = (x > 0)
+        erf = lambda arr : cupy.exp(-arr * arr)
+        return temp * (0.5 + erf(x/sqrt2)/2) + (1 - temp) * (0.5 - erf((-x)/sqrt2)/2)
+    
+    d1 = cupy.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * cupy.sqrt(T))
+
+    d2 = d1 - (V * cupy.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * cupy.exp((-R) * T) * cnd_d2)
+    P = X * cupy.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1))
+
+    cupy.cuda.runtime.deviceSynchronize()
+
+    return (C, P)
+
+def black_scholes_arrayfire(S, X, R, V, T):
+    def cnd(x):
+        temp = (x > 0)
+        return temp * (0.5 + af.erf(x/sqrt2)/2) + (1 - temp) * (0.5 - af.erf((-x)/sqrt2)/2)
+
+    d1 = af.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * af.sqrt(T))
+
+    d2 = d1 - (V * af.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * af.exp((-R) * T) * cnd_d2)
+    P = X * af.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1))
+
+    af.eval(C)
+    af.eval(P)
+    af.sync()
+
+    return (C, P)
+
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        af.device_gc()
+        for i in range(count):  
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
+
+FUNCS = { "dpnp" : black_scholes_dpnp, "numpy" : black_scholes_numpy, \
+         "cupy" : black_scholes_cupy , "arrayfire" : black_scholes_arrayfire}
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_elementwise.py b/benchmarks/src/pytest_benchmark/test_elementwise.py
new file mode 100644
index 0000000..ad6cbd6
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_elementwise.py
@@ -0,0 +1,327 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from common import *
+
+ITERATIONS = 1
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestElementwise:
+    def test_group_elementwise(self, benchmark, pkgid):
+        initialize_package(pkgid)
+
+        setup = lambda: ([generate_arrays(pkgid, 1)[0] / (NSIZE * NSIZE)], {})
+        pkg = PKGDICT[pkgid]
+
+        def func(arr):
+            return pkg.exp(pkg.cos(pkg.sinh(arr))) +\
+                pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr)))
+        
+        def func_af(arr):
+            x = af.exp(af.cos(af.sinh(arr))) +\
+                af.cbrt(af.log(arr) * af.expm1(-af.sqrt(arr)))
+            af.eval(x)
+            af.sync()
+            return x
+
+        def func_cupy(arr):
+            x = pkg.exp(pkg.cos(pkg.sinh(arr))) +\
+                pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr)))
+            cupy.cuda.runtime.deviceSynchronize()
+            return x
+
+        GROUP_FUNCS = {
+            "numpy": func,
+            "cupy": func_cupy,
+            "arrayfire": func_af,
+            "dpnp": func
+        }
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        result = benchmark.pedantic(
+            target=GROUP_FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    '''
+    def test_arccos(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.acos,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arccosh(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.acosh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arcsin(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.asin,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arcsinh(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.asinh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arctan(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.atan,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arctanh(self, benchmark, pkg):
+        setup = lambda: ([(generate_arrays(pkg, 1)[0] - 1) / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.atanh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_cbrt(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.cbrt,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_cos(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.cos,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_cosh(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.cosh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_sin(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.sin,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_sinh(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.sinh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_degrees(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.degrees,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_exp(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.exp,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_exp2(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.exp2,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_expm1(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.expm1,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_log(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.log,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_log10(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.log10,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_log1p(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.log1p,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_sqrt(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.sqrt,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_square(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.square,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_tan(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.tan,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_tanh(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.tanh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_reciprocal(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.reciprocal,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+    '''
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):  
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_fft.py b/benchmarks/src/pytest_benchmark/test_fft.py
new file mode 100644
index 0000000..4c563a9
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_fft.py
@@ -0,0 +1,91 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from common import *
+
+ITERATIONS = 1
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):  
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestFFT:
+    def test_fft(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        result = benchmark.pedantic(
+            target=FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+def fft_af(arr):
+    res = af.fft(arr)
+    af.eval(res)
+    af.sync()
+
+    return res
+
+def fft_np(arr):
+    return np.fft.fft(arr)
+
+def fft_dpnp(arr):
+    return dpnp.fft.fft(arr)
+
+def fft_cupy(arr):
+    res = cupy.fft.fft(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return res
+
+FUNCS = { "dpnp" : fft_dpnp , "numpy" : fft_np, \
+         "cupy" : fft_cupy , "arrayfire" : fft_af }
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_gemm.py b/benchmarks/src/pytest_benchmark/test_gemm.py
new file mode 100644
index 0000000..3b449fe
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_gemm.py
@@ -0,0 +1,116 @@
+from common import *
+
+import os
+
+ITERATIONS = 1
+
+alpha = 0.4
+beta = 0.6
+
+dim_x=16
+dim_y=16
+blk_m=64
+blk_n=64
+blk_k=4
+dim_xa=64
+dim_ya=4
+dim_xb=4
+dim_yb=64
+assert (dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb)
+config = {'DIM_X': dim_x, 'DIM_Y': dim_y,
+            'BLK_M': blk_m, 'BLK_N': blk_n, 'BLK_K': blk_k,
+            'DIM_XA': dim_xa, 'DIM_YA': dim_ya,
+            'DIM_XB': dim_xb, 'DIM_YB': dim_yb,
+            'THR_M': blk_m // dim_x, 'THR_N': blk_n // dim_y}
+
+def create_cupy_kernel(params):
+    sgemm_file = os.path.join(os.path.dirname(__file__), 'sgemm.cu')
+    code = None
+    with open(sgemm_file, 'r') as f:
+        code = f.read()
+        for k, v in params.items():
+            code = '#define ' + k + ' ' + str(v) + '\n' + code
+
+    return cupy.RawKernel(code, 'sgemm')
+
+kern = create_cupy_kernel(config)
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestGemm:
+    def test_gemm(self, benchmark, pkgid):
+        pkg = PKGDICT[pkgid]
+        initialize_package(pkgid)
+
+        setup = lambda: (generate_arrays(pkgid, 3), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" 
+        result = benchmark.pedantic(
+            target=FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        cupy.random.seed(1)
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):  
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        dpnp.random.seed(1)
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        np.random.rand(1)
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
+
+def gemm_np(A, B, C):
+    return alpha * np.matmul(A, B) + beta * C
+
+def gemm_af(A, B, C):
+    x = af.gemm(A, B, alpha=alpha, beta=beta, accum=C)
+    af.eval(x)
+    af.sync()
+    return x
+
+def gemm_dpnp(A, B, C):
+    return alpha * dpnp.matmul(A, B) + beta * C
+
+def gemm_cupy(A, B, C):
+    m, k = A.shape
+    k, n = B.shape
+
+    # Inputs matrices need to be in Fortran order.
+    # A = cupy.asfortranarray(A)
+    # B = cupy.asfortranarray(B)
+    # C = cupy.asfortranarray(C)
+
+    grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1)
+    block = (dim_x, dim_y, 1)
+    args = (m, n, k, A, B, C)
+    shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4
+    kern(grid, block, args=args, shared_mem=shared_mem)
+    cupy.cuda.runtime.deviceSynchronize()
+    return C
+
+FUNCS = {
+    "numpy": gemm_np,
+    "cupy": gemm_cupy,
+    "arrayfire": gemm_af,
+    "dpnp": gemm_dpnp
+}
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_kmeans.py b/benchmarks/src/pytest_benchmark/test_kmeans.py
new file mode 100644
index 0000000..e102990
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_kmeans.py
@@ -0,0 +1,290 @@
+from common import *
+
+ITERATIONS = 10
+TOLERANCE = 1e-4
+NSAMPLES = NSIZE
+NFEATURES = 256
+K = 20
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestKmeans:
+    def test_kmeans(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+        kmean_class = { "dpnp" : kmeans_dpnp , "numpy" : kmeans_numpy, \
+         "cupy" : kmeans_cupy , "arrayfire" : kmeans_af }
+        obj = kmean_class[pkg.__name__]()
+
+        benchmark.extra_info["description"] = f"{NSAMPLES}x{NFEATURES} over {K} centers"
+        result = benchmark.pedantic(
+            target=obj.kmeans,
+            rounds=ROUNDS,
+            iterations=1
+        )
+
+class kmeans_numpy:
+    def __init__(self):
+        self.data = np.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = np.random.choice(self.data.shape[0], K, replace=False)
+
+    def initialize_centroids(self):
+        """
+        Randomly initializes k centroids from the data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Initial centroids (k, n_features).
+        """
+
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        """
+        Assigns each data point to the closest centroid.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            centroids (np.ndarray): The current centroids (k, n_features).
+
+        Returns:
+            np.ndarray: An array of cluster assignments for each data point (n_samples,).
+        """
+        distances = np.sqrt(((self.data[:,np.newaxis,:] - centroids[np.newaxis,:,:])**2).sum(axis=2))
+        cluster_assignments = np.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        """
+        Recalculates the centroids based on the mean of the assigned data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            cluster_assignments (np.ndarray): An array of cluster assignments.
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Updated centroids (k, n_features).
+        """
+        new_centroids = np.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = np.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        """
+        Performs the K-Means clustering algorithm.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+            max_iterations (int): Maximum number of iterations to run the algorithm.
+            tolerance (float): The tolerance for convergence (change in centroids).
+
+        Returns:
+            tuple: A tuple containing:
+                - np.ndarray: Final centroids (k, n_features).
+                - np.ndarray: Final cluster assignments for each data point (n_samples,).
+        """
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = np.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if np.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        return centroids, cluster_assignments
+    
+
+class kmeans_dpnp:
+    def __init__(self):
+        self.data = dpnp.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = dpnp.array(np.random.choice(self.data.shape[0], K, replace=False).tolist())
+
+    def initialize_centroids(self):
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        distances = dpnp.sqrt(((self.data[:,dpnp.newaxis,:] - centroids[dpnp.newaxis,:,:])**2).sum(axis=2))
+        cluster_assignments = dpnp.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        new_centroids = dpnp.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = dpnp.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+        for i in range(ITERATIONS):
+            old_centroids = dpnp.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if dpnp.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        return centroids, cluster_assignments
+    
+class kmeans_cupy:
+    def __init__(self):
+        self.data = cupy.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = cupy.random.choice(self.data.shape[0], K, replace=False)
+        cupy.cuda.runtime.deviceSynchronize()
+
+    def initialize_centroids(self):
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        distances = cupy.sqrt(((self.data[:,cupy.newaxis,:] - centroids[cupy.newaxis,:,:])**2).sum(axis=2))
+        cluster_assignments = cupy.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        new_centroids = cupy.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = cupy.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = cupy.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if cupy.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        cupy.cuda.runtime.deviceSynchronize()
+        return centroids, cluster_assignments
+    
+class kmeans_af:
+    def __init__(self):
+        self.data = af.Array(np.random.random((NSAMPLES, NFEATURES)).flatten().tolist(), shape=(NSAMPLES, NFEATURES))
+        self.centroid_indices = af.Array(np.random.choice(self.data.shape[0], K, replace=False).tolist())
+
+        af.eval(self.data)
+        af.eval(self.centroid_indices)
+        af.sync()
+
+    def initialize_centroids(self):
+        """
+        Randomly initializes k centroids from the data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Initial centroids (k, n_features).
+        """
+
+        return self.data[self.centroid_indices, :]
+        # return af.lookup(self.data, self.centroid_indices, axis=0)
+
+    def assign_to_clusters(self, centroids):
+        """
+        Assigns each data point to the closest centroid.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            centroids (np.ndarray): The current centroids (k, n_features).
+
+        Returns:
+            np.ndarray: An array of cluster assignments for each data point (n_samples,).
+        """
+        dist = (af.moddims(self.data, (NSAMPLES, 1, NFEATURES)) - af.moddims(centroids, (1, K, NFEATURES)))**2
+        distances = af.sqrt(af.sum(dist, axis=2))
+        cluster_assignments = af.where(distances == af.tile(af.min(distances, axis=1), (1, K)))
+        # cluster_assignments = af.range((NSAMPLES, K))[distances == af.min(distances, axis=1)]
+        
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        """
+        Recalculates the centroids based on the mean of the assigned data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            cluster_assignments (np.ndarray): An array of cluster assignments.
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Updated centroids (k, n_features).
+        """
+        new_centroids = af.constant(0, (K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[:, af.where(cluster_assignments == i)]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = af.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        """
+        Performs the K-Means clustering algorithm.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+            max_iterations (int): Maximum number of iterations to run the algorithm.
+            tolerance (float): The tolerance for convergence (change in centroids).
+
+        Returns:
+            tuple: A tuple containing:
+                - np.ndarray: Final centroids (k, n_features).
+                - np.ndarray: Final cluster assignments for each data point (n_samples,).
+        """
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = af.copy_array(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if af.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        af.eval(centroids)
+        af.eval(cluster_assignments)
+        af.sync()
+        return centroids, cluster_assignments
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_linalg.py b/benchmarks/src/pytest_benchmark/test_linalg.py
new file mode 100644
index 0000000..5077a96
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_linalg.py
@@ -0,0 +1,297 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from common import *
+
+ITERATIONS = 1
+
+eps = 1e-3
+
+def generate_arrays(pkgid, count, posdef = False):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            x = cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)
+            if posdef:
+                x = x @ x.T + x.T @ x + eps
+            arr_list.append(x)
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):  
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            if posdef:
+                x = af.matmul(x, x.T) + af.matmul(x.T, x) + eps
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            x = dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)
+            if posdef:
+                x = x @ x.T + x.T @ x + eps
+            arr_list.append(x)
+    elif "numpy" == pkg:
+        for i in range(count):
+            x = np.random.rand(NSIZE, NSIZE).astype(DTYPE)
+            if posdef:
+                x = x @ x.T + x.T @ x + eps
+            arr_list.append(x)
+
+    return arr_list
+
+def svd_np(arr):
+    return np.linalg.svd(arr)
+
+def svd_dpnp(arr):
+    return dpnp.linalg.svd(arr)
+
+def svd_af(arr):
+    x = af.svd(arr)
+    for r in x:
+        af.eval(r)
+    af.sync()
+    return x
+
+def svd_cupy(arr):
+    x = cupy.linalg.svd(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+def qr_np(arr):
+    return np.linalg.qr(arr)
+
+def qr_dpnp(arr):
+    return dpnp.linalg.qr(arr)
+
+def qr_af(arr):
+    x = af.qr(arr)
+    for r in x:
+        af.eval(r)
+    af.sync()
+    return x
+
+def qr_cupy(arr):
+    x = cupy.linalg.qr(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+def cholesky_np(arr):
+    return np.linalg.cholesky(arr)
+
+def cholesky_dpnp(arr):
+    return dpnp.linalg.cholesky(arr)
+
+def cholesky_af(arr):
+    x, info = af.cholesky(arr)
+    af.eval(x)
+    af.sync()
+    return x
+
+def cholesky_cupy(arr):
+    x = cupy.linalg.cholesky(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+def qr_cupy(arr):
+    x = cupy.linalg.qr(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+def inv_np(arr):
+    return np.linalg.inv(arr)
+
+def inv_dpnp(arr):
+    return dpnp.linalg.inv(arr)
+
+def inv_af(arr):
+    x, info = af.inverse(arr)
+    af.eval(x)
+    af.sync()
+    return x
+
+def inv_cupy(arr):
+    x = cupy.linalg.inv(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+def det_np(arr):
+    return np.linalg.det(arr)
+
+def det_dpnp(arr):
+    return dpnp.linalg.det(arr)
+
+def det_af(arr):
+    x = af.det(arr)
+    af.sync()
+    return x
+
+def det_cupy(arr):
+    x = cupy.linalg.det(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+def norm_np(arr):
+    return np.linalg.norm(arr)
+
+def norm_dpnp(arr):
+    return dpnp.linalg.norm(arr)
+
+def norm_af(arr):
+    x = af.norm(arr)
+    af.sync()
+    return x
+
+def norm_cupy(arr):
+    x = cupy.linalg.norm(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestLinalg:
+    def test_cholesky(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1, True), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+       
+        CHOLESKY_FUNCS = {
+            "numpy": cholesky_np,
+            "cupy": cholesky_cupy,
+            "arrayfire": cholesky_af,
+            "dpnp": cholesky_dpnp
+        }
+        result = benchmark.pedantic(
+            target=CHOLESKY_FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_svd(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+       
+        SVD_FUNCS = {
+            "numpy": svd_np,
+            "cupy": svd_cupy,
+            "arrayfire": svd_af,
+            "dpnp": svd_dpnp
+        }
+        result = benchmark.pedantic(
+            target=SVD_FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_qr(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+       
+        QR_FUNCS = {
+            "numpy": qr_np,
+            "cupy": qr_cupy,
+            "arrayfire": qr_af,
+            "dpnp": qr_dpnp
+        }
+        result = benchmark.pedantic(
+            target=QR_FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+    
+    def test_inv(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+       
+        INV_FUNCS = {
+            "numpy": inv_np,
+            "cupy": inv_cupy,
+            "arrayfire": inv_af,
+            "dpnp": inv_dpnp
+        }
+        result = benchmark.pedantic(
+            target=INV_FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+    
+    def test_det(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+       
+        DET_FUNCS = {
+            "numpy": det_np,
+            "cupy": det_cupy,
+            "arrayfire": det_af,
+            "dpnp": det_dpnp
+        }
+        result = benchmark.pedantic(
+            target=DET_FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+    
+    def test_norm(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+       
+        NORM_FUNCS = {
+            "numpy": norm_np,
+            "cupy": norm_cupy,
+            "arrayfire": norm_af,
+            "dpnp": norm_dpnp
+        }
+        result = benchmark.pedantic(
+            target=NORM_FUNCS[pkg.__name__],
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_mandelbrot.py b/benchmarks/src/pytest_benchmark/test_mandelbrot.py
new file mode 100644
index 0000000..60c730a
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_mandelbrot.py
@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: 2017 Nicolas P. Rougier
+# SPDX-FileCopyrightText: 2021 ETH Zurich and the NPBench authors
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+# more information at https://github.com/rougier/numpy-book
+
+from common import *
+
+xmin = -2
+xmax = 2
+ymin = -2
+ymax = 2
+xn = NSIZE
+yn = NSIZE
+itermax = 20
+horizon = 2.0
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestMandelbrot:
+    def test_mandelbrot(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} grid iterated {itermax}x"
+        result = benchmark.pedantic(
+            target=FUNCS[pkg.__name__],
+            rounds=ROUNDS,
+            iterations=1
+        )
+
+def mandelbrot_np():
+    # Adapted from
+    # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/
+    Xi, Yi = np.mgrid[0:xn, 0:yn]
+    X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi]
+    Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi]
+    C = X + Y * 1j
+
+    N_ = np.zeros(C.shape, dtype=np.int64)
+    Z_ = np.zeros(C.shape, dtype=np.complex128)
+    Xi.shape = Yi.shape = C.shape = xn * yn
+
+    Z = np.zeros(C.shape, np.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        np.multiply(Z, Z, Z)
+        np.add(Z, C, Z)
+
+        # Failed convergence
+        I = abs(Z) > horizon  # noqa: E741 math variable
+        N_[Xi[I], Yi[I]] = i + 1
+        Z_[Xi[I], Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        np.logical_not(I, I)  # np.negative(I, I) not working any longer
+        Z = Z[I]
+        Xi, Yi = Xi[I], Yi[I]
+        C = C[I]
+    return Z_.T, N_.T
+
+def mandelbrot_dpnp():
+    # Adapted from
+    # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/
+    Xi, Yi = dpnp.mgrid[0:xn, 0:yn]
+    X = dpnp.linspace(xmin, xmax, xn, dtype=dpnp.float64)[Xi]
+    Y = dpnp.linspace(ymin, ymax, yn, dtype=dpnp.float64)[Yi]
+    C = X + Y * 1j
+    N_ = dpnp.zeros(C.shape, dtype=dpnp.int64)
+    Z_ = dpnp.zeros(C.shape, dtype=dpnp.complex128)
+    Xi.reshape(xn * yn)
+    Yi.reshape(xn * yn)
+    C.reshape(xn * yn)
+
+    Z = dpnp.zeros(C.shape, dtype=dpnp.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        dpnp.multiply(Z, Z, Z)
+        dpnp.add(Z, C, Z)
+
+        # Failed convergence
+        I = abs(Z) > horizon  # noqa: E741 math variable
+        N_[Xi[I], Yi[I]] = i + 1
+        Z_[Xi[I], Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        dpnp.logical_not(I, I)  # dpnp.negative(I, I) not working any longer
+        Z = Z[I]
+        Xi, Yi = Xi[I], Yi[I]
+        C = C[I]
+
+    Z_ = Z_.T
+    N_ = N_.T
+
+    return Z_, N_
+
+def mandelbrot_cupy():
+    # Adapted from
+    # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/
+    Xi, Yi = cupy.mgrid[0:xn, 0:yn]
+    X = cupy.linspace(xmin, xmax, xn, dtype=cupy.float64)[Xi]
+    Y = cupy.linspace(ymin, ymax, yn, dtype=cupy.float64)[Yi]
+    C = X + Y * 1j
+    N_ = cupy.zeros(C.shape, dtype=cupy.int64)
+    Z_ = cupy.zeros(C.shape, dtype=cupy.complex128)
+    Xi.shape = Yi.shape = C.shape = xn * yn
+
+    Z = cupy.zeros(C.shape, cupy.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        cupy.multiply(Z, Z, Z)
+        cupy.add(Z, C, Z)
+
+        # Failed convergence
+        I = abs(Z) > horizon  # noqa: E741 math variable
+        N_[Xi[I], Yi[I]] = i + 1
+        Z_[Xi[I], Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        cupy.logical_not(I, I)  # cupy.negative(I, I) not working any longer
+        Z = Z[I]
+        Xi, Yi = Xi[I], Yi[I]
+        C = C[I]
+
+    Z_ = Z_.T
+    N_ = N_.T
+
+    cupy.cuda.runtime.deviceSynchronize()
+    return Z_, N_
+
+def mandelbrot_af():
+    Xi = af.flat(af.range((xn, yn), axis=0, dtype=af.int64))
+    Yi = af.flat(af.range((xn, yn), axis=1, dtype=af.int64))
+    X = af.iota((xn,1), tile_shape=(1,yn), dtype=af.float64) * (xmax - xmin) / (xn - 1) + xmin
+    Y = af.iota((1,yn), tile_shape=(xn,1), dtype=af.float64) * (ymax - ymin) / (yn - 1) + ymin
+
+    C = af.cplx(X, Y)
+    N_ = af.constant(0, (xn, yn))
+    Z_ = af.constant(0, (xn, yn), dtype=af.complex64)
+    Z = af.constant(0, (xn, yn), dtype=af.complex64)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        Z = Z * Z
+        Z = Z + C
+
+        # Failed convergence
+        I = af.abs(Z) > horizon  # noqa: E741 math variable
+
+        if not af.any_true(I):
+            break
+
+        N_[Xi[I] * yn + Yi[I]] = i + 1
+        Z_[Xi[I] * yn + Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        I = af.logical_not(I) 
+        Z = Z[I]
+        Xi = Xi[I]
+        Yi = Yi[I]
+        C = C[I]
+
+    Z_ = Z_.T
+    N_ = N_.T
+    af.eval(Z_)
+    af.eval(N_)
+    af.sync()
+    return Z_, N_
+
+FUNCS = {
+    "dpnp" : mandelbrot_dpnp , "numpy" : mandelbrot_np, \
+    "cupy" : mandelbrot_cupy , "arrayfire" : mandelbrot_af
+}
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py
new file mode 100644
index 0000000..389d4e4
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py
@@ -0,0 +1,52 @@
+from common import *
+
+ITERATIONS = 1
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestPi:
+    def test_pi(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples"
+        result = benchmark.pedantic(
+            target=FUNCS[pkg.__name__],
+            rounds=ROUNDS,
+            iterations=ITERATIONS,
+            args=[NNSIZE]
+        )
+
+# Having the function outside is faster than the lambda inside
+def in_circle(x, y):
+    return (x*x + y*y) < 1
+
+def calc_pi_af(samples):
+    x = af.randu(samples)
+    y = af.randu(samples)
+    result =  4 * af.sum(in_circle(x, y)) / samples
+
+    af.sync()
+
+    return result
+
+def calc_pi_numpy(samples):
+    x = np.random.rand(samples).astype(np.float32)
+    y = np.random.rand(samples).astype(np.float32)
+    return 4. * np.sum(in_circle(x, y)) / samples
+
+def calc_pi_cupy(samples):
+    x = cupy.random.rand(samples, dtype=np.float32)
+    y = cupy.random.rand(samples, dtype=np.float32)
+    res = 4. * cupy.sum(in_circle(x, y)) / samples
+    cupy.cuda.runtime.deviceSynchronize()
+    return res
+
+def calc_pi_dpnp(samples):
+    x = dpnp.random.rand(samples).astype(dpnp.float32)
+    y = dpnp.random.rand(samples).astype(dpnp.float32)
+    return 4. * dpnp.sum(in_circle(x, y)) / samples
+
+FUNCS = { "dpnp" : calc_pi_dpnp, "numpy" : calc_pi_numpy, \
+         "cupy" : calc_pi_cupy , "arrayfire" : calc_pi_af}
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_nbody.py b/benchmarks/src/pytest_benchmark/test_nbody.py
new file mode 100644
index 0000000..d523ee9
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_nbody.py
@@ -0,0 +1,113 @@
+from common import *
+
+ITERATIONS = 1
+G = 1
+dt = 1e-3
+softening = 1e-4
+M = 1
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestNbody:
+    def test_nbody(self, benchmark, pkgid):
+        pkg = PKGDICT[pkgid]
+        setup = lambda: (generate_arrays(pkgid), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE:.2e} Bodies"
+        result = benchmark.pedantic(
+            target=nbody,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+def acceleration(pkg, mass, pos):
+    x = pos[:, 0:1]
+    y = pos[:, 1:2]
+    z = pos[:, 2:3]
+
+    # matrix that stores all pairwise particle separations: r_j - r_i
+    dx = x.T - x
+    dy = y.T - y
+    dz = z.T - z
+
+    # matrix that stores 1/r^3 for all particle pairwise particle separations
+    inv_r3 = dx**2 + dy**2 + dz**2 + softening**2
+    inv_r3[inv_r3 > 0] = inv_r3[inv_r3 > 0] ** (-1.5)
+
+    ax = G * (dx * inv_r3) @ mass
+    ay = G * (dy * inv_r3) @ mass
+    az = G * (dz * inv_r3) @ mass
+
+    return pkg.hstack((ax, ay, az))
+
+def acceleration_af(mass, pos):
+    x = pos[:, 0:1]
+    y = pos[:, 1:2]
+    z = pos[:, 2:3]
+
+    # matrix that stores all pairwise particle separations: r_j - r_i
+    dx = af.moddims(x, (1, x.shape[0])) - x
+    dy = af.moddims(y, (1, x.shape[0])) - y
+    dz = af.moddims(z, (1, x.shape[0])) - z
+
+    # matrix that stores 1/r^3 for all particle pairwise particle separations
+    # inv_r3 = dx**2 + dy**2 + dz**2 + softening**2
+    inv_r3 = dx*dx + dy*dy + dz*dz + softening*softening
+    inv_r3 = af.pow(inv_r3, -1.5)
+
+    ax = G * af.matmul((dx * inv_r3), mass)
+    ay = G * af.matmul((dy * inv_r3), mass)
+    az = G * af.matmul((dz * inv_r3), mass)
+
+    return af.join(1, ax, ay, az)
+
+def nbody(pkg, mass, pos, vel):
+    vel -= pkg.mean(mass * vel, axis=0) / pkg.mean(mass)
+
+    acc = None
+    if pkg.__name__ == 'arrayfire':
+        acc = acceleration_af(mass, pos)
+    else:
+        acc = acceleration(pkg, mass, pos)
+
+    for i in range(ITERATIONS):
+        vel += acc * dt / 2.0
+        pos += vel * dt
+        vel += acc * dt / 2.0
+
+    energy = 0.5 * pkg.sum(mass * vel ** 2)
+
+    return float(energy)
+
+def generate_arrays(pkgid):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    
+    initialize_package(pkgid)
+    pkgname = pkg.__name__
+    count = 2
+    if "cupy" == pkgname:
+        arr_list.append(M * cupy.ones((NSIZE, 1), dtype=DTYPE))
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, 3, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkgname:
+        af.device_gc()
+        arr_list.append(M * af.constant(1, (NSIZE,1), dtype=getattr(af, DTYPE)))
+        for i in range(count):  
+            arr_list.append(af.randu((NSIZE, 3), dtype=getattr(af, DTYPE)))
+        for arr in arr_list:
+            af.eval(arr)
+        af.sync()
+    elif "dpnp" == pkgname:
+        arr_list.append(M * dpnp.ones((NSIZE, 1), dtype=DTYPE))
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, 3).astype(DTYPE))
+    elif "numpy" == pkgname:
+        arr_list.append(M * np.ones((NSIZE, 1), dtype=DTYPE))
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, 3).astype(DTYPE))
+
+    return (pkg, arr_list[0], arr_list[1], arr_list[2])
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_nn.py b/benchmarks/src/pytest_benchmark/test_nn.py
new file mode 100644
index 0000000..f7df176
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_nn.py
@@ -0,0 +1,371 @@
+from common import *
+
+INPUT_SIZE = 28 * 28
+HIDDEN_SIZE = 1000
+OUTPUT_SIZE = 10
+LEARNING_RATE = 0.01
+ITERATIONS = 10
+BATCH_SIZE = 2560
+SAMPLES = 25600
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestNeuralNetwork:
+    def test_neural_network(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+        nn = { "dpnp" : NeuralNetwork_dpnp , "numpy" : NeuralNetwork_numpy, \
+         "cupy" : NeuralNetwork_cupy , "arrayfire" : NeuralNetwork_af }
+        
+        obj = nn[pkg.__name__]()
+
+        benchmark.extra_info["description"] = f"{INPUT_SIZE}x{HIDDEN_SIZE}x{OUTPUT_SIZE} trained with {SAMPLES:.2e} samples"
+        result = benchmark.pedantic(
+            target=obj.train,
+            rounds=ROUNDS,
+            iterations=1
+        )
+
+
+class NeuralNetwork_numpy:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(2. / self.input_size)
+        self.b1 = np.zeros((1, self.hidden_size))
+        self.W2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(2. / self.hidden_size)
+        self.b2 = np.zeros((1, self.output_size))
+
+        self.X_train = np.random.rand(SAMPLES,INPUT_SIZE)
+        self.y_train = np.zeros((SAMPLES * OUTPUT_SIZE))
+        self.y_train[np.arange(SAMPLES) * OUTPUT_SIZE + np.floor(np.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)] = 1
+        self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE))
+
+    def relu(self, x):
+        return np.maximum(0, x)
+
+    def relu_derivative(self, x):
+        return (x > 0).astype(float)
+
+    def softmax(self, x):
+        exp_scores = np.exp(x - np.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability
+        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = np.dot(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = np.dot(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = np.dot(self.a1.T, error_output)
+        db2 = np.sum(error_output, axis=0, keepdims=True)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = np.dot(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = np.dot(X.T, error_hidden)
+        db1 = np.sum(error_hidden, axis=0, keepdims=True)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+    
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i:i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i:i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+    def predict(self, X):
+        return np.argmax(self.forward(X), axis=1)
+    
+
+class NeuralNetwork_dpnp:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = dpnp.random.randn(self.input_size, self.hidden_size) * np.sqrt(2. / self.input_size)
+        self.b1 = dpnp.zeros((1, self.hidden_size))
+        self.W2 = dpnp.random.randn(self.hidden_size, self.output_size) * np.sqrt(2. / self.hidden_size)
+        self.b2 = dpnp.zeros((1, self.output_size))
+
+        self.X_train = dpnp.random.rand(SAMPLES, INPUT_SIZE)
+        self.y_train = dpnp.zeros((SAMPLES * OUTPUT_SIZE))
+        self.y_train[dpnp.arange(SAMPLES) * OUTPUT_SIZE + dpnp.floor(dpnp.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)] = 1
+        self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE))
+
+    def relu(self, x):
+        return dpnp.maximum(0, x)
+
+    def relu_derivative(self, x):
+        return (x > 0).astype(float)
+
+    def softmax(self, x):
+        exp_scores = dpnp.exp(x - dpnp.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability
+        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = dpnp.dot(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = dpnp.dot(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = dpnp.dot(self.a1.T, error_output)
+        db2 = dpnp.sum(error_output, axis=0, keepdims=True)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = dpnp.dot(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = dpnp.dot(X.T, error_hidden)
+        db1 = dpnp.sum(error_hidden, axis=0, keepdims=True)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i:i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i:i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+    def predict(self, X):
+        return dpnp.argmax(self.forward(X), axis=1)
+    
+class NeuralNetwork_cupy:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = cupy.random.randn(self.input_size, self.hidden_size) * np.sqrt(2. / self.input_size)
+        self.b1 = cupy.zeros((1, self.hidden_size))
+        self.W2 = cupy.random.randn(self.hidden_size, self.output_size) * np.sqrt(2. / self.hidden_size)
+        self.b2 = cupy.zeros((1, self.output_size))
+
+        self.X_train = cupy.random.rand(SAMPLES, INPUT_SIZE)
+        self.y_train = cupy.zeros((SAMPLES * OUTPUT_SIZE))
+        self.y_train[cupy.arange(SAMPLES) * OUTPUT_SIZE + cupy.floor(cupy.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)] = 1
+        self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE))
+
+        cupy.cuda.runtime.deviceSynchronize()
+
+    def relu(self, x):
+        return cupy.maximum(0, x)
+
+    def relu_derivative(self, x):
+        return (x > 0).astype(float)
+
+    def softmax(self, x):
+        exp_scores = cupy.exp(x - cupy.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability
+        return exp_scores / cupy.sum(exp_scores, axis=1, keepdims=True)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = cupy.dot(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = cupy.dot(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = cupy.dot(self.a1.T, error_output)
+        db2 = cupy.sum(error_output, axis=0, keepdims=True)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = cupy.dot(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = cupy.dot(X.T, error_hidden)
+        db1 = cupy.sum(error_hidden, axis=0, keepdims=True)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i:i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i:i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+        cupy.cuda.runtime.deviceSynchronize()
+
+    def predict(self, X):
+        return cupy.argmax(self.forward(X), axis=1)
+    
+
+class NeuralNetwork_af:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = af.randn((self.input_size, self.hidden_size)) * np.sqrt(2. / self.input_size)
+        self.b1 = af.constant(0, (1, self.hidden_size))
+        self.W2 = af.randn((self.hidden_size, self.output_size)) * np.sqrt(2. / self.hidden_size)
+        self.b2 = af.constant(0, (1, self.output_size))
+
+        self.X_train = af.randu((SAMPLES, INPUT_SIZE))
+        self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE))
+    
+        self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE))
+        self.y_train[af.iota(SAMPLES), af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1
+
+        af.eval(self.X_train)
+        af.eval(self.y_train)
+        af.eval(self.W1)
+        af.eval(self.W2)
+        af.eval(self.b1)
+        af.eval(self.b2)
+        af.sync()
+
+    def relu(self, x):
+        selection = x > 0
+        return af.select(x, 0, selection)
+
+    def relu_derivative(self, x):
+        return af.cast(x > 0, getattr(af, DTYPE))
+
+    def softmax(self, x):
+        exp_scores = af.exp(x - af.max(x, axis=1)) # Subtract max for numerical stability
+        return exp_scores / af.sum(exp_scores, axis=1)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = af.matmul(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = af.matmul(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = af.matmul(self.a1.T, error_output)
+        db2 = af.sum(error_output, axis=0)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = af.matmul(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = af.matmul(X.T, error_hidden)
+        db1 = af.sum(error_hidden, axis=0)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i:i + BATCH_SIZE,:]
+                y_batch = y_shuffled[i:i + BATCH_SIZE,:]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+        af.eval(self.W2)
+        af.eval(self.b2)
+        af.eval(self.W1)
+        af.eval(self.b1)
+        af.sync()
+
+    def predict(self, X):
+        return af.where(X == af.tile(af.max(self.forward(X), axis=1), (1, X.shape[1])))
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_random.py b/benchmarks/src/pytest_benchmark/test_random.py
new file mode 100644
index 0000000..d74345f
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_random.py
@@ -0,0 +1,65 @@
+from common import *
+
+ITERATIONS = 20
+
+def randn_np():
+    arr = np.random.normal(size=(NNSIZE))
+
+def randn_dpnp():
+    arr = dpnp.random.normal(size=(NNSIZE))
+
+def randn_cupy():
+    arr = cupy.random.normal(size=(NNSIZE))
+    cupy.cuda.runtime.deviceSynchronize()
+
+def randn_af():
+    arr = af.randn((NNSIZE))
+    af.eval(arr)
+    af.sync()
+
+def randu_np():
+    arr = np.random.uniform(size=(NNSIZE))
+
+def randu_dpnp():
+    arr = dpnp.random.uniform(size=(NNSIZE))
+
+def randu_cupy():
+    arr = cupy.random.uniform(size=(NNSIZE))
+    cupy.cuda.runtime.deviceSynchronize()
+
+def randu_af():
+    arr = af.randu((NNSIZE))
+    af.eval(arr)
+    af.sync()
+
+@pytest.mark.parametrize(
+    "pkgid", IDS, ids=IDS
+)
+class TestRandom:
+    def test_normal(self, benchmark, pkgid):
+        initialize_package(pkgid)
+
+        pkg = PKGDICT[pkgid]
+        FUNCS = { "dpnp" : randn_dpnp , "numpy" : randn_np, \
+         "cupy" : randn_cupy , "arrayfire" : randn_af }
+        
+        benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples"
+        result = benchmark.pedantic(
+            target=FUNCS[pkg.__name__],
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    
+    def test_uniform(self, benchmark, pkgid):
+        initialize_package(pkgid)
+
+        pkg = PKGDICT[pkgid]
+        FUNCS = { "dpnp" : randu_dpnp , "numpy" : randu_np, \
+         "cupy" : randu_cupy , "arrayfire" : randu_af }
+
+        result = benchmark.pedantic(
+            target=FUNCS[pkg.__name__],
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
diff --git a/benchmarks/src/requirements.txt b/benchmarks/src/requirements.txt
new file mode 100644
index 0000000..30b908b
--- /dev/null
+++ b/benchmarks/src/requirements.txt
@@ -0,0 +1,10 @@
+--extra-index-url https://software.repos.intel.com/python/pypi
+dpnp
+numpy
+cupy-cuda12x
+pytest
+pytest-benchmark
+matplotlib
+gdown
+pandas
+# arrayfire
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 4ea40fe..724a88b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "arrayfire"
 version = "0.1.0"
-dependencies = ["arrayfire-binary-python-wrapper == 0.7.0"]
+dependencies = ["arrayfire-binary-python-wrapper == 0.8.0+AF3.10.0"]
 requires-python = ">=3.10"
 description = "ArrayFire Python"
 readme = "README.md"
diff --git a/requirements.txt b/requirements.txt
index d3f5294..ba9b100 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-arrayfire-binary-python-wrapper==0.7.0+af3.9.0
+arrayfire-binary-python-wrapper==0.8.0+af3.10.0
diff --git a/tests/test_library/test_linear_algebra.py b/tests/test_library/test_linear_algebra.py
index 19b1cb2..38d7074 100644
--- a/tests/test_library/test_linear_algebra.py
+++ b/tests/test_library/test_linear_algebra.py
@@ -65,8 +65,8 @@ def test_gemm_basic(matrix_a: af.Array, matrix_b: af.Array) -> None:
 def test_gemm_alpha_beta(matrix_a: af.Array, matrix_b: af.Array) -> None:
     alpha = 0.5
     beta = 2.0
-    result = af.gemm(matrix_a, matrix_b, alpha=alpha, beta=beta)
-    expected = create_from_2d_nested(10.5, 12.0, 22.5, 26.0)
+    result = af.gemm(matrix_a, matrix_b, alpha=alpha, beta=beta, accum=matrix_a)
+    expected = create_from_2d_nested(11.5, 15.0, 27.5, 33.0)
     assert result == expected, f"Expected {expected}, got {result}"