diff --git a/arrayfire/array_object.py b/arrayfire/array_object.py index d44ee29..ea8c5cd 100755 --- a/arrayfire/array_object.py +++ b/arrayfire/array_object.py @@ -746,7 +746,7 @@ def __getitem__(self, key: IndexKey, /) -> Array: ---------- self : Array Array instance. - key : int | slice | tuple[int | slice, ...] | Array + key : int | slice | tuple[int | slice | Array, ...] | Array Index key. Returns @@ -754,19 +754,40 @@ def __getitem__(self, key: IndexKey, /) -> Array: out : Array An array containing the accessed value(s). The returned array must have the same data type as self. """ - # TODO - # API Specification - key: Union[int, slice, ellipsis, tuple[Union[int, slice, ellipsis], ...], array]. - # consider using af.span to replace ellipsis during refactoring out = Array() ndims = self.ndim - if isinstance(key, Array) and key == afbool.c_api_value: + indexing = key + + if isinstance(key, int | float | slice): # when indexing with one dimension, treat it as indexing a flat array + ndims = 1 + elif isinstance(key, Array): # when indexing with one array, treat it as indexing a flat array ndims = 1 - if wrapper.count_all(key.arr) == 0: # HACK was count() method before - return out + if key.is_bool: + indexing = wrapper.where(key.arr) + else: + indexing = key.arr + elif isinstance(key, tuple): + key_list = [] + for elem in key: + if isinstance(elem, Array): + if elem.is_bool: + key_list.append(wrapper.where(elem.arr)) + else: + key_list.append(elem.arr) + else: + key_list.append(elem) + indexing = tuple(key_list) + + out._arr = wrapper.index_gen(self._arr, ndims, wrapper.get_indices(indexing)) # type: ignore[arg-type] + + if isinstance(key, Array) and key.is_bool: + wrapper.release_array(indexing) + elif isinstance(key, tuple): + for i in range(len(key)): + if isinstance(key[i], Array) and key[i].is_bool: + wrapper.release_array(indexing[i]) - # HACK known issue - out._arr = wrapper.index_gen(self._arr, ndims, wrapper.get_indices(key)) # type: ignore[arg-type] return out def __index__(self) -> int: @@ -781,8 +802,19 @@ def __len__(self) -> int: return self.shape[0] if self.shape else 0 def __setitem__(self, key: IndexKey, value: int | float | bool | Array, /) -> None: - ndims = self.ndim + """ + Assigns self[key] = value + + Parameters + ---------- + self : Array + Array instance. + key : int | slice | tuple[int | slice | Array, ...] | Array + Index key. + value: int | float | complex | bool | Array + """ + ndims = self.ndim is_array_with_bool = isinstance(key, Array) and type(key) is afbool if is_array_with_bool: @@ -803,12 +835,41 @@ def __setitem__(self, key: IndexKey, value: int | float | bool | Array, /) -> No other_arr = value.arr del_other = False - indices = wrapper.get_indices(key) # type: ignore[arg-type] # FIXME - out = wrapper.assign_gen(self._arr, other_arr, ndims, indices) + indexing = key + if isinstance(key, int | float | slice): # when indexing with one dimension, treat it as indexing a flat array + ndims = 1 + elif isinstance(key, Array): # when indexing with one array, treat it as indexing a flat array + ndims = 1 + if key.is_bool: + indexing = wrapper.where(key.arr) + else: + indexing = key.arr + elif isinstance(key, tuple): + key_list = [] + for elem in key: + if isinstance(elem, Array): + if elem.is_bool: + locs = wrapper.where(elem.arr) + key_list.append(locs) + else: + key_list.append(elem.arr) + else: + key_list.append(elem) + indexing = tuple(key_list) + + out = wrapper.assign_gen(self._arr, other_arr, ndims, wrapper.get_indices(indexing)) + + if isinstance(key, Array) and key.is_bool: + wrapper.release_array(indexing) + elif isinstance(key, tuple): + for i in range(len(key)): + if isinstance(key[i], Array) and key[i].is_bool: + wrapper.release_array(indexing[i]) wrapper.release_array(self._arr) if del_other: wrapper.release_array(other_arr) + self._arr = out def __str__(self) -> str: @@ -1144,7 +1205,10 @@ def _get_processed_index(key: IndexKey, shape: tuple[int, ...]) -> tuple[int, .. if isinstance(key, tuple): return tuple(_index_to_afindex(key[i], shape[i]) for i in range(len(key))) - return (_index_to_afindex(key, shape[0]),) + shape[1:] + size = 1 + for dim_size in shape: + size *= dim_size + return (_index_to_afindex(key, size),) def _index_to_afindex(key: int | float | complex | bool | slice | wrapper.ParallelRange | Array, axis: int) -> int: @@ -1168,6 +1232,10 @@ def _index_to_afindex(key: int | float | complex | bool | slice | wrapper.Parall def _slice_to_length(key: slice, axis: int) -> int: + start = key.start + stop = key.stop + step = key.step + if key.start is None: start = 0 elif key.start < 0: diff --git a/arrayfire/library/array_functions.py b/arrayfire/library/array_functions.py index e8028e5..90956b3 100644 --- a/arrayfire/library/array_functions.py +++ b/arrayfire/library/array_functions.py @@ -954,6 +954,7 @@ def replace(lhs: Array, rhs: Array | int | float, conditional: Array, /) -> None wrapper.replace_scalar(lhs.arr, conditional.arr, rhs) +@afarray_as_array def select(lhs: Array | int | float, rhs: Array | int | float, conditional: Array, /) -> Array: """ Conditionally selects elements from one of two sources (ArrayFire arrays or scalars) based on a condition array. diff --git a/arrayfire/library/linear_algebra.py b/arrayfire/library/linear_algebra.py index ee1f09e..67109f2 100644 --- a/arrayfire/library/linear_algebra.py +++ b/arrayfire/library/linear_algebra.py @@ -92,6 +92,7 @@ def gemm( rhs_opts: MatProp = MatProp.NONE, alpha: int | float = 1.0, beta: int | float = 0.0, + accum: Array = None ) -> Array: """ Performs BLAS general matrix multiplication (GEMM) on two Array instances. @@ -125,6 +126,10 @@ def gemm( beta : int | float, optional Scalar multiplier for the existing matrix C in the accumulation. Default is 0.0. + accum: Array, optional + A 2-dimensional, real or complex array representing the matrix C in the accumulation. + Default is None (no accumulation). + Returns ------- Array @@ -135,7 +140,10 @@ def gemm( - The data types of `lhs` and `rhs` must be compatible. - Batch operations are not supported in this version. """ - return cast(Array, wrapper.gemm(lhs.arr, rhs.arr, lhs_opts, rhs_opts, alpha, beta)) + accumulator = None + if isinstance(accum, Array): + accumulator = accum.arr + return cast(Array, wrapper.gemm(lhs.arr, rhs.arr, lhs_opts, rhs_opts, alpha, beta, accumulator)) @afarray_as_array diff --git a/arrayfire/library/mathematical_functions.py b/arrayfire/library/mathematical_functions.py index bc7b970..0d2e99b 100644 --- a/arrayfire/library/mathematical_functions.py +++ b/arrayfire/library/mathematical_functions.py @@ -261,12 +261,11 @@ def atan2(x1: int | float | Array, x2: int | float | Array, /) -> Array: return process_c_function(x1, x2, wrapper.atan2) -@afarray_as_array def cplx(x1: int | float | Array, /, x2: int | float | Array | None = None) -> Array: if x2 is None: if not isinstance(x1, Array): raise TypeError("x1 can not be int or tuple when x2 is None.") - return cast(Array, wrapper.cplx(x1.arr)) + return Array.from_afarray(Array, wrapper.cplx(x1.arr)) else: return process_c_function(x1, x2, wrapper.cplx2) diff --git a/benchmarks/src/README.md b/benchmarks/src/README.md new file mode 100644 index 0000000..ae84c4e --- /dev/null +++ b/benchmarks/src/README.md @@ -0,0 +1,36 @@ +Benchmarks +=========== + +## Setting up environment + +```sh + python -m pip install -r requirements.txt +``` + +## Benchmark parameters + +The benchmark packages, rounds, array sizes, and numeric type may be specified on the constants at the top of [pytest_benchmark/common.py](pytest_benchmark/common.py). + +Alternatively, they may be specified individually at the top of each test file. + + +## Running + +These are the steps to run the benchmarks, and produce the graphs + +Run the benchmarks and store the results in `results.json` +```sh + pytest .\pytest_benchmark --benchmark-json=results.json +``` + +To create graphs and store the timing results after creating the `results.json`, run: +```sh + python graphs.py +``` + +To modify the tests being shown, modify the `TESTS` list at the top of the `graphs.py` file. +To modify the labels shown, modify `PKG_LABELS` +To modify the hardware display, modify `HARDWARE` + +## Notes +When running with `dpnp`, set the environment variable `DPNP_RAISE_EXCEPION_ON_NUMPY_FALLBACK` to 0. \ No newline at end of file diff --git a/benchmarks/src/graphs.py b/benchmarks/src/graphs.py new file mode 100644 index 0000000..cd19775 --- /dev/null +++ b/benchmarks/src/graphs.py @@ -0,0 +1,225 @@ +import matplotlib.pyplot as plt +import numpy as np +import json +import pandas as pd + +BENCHMARKS_JSON = 'results.json' + +# Hardware details shown in title +HARDWARE = "AMD Ryzen 9 9900X 12-Core Processor 63032 MB (fp64 fp16)\noneAPI 2025.1.3 Intel(R) OpenCL Graphics: Intel(R) Arc(TM) B580 Graphics, 11873 MB (fp64 fp16)" + +# Show speedup in graph +SHOW_NUMBERS = True + +# Round to digits after decimal +ROUND_NUMBERS = 1 + +# package list in graph order; arrayfire packages are added later +PKG_NAMES = [ + 'numpy', + 'dpnp', + 'cupy' +] + +# color used in graphs +PKG_COLOR = { + "numpy": "tab:blue", + "cupy": "tab:green", + "dpnp": "tab:red", + "afcpu": "tab:orange", + "afopencl": "tab:orange", + "afcuda": "tab:orange", + "afoneapi": "tab:orange" +} + +# labels displayed in the graph +PKG_LABELS = { + "numpy": "numpy[cpu]", + "dpnp": "dpnp[level_zero:gpu]", + "cupy": "cupy", + "afcpu": "afcpu", + "afcuda": "afcuda", + "afopencl": "afopencl[opencl:gpu]", + "afoneapi": "afoneapi[opencl:gpu]" +} + +AFBACKENDS = [ + 'afcpu', + 'afcuda', + 'afopencl', + 'afoneapi' +] + +# Tests to be shown in graphs +TESTS = [ + 'qr', + 'neural_network', + 'gemm', + 'mandelbrot', + 'nbody', + 'pi', + 'black_scholes', + 'fft', + 'normal', + 'group_elementwise', + + #Other tests + # 'svd + # 'cholesky', + # 'det', + # 'norm', + # 'uniform', + # 'inv' +] + +def get_benchmark_data(): + results = {} + descriptions = {} + with open(BENCHMARKS_JSON) as f: + js = json.load(f) + for bench in js['benchmarks']: + test_name = bench["name"] + test_name = test_name[test_name.find('_') + 1:test_name.find('[')] + + key = bench["param"] + val = bench["stats"]["ops"] + + if len(bench["extra_info"]) != 0 and (not test_name in descriptions): + descriptions[test_name] = bench["extra_info"]["description"] + + if test_name not in results: + results[test_name] = { key : val } + else: + results[test_name][key] = val + + return results, descriptions + +def create_graph(test_name, test_results): + names = [] + values = [] + for name in test_results: + names.append(name) + values.append(test_results[name]) + + bar = plt.bar(names, values) + plt.title(test_name) + + plt.savefig("img/" + test_name + ".png") + plt.close() + +def generate_individual_graphs(): + results, descriptions = get_benchmark_data() + + for test in results: + create_graph(test, results[test]) + +# Stores the timing results in a csv file +def store_csv(): + data_dict = {} + data_dict["Test(seconds)"] = [] + results = {} + for pkg in PKG_LABELS.keys(): + data_dict[pkg] = [] + results[pkg] = {} + + with open(BENCHMARKS_JSON) as f: + js = json.load(f) + for bench in js['benchmarks']: + test_name = bench["name"] + test_name = test_name[test_name.find('_') + 1:test_name.find('[')] + + pkg = bench["param"] + time = bench["stats"]["mean"] + + if not test_name in data_dict["Test(seconds)"]: + data_dict["Test(seconds)"].append(test_name) + + results[pkg][test_name] = time + + for test in data_dict["Test(seconds)"]: + for pkg in PKG_LABELS.keys(): + if test in results[pkg]: + data_dict[pkg].append(results[pkg][test]) + else: + data_dict[pkg].append(np.nan) + + df = pd.DataFrame(data_dict) + df.to_csv("summary.csv") + +def generate_group_graph(test_list = None, show_numbers = False, filename = "comparison"): + results, descriptions = get_benchmark_data() + + width = 1 / (1 + len(PKG_NAMES)) + multiplier = 0 + + tests = None + if test_list: + tests = test_list + else: + tests = results.keys() + + tests_values = {} + x = np.arange(len(tests)) + + for name in PKG_NAMES: + tests_values[name] = [] + + max_val = 1 + for test in tests: + for name in PKG_NAMES: + base_value = results[test]["numpy"] + if name in results[test]: + val = results[test][name] / base_value + + if ROUND_NUMBERS: + val = round(val, ROUND_NUMBERS) + + if max_val < val: + max_val = val + + tests_values[name].append(val) + else: + tests_values[name].append(np.nan) + + fig, ax = plt.subplots(layout='constrained') + + for name in PKG_NAMES: + offset = width * multiplier + rects = ax.barh(x + offset, tests_values[name], width, label=PKG_LABELS[name], color=PKG_COLOR[name]) + + if show_numbers: + ax.bar_label(rects, padding=3, rotation=0) + multiplier += 1 + + xlabels = [] + for test in tests: + xlabels.append(test + "\n" + descriptions[test]) + + ax.set_xlabel('Speedup') + ax.set_xscale('log') + ax.set_title(f'Runtime Comparison\n{HARDWARE}') + ax.set_yticks(x + width, xlabels, rotation=0) + xmin, xmax = ax.get_xlim() + ax.set_xlim(xmin, xmax * 2) + + ax.legend(loc='lower right', ncols=len(PKG_NAMES)) + fig.set_figheight(8) + fig.set_figwidth(13) + fig.savefig(f"img/{filename}.png") + plt.show() + +def main(): + store_csv() + for backend in AFBACKENDS: + try: + filename = f"comparison_{backend}" + if not backend in PKG_NAMES: + PKG_NAMES.insert(1, backend) + generate_group_graph(TESTS, SHOW_NUMBERS, filename) + PKG_NAMES.remove(backend) + except Exception as e: + print(e) + print("No data for", backend) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/common.py b/benchmarks/src/pytest_benchmark/common.py new file mode 100644 index 0000000..58ee9d5 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/common.py @@ -0,0 +1,103 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pytest +import math + +import arrayfire as af +import numpy as np +import dpnp +import dpctl +import cupy +import gc + +# modify parameters for most benchmarks +ROUNDS = 30 +NSIZE = 2 ** 13 +NNSIZE = NSIZE ** 2 +DTYPE = "float32" + +# comment a line to remove that package from testing +PKGDICT = { + "dpnp" : dpnp, + "numpy" : np, + "cupy": cupy, + # "afcpu": af, + "afopencl": af, + "afcuda" : af, + "afoneapi": af +} + +PKGS = [] +IDS = [] + +for key, value in PKGDICT.items(): + IDS.append(key) + PKGS.append(value) + +# Initialize packages and cleanup memory before each round +def initialize_package(PKG_ID): + pkg = PKGDICT[PKG_ID] + + try: + af.device_gc() + mempool = cupy.get_default_memory_pool() + mempool.free_all_blocks() + except: + pass + + if PKG_ID == "afcpu": + af.set_backend(af.BackendType.cpu) + af.device_gc() + af.info() + elif PKG_ID == "afopencl": + af.set_backend(af.BackendType.opencl) + af.device_gc() + af.info() + elif PKG_ID == "afcuda": + af.set_backend(af.BackendType.cuda) + af.device_gc() + af.info() + elif PKG_ID == "afoneapi": + af.set_backend(af.BackendType.oneapi) + af.device_gc() + af.info() + elif PKG_ID == "numpy": + np.random.seed(0) + elif PKG_ID == "dpnp": + dpnp.random.seed(0) + print(dpctl.get_devices()[0]) + elif PKG_ID == "cupy": + cupy.random.seed(0) + print(cupy.cuda.Device()) + mempool = cupy.get_default_memory_pool() + mempool.free_all_blocks() + else: + raise NotImplementedError() + + # Free all unused memory + gc.collect() \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/sgemm.cu b/benchmarks/src/pytest_benchmark/sgemm.cu new file mode 100644 index 0000000..9e4791f --- /dev/null +++ b/benchmarks/src/pytest_benchmark/sgemm.cu @@ -0,0 +1,200 @@ +/* +Original works by: +-------------------------------------------------------- +MAGMA +Copyright (c) 2017 The University of Tennessee. All rights reserved. +Licensed under modified BSD license +*/ + + +// These parameters will be determined by utils.read_code +//#define DIM_X ${DIM_X} +//#define DIM_Y ${DIM_Y} +//#define BLK_M ${BLK_M} +//#define BLK_N ${BLK_N} +//#define BLK_K ${BLK_K} +//#define DIM_XA ${DIM_XA} +//#define DIM_YA ${DIM_YA} +//#define DIM_XB ${DIM_XB} +//#define DIM_YB ${DIM_YB} +//#define THR_N ${THR_N} +//#define THR_M ${THR_M} + +#define fetch(arr, col, m, n, bound) arr[min(n*col + m, bound)] + + +extern "C" __global__ +void sgemm( + int M, int N, int K, + const float* A, + const float* B, + float * C) +{ + int idx = threadIdx.x; + int idy = threadIdx.y; + + int idt = DIM_X * idy + idx; + + int idxA = idt % DIM_XA; + int idyA = idt / DIM_XA; + + int idxB = idt % DIM_XB; + int idyB = idt / DIM_XB; + + int blx = blockIdx.x; + int bly = blockIdx.y; + + __shared__ float sA[BLK_K][BLK_M + 1]; + __shared__ float sB[BLK_N][BLK_K + 1]; + + // registers for the innermost loop + float rC[THR_N][THR_M]; + float rA[THR_M]; + float rB[THR_N]; + + float ra[BLK_K / DIM_YA][BLK_M / DIM_XA]; + float rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; + + const float* offs_dA = A + blx * BLK_M + idyA * M + idxA; + int boundA = (M * (K - 1) + M) - (blx * BLK_M + idyA * M + idxA) - 1; + const float* offs_dB = B + bly * BLK_N * K + idyB * K + idxB; + int boundB = (K * (N - 1) + K) - (bly * BLK_N * K + idyB * K + idxB) - 1; + + int m, n, k, kk; + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0 ; m < THR_M; m++) { + rC[n][m] = 0; + } + } + + // blockwise transpose to transpose load + #pragma unroll + for (n = 0; n < BLK_K; n += DIM_YA) { + #pragma unroll + for (m = 0; m < BLK_M; m += DIM_XA) { + sA[n + idyA][m + idxA] = fetch(offs_dA, M, m, n, boundA); + } + } + // blockwise transpose to transpose load + #pragma unroll + for (n = 0; n < BLK_N; n += DIM_YB) { + #pragma unroll + for (m = 0; m < BLK_K; m += DIM_XB) { + sB[n + idyB][m + idxB] = fetch(offs_dB, K, m, n, boundB); + } + } + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K * M; + boundA -= BLK_K * M; + offs_dB += BLK_K; + boundB -= BLK_K; + + #pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) { + #pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) { + ra[n][m] = fetch(offs_dA, M, m * DIM_XA, n * DIM_YA, boundA); + } + } + + #pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) { + #pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) { + rb[n][m] = fetch(offs_dB, K, m * DIM_XB, n * DIM_YB, boundB); + } + } + + // multiply + #pragma unroll + for (k = 0; k < BLK_K; k++) + { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rA[m] = sA[k][m * DIM_X + idx]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + rB[n] = sB[n * DIM_Y + idy][k]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rC[n][m] += rA[m] * rB[n]; + } + } + } + __syncthreads(); + + // store A regs->smem + #pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { + #pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + sA[n * DIM_YA + idyA][m * DIM_XA + idxA] = ra[n][m]; + } + } + + #pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) + { + #pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) + { + sB[n * DIM_YB + idyB][m * DIM_XB + idxB] = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of columns of A and + // rows of B. + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + + kk = K - kk; + #pragma unroll + for (k = 0; k < kk; k++) + { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rA[m] = sA[k][m * DIM_X + idx]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + rB[n] = sB[n * DIM_Y + idy][k]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; + #pragma unroll + for (m = 0; m < THR_M; m++) { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) { + C[coord_dCn * M + coord_dCm] = rC[n][m]; + } + } + } +} \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_blackscholes.py b/benchmarks/src/pytest_benchmark/test_blackscholes.py new file mode 100644 index 0000000..f940ff9 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_blackscholes.py @@ -0,0 +1,146 @@ +#!/usr/bin/python + +####################################################### +# Copyright (c) 2015, ArrayFire +# All rights reserved. +# +# This file is distributed under 3-clause BSD license. +# The complete license agreement can be obtained at: +# http://arrayfire.com/licenses/BSD-3-Clause +######################################################## + +from common import * + +ITERATIONS = 1 + +sqrt2 = math.sqrt(2.0) + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestBlackScholes: + def test_black_scholes(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 5), {}) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic( + target=FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + +def black_scholes_numpy(S, X, R, V, T): + # S = Underlying stock price + # X = Strike Price + # R = Risk free rate of interest + # V = Volatility + # T = Time to maturity + def cnd(x): + temp = (x > 0) + erf = lambda arr : np.exp(-arr * arr) + return temp * (0.5 + erf(x/sqrt2)/2) + (1 - temp) * (0.5 - erf((-x)/sqrt2)/2) + + d1 = np.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * np.sqrt(T)) + + d2 = d1 - (V * np.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * np.exp((-R) * T) * cnd_d2) + P = X * np.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1)) + + return (C, P) + +def black_scholes_dpnp(S, X, R, V, T): + def cnd(x): + temp = (x > 0) + return temp * (0.5 + dpnp.erf(x/sqrt2)/2) + (1 - temp) * (0.5 - dpnp.erf((-x)/sqrt2)/2) + + d1 = dpnp.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * dpnp.sqrt(T)) + + d2 = d1 - (V * dpnp.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * dpnp.exp((-R) * T) * cnd_d2) + P = X * dpnp.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1)) + + return (C, P) + +def black_scholes_cupy(S, X, R, V, T): + def cnd(x): + temp = (x > 0) + erf = lambda arr : cupy.exp(-arr * arr) + return temp * (0.5 + erf(x/sqrt2)/2) + (1 - temp) * (0.5 - erf((-x)/sqrt2)/2) + + d1 = cupy.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * cupy.sqrt(T)) + + d2 = d1 - (V * cupy.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * cupy.exp((-R) * T) * cnd_d2) + P = X * cupy.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1)) + + cupy.cuda.runtime.deviceSynchronize() + + return (C, P) + +def black_scholes_arrayfire(S, X, R, V, T): + def cnd(x): + temp = (x > 0) + return temp * (0.5 + af.erf(x/sqrt2)/2) + (1 - temp) * (0.5 - af.erf((-x)/sqrt2)/2) + + d1 = af.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * af.sqrt(T)) + + d2 = d1 - (V * af.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * af.exp((-R) * T) * cnd_d2) + P = X * af.exp((-R) * T) * (1 - cnd_d2) - (S * (1 -cnd_d1)) + + af.eval(C) + af.eval(P) + af.sync() + + return (C, P) + + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + af.device_gc() + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list + +FUNCS = { "dpnp" : black_scholes_dpnp, "numpy" : black_scholes_numpy, \ + "cupy" : black_scholes_cupy , "arrayfire" : black_scholes_arrayfire} \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_elementwise.py b/benchmarks/src/pytest_benchmark/test_elementwise.py new file mode 100644 index 0000000..ad6cbd6 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_elementwise.py @@ -0,0 +1,327 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from common import * + +ITERATIONS = 1 + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestElementwise: + def test_group_elementwise(self, benchmark, pkgid): + initialize_package(pkgid) + + setup = lambda: ([generate_arrays(pkgid, 1)[0] / (NSIZE * NSIZE)], {}) + pkg = PKGDICT[pkgid] + + def func(arr): + return pkg.exp(pkg.cos(pkg.sinh(arr))) +\ + pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr))) + + def func_af(arr): + x = af.exp(af.cos(af.sinh(arr))) +\ + af.cbrt(af.log(arr) * af.expm1(-af.sqrt(arr))) + af.eval(x) + af.sync() + return x + + def func_cupy(arr): + x = pkg.exp(pkg.cos(pkg.sinh(arr))) +\ + pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr))) + cupy.cuda.runtime.deviceSynchronize() + return x + + GROUP_FUNCS = { + "numpy": func, + "cupy": func_cupy, + "arrayfire": func_af, + "dpnp": func + } + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic( + target=GROUP_FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + ''' + def test_arccos(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.acos, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arccosh(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.acosh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arcsin(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.asin, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arcsinh(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.asinh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arctan(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.atan, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arctanh(self, benchmark, pkg): + setup = lambda: ([(generate_arrays(pkg, 1)[0] - 1) / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.atanh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_cbrt(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.cbrt, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_cos(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.cos, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_cosh(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.cosh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_sin(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.sin, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_sinh(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.sinh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_degrees(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.degrees, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_exp(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.exp, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_exp2(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.exp2, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_expm1(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.expm1, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_log(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.log, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_log10(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.log10, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_log1p(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.log1p, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_sqrt(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.sqrt, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_square(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.square, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_tan(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.tan, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_tanh(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.tanh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_reciprocal(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.reciprocal, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + ''' + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_fft.py b/benchmarks/src/pytest_benchmark/test_fft.py new file mode 100644 index 0000000..4c563a9 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_fft.py @@ -0,0 +1,91 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from common import * + +ITERATIONS = 1 + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestFFT: + def test_fft(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic( + target=FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + +def fft_af(arr): + res = af.fft(arr) + af.eval(res) + af.sync() + + return res + +def fft_np(arr): + return np.fft.fft(arr) + +def fft_dpnp(arr): + return dpnp.fft.fft(arr) + +def fft_cupy(arr): + res = cupy.fft.fft(arr) + cupy.cuda.runtime.deviceSynchronize() + return res + +FUNCS = { "dpnp" : fft_dpnp , "numpy" : fft_np, \ + "cupy" : fft_cupy , "arrayfire" : fft_af } \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_gemm.py b/benchmarks/src/pytest_benchmark/test_gemm.py new file mode 100644 index 0000000..3b449fe --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_gemm.py @@ -0,0 +1,116 @@ +from common import * + +import os + +ITERATIONS = 1 + +alpha = 0.4 +beta = 0.6 + +dim_x=16 +dim_y=16 +blk_m=64 +blk_n=64 +blk_k=4 +dim_xa=64 +dim_ya=4 +dim_xb=4 +dim_yb=64 +assert (dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb) +config = {'DIM_X': dim_x, 'DIM_Y': dim_y, + 'BLK_M': blk_m, 'BLK_N': blk_n, 'BLK_K': blk_k, + 'DIM_XA': dim_xa, 'DIM_YA': dim_ya, + 'DIM_XB': dim_xb, 'DIM_YB': dim_yb, + 'THR_M': blk_m // dim_x, 'THR_N': blk_n // dim_y} + +def create_cupy_kernel(params): + sgemm_file = os.path.join(os.path.dirname(__file__), 'sgemm.cu') + code = None + with open(sgemm_file, 'r') as f: + code = f.read() + for k, v in params.items(): + code = '#define ' + k + ' ' + str(v) + '\n' + code + + return cupy.RawKernel(code, 'sgemm') + +kern = create_cupy_kernel(config) + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestGemm: + def test_gemm(self, benchmark, pkgid): + pkg = PKGDICT[pkgid] + initialize_package(pkgid) + + setup = lambda: (generate_arrays(pkgid, 3), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic( + target=FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + cupy.random.seed(1) + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + dpnp.random.seed(1) + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + np.random.rand(1) + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list + +def gemm_np(A, B, C): + return alpha * np.matmul(A, B) + beta * C + +def gemm_af(A, B, C): + x = af.gemm(A, B, alpha=alpha, beta=beta, accum=C) + af.eval(x) + af.sync() + return x + +def gemm_dpnp(A, B, C): + return alpha * dpnp.matmul(A, B) + beta * C + +def gemm_cupy(A, B, C): + m, k = A.shape + k, n = B.shape + + # Inputs matrices need to be in Fortran order. + # A = cupy.asfortranarray(A) + # B = cupy.asfortranarray(B) + # C = cupy.asfortranarray(C) + + grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1) + block = (dim_x, dim_y, 1) + args = (m, n, k, A, B, C) + shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4 + kern(grid, block, args=args, shared_mem=shared_mem) + cupy.cuda.runtime.deviceSynchronize() + return C + +FUNCS = { + "numpy": gemm_np, + "cupy": gemm_cupy, + "arrayfire": gemm_af, + "dpnp": gemm_dpnp +} \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_kmeans.py b/benchmarks/src/pytest_benchmark/test_kmeans.py new file mode 100644 index 0000000..e102990 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_kmeans.py @@ -0,0 +1,290 @@ +from common import * + +ITERATIONS = 10 +TOLERANCE = 1e-4 +NSAMPLES = NSIZE +NFEATURES = 256 +K = 20 + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestKmeans: + def test_kmeans(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + kmean_class = { "dpnp" : kmeans_dpnp , "numpy" : kmeans_numpy, \ + "cupy" : kmeans_cupy , "arrayfire" : kmeans_af } + obj = kmean_class[pkg.__name__]() + + benchmark.extra_info["description"] = f"{NSAMPLES}x{NFEATURES} over {K} centers" + result = benchmark.pedantic( + target=obj.kmeans, + rounds=ROUNDS, + iterations=1 + ) + +class kmeans_numpy: + def __init__(self): + self.data = np.random.random((NSAMPLES, NFEATURES)) + self.centroid_indices = np.random.choice(self.data.shape[0], K, replace=False) + + def initialize_centroids(self): + """ + Randomly initializes k centroids from the data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + + Returns: + np.ndarray: Initial centroids (k, n_features). + """ + + return self.data[self.centroid_indices, :] + + def assign_to_clusters(self, centroids): + """ + Assigns each data point to the closest centroid. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + centroids (np.ndarray): The current centroids (k, n_features). + + Returns: + np.ndarray: An array of cluster assignments for each data point (n_samples,). + """ + distances = np.sqrt(((self.data[:,np.newaxis,:] - centroids[np.newaxis,:,:])**2).sum(axis=2)) + cluster_assignments = np.argmin(distances, axis=1) + return cluster_assignments + + def update_centroids(self, cluster_assignments): + """ + Recalculates the centroids based on the mean of the assigned data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + cluster_assignments (np.ndarray): An array of cluster assignments. + k (int): The number of clusters. + + Returns: + np.ndarray: Updated centroids (k, n_features). + """ + new_centroids = np.zeros((K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[cluster_assignments == i] + if len(points_in_cluster) > 0: + new_centroids[i] = np.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + """ + Performs the K-Means clustering algorithm. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + max_iterations (int): Maximum number of iterations to run the algorithm. + tolerance (float): The tolerance for convergence (change in centroids). + + Returns: + tuple: A tuple containing: + - np.ndarray: Final centroids (k, n_features). + - np.ndarray: Final cluster assignments for each data point (n_samples,). + """ + centroids = self.initialize_centroids() + cluster_assignments = None + + for i in range(ITERATIONS): + old_centroids = np.copy(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if np.linalg.norm(centroids - old_centroids) < TOLERANCE: + break + + return centroids, cluster_assignments + + +class kmeans_dpnp: + def __init__(self): + self.data = dpnp.random.random((NSAMPLES, NFEATURES)) + self.centroid_indices = dpnp.array(np.random.choice(self.data.shape[0], K, replace=False).tolist()) + + def initialize_centroids(self): + return self.data[self.centroid_indices, :] + + def assign_to_clusters(self, centroids): + distances = dpnp.sqrt(((self.data[:,dpnp.newaxis,:] - centroids[dpnp.newaxis,:,:])**2).sum(axis=2)) + cluster_assignments = dpnp.argmin(distances, axis=1) + return cluster_assignments + + def update_centroids(self, cluster_assignments): + new_centroids = dpnp.zeros((K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[cluster_assignments == i] + if len(points_in_cluster) > 0: + new_centroids[i] = dpnp.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + centroids = self.initialize_centroids() + cluster_assignments = None + for i in range(ITERATIONS): + old_centroids = dpnp.copy(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if dpnp.linalg.norm(centroids - old_centroids) < TOLERANCE: + break + + return centroids, cluster_assignments + +class kmeans_cupy: + def __init__(self): + self.data = cupy.random.random((NSAMPLES, NFEATURES)) + self.centroid_indices = cupy.random.choice(self.data.shape[0], K, replace=False) + cupy.cuda.runtime.deviceSynchronize() + + def initialize_centroids(self): + return self.data[self.centroid_indices, :] + + def assign_to_clusters(self, centroids): + distances = cupy.sqrt(((self.data[:,cupy.newaxis,:] - centroids[cupy.newaxis,:,:])**2).sum(axis=2)) + cluster_assignments = cupy.argmin(distances, axis=1) + return cluster_assignments + + def update_centroids(self, cluster_assignments): + new_centroids = cupy.zeros((K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[cluster_assignments == i] + if len(points_in_cluster) > 0: + new_centroids[i] = cupy.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + centroids = self.initialize_centroids() + cluster_assignments = None + + for i in range(ITERATIONS): + old_centroids = cupy.copy(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if cupy.linalg.norm(centroids - old_centroids) < TOLERANCE: + break + + cupy.cuda.runtime.deviceSynchronize() + return centroids, cluster_assignments + +class kmeans_af: + def __init__(self): + self.data = af.Array(np.random.random((NSAMPLES, NFEATURES)).flatten().tolist(), shape=(NSAMPLES, NFEATURES)) + self.centroid_indices = af.Array(np.random.choice(self.data.shape[0], K, replace=False).tolist()) + + af.eval(self.data) + af.eval(self.centroid_indices) + af.sync() + + def initialize_centroids(self): + """ + Randomly initializes k centroids from the data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + + Returns: + np.ndarray: Initial centroids (k, n_features). + """ + + return self.data[self.centroid_indices, :] + # return af.lookup(self.data, self.centroid_indices, axis=0) + + def assign_to_clusters(self, centroids): + """ + Assigns each data point to the closest centroid. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + centroids (np.ndarray): The current centroids (k, n_features). + + Returns: + np.ndarray: An array of cluster assignments for each data point (n_samples,). + """ + dist = (af.moddims(self.data, (NSAMPLES, 1, NFEATURES)) - af.moddims(centroids, (1, K, NFEATURES)))**2 + distances = af.sqrt(af.sum(dist, axis=2)) + cluster_assignments = af.where(distances == af.tile(af.min(distances, axis=1), (1, K))) + # cluster_assignments = af.range((NSAMPLES, K))[distances == af.min(distances, axis=1)] + + return cluster_assignments + + def update_centroids(self, cluster_assignments): + """ + Recalculates the centroids based on the mean of the assigned data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + cluster_assignments (np.ndarray): An array of cluster assignments. + k (int): The number of clusters. + + Returns: + np.ndarray: Updated centroids (k, n_features). + """ + new_centroids = af.constant(0, (K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[:, af.where(cluster_assignments == i)] + if len(points_in_cluster) > 0: + new_centroids[i] = af.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + """ + Performs the K-Means clustering algorithm. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + max_iterations (int): Maximum number of iterations to run the algorithm. + tolerance (float): The tolerance for convergence (change in centroids). + + Returns: + tuple: A tuple containing: + - np.ndarray: Final centroids (k, n_features). + - np.ndarray: Final cluster assignments for each data point (n_samples,). + """ + centroids = self.initialize_centroids() + cluster_assignments = None + + for i in range(ITERATIONS): + old_centroids = af.copy_array(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if af.norm(centroids - old_centroids) < TOLERANCE: + break + + af.eval(centroids) + af.eval(cluster_assignments) + af.sync() + return centroids, cluster_assignments \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_linalg.py b/benchmarks/src/pytest_benchmark/test_linalg.py new file mode 100644 index 0000000..5077a96 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_linalg.py @@ -0,0 +1,297 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from common import * + +ITERATIONS = 1 + +eps = 1e-3 + +def generate_arrays(pkgid, count, posdef = False): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + x = cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE) + if posdef: + x = x @ x.T + x.T @ x + eps + arr_list.append(x) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + if posdef: + x = af.matmul(x, x.T) + af.matmul(x.T, x) + eps + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + x = dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE) + if posdef: + x = x @ x.T + x.T @ x + eps + arr_list.append(x) + elif "numpy" == pkg: + for i in range(count): + x = np.random.rand(NSIZE, NSIZE).astype(DTYPE) + if posdef: + x = x @ x.T + x.T @ x + eps + arr_list.append(x) + + return arr_list + +def svd_np(arr): + return np.linalg.svd(arr) + +def svd_dpnp(arr): + return dpnp.linalg.svd(arr) + +def svd_af(arr): + x = af.svd(arr) + for r in x: + af.eval(r) + af.sync() + return x + +def svd_cupy(arr): + x = cupy.linalg.svd(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + +def qr_np(arr): + return np.linalg.qr(arr) + +def qr_dpnp(arr): + return dpnp.linalg.qr(arr) + +def qr_af(arr): + x = af.qr(arr) + for r in x: + af.eval(r) + af.sync() + return x + +def qr_cupy(arr): + x = cupy.linalg.qr(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + +def cholesky_np(arr): + return np.linalg.cholesky(arr) + +def cholesky_dpnp(arr): + return dpnp.linalg.cholesky(arr) + +def cholesky_af(arr): + x, info = af.cholesky(arr) + af.eval(x) + af.sync() + return x + +def cholesky_cupy(arr): + x = cupy.linalg.cholesky(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + +def qr_cupy(arr): + x = cupy.linalg.qr(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + +def inv_np(arr): + return np.linalg.inv(arr) + +def inv_dpnp(arr): + return dpnp.linalg.inv(arr) + +def inv_af(arr): + x, info = af.inverse(arr) + af.eval(x) + af.sync() + return x + +def inv_cupy(arr): + x = cupy.linalg.inv(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + +def det_np(arr): + return np.linalg.det(arr) + +def det_dpnp(arr): + return dpnp.linalg.det(arr) + +def det_af(arr): + x = af.det(arr) + af.sync() + return x + +def det_cupy(arr): + x = cupy.linalg.det(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + +def norm_np(arr): + return np.linalg.norm(arr) + +def norm_dpnp(arr): + return dpnp.linalg.norm(arr) + +def norm_af(arr): + x = af.norm(arr) + af.sync() + return x + +def norm_cupy(arr): + x = cupy.linalg.norm(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestLinalg: + def test_cholesky(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1, True), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + CHOLESKY_FUNCS = { + "numpy": cholesky_np, + "cupy": cholesky_cupy, + "arrayfire": cholesky_af, + "dpnp": cholesky_dpnp + } + result = benchmark.pedantic( + target=CHOLESKY_FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_svd(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + SVD_FUNCS = { + "numpy": svd_np, + "cupy": svd_cupy, + "arrayfire": svd_af, + "dpnp": svd_dpnp + } + result = benchmark.pedantic( + target=SVD_FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_qr(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + QR_FUNCS = { + "numpy": qr_np, + "cupy": qr_cupy, + "arrayfire": qr_af, + "dpnp": qr_dpnp + } + result = benchmark.pedantic( + target=QR_FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_inv(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + INV_FUNCS = { + "numpy": inv_np, + "cupy": inv_cupy, + "arrayfire": inv_af, + "dpnp": inv_dpnp + } + result = benchmark.pedantic( + target=INV_FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_det(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + DET_FUNCS = { + "numpy": det_np, + "cupy": det_cupy, + "arrayfire": det_af, + "dpnp": det_dpnp + } + result = benchmark.pedantic( + target=DET_FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_norm(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + NORM_FUNCS = { + "numpy": norm_np, + "cupy": norm_cupy, + "arrayfire": norm_af, + "dpnp": norm_dpnp + } + result = benchmark.pedantic( + target=NORM_FUNCS[pkg.__name__], + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_mandelbrot.py b/benchmarks/src/pytest_benchmark/test_mandelbrot.py new file mode 100644 index 0000000..60c730a --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_mandelbrot.py @@ -0,0 +1,187 @@ +# SPDX-FileCopyrightText: 2017 Nicolas P. Rougier +# SPDX-FileCopyrightText: 2021 ETH Zurich and the NPBench authors +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: BSD-3-Clause + +# more information at https://github.com/rougier/numpy-book + +from common import * + +xmin = -2 +xmax = 2 +ymin = -2 +ymax = 2 +xn = NSIZE +yn = NSIZE +itermax = 20 +horizon = 2.0 + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestMandelbrot: + def test_mandelbrot(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} grid iterated {itermax}x" + result = benchmark.pedantic( + target=FUNCS[pkg.__name__], + rounds=ROUNDS, + iterations=1 + ) + +def mandelbrot_np(): + # Adapted from + # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/ + Xi, Yi = np.mgrid[0:xn, 0:yn] + X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi] + Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi] + C = X + Y * 1j + + N_ = np.zeros(C.shape, dtype=np.int64) + Z_ = np.zeros(C.shape, dtype=np.complex128) + Xi.shape = Yi.shape = C.shape = xn * yn + + Z = np.zeros(C.shape, np.complex128) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + np.multiply(Z, Z, Z) + np.add(Z, C, Z) + + # Failed convergence + I = abs(Z) > horizon # noqa: E741 math variable + N_[Xi[I], Yi[I]] = i + 1 + Z_[Xi[I], Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + np.logical_not(I, I) # np.negative(I, I) not working any longer + Z = Z[I] + Xi, Yi = Xi[I], Yi[I] + C = C[I] + return Z_.T, N_.T + +def mandelbrot_dpnp(): + # Adapted from + # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/ + Xi, Yi = dpnp.mgrid[0:xn, 0:yn] + X = dpnp.linspace(xmin, xmax, xn, dtype=dpnp.float64)[Xi] + Y = dpnp.linspace(ymin, ymax, yn, dtype=dpnp.float64)[Yi] + C = X + Y * 1j + N_ = dpnp.zeros(C.shape, dtype=dpnp.int64) + Z_ = dpnp.zeros(C.shape, dtype=dpnp.complex128) + Xi.reshape(xn * yn) + Yi.reshape(xn * yn) + C.reshape(xn * yn) + + Z = dpnp.zeros(C.shape, dtype=dpnp.complex128) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + dpnp.multiply(Z, Z, Z) + dpnp.add(Z, C, Z) + + # Failed convergence + I = abs(Z) > horizon # noqa: E741 math variable + N_[Xi[I], Yi[I]] = i + 1 + Z_[Xi[I], Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + dpnp.logical_not(I, I) # dpnp.negative(I, I) not working any longer + Z = Z[I] + Xi, Yi = Xi[I], Yi[I] + C = C[I] + + Z_ = Z_.T + N_ = N_.T + + return Z_, N_ + +def mandelbrot_cupy(): + # Adapted from + # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/ + Xi, Yi = cupy.mgrid[0:xn, 0:yn] + X = cupy.linspace(xmin, xmax, xn, dtype=cupy.float64)[Xi] + Y = cupy.linspace(ymin, ymax, yn, dtype=cupy.float64)[Yi] + C = X + Y * 1j + N_ = cupy.zeros(C.shape, dtype=cupy.int64) + Z_ = cupy.zeros(C.shape, dtype=cupy.complex128) + Xi.shape = Yi.shape = C.shape = xn * yn + + Z = cupy.zeros(C.shape, cupy.complex128) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + cupy.multiply(Z, Z, Z) + cupy.add(Z, C, Z) + + # Failed convergence + I = abs(Z) > horizon # noqa: E741 math variable + N_[Xi[I], Yi[I]] = i + 1 + Z_[Xi[I], Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + cupy.logical_not(I, I) # cupy.negative(I, I) not working any longer + Z = Z[I] + Xi, Yi = Xi[I], Yi[I] + C = C[I] + + Z_ = Z_.T + N_ = N_.T + + cupy.cuda.runtime.deviceSynchronize() + return Z_, N_ + +def mandelbrot_af(): + Xi = af.flat(af.range((xn, yn), axis=0, dtype=af.int64)) + Yi = af.flat(af.range((xn, yn), axis=1, dtype=af.int64)) + X = af.iota((xn,1), tile_shape=(1,yn), dtype=af.float64) * (xmax - xmin) / (xn - 1) + xmin + Y = af.iota((1,yn), tile_shape=(xn,1), dtype=af.float64) * (ymax - ymin) / (yn - 1) + ymin + + C = af.cplx(X, Y) + N_ = af.constant(0, (xn, yn)) + Z_ = af.constant(0, (xn, yn), dtype=af.complex64) + Z = af.constant(0, (xn, yn), dtype=af.complex64) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + Z = Z * Z + Z = Z + C + + # Failed convergence + I = af.abs(Z) > horizon # noqa: E741 math variable + + if not af.any_true(I): + break + + N_[Xi[I] * yn + Yi[I]] = i + 1 + Z_[Xi[I] * yn + Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + I = af.logical_not(I) + Z = Z[I] + Xi = Xi[I] + Yi = Yi[I] + C = C[I] + + Z_ = Z_.T + N_ = N_.T + af.eval(Z_) + af.eval(N_) + af.sync() + return Z_, N_ + +FUNCS = { + "dpnp" : mandelbrot_dpnp , "numpy" : mandelbrot_np, \ + "cupy" : mandelbrot_cupy , "arrayfire" : mandelbrot_af +} \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py new file mode 100644 index 0000000..389d4e4 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py @@ -0,0 +1,52 @@ +from common import * + +ITERATIONS = 1 + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestPi: + def test_pi(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples" + result = benchmark.pedantic( + target=FUNCS[pkg.__name__], + rounds=ROUNDS, + iterations=ITERATIONS, + args=[NNSIZE] + ) + +# Having the function outside is faster than the lambda inside +def in_circle(x, y): + return (x*x + y*y) < 1 + +def calc_pi_af(samples): + x = af.randu(samples) + y = af.randu(samples) + result = 4 * af.sum(in_circle(x, y)) / samples + + af.sync() + + return result + +def calc_pi_numpy(samples): + x = np.random.rand(samples).astype(np.float32) + y = np.random.rand(samples).astype(np.float32) + return 4. * np.sum(in_circle(x, y)) / samples + +def calc_pi_cupy(samples): + x = cupy.random.rand(samples, dtype=np.float32) + y = cupy.random.rand(samples, dtype=np.float32) + res = 4. * cupy.sum(in_circle(x, y)) / samples + cupy.cuda.runtime.deviceSynchronize() + return res + +def calc_pi_dpnp(samples): + x = dpnp.random.rand(samples).astype(dpnp.float32) + y = dpnp.random.rand(samples).astype(dpnp.float32) + return 4. * dpnp.sum(in_circle(x, y)) / samples + +FUNCS = { "dpnp" : calc_pi_dpnp, "numpy" : calc_pi_numpy, \ + "cupy" : calc_pi_cupy , "arrayfire" : calc_pi_af} \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_nbody.py b/benchmarks/src/pytest_benchmark/test_nbody.py new file mode 100644 index 0000000..d523ee9 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_nbody.py @@ -0,0 +1,113 @@ +from common import * + +ITERATIONS = 1 +G = 1 +dt = 1e-3 +softening = 1e-4 +M = 1 + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestNbody: + def test_nbody(self, benchmark, pkgid): + pkg = PKGDICT[pkgid] + setup = lambda: (generate_arrays(pkgid), {}) + + benchmark.extra_info["description"] = f"{NSIZE:.2e} Bodies" + result = benchmark.pedantic( + target=nbody, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + +def acceleration(pkg, mass, pos): + x = pos[:, 0:1] + y = pos[:, 1:2] + z = pos[:, 2:3] + + # matrix that stores all pairwise particle separations: r_j - r_i + dx = x.T - x + dy = y.T - y + dz = z.T - z + + # matrix that stores 1/r^3 for all particle pairwise particle separations + inv_r3 = dx**2 + dy**2 + dz**2 + softening**2 + inv_r3[inv_r3 > 0] = inv_r3[inv_r3 > 0] ** (-1.5) + + ax = G * (dx * inv_r3) @ mass + ay = G * (dy * inv_r3) @ mass + az = G * (dz * inv_r3) @ mass + + return pkg.hstack((ax, ay, az)) + +def acceleration_af(mass, pos): + x = pos[:, 0:1] + y = pos[:, 1:2] + z = pos[:, 2:3] + + # matrix that stores all pairwise particle separations: r_j - r_i + dx = af.moddims(x, (1, x.shape[0])) - x + dy = af.moddims(y, (1, x.shape[0])) - y + dz = af.moddims(z, (1, x.shape[0])) - z + + # matrix that stores 1/r^3 for all particle pairwise particle separations + # inv_r3 = dx**2 + dy**2 + dz**2 + softening**2 + inv_r3 = dx*dx + dy*dy + dz*dz + softening*softening + inv_r3 = af.pow(inv_r3, -1.5) + + ax = G * af.matmul((dx * inv_r3), mass) + ay = G * af.matmul((dy * inv_r3), mass) + az = G * af.matmul((dz * inv_r3), mass) + + return af.join(1, ax, ay, az) + +def nbody(pkg, mass, pos, vel): + vel -= pkg.mean(mass * vel, axis=0) / pkg.mean(mass) + + acc = None + if pkg.__name__ == 'arrayfire': + acc = acceleration_af(mass, pos) + else: + acc = acceleration(pkg, mass, pos) + + for i in range(ITERATIONS): + vel += acc * dt / 2.0 + pos += vel * dt + vel += acc * dt / 2.0 + + energy = 0.5 * pkg.sum(mass * vel ** 2) + + return float(energy) + +def generate_arrays(pkgid): + arr_list = [] + pkg = PKGDICT[pkgid] + + initialize_package(pkgid) + pkgname = pkg.__name__ + count = 2 + if "cupy" == pkgname: + arr_list.append(M * cupy.ones((NSIZE, 1), dtype=DTYPE)) + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, 3, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkgname: + af.device_gc() + arr_list.append(M * af.constant(1, (NSIZE,1), dtype=getattr(af, DTYPE))) + for i in range(count): + arr_list.append(af.randu((NSIZE, 3), dtype=getattr(af, DTYPE))) + for arr in arr_list: + af.eval(arr) + af.sync() + elif "dpnp" == pkgname: + arr_list.append(M * dpnp.ones((NSIZE, 1), dtype=DTYPE)) + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, 3).astype(DTYPE)) + elif "numpy" == pkgname: + arr_list.append(M * np.ones((NSIZE, 1), dtype=DTYPE)) + for i in range(count): + arr_list.append(np.random.rand(NSIZE, 3).astype(DTYPE)) + + return (pkg, arr_list[0], arr_list[1], arr_list[2]) \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_nn.py b/benchmarks/src/pytest_benchmark/test_nn.py new file mode 100644 index 0000000..f7df176 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_nn.py @@ -0,0 +1,371 @@ +from common import * + +INPUT_SIZE = 28 * 28 +HIDDEN_SIZE = 1000 +OUTPUT_SIZE = 10 +LEARNING_RATE = 0.01 +ITERATIONS = 10 +BATCH_SIZE = 2560 +SAMPLES = 25600 + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestNeuralNetwork: + def test_neural_network(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + nn = { "dpnp" : NeuralNetwork_dpnp , "numpy" : NeuralNetwork_numpy, \ + "cupy" : NeuralNetwork_cupy , "arrayfire" : NeuralNetwork_af } + + obj = nn[pkg.__name__]() + + benchmark.extra_info["description"] = f"{INPUT_SIZE}x{HIDDEN_SIZE}x{OUTPUT_SIZE} trained with {SAMPLES:.2e} samples" + result = benchmark.pedantic( + target=obj.train, + rounds=ROUNDS, + iterations=1 + ) + + +class NeuralNetwork_numpy: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(2. / self.input_size) + self.b1 = np.zeros((1, self.hidden_size)) + self.W2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(2. / self.hidden_size) + self.b2 = np.zeros((1, self.output_size)) + + self.X_train = np.random.rand(SAMPLES,INPUT_SIZE) + self.y_train = np.zeros((SAMPLES * OUTPUT_SIZE)) + self.y_train[np.arange(SAMPLES) * OUTPUT_SIZE + np.floor(np.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)] = 1 + self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE)) + + def relu(self, x): + return np.maximum(0, x) + + def relu_derivative(self, x): + return (x > 0).astype(float) + + def softmax(self, x): + exp_scores = np.exp(x - np.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability + return exp_scores / np.sum(exp_scores, axis=1, keepdims=True) + + def forward(self, X): + # Hidden layer + self.z1 = np.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = np.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = np.dot(self.a1.T, error_output) + db2 = np.sum(error_output, axis=0, keepdims=True) + + # Calculate gradients for the hidden layer + error_hidden = np.dot(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = np.dot(X.T, error_hidden) + db1 = np.sum(error_hidden, axis=0, keepdims=True) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i:i + BATCH_SIZE, :] + y_batch = y_shuffled[i:i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + def predict(self, X): + return np.argmax(self.forward(X), axis=1) + + +class NeuralNetwork_dpnp: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = dpnp.random.randn(self.input_size, self.hidden_size) * np.sqrt(2. / self.input_size) + self.b1 = dpnp.zeros((1, self.hidden_size)) + self.W2 = dpnp.random.randn(self.hidden_size, self.output_size) * np.sqrt(2. / self.hidden_size) + self.b2 = dpnp.zeros((1, self.output_size)) + + self.X_train = dpnp.random.rand(SAMPLES, INPUT_SIZE) + self.y_train = dpnp.zeros((SAMPLES * OUTPUT_SIZE)) + self.y_train[dpnp.arange(SAMPLES) * OUTPUT_SIZE + dpnp.floor(dpnp.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)] = 1 + self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE)) + + def relu(self, x): + return dpnp.maximum(0, x) + + def relu_derivative(self, x): + return (x > 0).astype(float) + + def softmax(self, x): + exp_scores = dpnp.exp(x - dpnp.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability + return exp_scores / np.sum(exp_scores, axis=1, keepdims=True) + + def forward(self, X): + # Hidden layer + self.z1 = dpnp.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = dpnp.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = dpnp.dot(self.a1.T, error_output) + db2 = dpnp.sum(error_output, axis=0, keepdims=True) + + # Calculate gradients for the hidden layer + error_hidden = dpnp.dot(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = dpnp.dot(X.T, error_hidden) + db1 = dpnp.sum(error_hidden, axis=0, keepdims=True) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i:i + BATCH_SIZE, :] + y_batch = y_shuffled[i:i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + def predict(self, X): + return dpnp.argmax(self.forward(X), axis=1) + +class NeuralNetwork_cupy: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = cupy.random.randn(self.input_size, self.hidden_size) * np.sqrt(2. / self.input_size) + self.b1 = cupy.zeros((1, self.hidden_size)) + self.W2 = cupy.random.randn(self.hidden_size, self.output_size) * np.sqrt(2. / self.hidden_size) + self.b2 = cupy.zeros((1, self.output_size)) + + self.X_train = cupy.random.rand(SAMPLES, INPUT_SIZE) + self.y_train = cupy.zeros((SAMPLES * OUTPUT_SIZE)) + self.y_train[cupy.arange(SAMPLES) * OUTPUT_SIZE + cupy.floor(cupy.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)] = 1 + self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE)) + + cupy.cuda.runtime.deviceSynchronize() + + def relu(self, x): + return cupy.maximum(0, x) + + def relu_derivative(self, x): + return (x > 0).astype(float) + + def softmax(self, x): + exp_scores = cupy.exp(x - cupy.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability + return exp_scores / cupy.sum(exp_scores, axis=1, keepdims=True) + + def forward(self, X): + # Hidden layer + self.z1 = cupy.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = cupy.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = cupy.dot(self.a1.T, error_output) + db2 = cupy.sum(error_output, axis=0, keepdims=True) + + # Calculate gradients for the hidden layer + error_hidden = cupy.dot(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = cupy.dot(X.T, error_hidden) + db1 = cupy.sum(error_hidden, axis=0, keepdims=True) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i:i + BATCH_SIZE, :] + y_batch = y_shuffled[i:i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + cupy.cuda.runtime.deviceSynchronize() + + def predict(self, X): + return cupy.argmax(self.forward(X), axis=1) + + +class NeuralNetwork_af: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = af.randn((self.input_size, self.hidden_size)) * np.sqrt(2. / self.input_size) + self.b1 = af.constant(0, (1, self.hidden_size)) + self.W2 = af.randn((self.hidden_size, self.output_size)) * np.sqrt(2. / self.hidden_size) + self.b2 = af.constant(0, (1, self.output_size)) + + self.X_train = af.randu((SAMPLES, INPUT_SIZE)) + self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE)) + + self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE)) + self.y_train[af.iota(SAMPLES), af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1 + + af.eval(self.X_train) + af.eval(self.y_train) + af.eval(self.W1) + af.eval(self.W2) + af.eval(self.b1) + af.eval(self.b2) + af.sync() + + def relu(self, x): + selection = x > 0 + return af.select(x, 0, selection) + + def relu_derivative(self, x): + return af.cast(x > 0, getattr(af, DTYPE)) + + def softmax(self, x): + exp_scores = af.exp(x - af.max(x, axis=1)) # Subtract max for numerical stability + return exp_scores / af.sum(exp_scores, axis=1) + + def forward(self, X): + # Hidden layer + self.z1 = af.matmul(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = af.matmul(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = af.matmul(self.a1.T, error_output) + db2 = af.sum(error_output, axis=0) + + # Calculate gradients for the hidden layer + error_hidden = af.matmul(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = af.matmul(X.T, error_hidden) + db1 = af.sum(error_hidden, axis=0) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i:i + BATCH_SIZE,:] + y_batch = y_shuffled[i:i + BATCH_SIZE,:] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + af.eval(self.W2) + af.eval(self.b2) + af.eval(self.W1) + af.eval(self.b1) + af.sync() + + def predict(self, X): + return af.where(X == af.tile(af.max(self.forward(X), axis=1), (1, X.shape[1]))) \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_random.py b/benchmarks/src/pytest_benchmark/test_random.py new file mode 100644 index 0000000..d74345f --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_random.py @@ -0,0 +1,65 @@ +from common import * + +ITERATIONS = 20 + +def randn_np(): + arr = np.random.normal(size=(NNSIZE)) + +def randn_dpnp(): + arr = dpnp.random.normal(size=(NNSIZE)) + +def randn_cupy(): + arr = cupy.random.normal(size=(NNSIZE)) + cupy.cuda.runtime.deviceSynchronize() + +def randn_af(): + arr = af.randn((NNSIZE)) + af.eval(arr) + af.sync() + +def randu_np(): + arr = np.random.uniform(size=(NNSIZE)) + +def randu_dpnp(): + arr = dpnp.random.uniform(size=(NNSIZE)) + +def randu_cupy(): + arr = cupy.random.uniform(size=(NNSIZE)) + cupy.cuda.runtime.deviceSynchronize() + +def randu_af(): + arr = af.randu((NNSIZE)) + af.eval(arr) + af.sync() + +@pytest.mark.parametrize( + "pkgid", IDS, ids=IDS +) +class TestRandom: + def test_normal(self, benchmark, pkgid): + initialize_package(pkgid) + + pkg = PKGDICT[pkgid] + FUNCS = { "dpnp" : randn_dpnp , "numpy" : randn_np, \ + "cupy" : randn_cupy , "arrayfire" : randn_af } + + benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples" + result = benchmark.pedantic( + target=FUNCS[pkg.__name__], + rounds=ROUNDS, + iterations=ITERATIONS + ) + + + def test_uniform(self, benchmark, pkgid): + initialize_package(pkgid) + + pkg = PKGDICT[pkgid] + FUNCS = { "dpnp" : randu_dpnp , "numpy" : randu_np, \ + "cupy" : randu_cupy , "arrayfire" : randu_af } + + result = benchmark.pedantic( + target=FUNCS[pkg.__name__], + rounds=ROUNDS, + iterations=ITERATIONS + ) diff --git a/benchmarks/src/requirements.txt b/benchmarks/src/requirements.txt new file mode 100644 index 0000000..30b908b --- /dev/null +++ b/benchmarks/src/requirements.txt @@ -0,0 +1,10 @@ +--extra-index-url https://software.repos.intel.com/python/pypi +dpnp +numpy +cupy-cuda12x +pytest +pytest-benchmark +matplotlib +gdown +pandas +# arrayfire \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4ea40fe..724a88b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "arrayfire" version = "0.1.0" -dependencies = ["arrayfire-binary-python-wrapper == 0.7.0"] +dependencies = ["arrayfire-binary-python-wrapper == 0.8.0+AF3.10.0"] requires-python = ">=3.10" description = "ArrayFire Python" readme = "README.md" diff --git a/requirements.txt b/requirements.txt index d3f5294..ba9b100 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -arrayfire-binary-python-wrapper==0.7.0+af3.9.0 +arrayfire-binary-python-wrapper==0.8.0+af3.10.0 diff --git a/tests/test_library/test_linear_algebra.py b/tests/test_library/test_linear_algebra.py index 19b1cb2..38d7074 100644 --- a/tests/test_library/test_linear_algebra.py +++ b/tests/test_library/test_linear_algebra.py @@ -65,8 +65,8 @@ def test_gemm_basic(matrix_a: af.Array, matrix_b: af.Array) -> None: def test_gemm_alpha_beta(matrix_a: af.Array, matrix_b: af.Array) -> None: alpha = 0.5 beta = 2.0 - result = af.gemm(matrix_a, matrix_b, alpha=alpha, beta=beta) - expected = create_from_2d_nested(10.5, 12.0, 22.5, 26.0) + result = af.gemm(matrix_a, matrix_b, alpha=alpha, beta=beta, accum=matrix_a) + expected = create_from_2d_nested(11.5, 15.0, 27.5, 33.0) assert result == expected, f"Expected {expected}, got {result}"