diff --git a/CHANGELOG.md b/CHANGELOG.md index c58cb1a..6ea8d93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,14 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [dev] (MM/DD/YYYY) +## [dev] - YYYY-MM-DD ### Added * Added mkl implementation for floating point data-types of `exp2`, `log2`, `fabs`, `copysign`, `nextafter`, `fmax`, `fmin` and `remainder` functions [gh-81](https://github.com/IntelPython/mkl_umath/pull/81) * Added mkl implementation for complex data-types of `conjugate` and `absolute` functions [gh-86](https://github.com/IntelPython/mkl_umath/pull/86) * Enabled support of Python 3.13 [gh-101](https://github.com/IntelPython/mkl_umath/pull/101) +* Added mkl implementation for complex data-types of `add`, `subtract`, `multiply` and `divide` functions [gh-88](https://github.com/IntelPython/mkl_umath/pull/88) -## [0.2.0] (06/03/2025) +## [0.2.0] - 2025-06-03 This release updates `mkl_umath` to be aligned with both numpy-1.26.x and numpy-2.x.x. ### Added @@ -26,12 +27,12 @@ This release updates `mkl_umath` to be aligned with both numpy-1.26.x and numpy- * Fixed a bug for `mkl_umath.is_patched` function [gh-66](https://github.com/IntelPython/mkl_umath/pull/66) -## [0.1.5] (04/09/2025) +## [0.1.5] - 2025-04-09 ### Fixed * Fixed failures to import `mkl_umath` from virtual environment on Linux -## [0.1.4] (04/09/2025) +## [0.1.4] - 2025-04-09 ### Added * Added support for `mkl_umath` out-of-the-box in virtual environments on Windows @@ -39,7 +40,7 @@ This release updates `mkl_umath` to be aligned with both numpy-1.26.x and numpy- ### Fixed * Fixed a bug in in-place addition with negative zeros -## [0.1.2] (10/11/2024) +## [0.1.2] - 2024-10-11 ### Added * Added support for building with NumPy 2.0 and older diff --git a/mkl_umath/src/mkl_umath_loops.c.src b/mkl_umath/src/mkl_umath_loops.c.src index 81ee830..aa4b927 100644 --- a/mkl_umath/src/mkl_umath_loops.c.src +++ b/mkl_umath/src/mkl_umath_loops.c.src @@ -318,10 +318,11 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride) void mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + if (IS_BINARY_CONT(@type@, @type@)) { - if (dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { + if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) { CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ } @@ -371,8 +372,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } } else if (IS_BINARY_CONT_S1(@type@, @type@)) { - if (dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { + if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same2) { CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], @OP@1.0, *(@type@*)args[0], 0.0, 1.0); /* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], @OP@1.0, *(@type@*)args[0], 0.0, 1.0, (@type@*) args[2]); */ } @@ -412,8 +412,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } } else if (IS_BINARY_CONT_S2(@type@, @type@)) { - if (dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) { + if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1) { CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0); /* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0, (@type@*) args[2]); */ } @@ -478,10 +477,11 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp void mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + if (IS_BINARY_CONT(@type@, @type@)) { - if (dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { + if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) { CHUNKED_VML_CALL3(v@s@Mul, dimensions[0], @type@, args[0], args[1], args[2]); /* v@s@Mul(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ } @@ -531,8 +531,7 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int } } else if (IS_BINARY_CONT_S1(@type@, @type@)) { - if (dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { + if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same2) { CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], *(@type@*)args[0], 0.0, 0.0, 1.0); /* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], *(@type@*)args[0], 0.0, 0.0, 1.0, (@type@*) args[2]); */ } @@ -572,8 +571,7 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int } } else if (IS_BINARY_CONT_S2(@type@, @type@)) { - if (dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) { + if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1) { CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], *(@type@*)args[1], 0.0, 0.0, 1.0); /* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], *(@type@*)args[1], 0.0, 0.0, 1.0, (@type@*) args[2]); */ } @@ -630,10 +628,11 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int void mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + if (IS_BINARY_CONT(@type@, @type@)) { - if (dimensions[0] > VML_D_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { + if (dimensions[0] > VML_D_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) { CHUNKED_VML_CALL3(v@s@Div, dimensions[0], @type@, args[0], args[1], args[2]); /* v@s@Div(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ } @@ -1365,83 +1364,114 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, npy_intp st } } -/* TODO: USE MKL */ /**begin repeat1 * #kind = add, subtract# * #OP = +, -# * #PW = 1, 0# + * #VML = Add, Sub# */ void mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - if (IS_BINARY_REDUCE && @PW@) { - npy_intp n = dimensions[0]; - @ftype@ * or = ((@ftype@ *)args[0]); - @ftype@ * oi = ((@ftype@ *)args[0]) + 1; - @ftype@ rr, ri; + const int contig = IS_BINARY_CONT(@type@, @type@); + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; - pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2); - *or @OP@= rr; - *oi @OP@= ri; - return; + if (can_vectorize && dimensions[0] > VML_ASM_THRESHOLD) { + CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ } - else { - BINARY_LOOP { - const @ftype@ in1r = ((@ftype@ *)ip1)[0]; - const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - const @ftype@ in2r = ((@ftype@ *)ip2)[0]; - const @ftype@ in2i = ((@ftype@ *)ip2)[1]; - ((@ftype@ *)op1)[0] = in1r @OP@ in2r; - ((@ftype@ *)op1)[1] = in1i @OP@ in2i; + else { + if (IS_BINARY_REDUCE && @PW@) { + npy_intp n = dimensions[0]; + @ftype@ * or = ((@ftype@ *)args[0]); + @ftype@ * oi = ((@ftype@ *)args[0]) + 1; + @ftype@ rr, ri; + + pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2); + *or @OP@= rr; + *oi @OP@= ri; + return; + } + else { + BINARY_LOOP { + const @ftype@ in1r = ((@ftype@ *)ip1)[0]; + const @ftype@ in1i = ((@ftype@ *)ip1)[1]; + const @ftype@ in2r = ((@ftype@ *)ip2)[0]; + const @ftype@ in2i = ((@ftype@ *)ip2)[1]; + ((@ftype@ *)op1)[0] = in1r @OP@ in2r; + ((@ftype@ *)op1)[1] = in1i @OP@ in2i; + } } } } /**end repeat1**/ -/* TODO: USE MKL */ void mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - BINARY_LOOP { - const @ftype@ in1r = ((@ftype@ *)ip1)[0]; - const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - const @ftype@ in2r = ((@ftype@ *)ip2)[0]; - const @ftype@ in2i = ((@ftype@ *)ip2)[1]; - ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i; - ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; + const int contig = IS_BINARY_CONT(@type@, @type@); + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; + + if (can_vectorize && dimensions[0] > VML_ASM_THRESHOLD) { + CHUNKED_VML_CALL3(v@s@Mul, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@Mul(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ + } + else { + BINARY_LOOP { + const @ftype@ in1r = ((@ftype@ *)ip1)[0]; + const @ftype@ in1i = ((@ftype@ *)ip1)[1]; + const @ftype@ in2r = ((@ftype@ *)ip2)[0]; + const @ftype@ in2i = ((@ftype@ *)ip2)[1]; + ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i; + ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; + } } } -/* TODO: USE MKL */ void mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - BINARY_LOOP { - const @ftype@ in1r = ((@ftype@ *)ip1)[0]; - const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - const @ftype@ in2r = ((@ftype@ *)ip2)[0]; - const @ftype@ in2i = ((@ftype@ *)ip2)[1]; - const @ftype@ in2r_abs = fabs@c@(in2r); - const @ftype@ in2i_abs = fabs@c@(in2i); - if (in2r_abs >= in2i_abs) { - if (in2r_abs == 0 && in2i_abs == 0) { - /* divide by zero should yield a complex inf or nan */ - ((@ftype@ *)op1)[0] = in1r/in2r_abs; - ((@ftype@ *)op1)[1] = in1i/in2i_abs; + const int contig = IS_BINARY_CONT(@type@, @type@); + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; + + if (can_vectorize && dimensions[0] > VML_D_THRESHOLD) { + CHUNKED_VML_CALL3(v@s@Div, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@Div(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ + } + else { + BINARY_LOOP { + const @ftype@ in1r = ((@ftype@ *)ip1)[0]; + const @ftype@ in1i = ((@ftype@ *)ip1)[1]; + const @ftype@ in2r = ((@ftype@ *)ip2)[0]; + const @ftype@ in2i = ((@ftype@ *)ip2)[1]; + const @ftype@ in2r_abs = fabs@c@(in2r); + const @ftype@ in2i_abs = fabs@c@(in2i); + if (in2r_abs >= in2i_abs) { + if (in2r_abs == 0 && in2i_abs == 0) { + /* divide by zero should yield a complex inf or nan */ + ((@ftype@ *)op1)[0] = in1r/in2r_abs; + ((@ftype@ *)op1)[1] = in1i/in2i_abs; + } + else { + const @ftype@ rat = in2i/in2r; + const @ftype@ scl = 1.0@c@/(in2r + in2i*rat); + ((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl; + ((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl; + } } else { - const @ftype@ rat = in2i/in2r; - const @ftype@ scl = 1.0@c@/(in2r + in2i*rat); - ((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl; - ((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl; + const @ftype@ rat = in2r/in2i; + const @ftype@ scl = 1.0@c@/(in2i + in2r*rat); + ((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl; + ((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl; } } - else { - const @ftype@ rat = in2r/in2i; - const @ftype@ scl = 1.0@c@/(in2i + in2r*rat); - ((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl; - ((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl; - } } }