Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [dev] (MM/DD/YYYY)
## [dev] - YYYY-MM-DD

### Added
* Added mkl implementation for floating point data-types of `exp2`, `log2`, `fabs`, `copysign`, `nextafter`, `fmax`, `fmin` and `remainder` functions [gh-81](https://github.com/IntelPython/mkl_umath/pull/81)
* Added mkl implementation for complex data-types of `conjugate` and `absolute` functions [gh-86](https://github.com/IntelPython/mkl_umath/pull/86)
* Enabled support of Python 3.13 [gh-101](https://github.com/IntelPython/mkl_umath/pull/101)
* Added mkl implementation for complex data-types of `add`, `subtract`, `multiply` and `divide` functions [gh-88](https://github.com/IntelPython/mkl_umath/pull/88)

## [0.2.0] (06/03/2025)
## [0.2.0] - 2025-06-03
This release updates `mkl_umath` to be aligned with both numpy-1.26.x and numpy-2.x.x.

### Added
Expand All @@ -26,20 +27,20 @@ This release updates `mkl_umath` to be aligned with both numpy-1.26.x and numpy-
* Fixed a bug for `mkl_umath.is_patched` function [gh-66](https://github.com/IntelPython/mkl_umath/pull/66)


## [0.1.5] (04/09/2025)
## [0.1.5] - 2025-04-09

### Fixed
* Fixed failures to import `mkl_umath` from virtual environment on Linux

## [0.1.4] (04/09/2025)
## [0.1.4] - 2025-04-09

### Added
* Added support for `mkl_umath` out-of-the-box in virtual environments on Windows

### Fixed
* Fixed a bug in in-place addition with negative zeros

## [0.1.2] (10/11/2024)
## [0.1.2] - 2024-10-11

### Added
* Added support for building with NumPy 2.0 and older
Expand Down
162 changes: 96 additions & 66 deletions mkl_umath/src/mkl_umath_loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -318,10 +318,11 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
void
mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
{
const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));

if (IS_BINARY_CONT(@type@, @type@)) {
if (dimensions[0] > VML_ASM_THRESHOLD &&
DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) &&
DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) {
CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]);
/* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
}
Expand Down Expand Up @@ -371,8 +372,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
}
}
else if (IS_BINARY_CONT_S1(@type@, @type@)) {
if (dimensions[0] > VML_ASM_THRESHOLD &&
DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same2) {
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], @[email protected], *(@type@*)args[0], 0.0, 1.0);
/* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], @[email protected], *(@type@*)args[0], 0.0, 1.0, (@type@*) args[2]); */
}
Expand Down Expand Up @@ -412,8 +412,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
}
}
else if (IS_BINARY_CONT_S2(@type@, @type@)) {
if (dimensions[0] > VML_ASM_THRESHOLD &&
DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) {
if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1) {
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0);
/* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0, (@type@*) args[2]); */
}
Expand Down Expand Up @@ -478,10 +477,11 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
void
mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
{
const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));

if (IS_BINARY_CONT(@type@, @type@)) {
if (dimensions[0] > VML_ASM_THRESHOLD &&
DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) &&
DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) {
CHUNKED_VML_CALL3(v@s@Mul, dimensions[0], @type@, args[0], args[1], args[2]);
/* v@s@Mul(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
}
Expand Down Expand Up @@ -531,8 +531,7 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int
}
}
else if (IS_BINARY_CONT_S1(@type@, @type@)) {
if (dimensions[0] > VML_ASM_THRESHOLD &&
DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same2) {
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], *(@type@*)args[0], 0.0, 0.0, 1.0);
/* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], *(@type@*)args[0], 0.0, 0.0, 1.0, (@type@*) args[2]); */
}
Expand Down Expand Up @@ -572,8 +571,7 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int
}
}
else if (IS_BINARY_CONT_S2(@type@, @type@)) {
if (dimensions[0] > VML_ASM_THRESHOLD &&
DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) {
if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1) {
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], *(@type@*)args[1], 0.0, 0.0, 1.0);
/* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], *(@type@*)args[1], 0.0, 0.0, 1.0, (@type@*) args[2]); */
}
Expand Down Expand Up @@ -630,10 +628,11 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int
void
mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
{
const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));

if (IS_BINARY_CONT(@type@, @type@)) {
if (dimensions[0] > VML_D_THRESHOLD &&
DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) &&
DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
if (dimensions[0] > VML_D_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) {
CHUNKED_VML_CALL3(v@s@Div, dimensions[0], @type@, args[0], args[1], args[2]);
/* v@s@Div(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
}
Expand Down Expand Up @@ -1365,83 +1364,114 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, npy_intp st
}
}

/* TODO: USE MKL */
/**begin repeat1
* #kind = add, subtract#
* #OP = +, -#
* #PW = 1, 0#
* #VML = Add, Sub#
*/
void
mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
{
if (IS_BINARY_REDUCE && @PW@) {
npy_intp n = dimensions[0];
@ftype@ * or = ((@ftype@ *)args[0]);
@ftype@ * oi = ((@ftype@ *)args[0]) + 1;
@ftype@ rr, ri;
const int contig = IS_BINARY_CONT(@type@, @type@);
const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2;

pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
*or @OP@= rr;
*oi @OP@= ri;
return;
if (can_vectorize && dimensions[0] > VML_ASM_THRESHOLD) {
CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]);
/* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
}
else {
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
((@ftype@ *)op1)[0] = in1r @OP@ in2r;
((@ftype@ *)op1)[1] = in1i @OP@ in2i;
else {
if (IS_BINARY_REDUCE && @PW@) {
npy_intp n = dimensions[0];
@ftype@ * or = ((@ftype@ *)args[0]);
@ftype@ * oi = ((@ftype@ *)args[0]) + 1;
@ftype@ rr, ri;

pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
*or @OP@= rr;
*oi @OP@= ri;
return;
}
else {
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
((@ftype@ *)op1)[0] = in1r @OP@ in2r;
((@ftype@ *)op1)[1] = in1i @OP@ in2i;
}
}
}
}
/**end repeat1**/

/* TODO: USE MKL */
void
mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
const int contig = IS_BINARY_CONT(@type@, @type@);
const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2;

if (can_vectorize && dimensions[0] > VML_ASM_THRESHOLD) {
CHUNKED_VML_CALL3(v@s@Mul, dimensions[0], @type@, args[0], args[1], args[2]);
/* v@s@Mul(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
}
else {
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
}
}
}

/* TODO: USE MKL */
void
mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
const @ftype@ in2r_abs = fabs@c@(in2r);
const @ftype@ in2i_abs = fabs@c@(in2i);
if (in2r_abs >= in2i_abs) {
if (in2r_abs == 0 && in2i_abs == 0) {
/* divide by zero should yield a complex inf or nan */
((@ftype@ *)op1)[0] = in1r/in2r_abs;
((@ftype@ *)op1)[1] = in1i/in2i_abs;
const int contig = IS_BINARY_CONT(@type@, @type@);
const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2;

if (can_vectorize && dimensions[0] > VML_D_THRESHOLD) {
CHUNKED_VML_CALL3(v@s@Div, dimensions[0], @type@, args[0], args[1], args[2]);
/* v@s@Div(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
}
else {
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
const @ftype@ in2r_abs = fabs@c@(in2r);
const @ftype@ in2i_abs = fabs@c@(in2i);
if (in2r_abs >= in2i_abs) {
if (in2r_abs == 0 && in2i_abs == 0) {
/* divide by zero should yield a complex inf or nan */
((@ftype@ *)op1)[0] = in1r/in2r_abs;
((@ftype@ *)op1)[1] = in1i/in2i_abs;
}
else {
const @ftype@ rat = in2i/in2r;
const @ftype@ scl = 1.0@c@/(in2r + in2i*rat);
((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl;
((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl;
}
}
else {
const @ftype@ rat = in2i/in2r;
const @ftype@ scl = 1.0@c@/(in2r + in2i*rat);
((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl;
((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl;
const @ftype@ rat = in2r/in2i;
const @ftype@ scl = 1.0@c@/(in2i + in2r*rat);
((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl;
((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl;
}
}
else {
const @ftype@ rat = in2r/in2i;
const @ftype@ scl = 1.0@c@/(in2i + in2r*rat);
((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl;
((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl;
}
}
}

Expand Down
Loading