From 5e4e3ac74c530e9d16cbb57b1ae8a6b09f58d85d Mon Sep 17 00:00:00 2001 From: Vahid Tavanashad Date: Thu, 24 Jul 2025 15:34:18 -0700 Subject: [PATCH 1/2] clean up source files --- mkl_umath/src/mkl_umath_loops.c.src | 1926 +++++++-------------------- mkl_umath/src/mkl_umath_loops.h.src | 261 +--- mkl_umath/tests/test_basic.py | 70 +- 3 files changed, 527 insertions(+), 1730 deletions(-) diff --git a/mkl_umath/src/mkl_umath_loops.c.src b/mkl_umath/src/mkl_umath_loops.c.src index 09c6a86..81ee830 100644 --- a/mkl_umath/src/mkl_umath_loops.c.src +++ b/mkl_umath/src/mkl_umath_loops.c.src @@ -128,8 +128,8 @@ /* for pointers p1, and p2 pointing at contiguous arrays n-elements of size s, are arrays disjoint or same * when these conditions are not met VML functions may produce incorrect output */ -#define DISJOINT_OR_SAME(p1, p2, n, s) (((p1) == (p2)) || ((p2) + (n)*(s) < (p1)) || ((p1) + (n)*(s) < (p2)) ) -#define DISJOINT_OR_SAME_TWO_DTYPES(p1, p2, n, s1, s2) (((p1) == (p2)) || ((p2) + (n)*(s2) < (p1)) || ((p1) + (n)*(s1) < (p2)) ) +#define DISJOINT_OR_SAME(p1, p2, n, s) (((p1) == (p2)) || ((p2) + (n)*(s) < (p1)) || ((p1) + (n)*(s) < (p2))) +#define DISJOINT_OR_SAME_TWO_DTYPES(p1, p2, n, s1, s2) (((p1) == (p2)) || ((p2) + (n)*(s2) < (p1)) || ((p1) + (n)*(s1) < (p2))) /* * include vectorized functions and dispatchers @@ -224,890 +224,10 @@ divmod@c@(@type@ a, @type@ b, @type@ *modulus) /**begin repeat * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = sqrtf, sqrt# - */ - -void -mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Sqrt, dimensions[0], @type@, args[0], args[1]); - /* v@c@Sqrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #A = F, # - * #c = s, d# - * #scalarf = expf, exp# - */ - -void -mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - int ignore_fpstatus = 0; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - ignore_fpstatus = 1; - CHUNKED_VML_CALL2(v@c@Exp, dimensions[0], @type@, args[0], args[1]); - /* v@c@Exp(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - const int invalid_cases = npy_isnan(in1) || in1 == NPY_INFINITY || in1 == -NPY_INFINITY; - ignore_fpstatus |= invalid_cases; - *(@type@ *)op1 = @scalarf@(in1); - ) - } - if(ignore_fpstatus) { - feclearexcept(FE_OVERFLOW | FE_UNDERFLOW | FE_INVALID); - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = exp2f, exp2# - */ - -void -mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - int ignore_fpstatus = 0; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - ignore_fpstatus = 1; - CHUNKED_VML_CALL2(v@c@Exp2, dimensions[0], @type@, args[0], args[1]); - /* v@c@Exp2(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - const int invalid_cases = npy_isnan(in1) || in1 == NPY_INFINITY || in1 == -NPY_INFINITY; - ignore_fpstatus |= invalid_cases; - *(@type@ *)op1 = @scalarf@(in1); - ) - } - if(ignore_fpstatus) { - feclearexcept(FE_OVERFLOW | FE_UNDERFLOW | FE_INVALID); - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = expm1f, expm1# - */ - -void -mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - CHUNKED_VML_CALL2(v@c@Expm1, dimensions[0], @type@, args[0], args[1]); - /* v@c@Expm1(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = logf, log# - */ - -void -mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Ln, dimensions[0], @type@, args[0], args[1]); - /* v@c@Ln(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = log2f, log2# - */ - -void -mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - int ignore_fpstatus = 0; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - ignore_fpstatus = 1; - CHUNKED_VML_CALL2(v@c@Log2, dimensions[0], @type@, args[0], args[1]); - /* v@c@Log2(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - const int invalid_cases = in1 < 0 || in1 == 0 || npy_isnan(in1) || in1 == -NPY_INFINITY; - ignore_fpstatus |= invalid_cases; - *(@type@ *)op1 = @scalarf@(in1); - ) - } - if(ignore_fpstatus) { - feclearexcept(FE_DIVBYZERO | FE_INVALID); - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = log10f, log10# - */ - -void -mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Log10, dimensions[0], @type@, args[0], args[1]); - /* v@c@Log10(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = log1pf, log1p# - */ - -void -mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Log1p, dimensions[0], @type@, args[0], args[1]); - /* v@c@Log1p(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = cosf, cos# - */ - -void -mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Cos, dimensions[0], @type@, args[0], args[1]); - /* v@c@Cos(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = sinf, sin# - */ - -void -mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Sin, dimensions[0], @type@, args[0], args[1]); - /* v@c@Sin(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = tanf, tan# - */ - -void -mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Tan, dimensions[0], @type@, args[0], args[1]); - /* v@c@Tan(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = acosf, acos# - */ - -void -mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Acos, dimensions[0], @type@, args[0], args[1]); - /* v@c@Acos(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = asinf, asin# - */ - -void -mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Asin, dimensions[0], @type@, args[0], args[1]); - /* v@c@Asin(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = atanf, atan# - */ - -void -mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Atan, dimensions[0], @type@, args[0], args[1]); - /* v@c@Atan(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = coshf, cosh# - */ - -void -mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Cosh, dimensions[0], @type@, args[0], args[1]); - /* v@c@Cosh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = sinhf, sinh# - */ - -void -mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Sinh, dimensions[0], @type@, args[0], args[1]); - /* v@c@Sinh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = tanhf, tanh# - */ - -void -mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Tanh, dimensions[0], @type@, args[0], args[1]); - /* v@c@Tanh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = acoshf, acosh# - */ - -void -mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Acosh, dimensions[0], @type@, args[0], args[1]); - /* v@c@Acosh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = asinhf, asinh# - */ - -void -mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Asinh, dimensions[0], @type@, args[0], args[1]); - /* v@c@Asinh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = atanhf, atanh# - */ - -void -mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Atanh, dimensions[0], @type@, args[0], args[1]); - /* v@c@Atanh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = fabsf, fabs# - */ - -void -mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Abs, dimensions[0], @type@, args[0], args[1]); - /* v@c@Abs(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = floorf, floor# - */ - -void -mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Floor, dimensions[0], @type@, args[0], args[1]); - /* v@c@Floor(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = ceilf, ceil# - */ - -void -mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Ceil, dimensions[0], @type@, args[0], args[1]); - /* v@c@Ceil(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = rintf, rint# - */ - -void -mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Rint, dimensions[0], @type@, args[0], args[1]); - /* v@c@Rint(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = truncf, trunc# - */ - -void -mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Trunc, dimensions[0], @type@, args[0], args[1]); - /* v@c@Trunc(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = s, d# - * #scalarf = cbrtf, cbrt# - */ - -void -mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) - { - CHUNKED_VML_CALL2(v@c@Cbrt, dimensions[0], @type@, args[0], args[1]); - /* v@c@Cbrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { - UNARY_LOOP_DISPATCH( - @type@, @type@ - , - can_vectorize - , - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - ) - } -} - -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #dtype = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = f, # - * #C = F, # - * #trf = , # + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #c = f, # + * #s = s, d# */ /* @@ -1128,13 +248,13 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride) if (n < 8) { npy_intp i; /* - * Start with -0 to preserve -0 values. The reason is that summing + * Start with -0 to preserve -0 values. The reason is that summing * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`. */ @type@ res = -0.0; for (i = 0; i < n; i++) { - res += @trf@(*((@dtype@*)(a + i * stride))); + res += (*((@type@*)(a + i * stride))); } return res; } @@ -1147,26 +267,26 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride) * 8 times unroll reduces blocksize to 16 and allows vectorization with * avx without changing summation ordering */ - r[0] = @trf@(*((@dtype@ *)(a + 0 * stride))); - r[1] = @trf@(*((@dtype@ *)(a + 1 * stride))); - r[2] = @trf@(*((@dtype@ *)(a + 2 * stride))); - r[3] = @trf@(*((@dtype@ *)(a + 3 * stride))); - r[4] = @trf@(*((@dtype@ *)(a + 4 * stride))); - r[5] = @trf@(*((@dtype@ *)(a + 5 * stride))); - r[6] = @trf@(*((@dtype@ *)(a + 6 * stride))); - r[7] = @trf@(*((@dtype@ *)(a + 7 * stride))); + r[0] = (*((@type@ *)(a + 0 * stride))); + r[1] = (*((@type@ *)(a + 1 * stride))); + r[2] = (*((@type@ *)(a + 2 * stride))); + r[3] = (*((@type@ *)(a + 3 * stride))); + r[4] = (*((@type@ *)(a + 4 * stride))); + r[5] = (*((@type@ *)(a + 5 * stride))); + r[6] = (*((@type@ *)(a + 6 * stride))); + r[7] = (*((@type@ *)(a + 7 * stride))); for (i = 8; i < n - (n % 8); i += 8) { /* small blocksizes seems to mess with hardware prefetch */ - NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3); - r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride))); - r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride))); - r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride))); - r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride))); - r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride))); - r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride))); - r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride))); - r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride))); + NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@type@))*stride, 0, 3); + r[0] += (*((@type@ *)(a + (i + 0) * stride))); + r[1] += (*((@type@ *)(a + (i + 1) * stride))); + r[2] += (*((@type@ *)(a + (i + 2) * stride))); + r[3] += (*((@type@ *)(a + (i + 3) * stride))); + r[4] += (*((@type@ *)(a + (i + 4) * stride))); + r[5] += (*((@type@ *)(a + (i + 5) * stride))); + r[6] += (*((@type@ *)(a + (i + 6) * stride))); + r[7] += (*((@type@ *)(a + (i + 7) * stride))); } /* accumulate now to avoid stack spills for single peel loop */ @@ -1175,7 +295,7 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride) /* do non multiple of 8 rest */ for (; i < n; i++) { - res += @trf@(*((@dtype@ *)(a + i * stride))); + res += (*((@type@ *)(a + i * stride))); } return res; } @@ -1189,37 +309,23 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride) } } -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = f, # - * #s = s, d# - * #SUPPORTED_BY_VML = 1, 1# - */ - /**begin repeat1 - * Arithmetic - * # kind = add# - * # OP = +# - * # PW = 1# - * # VML = Add# + * # kind = add, subtract# + * # OP = +, -# + * # PW = 1, 0# + * # VML = Add, Sub# */ void mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - if(IS_BINARY_CONT(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && + if (IS_BINARY_CONT(@type@, @type@)) { + if (dimensions[0] > VML_ASM_THRESHOLD && DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)) ) { + DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { + } + else { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1240,14 +346,15 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp @type@ *op1_shifted = op1 + peel; @type@ *ip2_shifted = ip2 + peel; - if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && - DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { + if (DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && + DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1)) { NPY_ASSUME_ALIGNED(ip1_aligned, 64) NPY_PRAGMA_VECTOR for(j = 0; j < j_max; j++) { op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; } - } else { + } + else { NPY_ASSUME_ALIGNED(ip1_aligned, 64) for(j = 0; j < j_max; j++) { op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; @@ -1262,183 +369,14 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp op1[i] = ip1[i] @OP@ ip2[i]; } } - } else if(IS_BINARY_CONT_S1(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)) ) { - CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], 1.0, *(@type@*)args[0], 0.0, 1.0); - /* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], 1.0, *(@type@*)args[0], 0.0, 1.0, (@type@*) args[2]); */ - } else -#endif - { - @type@ *ip1 = (@type@*)args[0]; - @type@ *ip2 = (@type@*)args[1]; - @type@ *op1 = (@type@*)args[2]; - const npy_intp vsize = 64; - const npy_intp n = dimensions[0]; - const npy_intp peel = npy_aligned_block_offset(ip2, sizeof(@type@), vsize, n); - const npy_intp blocked_end = npy_blocked_end(peel, sizeof(@type@), vsize, n); - npy_intp i; - - const @type@ ip1c = ip1[0]; - for(i = 0; i < peel; i++) { - op1[i] = ip1c @OP@ ip2[i]; - } - - { - npy_intp j, j_max = blocked_end - peel; - if (j_max > 0) { - @type@ *ip2_aligned = ip2 + peel; - @type@ *op1_shifted = op1 + peel; - - NPY_ASSUME_ALIGNED(ip2_aligned, 64) - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1c @OP@ ip2_aligned[j]; - } - - i = blocked_end; - } - } - - for(; i < n; i++) { - op1[i] = ip1c @OP@ ip2[i]; - } - } - } else if(IS_BINARY_CONT_S2(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) ) { - CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], 1.0, *(@type@*)args[1], 0.0, 1.0); - /* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], 1.0, *(@type@*)args[1], 0.0, 1.0, (@type@*) args[2]); */ - } else -#endif - { - @type@ *ip1 = (@type@*)args[0]; - @type@ *ip2 = (@type@*)args[1]; - @type@ *op1 = (@type@*)args[2]; - const npy_intp vsize = 64; - const npy_intp n = dimensions[0]; - const npy_intp peel = npy_aligned_block_offset(ip1, sizeof(@type@), vsize, n); - const npy_intp blocked_end = npy_blocked_end(peel, sizeof(@type@), vsize, n); - npy_intp i; - - const @type@ ip2c = ip2[0]; - for(i = 0; i < peel; i++) { - op1[i] = ip1[i] @OP@ ip2c; - } - - if (blocked_end > peel) { - npy_intp j, j_max = blocked_end - peel; - if (j_max > 0) { - @type@ *op1_shifted = op1 + peel; - @type@ *ip1_aligned = ip1 + peel; - - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2c; - } - - i = blocked_end; - } - } - - for(; i < n; i++) { - op1[i] = ip1[i] @OP@ ip2c; - } - } - } else if (IS_BINARY_REDUCE) { -#if @PW@ - @type@ * iop1 = (@type@ *)args[0]; - npy_intp n = dimensions[0]; - - *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]); -#else - BINARY_REDUCE_LOOP(@type@) { - io1 @OP@= *(@type@ *)ip2; - } - *((@type@ *)iop1) = io1; -#endif - } else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = in1 @OP@ in2; - } - } -} -/**end repeat1**/ - -/**begin repeat1 - * Arithmetic - * # kind = subtract# - * # OP = -# - * # PW = 0# - * # VML = Sub# - */ -void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - if(IS_BINARY_CONT(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)) ) { - CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); - /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { - @type@ *ip1 = (@type@*)args[0]; - @type@ *ip2 = (@type@*)args[1]; - @type@ *op1 = (@type@*)args[2]; - const npy_intp vsize = 64; - const npy_intp n = dimensions[0]; - const npy_intp peel = npy_aligned_block_offset(ip1, sizeof(@type@), vsize, n); - const npy_intp blocked_end = npy_blocked_end(peel, sizeof(@type@), vsize, n); - npy_intp i; - - for(i = 0; i < peel; i++) { - op1[i] = ip1[i] @OP@ ip2[i]; - } - - { - npy_intp j, j_max = blocked_end - peel; - if (j_max > 0) { - @type@ *ip1_aligned = ip1 + peel; - @type@ *ip2_shifted = ip2 + peel; - @type@ *op1_shifted = op1 + peel; - - if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && - DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - NPY_PRAGMA_VECTOR - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; - } - } else { - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; - } - } - - i = blocked_end; - } - } - - for(; i < n; i++) { - op1[i] = ip1[i] @OP@ ip2[i]; - } - } - } else if(IS_BINARY_CONT_S1(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && + } + else if (IS_BINARY_CONT_S1(@type@, @type@)) { + if (dimensions[0] > VML_ASM_THRESHOLD && DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { - CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], -1.0, *(@type@*)args[0], 0.0, 1.0); - /* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], -1.0, *(@type@*)args[0], 0.0, 1.0, (@type@*) args[2]); */ - } else -#endif - { + CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], @OP@1.0, *(@type@*)args[0], 0.0, 1.0); + /* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], @OP@1.0, *(@type@*)args[0], 0.0, 1.0, (@type@*) args[2]); */ + } + else { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1472,15 +410,14 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp op1[i] = ip1c @OP@ ip2[i]; } } - } else if(IS_BINARY_CONT_S2(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) ) { - CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], 1.0, -(*(@type@*)args[1]), 0.0, 1.0); - /* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], 1.0, -*(@type@*)args[1], 0.0, 1.0, (@type@*) args[2]); */ - } else -#endif - { + } + else if (IS_BINARY_CONT_S2(@type@, @type@)) { + if (dimensions[0] > VML_ASM_THRESHOLD && + DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) { + CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0); + /* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0, (@type@*) args[2]); */ + } + else { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1514,7 +451,8 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp op1[i] = ip1[i] @OP@ ip2c; } } - } else if (IS_BINARY_REDUCE) { + } + else if (IS_BINARY_REDUCE) { #if @PW@ @type@ * iop1 = (@type@ *)args[0]; npy_intp n = dimensions[0]; @@ -1526,7 +464,8 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } *((@type@ *)iop1) = io1; #endif - } else { + } + else { BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; @@ -1536,26 +475,17 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } /**end repeat1**/ -/**begin repeat1 - * Arithmetic - * # kind = multiply# - * # OP = *# - * # PW = 0# - * # VML = Mul# - */ void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - if(IS_BINARY_CONT(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)) ) { - CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); - /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { + if (IS_BINARY_CONT(@type@, @type@)) { + if (dimensions[0] > VML_ASM_THRESHOLD && + DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && + DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { + CHUNKED_VML_CALL3(v@s@Mul, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@Mul(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ + } + else { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1566,47 +496,47 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp npy_intp i; for(i = 0; i < peel; i++) { - op1[i] = ip1[i] @OP@ ip2[i]; + op1[i] = ip1[i] * ip2[i]; } { npy_intp j, j_max = blocked_end - peel; if (j_max > 0) { @type@ *ip1_aligned = ip1 + peel; - @type@ *ip2_shifted = ip2 + peel; @type@ *op1_shifted = op1 + peel; + @type@ *ip2_shifted = ip2 + peel; - if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && - DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - NPY_PRAGMA_VECTOR - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; - } - } else { - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; - } - } + if ( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && + DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1)) { + NPY_ASSUME_ALIGNED(ip1_aligned, 64) + NPY_PRAGMA_VECTOR + for(j = 0; j < j_max; j++) { + op1_shifted[j] = ip1_aligned[j] * ip2_shifted[j]; + } + } + else { + NPY_ASSUME_ALIGNED(ip1_aligned, 64) + for(j = 0; j < j_max; j++) { + op1_shifted[j] = ip1_aligned[j] * ip2_shifted[j]; + } + } i = blocked_end; } } for(; i < n; i++) { - op1[i] = ip1[i] @OP@ ip2[i]; + op1[i] = ip1[i] * ip2[i]; } } - } else if(IS_BINARY_CONT_S1(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)) ) { + } + else if (IS_BINARY_CONT_S1(@type@, @type@)) { + if (dimensions[0] > VML_ASM_THRESHOLD && + DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], *(@type@*)args[0], 0.0, 0.0, 1.0); /* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], *(@type@*)args[0], 0.0, 0.0, 1.0, (@type@*) args[2]); */ - } else -#endif - { + } + else { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1618,7 +548,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp const @type@ ip1c = ip1[0]; for(i = 0; i < peel; i++) { - op1[i] = ip1c @OP@ ip2[i]; + op1[i] = ip1c * ip2[i]; } { @@ -1629,7 +559,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp NPY_ASSUME_ALIGNED(ip2_aligned, 64) for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1c @OP@ ip2_aligned[j]; + op1_shifted[j] = ip1c * ip2_aligned[j]; } i = blocked_end; @@ -1637,18 +567,17 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } for(; i < n; i++) { - op1[i] = ip1c @OP@ ip2[i]; + op1[i] = ip1c * ip2[i]; } } - } else if(IS_BINARY_CONT_S2(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_ASM_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) ) { + } + else if (IS_BINARY_CONT_S2(@type@, @type@)) { + if (dimensions[0] > VML_ASM_THRESHOLD && + DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) { CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], *(@type@*)args[1], 0.0, 0.0, 1.0); /* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], *(@type@*)args[1], 0.0, 0.0, 1.0, (@type@*) args[2]); */ - } else -#endif - { + } + else { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1660,7 +589,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp const @type@ ip2c = ip2[0]; for(i = 0; i < peel; i++) { - op1[i] = ip1[i] @OP@ ip2c; + op1[i] = ip1[i] * ip2c; } { @@ -1671,7 +600,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp NPY_ASSUME_ALIGNED(ip1_aligned, 64) for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2c; + op1_shifted[j] = ip1_aligned[j] * ip2c; } i = blocked_end; @@ -1679,51 +608,36 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } for(; i < n; i++) { - op1[i] = ip1[i] @OP@ ip2c; + op1[i] = ip1[i] * ip2c; } } - } else if (IS_BINARY_REDUCE) { -#if @PW@ - @type@ * iop1 = (@type@ *)args[0]; - npy_intp n = dimensions[0]; - - *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]); -#else + } + else if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(@type@) { - io1 @OP@= *(@type@ *)ip2; + io1 *= *(@type@ *)ip2; } *((@type@ *)iop1) = io1; -#endif - } else { + } + else { BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = in1 @OP@ in2; + *((@type@ *)op1) = in1 * in2; } } } -/**end repeat1**/ -/**begin repeat1 - * Arithmetic - * # kind = divide# - * # OP = /# - * # PW = 0# - * # VML = Div# - */ void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - if(IS_BINARY_CONT(@type@, @type@)) { -#if @SUPPORTED_BY_VML@ - if(dimensions[0] > VML_D_THRESHOLD && - DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && - DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)) ) { - CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); - /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { + if (IS_BINARY_CONT(@type@, @type@)) { + if (dimensions[0] > VML_D_THRESHOLD && + DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) && + DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) { + CHUNKED_VML_CALL3(v@s@Div, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@Div(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ + } + else { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1735,7 +649,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp NPY_PRAGMA_NOVECTOR for(i = 0; i < peel; i++) { - op1[i] = ip1[i] @OP@ ip2[i]; + op1[i] = ip1[i] / ip2[i]; } { @@ -1743,21 +657,24 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp j_max &= (~0xf); const npy_intp blocked_end = j_max + peel; if (j_max > 0) { - @type@ *ip1_aligned = ip1 + peel, *op1_shifted = op1 + peel, *ip2_shifted = ip2 + peel; - - if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && - DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - NPY_PRAGMA_VECTOR - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; - } - } else { - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; - } - } + @type@ *ip1_aligned = ip1 + peel; + @type@ *op1_shifted = op1 + peel; + @type@ *ip2_shifted = ip2 + peel; + + if (DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && + DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1)) { + NPY_ASSUME_ALIGNED(ip1_aligned, 64) + NPY_PRAGMA_VECTOR + for(j = 0; j < j_max; j++) { + op1_shifted[j] = ip1_aligned[j] / ip2_shifted[j]; + } + } + else { + NPY_ASSUME_ALIGNED(ip1_aligned, 64) + for(j = 0; j < j_max; j++) { + op1_shifted[j] = ip1_aligned[j] / ip2_shifted[j]; + } + } i = blocked_end; } @@ -1765,10 +682,11 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp NPY_PRAGMA_NOVECTOR for(; i < n; i++) { - op1[i] = ip1[i] @OP@ ip2[i]; + op1[i] = ip1[i] / ip2[i]; } } - } else if(IS_BINARY_CONT_S1(@type@, @type@)) { + } + else if (IS_BINARY_CONT_S1(@type@, @type@)) { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1781,7 +699,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp const @type@ ip1c = ip1[0]; NPY_PRAGMA_NOVECTOR for(i = 0; i < peel; i++) { - op1[i] = ip1c @OP@ ip2[i]; + op1[i] = ip1c / ip2[i]; } { @@ -1791,7 +709,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp NPY_ASSUME_ALIGNED(ip2_aligned, 64) for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1c @OP@ ip2_aligned[j]; + op1_shifted[j] = ip1c / ip2_aligned[j]; } i = blocked_end; @@ -1800,9 +718,10 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp NPY_PRAGMA_NOVECTOR for(; i < n; i++) { - op1[i] = ip1c @OP@ ip2[i]; + op1[i] = ip1c / ip2[i]; } - } else if(IS_BINARY_CONT_S2(@type@, @type@)) { + } + else if (IS_BINARY_CONT_S2(@type@, @type@)) { @type@ *ip1 = (@type@*)args[0]; @type@ *ip2 = (@type@*)args[1]; @type@ *op1 = (@type@*)args[2]; @@ -1815,48 +734,304 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp const @type@ ip2c = ip2[0]; NPY_PRAGMA_NOVECTOR for(i = 0; i < peel; i++) { - op1[i] = ip1[i] @OP@ ip2c; + op1[i] = ip1[i] / ip2c; + } + + { + npy_intp j, j_max = blocked_end - peel; + if (j_max > 0) { + @type@ *ip1_aligned = ip1 + peel, *op1_shifted = op1 + peel; + + NPY_ASSUME_ALIGNED(ip1_aligned, 64) + for(j = 0; j < j_max; j++) { + op1_shifted[j] = ip1_aligned[j] / ip2c; + } + + i = blocked_end; + } + } + + NPY_PRAGMA_NOVECTOR + for(; i < n; i++) { + op1[i] = ip1[i] / ip2c; + } + } + else if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(@type@) { + io1 /= *(@type@ *)ip2; + } + *((@type@ *)iop1) = io1; + } + else { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1) = in1 / in2; + } + } +} + +/**begin repeat1 + * #kind = fmax, fmin# + * #VML = Fmax, Fmin# + * #OP = >=, <=# + **/ +void +mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + const int contig = IS_BINARY_CONT(@type@, @type@); + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; + + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ + } + else { + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(@type@) { + const @type@ in2 = *(@type@ *)ip2; + /* Order of operations important for MSVC 2015 */ + io1 = (io1 @OP@ in2 || isnan(in2)) ? io1 : in2; + } + *((@type@ *)iop1) = io1; + } + else { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + /* Order of operations important for MSVC 2015 */ + *((@type@ *)op1) = (in1 @OP@ in2 || isnan(in2)) ? in1 : in2; + } + } + feclearexcept(FE_ALL_EXCEPT); /* clear floatstatus */ + } +} +/**end repeat1**/ + +/**begin repeat1 + * # kind = copysign, nextafter# + * # VML = CopySign, NextAfter# + */ +void +mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + const int contig = IS_BINARY_CONT(@type@, @type@); + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; + + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ + } + else { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1)= @kind@@c@(in1, in2); + } + } +} +/**end repeat1**/ + +void +mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + const int contig = IS_BINARY_CONT(@type@, @type@); + const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); + const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; + int ignore_fpstatus = 0; + + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + ignore_fpstatus = 1; + CHUNKED_VML_CALL3(v@s@Remainder, dimensions[0], @type@, args[0], args[1], args[2]); + /* v@s@Remainder(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ + } + else { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + int invalid_cases = !npy_isnan(in1) && in2 == 0; + invalid_cases |= (in1 == NPY_INFINITY || in1 == -NPY_INFINITY) && !npy_isnan(in2); + invalid_cases |= (in1 != NPY_INFINITY && in1 != -NPY_INFINITY) && (in2 == NPY_INFINITY || in2 == -NPY_INFINITY); + ignore_fpstatus |= invalid_cases; + divmod@c@(in1, in2, (@type@ *)op1); } + } + if (ignore_fpstatus) { + feclearexcept(FE_UNDERFLOW | FE_INVALID); + } +} + +/**begin repeat1 + * # kind = cos, sin, tan, arccos, arcsin, arctan, cosh, sinh, tanh, arccosh, arcsinh, arctanh, fabs, floor, ceil, rint, trunc, cbrt, sqrt, expm1, log, log1p, log10# + * # func = cos, sin, tan, acos, asin, atan, cosh, sinh, tanh, acosh, asinh, atanh, fabs, floor, ceil, rint, trunc, cbrt, sqrt, expm1, log, log1p, log10# + * # VML = Cos, Sin, Tan, Acos, Asin, Atan, Cosh, Sinh, Tanh, Acosh, Asinh, Atanh, Abs, Floor, Ceil, Rint, Trunc, Cbrt, Sqrt, Expm1, Ln, Log1p, Log10# + */ +void +mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + const int contig = IS_UNARY_CONT(@type@, @type@); + const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same; + + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) + { + CHUNKED_VML_CALL2(v@s@@VML@, dimensions[0], @type@, args[0], args[1]); + /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ + } + else { + UNARY_LOOP_DISPATCH( + @type@, @type@ + , + can_vectorize + , + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = @func@@c@(in1); + ) + } +} +/**end repeat1**/ + +/**begin repeat1 + * # kind = exp, exp2# + * # VML = Exp, Exp2# + */ +void +mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + const int contig = IS_UNARY_CONT(@type@, @type@); + const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same; + int ignore_fpstatus = 0; + + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) + { + ignore_fpstatus = 1; + CHUNKED_VML_CALL2(v@s@@VML@, dimensions[0], @type@, args[0], args[1]); + /* v@s@Exp(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ + } + else { + UNARY_LOOP_DISPATCH( + @type@, @type@ + , + can_vectorize + , + const @type@ in1 = *(@type@ *)ip1; + const int invalid_cases = npy_isnan(in1) || in1 == NPY_INFINITY || in1 == -NPY_INFINITY; + ignore_fpstatus |= invalid_cases; + *(@type@ *)op1 = @kind@@c@(in1); + ) + } + if (ignore_fpstatus) { + feclearexcept(FE_OVERFLOW | FE_UNDERFLOW | FE_INVALID); + } +} +/**end repeat1**/ + +void +mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + const int contig = IS_UNARY_CONT(@type@, @type@); + const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same; + int ignore_fpstatus = 0; + + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) + { + ignore_fpstatus = 1; + CHUNKED_VML_CALL2(v@s@Log2, dimensions[0], @type@, args[0], args[1]); + /* v@s@Log2(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ + } + else { + UNARY_LOOP_DISPATCH( + @type@, @type@ + , + can_vectorize + , + const @type@ in1 = *(@type@ *)ip1; + const int invalid_cases = in1 < 0 || in1 == 0 || npy_isnan(in1) || in1 == -NPY_INFINITY; + ignore_fpstatus |= invalid_cases; + *(@type@ *)op1 = log2@c@(in1); + ) + } + if (ignore_fpstatus) { + feclearexcept(FE_DIVBYZERO | FE_INVALID); + } +} + +void +mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + const int contig = IS_UNARY_CONT(@type@, @type@); + const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same; - { - npy_intp j, j_max = blocked_end - peel; - if (j_max > 0) { - @type@ *ip1_aligned = ip1 + peel, *op1_shifted = op1 + peel; + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + CHUNKED_VML_CALL2(v@s@Abs, dimensions[0], @type@, args[0], args[1]); + /* v@s@Abs(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ + } + else { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ tmp = in1 > 0 ? in1 : -in1; + /* add 0 to clear -0.0 */ + *((@type@ *)op1) = tmp + 0; + } + } + feclearexcept(FE_ALL_EXCEPT); /* clear floatstatus */ +} - NPY_ASSUME_ALIGNED(ip1_aligned, 64) - for(j = 0; j < j_max; j++) { - op1_shifted[j] = ip1_aligned[j] @OP@ ip2c; - } +void +mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)) +{ + const int contig = IS_UNARY_CONT(@type@, @type@); + const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same; - i = blocked_end; - } + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + CHUNKED_VML_CALL2(v@s@Inv, dimensions[0], @type@, args[0], args[1]); + /* v@s@Inv(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ + } + else { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *((@type@ *)op1) = 1/in1; } + } +} - NPY_PRAGMA_NOVECTOR - for(; i < n; i++) { - op1[i] = ip1[i] @OP@ ip2c; - } - } else if (IS_BINARY_REDUCE) { -#if @PW@ - @type@ * iop1 = (@type@ *)args[0]; - npy_intp n = dimensions[0]; +void +mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)) +{ + const int contig = IS_UNARY_CONT(@type@, @type@); + const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); + const int can_vectorize = contig && disjoint_or_same; - *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]); -#else - BINARY_REDUCE_LOOP(@type@) { - io1 @OP@= *(@type@ *)ip2; - } - *((@type@ *)iop1) = io1; -#endif - } else { - BINARY_LOOP { + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + CHUNKED_VML_CALL2(v@s@Sqr, dimensions[0], @type@, args[0], args[1]); + /* v@s@Sqr(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ + } + else { + UNARY_LOOP { const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = in1 @OP@ in2; + *((@type@ *)op1) = in1*in1; } } } -/**end repeat1**/ + +/* TODO: USE MKL */ +void +mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_TWO_OUT { + const @type@ in1 = *(@type@ *)ip1; + *((@type@ *)op1) = modf@c@(in1, (@type@ *)op2); + } +} /**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal, @@ -1912,69 +1087,13 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } /**end repeat1**/ -void -mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = (spacing@c@(in1)); - } -} - -void -mkl_umath_@TYPE@_copysign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ -#if @SUPPORTED_BY_VML@ - const int contig = IS_BINARY_CONT(@type@, @type@); - const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); - const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - CHUNKED_VML_CALL3(v@s@CopySign, dimensions[0], @type@, args[0], args[1], args[2]); - /* v@s@CopySign(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1)= copysign@c@(in1, in2); - } - } -} - -void -mkl_umath_@TYPE@_nextafter(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ -#if @SUPPORTED_BY_VML@ - const int contig = IS_BINARY_CONT(@type@, @type@); - const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); - const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - CHUNKED_VML_CALL3(v@s@NextAfter, dimensions[0], @type@, args[0], args[1], args[2]); - /* v@s@NextAfter(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1)= nextafter@c@(in1, in2); - } - } -} - /**begin repeat1 * #kind = maximum, minimum# - * #OP = >=, <=# + * #OP = >=, <=# **/ void mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - /* */ if (IS_BINARY_REDUCE) { { BINARY_REDUCE_LOOP(@type@) { @@ -1998,130 +1117,6 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp } /**end repeat1**/ -/**begin repeat1 - * #kind = fmax, fmin# - * #VML = Fmax, Fmin# - * #OP = >=, <=# - **/ -void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ -#if @SUPPORTED_BY_VML@ - const int contig = IS_BINARY_CONT(@type@, @type@); - const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); - const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]); - /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || isnan(in2)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - *((@type@ *)op1) = (in1 @OP@ in2 || isnan(in2)) ? in1 : in2; - } - } - feclearexcept(FE_ALL_EXCEPT); /* clear floatstatus */ - } -} -/**end repeat1**/ - -void -mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ -#if @SUPPORTED_BY_VML@ - const int contig = IS_BINARY_CONT(@type@, @type@); - const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)); - const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2; - int ignore_fpstatus = 0; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - ignore_fpstatus = 1; - CHUNKED_VML_CALL3(v@s@Remainder, dimensions[0], @type@, args[0], args[1], args[2]); - /* v@s@Remainder(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */ - } else -#endif - { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - int invalid_cases = !npy_isnan(in1) && in2 == 0; - invalid_cases |= (in1 == NPY_INFINITY || in1 == -NPY_INFINITY) && !npy_isnan(in2); - invalid_cases |= (in1 != NPY_INFINITY && in1 != -NPY_INFINITY) && (in2 == NPY_INFINITY || in2 == -NPY_INFINITY); - ignore_fpstatus |= invalid_cases; - divmod@c@(in1, in2, (@type@ *)op1); - } - } - if(ignore_fpstatus) { - feclearexcept(FE_UNDERFLOW | FE_INVALID); - } -} - -void -mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP_TWO_OUT { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = divmod@c@(in1, in2, (@type@ *)op2); - } -} - -void -mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)) -{ -#if @SUPPORTED_BY_VML@ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - CHUNKED_VML_CALL2(v@s@Sqr, dimensions[0], @type@, args[0], args[1]); - /* v@s@Sqr(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else -#endif - { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = in1*in1; - } - } -} - -void -mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)) -{ -#if @SUPPORTED_BY_VML@ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - CHUNKED_VML_CALL2(v@s@Inv, dimensions[0], @type@, args[0], args[1]); - /* v@s@Inv(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else -#endif - { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = 1/in1; - } - } -} void mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) @@ -2133,27 +1128,12 @@ mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_in } void -mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { -#if @SUPPORTED_BY_VML@ - const int contig = IS_UNARY_CONT(@type@, @type@); - const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); - const int can_vectorize = contig && disjoint_or_same; - - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { - CHUNKED_VML_CALL2(v@s@Abs, dimensions[0], @type@, args[0], args[1]); - /* v@s@Abs(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else -#endif - { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ tmp = in1 > 0 ? in1 : -in1; - /* add 0 to clear -0.0 */ - *((@type@ *)op1) = tmp + 0; - } + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *((@type@ *)op1) = (spacing@c@(in1)); } - feclearexcept(FE_ALL_EXCEPT); /* clear floatstatus */ } void @@ -2186,13 +1166,13 @@ mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *s } } -/* TODO: USE MKL */ void -mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) +mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) { - UNARY_LOOP_TWO_OUT { + BINARY_LOOP_TWO_OUT { const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = modf@c@(in1, (@type@ *)op2); + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1) = divmod@c@(in1, in2, (@type@ *)op2); } } @@ -2276,7 +1256,6 @@ mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_i } } #endif - /**end repeat**/ /* @@ -2316,8 +1295,7 @@ mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_i #endif #endif static void -pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, - npy_intp stride) +pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, npy_intp stride) { assert(n % 2 == 0); if (n < 8) { @@ -2389,7 +1367,6 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, /* TODO: USE MKL */ /**begin repeat1 - * arithmetic * #kind = add, subtract# * #OP = +, -# * #PW = 1, 0# @@ -2566,7 +1543,8 @@ mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_i const @ftype@ d = in1r + in1i*r; ((@ftype@ *)op1)[0] = 1/d; ((@ftype@ *)op1)[1] = -r/d; - } else { + } + else { const @ftype@ r = in1r/in1i; const @ftype@ d = in1r*r + in1i; ((@ftype@ *)op1)[0] = r/d; @@ -2581,10 +1559,11 @@ mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_in const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)); const int can_vectorize = contig && disjoint_or_same; - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { CHUNKED_VML_CALL2(v@s@Conj, dimensions[0], @type@, args[0], args[1]); /* v@s@Conj(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { + } + else { UNARY_LOOP { const @ftype@ in1r = ((@ftype@ *)ip1)[0]; const @ftype@ in1i = ((@ftype@ *)ip1)[1]; @@ -2602,18 +1581,19 @@ mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_int const int can_vectorize = contig && disjoint_or_same; int ignore_fpstatus = 0; - if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { + if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) { ignore_fpstatus = 1; CHUNKED_VML_CALL2(v@s@Abs, dimensions[0], @type@, args[0], args[1]); /* v@s@Abs(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */ - } else { + } + else { UNARY_LOOP { const @ftype@ in1r = ((@ftype@ *)ip1)[0]; const @ftype@ in1i = ((@ftype@ *)ip1)[1]; *((@ftype@ *)op1) = hypot@c@(in1r, in1i); } } - if(ignore_fpstatus) { + if (ignore_fpstatus) { feclearexcept(FE_INVALID); } } @@ -2660,12 +1640,12 @@ mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *s UNARY_LOOP { const @ftype@ in1r = ((@ftype@ *)ip1)[0]; const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - ((@ftype@ *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ? 1 : + ((@ftype@ *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ? 1 : (CLT(in1r, in1i, 0.0, 0.0) ? -1 : - (CEQ(in1r, in1i, 0.0, 0.0) ? 0 : NPY_NAN@C@)); + (CEQ(in1r, in1i, 0.0, 0.0) ? 0 : NPY_NAN@C@)); ((@ftype@ *)op1)[1] = 0; } - feclearexcept(FE_INVALID); + feclearexcept(FE_INVALID); #endif } diff --git a/mkl_umath/src/mkl_umath_loops.h.src b/mkl_umath/src/mkl_umath_loops.h.src index be25745..71446ae 100644 --- a/mkl_umath/src/mkl_umath_loops.h.src +++ b/mkl_umath/src/mkl_umath_loops.h.src @@ -53,214 +53,19 @@ * #TYPE = FLOAT, DOUBLE# */ -MKL_UMATH_API -void -mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - /**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# + * # kind = absolute, add, arccos, arccosh, arcsin, arcsinh, arctan, arctanh, cbrt, ceil, conjugate, + copysign, cos, cosh , divide, divmod, isfinite, isinf, isnan, equal, exp, exp2, expm1, fabs, + floor, fmax, fmin, frexp, greater, greater_equal, ldexp, less, less_equal, log, log2, log10, + log1p, logical_and, logical_not, logical_or, logical_xor, maximum, minimum, multiply, modf, + negative, nextafter, not_equal, positive, reciprocal, remainder, rint, sign, signbit, sin, + sinh, spacing, sqrt, square, subtract, tan, tanh, trunc# */ MKL_UMATH_API void mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); /**end repeat1**/ -/**begin repeat1 - * Arithmetic - * # kind = equal, not_equal, less, less_equal, greater, greater_equal, - * logical_and, logical_or# - */ -MKL_UMATH_API -void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -MKL_UMATH_API -void -mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind = isnan, isinf, isfinite, signbit# - **/ -MKL_UMATH_API -void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -MKL_UMATH_API -void -mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - - -MKL_UMATH_API -void -mkl_umath_@TYPE@_copysign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_nextafter(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -/**begin repeat1 - * #kind = maximum, minimum, fmax, fmin# - **/ -MKL_UMATH_API -void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -MKL_UMATH_API -void -mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_negative(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_positive(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_frexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_ldexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - #ifdef USE_NUMPY_2 MKL_UMATH_API void @@ -292,57 +97,9 @@ mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_i */ /**begin repeat1 - * arithmetic - * #kind = add, subtract# - */ -MKL_UMATH_API -void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -MKL_UMATH_API -void -mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); - - -/**begin repeat1 - * arithmetic - * #kind = greater, greater_equal, less, less_equal, equal, - not_equal, logical_and, logical_or, logical_xor, logical_not, - isnan, isinf, isfinite# - */ -MKL_UMATH_API -void -mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ - -MKL_UMATH_API -void -mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); - -MKL_UMATH_API -void -mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); - -/**begin repeat1 - * arithmetic - * #kind = maximum, minimum, fmax, fmin# + * # kind = absolute, add, conjugate, divide, isfinite, isinf, isnan, equal, fmax, fmin, greater, greater_equal, + less, less_equal, logical_and, logical_not, logical_or, logical_xor, maximum, minimum, multiply, + not_equal, reciprocal, sign, square, subtract# */ MKL_UMATH_API void diff --git a/mkl_umath/tests/test_basic.py b/mkl_umath/tests/test_basic.py index af30e67..9c2133f 100644 --- a/mkl_umath/tests/test_basic.py +++ b/mkl_umath/tests/test_basic.py @@ -60,6 +60,7 @@ def get_args(args_str, size, low, high): for umath in umaths: mkl_umath = getattr(mu, umath) types = mkl_umath.types + size_mkl = 8192 + 1 for type_ in types: args_str = type_[:type_.find("->")] if umath in ["arccos", "arcsin", "arctanh"]: @@ -70,9 +71,12 @@ def get_args(args_str, size, low, high): low = 1; high = 10 else: low = -10; high = 10 - # when size > 8192, mkl is used (if supported), + + if umath in ["add", "subtract", "multiply"]: + size_mkl = 10**5 + 1 + # when size > size_mkl, mkl is used (if supported), # otherwise it falls back on numpy algorithm - args_mkl = get_args(args_str, 8200, low, high) + args_mkl = get_args(args_str, size_mkl, low, high) args_fall_back = get_args(args_str, 100, low, high) mkl_cases[(umath, type_)] = args_mkl fall_back_cases[(umath, type_)] = args_fall_back @@ -93,7 +97,7 @@ def test_mkl_umath(case): mkl_res = mkl_umath(*args) np_res = np_umath(*args) - assert np.allclose(mkl_res, np_res), f"Results for '{umath}': mkl_res: {mkl_res}, np_res: {np_res}" + assert np.allclose(mkl_res, np_res), f"Results for '{umath}' do not match" @pytest.mark.parametrize("case", test_fall_back, ids=get_id) @@ -105,8 +109,64 @@ def test_fall_back_umath(case): mkl_res = mkl_umath(*args) np_res = np_umath(*args) - - assert np.allclose(mkl_res, np_res), f"Results for '{umath}': mkl_res: {mkl_res}, np_res: {np_res}" + + assert np.allclose(mkl_res, np_res), f"Results for '{umath}' do not match" + + +@pytest.mark.parametrize("func", ["add", "subtract", "multiply", "divide"]) +@pytest.mark.parametrize("size", [10**5 + 1, 100]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_scalar(func, size, dtype): + a = np.random.uniform(-10, 10, size).astype(dtype) + + # testing implementation in IS_BINARY_CONT_S1 branch + mkl_res = getattr(mu, func)(a, 1.0) + np_res = getattr(np, func)(a, 1.0) + assert np.allclose(mkl_res, np_res), f"Results for '{func}(array, scalar)' do not match" + + # testing implementation in IS_BINARY_CONT_S2 branch + mkl_res = getattr(mu, func)(1.0, a) + np_res = getattr(np, func)(1.0, a) + assert np.allclose(mkl_res, np_res), f"Results for '{func}(scalar, array)' do not match" + + +@pytest.mark.parametrize("func", ["add", "subtract", "multiply", "divide"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_strided(func, dtype): + # testing implementation in rthe final else branch + a = np.random.uniform(-10, 10, 100)[::2].astype(dtype) + b = np.random.uniform(-10, 10, 100)[::2].astype(dtype) + + mkl_res = getattr(mu, func)(a, b) + np_res = getattr(np, func)(a, b) + assert np.allclose(mkl_res, np_res), f"Results for '{func}[strided]' do not match" + + +@pytest.mark.parametrize("func", ["add", "subtract", "multiply", "divide", "maximum", "minimum", "fmax", "fmin"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_reduce_float(func, dtype): + # testing implementation in IS_BINARY_REDUCE branch + a = np.random.uniform(-10, 10, 50).astype(dtype) + mkl_func = getattr(mu, func) + np_func = getattr(np, func) + + mkl_res = mkl_func.reduce(a) + np_res = np_func.reduce(a) + assert np.allclose(mkl_res, np_res), f"Results for '{func}[reduce]' do not match" + + +@pytest.mark.parametrize("func", ["add", "subtract"]) +@pytest.mark.parametrize("dtype", [np.complex64, np.complex128]) +def test_reduce_complex(func, dtype): + # testing implementation in IS_BINARY_REDUCE branch + a = np.random.uniform(-10, 10, 100) + 1j * np.random.uniform(-10, 10, 100) + a = a.astype(dtype) + mkl_func = getattr(mu, func) + np_func = getattr(np, func) + + mkl_res = mkl_func.reduce(a) + np_res = np_func.reduce(a) + assert np.allclose(mkl_res, np_res), f"Results for '{func}[reduce]' do not match" def test_patch(): From dd40c01cddb1b945668448c980ad9db5bb701367 Mon Sep 17 00:00:00 2001 From: Vahid Tavanashad <120411540+vtavana@users.noreply.github.com> Date: Mon, 4 Aug 2025 12:37:56 -0500 Subject: [PATCH 2/2] Update mkl_umath/src/mkl_umath_loops.h.src --- mkl_umath/src/mkl_umath_loops.h.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkl_umath/src/mkl_umath_loops.h.src b/mkl_umath/src/mkl_umath_loops.h.src index 71446ae..286f301 100644 --- a/mkl_umath/src/mkl_umath_loops.h.src +++ b/mkl_umath/src/mkl_umath_loops.h.src @@ -55,7 +55,7 @@ /**begin repeat1 * # kind = absolute, add, arccos, arccosh, arcsin, arcsinh, arctan, arctanh, cbrt, ceil, conjugate, - copysign, cos, cosh , divide, divmod, isfinite, isinf, isnan, equal, exp, exp2, expm1, fabs, + copysign, cos, cosh, divide, divmod, isfinite, isinf, isnan, equal, exp, exp2, expm1, fabs, floor, fmax, fmin, frexp, greater, greater_equal, ldexp, less, less_equal, log, log2, log10, log1p, logical_and, logical_not, logical_or, logical_xor, maximum, minimum, multiply, modf, negative, nextafter, not_equal, positive, reciprocal, remainder, rint, sign, signbit, sin,