Developer-Ecosystem-Engineering
diff --git a/‎numpy/_core/src/common/simd/simd.h
Lines changed: 30 additions & 9 deletions b/‎numpy/_core/src/common/simd/simd.h
Lines changed: 30 additions & 9 deletions
diff --git a/‎numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
Lines changed: 18 additions & 21 deletions b/‎numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src
Lines changed: 18 additions & 21 deletions
diff --git a/‎numpy/_core/src/umath/loops_exponent_log.dispatch.c.src
Lines changed: 9 additions & 9 deletions b/‎numpy/_core/src/umath/loops_exponent_log.dispatch.c.src
Lines changed: 9 additions & 9 deletions
diff --git a/‎numpy/_core/src/umath/loops_hyperbolic.dispatch.c.src
Lines changed: 15 additions & 16 deletions b/‎numpy/_core/src/umath/loops_hyperbolic.dispatch.c.src
Lines changed: 15 additions & 16 deletions
diff --git a/‎numpy/_core/src/umath/loops_minmax.dispatch.c.src
Lines changed: 3 additions & 3 deletions b/‎numpy/_core/src/umath/loops_minmax.dispatch.c.src
Lines changed: 3 additions & 3 deletions
diff --git a/‎numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
Lines changed: 24 additions & 22 deletions b/‎numpy/_core/src/umath/loops_trigonometric.dispatch.cpp
Lines changed: 24 additions & 22 deletions
@@ -1,5 +1,7 @@
 #ifndef _NPY_SIMD_H_
 #define _NPY_SIMD_H_
+
+#include <stdalign.h>  /* for alignof until C23 */
 /**
  * the NumPy C SIMD vectorization interface "NPYV" are types and functions intended
  * to simplify vectorization of code on different platforms, currently supports
@@ -123,18 +125,19 @@ typedef double     npyv_lanetype_f64;
  * acceptable limit of strides before using any of non-contiguous load/store intrinsics.
  *
  * For instance:
- *  npy_intp ld_stride = step[0] / sizeof(float);
- *  npy_intp st_stride = step[1] / sizeof(float);
  *
- *  if (npyv_loadable_stride_f32(ld_stride) && npyv_storable_stride_f32(st_stride)) {
+ *  if (npyv_loadable_stride_f32(steps[0]) && npyv_storable_stride_f32(steps[1])) {
+ *      // Strides are now guaranteed to be a multiple and compatible
+ *      npy_intp ld_stride = steps[0] / sizeof(float);
+ *      npy_intp st_stride = steps[1] / sizeof(float);
  *      for (;;)
  *          npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride);
  *          // ...
  *          npyv_storen_f32(st_pointer, st_stride, a);
  *  }
  *  else {
  *      for (;;)
- *          // C scalars
+ *          // C scalars, use byte steps/strides.
  *  }
  */
 #ifndef NPY_SIMD_MAXLOAD_STRIDE32
@@ -149,11 +152,29 @@ typedef double     npyv_lanetype_f64;
 #ifndef NPY_SIMD_MAXSTORE_STRIDE64
     #define NPY_SIMD_MAXSTORE_STRIDE64 0
 #endif
-#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \
-    NPY_FINLINE int npyv_loadable_stride_##SFX(npy_intp stride) \
-    { return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; } \
-    NPY_FINLINE int npyv_storable_stride_##SFX(npy_intp stride) \
-    { return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; }
+#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE)                         \
+    NPY_FINLINE int                                                         \
+    npyv_loadable_stride_##SFX(npy_intp stride)                             \
+    {                                                                       \
+        if (alignof(npyv_lanetype_##SFX) != sizeof(npyv_lanetype_##SFX) &&  \
+                stride % sizeof(npyv_lanetype_##SFX) != 0) {                \
+            /* stride not a multiple of itemsize, cannot handle. */         \
+            return 0;                                                       \
+        }                                                                   \
+        stride = stride / sizeof(npyv_lanetype_##SFX);                      \
+        return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1;                  \
+    }                                                                       \
+    NPY_FINLINE int                                                         \
+    npyv_storable_stride_##SFX(npy_intp stride)                             \
+    {                                                                       \
+        if (alignof(npyv_lanetype_##SFX) != sizeof(npyv_lanetype_##SFX) &&  \
+                stride % sizeof(npyv_lanetype_##SFX) != 0) {                \
+            /* stride not a multiple of itemsize, cannot handle. */         \
+            return 0;                                                       \
+        }                                                                   \
+        stride = stride / sizeof(npyv_lanetype_##SFX);                      \
+        return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1;                \
+    }
 #if NPY_SIMD
     NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
     NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
 
@@ -346,14 +346,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
             && __apple_build_version__ < 14030000
         goto loop_scalar;
     #endif  // end affected Apple clang.
+
     if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
         is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
-        b_sdst  % sizeof(@ftype@) != 0 || b_sdst == 0 ||
-        b_ssrc0 % sizeof(@ftype@) != 0 ||
-        b_ssrc1 % sizeof(@ftype@) != 0
+        !npyv_loadable_stride_@sfx@(b_ssrc0) ||
+        !npyv_loadable_stride_@sfx@(b_ssrc1) ||
+        !npyv_storable_stride_@sfx@(b_sdst)  ||
+        b_sdst == 0
     ) {
         goto loop_scalar;
     }
+
     const @ftype@ *src0 = (@ftype@*)b_src0;
     const @ftype@ *src1 = (@ftype@*)b_src1;
           @ftype@ *dst  = (@ftype@*)b_dst;
@@ -366,10 +369,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
     const int wstep = vstep * 2;
     const int hstep = vstep / 2;
 
-    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
-    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
-    const int storable = npyv_storable_stride_s64(sdst);
-
     // lots**lots of specializations, to squeeze out max performance
     // contig
     if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
@@ -414,7 +413,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
             }
         }
         // non-contig
-        else if (loadable1 && storable) {
+        else {
             for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
                 npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
                 npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
@@ -433,9 +432,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_storen2_till_@sfx@(dst, sdst, len, r);
             }
         }
-        else {
-            goto loop_scalar;
-        }
     }
     // scalar 1
     else if (ssrc1 == 0) {
@@ -460,7 +456,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
             }
         }
         // non-contig
-        else if (loadable0 && storable) {
+        else {
             for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
                 npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
                 npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
@@ -479,13 +475,10 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_storen2_till_@sfx@(dst, sdst, len, r);
             }
         }
-        else {
-            goto loop_scalar;
-        }
     }
     #if @is_mul@
     // non-contig
-    else if (loadable0 && loadable1 && storable) {
+    else {
         for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
                             src1 += ssrc1*vstep, dst += sdst*vstep
         ) {
@@ -512,12 +505,16 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
             npyv_storen2_till_@sfx@(dst, sdst, len, r);
         }
     }
-    #endif
+    #else  /* @is_mul@ */
     else {
+        // Only multiply is vectorized for the generic non-contig case.
         goto loop_scalar;
     }
+    #endif  /* @is_mul@ */
+
     npyv_cleanup();
     return;
+
 loop_scalar:
 #endif
     for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
@@ -580,8 +577,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
     npy_intp b_ssrc = steps[0], b_sdst = steps[1];
 #if @VECTOR@
     if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
-        b_sdst % sizeof(@ftype@) != 0 ||
-        b_ssrc % sizeof(@ftype@) != 0
+        !npyv_loadable_stride_@sfx@(b_ssrc) ||
+        !npyv_storable_stride_@sfx@(b_sdst)
     ) {
         goto loop_scalar;
     }
@@ -609,7 +606,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
             npyv_store2_till_@sfx@(dst, len, r);
         }
     }
-    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+    else if (ssrc == 2) {
         for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
             npyv_@sfx@ a0 = npyv_load_@sfx@(src);
             npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
@@ -624,7 +621,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
             npyv_storen2_till_@sfx@(dst, sdst, len, r);
         }
     }
-    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+    else if (sdst == 2) {
         for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
             npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
             npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
 
@@ -1315,16 +1315,16 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
 #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
-    const npy_double *src = (npy_double*)args[0];
-    npy_double *dst = (npy_double*)args[1];
-    const int lsize = sizeof(src[0]);
-    const npy_intp ssrc = steps[0] / lsize;
-    const npy_intp sdst = steps[1] / lsize;
     const npy_intp len = dimensions[0];
-    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
-    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
-            npyv_loadable_stride_f64(ssrc) &&
-            npyv_storable_stride_f64(sdst)) {
+
+    if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+            npyv_loadable_stride_f64(steps[0]) &&
+            npyv_storable_stride_f64(steps[1])) {
+        const npy_double *src = (npy_double*)args[0];
+        npy_double *dst = (npy_double*)args[1];
+        const npy_intp ssrc = steps[0] / sizeof(src[0]);
+        const npy_intp sdst = steps[1] / sizeof(src[0]);
+
         simd_@func@_f64(src, ssrc, dst, sdst, len);
         return;
     }
 
@@ -9,6 +9,8 @@
 #include "simd/simd.h"
 #include "loops_utils.h"
 #include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
 
 #if NPY_SIMD_FMA3 // native support
 /*
@@ -608,32 +610,29 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
-    const @type@ *src = (@type@*)args[0];
-          @type@ *dst = (@type@*)args[1];
-
-    const int lsize = sizeof(src[0]);
-    const npy_intp ssrc = steps[0] / lsize;
-    const npy_intp sdst = steps[1] / lsize;
-    npy_intp len = dimensions[0];
-    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
 #if @simd@
-    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
-        !npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)
+    npy_intp len = dimensions[0];
+
+    if (is_mem_overlap(args[0], steps[0], args[1], steps[1], len) ||
+        !npyv_loadable_stride_@sfx@(steps[0]) ||
+        !npyv_storable_stride_@sfx@(steps[1])
     ) {
-        for (; len > 0; --len, src += ssrc, dst += sdst) {
-            simd_@func@_@sfx@(src, 1, dst, 1, 1);
+        UNARY_LOOP {
+            simd_@func@_@sfx@((@type@ *)ip1, 1, (@type@ *)op1, 1, 1);
         }
     } else {
-        simd_@func@_@sfx@(src, ssrc, dst, sdst, len);
+        npy_intp ssrc = steps[0] / sizeof(@type@);
+        npy_intp sdst = steps[1] / sizeof(@type@);
+        simd_@func@_@sfx@((@type@ *)args[0], ssrc, (@type@ *)args[1], sdst, len);
     }
     npyv_cleanup();
     #if @simd_req_clear@
         npy_clear_floatstatus_barrier((char*)dimensions);
     #endif
 #else
-    for (; len > 0; --len, src += ssrc, dst += sdst) {
-        const @type@ src0 = *src;
-        *dst = npy_@func@@ssfx@(src0);
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = npy_@func@@ssfx@(in1);
     }
 #endif
 }
 
@@ -352,9 +352,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
         }
     // unroll scalars faster than non-contiguous vector load/store on Arm
     #if !defined(NPY_HAVE_NEON) && @is_fp@
-        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
-            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
-            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1)
         ) {
             TO_SIMD_SFX(simd_binary_@intrin@)(
                 (STYPE*)ip1, is1/sizeof(STYPE),
 
@@ -214,21 +214,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
 #if NPY_SIMD_FMA3
-    const npy_float *src = (npy_float*)args[0];
-          npy_float *dst = (npy_float*)args[1];
-
-    const int lsize = sizeof(src[0]);
-    const npy_intp ssrc = steps[0] / lsize;
-    const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
-    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
-        !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
+
+    if (is_mem_overlap(args[0], steps[0], args[1], steps[1], len) ||
+        !npyv_loadable_stride_f32(steps[0]) ||
+        !npyv_storable_stride_f32(steps[1])
     ) {
-        for (; len > 0; --len, src += ssrc, dst += sdst) {
-            simd_sincos_f32(src, 1, dst, 1, 1, SIMD_COMPUTE_SIN);
+        UNARY_LOOP {
+            simd_sincos_f32(
+                (npy_float *)ip1, 1, (npy_float *)op1, 1, 1, SIMD_COMPUTE_SIN);
         }
     } else {
+        const npy_float *src = (npy_float*)args[0];
+              npy_float *dst = (npy_float*)args[1];
+        const npy_intp ssrc = steps[0] / sizeof(npy_float);
+        const npy_intp sdst = steps[1] / sizeof(npy_float);
+
         simd_sincos_f32(src, ssrc, dst, sdst, len, SIMD_COMPUTE_SIN);
     }
 #else
@@ -243,21 +244,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cos)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
 #if NPY_SIMD_FMA3
-    const npy_float *src = (npy_float*)args[0];
-          npy_float *dst = (npy_float*)args[1];
-
-    const int lsize = sizeof(src[0]);
-    const npy_intp ssrc = steps[0] / lsize;
-    const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
-    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
-        !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
+
+    if (is_mem_overlap(args[0], steps[0], args[1], steps[1], len) ||
+        !npyv_loadable_stride_f32(steps[0]) ||
+        !npyv_storable_stride_f32(steps[1])
     ) {
-        for (; len > 0; --len, src += ssrc, dst += sdst) {
-            simd_sincos_f32(src, 1, dst, 1, 1, SIMD_COMPUTE_COS);
+        UNARY_LOOP {
+            simd_sincos_f32(
+                (npy_float *)ip1, 1, (npy_float *)op1, 1, 1, SIMD_COMPUTE_COS);
         }
     } else {
+        const npy_float *src = (npy_float*)args[0];
+              npy_float *dst = (npy_float*)args[1];
+        const npy_intp ssrc = steps[0] / sizeof(npy_float);
+        const npy_intp sdst = steps[1] / sizeof(npy_float);
+
         simd_sincos_f32(src, ssrc, dst, sdst, len, SIMD_COMPUTE_COS);
     }
 #else