Skip to content

Commit 1082661

Browse files
authored
Merge pull request numpy#27027 from seberg/sane-simd-steps
BUG: Fix simd loadable stride logic
2 parents 8351d3f + e27f419 commit 1082661

12 files changed

+173
-135
lines changed

numpy/_core/src/common/simd/simd.h

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#ifndef _NPY_SIMD_H_
22
#define _NPY_SIMD_H_
3+
4+
#include <stdalign.h> /* for alignof until C23 */
35
/**
46
* the NumPy C SIMD vectorization interface "NPYV" are types and functions intended
57
* to simplify vectorization of code on different platforms, currently supports
@@ -123,18 +125,19 @@ typedef double npyv_lanetype_f64;
123125
* acceptable limit of strides before using any of non-contiguous load/store intrinsics.
124126
*
125127
* For instance:
126-
* npy_intp ld_stride = step[0] / sizeof(float);
127-
* npy_intp st_stride = step[1] / sizeof(float);
128128
*
129-
* if (npyv_loadable_stride_f32(ld_stride) && npyv_storable_stride_f32(st_stride)) {
129+
* if (npyv_loadable_stride_f32(steps[0]) && npyv_storable_stride_f32(steps[1])) {
130+
* // Strides are now guaranteed to be a multiple and compatible
131+
* npy_intp ld_stride = steps[0] / sizeof(float);
132+
* npy_intp st_stride = steps[1] / sizeof(float);
130133
* for (;;)
131134
* npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride);
132135
* // ...
133136
* npyv_storen_f32(st_pointer, st_stride, a);
134137
* }
135138
* else {
136139
* for (;;)
137-
* // C scalars
140+
* // C scalars, use byte steps/strides.
138141
* }
139142
*/
140143
#ifndef NPY_SIMD_MAXLOAD_STRIDE32
@@ -149,11 +152,29 @@ typedef double npyv_lanetype_f64;
149152
#ifndef NPY_SIMD_MAXSTORE_STRIDE64
150153
#define NPY_SIMD_MAXSTORE_STRIDE64 0
151154
#endif
152-
#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \
153-
NPY_FINLINE int npyv_loadable_stride_##SFX(npy_intp stride) \
154-
{ return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; } \
155-
NPY_FINLINE int npyv_storable_stride_##SFX(npy_intp stride) \
156-
{ return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; }
155+
#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \
156+
NPY_FINLINE int \
157+
npyv_loadable_stride_##SFX(npy_intp stride) \
158+
{ \
159+
if (alignof(npyv_lanetype_##SFX) != sizeof(npyv_lanetype_##SFX) && \
160+
stride % sizeof(npyv_lanetype_##SFX) != 0) { \
161+
/* stride not a multiple of itemsize, cannot handle. */ \
162+
return 0; \
163+
} \
164+
stride = stride / sizeof(npyv_lanetype_##SFX); \
165+
return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; \
166+
} \
167+
NPY_FINLINE int \
168+
npyv_storable_stride_##SFX(npy_intp stride) \
169+
{ \
170+
if (alignof(npyv_lanetype_##SFX) != sizeof(npyv_lanetype_##SFX) && \
171+
stride % sizeof(npyv_lanetype_##SFX) != 0) { \
172+
/* stride not a multiple of itemsize, cannot handle. */ \
173+
return 0; \
174+
} \
175+
stride = stride / sizeof(npyv_lanetype_##SFX); \
176+
return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; \
177+
}
157178
#if NPY_SIMD
158179
NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
159180
NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)

numpy/_core/src/umath/loops_arithm_fp.dispatch.c.src

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -346,14 +346,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
346346
&& __apple_build_version__ < 14030000
347347
goto loop_scalar;
348348
#endif // end affected Apple clang.
349+
349350
if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
350351
is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
351-
b_sdst % sizeof(@ftype@) != 0 || b_sdst == 0 ||
352-
b_ssrc0 % sizeof(@ftype@) != 0 ||
353-
b_ssrc1 % sizeof(@ftype@) != 0
352+
!npyv_loadable_stride_@sfx@(b_ssrc0) ||
353+
!npyv_loadable_stride_@sfx@(b_ssrc1) ||
354+
!npyv_storable_stride_@sfx@(b_sdst) ||
355+
b_sdst == 0
354356
) {
355357
goto loop_scalar;
356358
}
359+
357360
const @ftype@ *src0 = (@ftype@*)b_src0;
358361
const @ftype@ *src1 = (@ftype@*)b_src1;
359362
@ftype@ *dst = (@ftype@*)b_dst;
@@ -366,10 +369,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
366369
const int wstep = vstep * 2;
367370
const int hstep = vstep / 2;
368371

369-
const int loadable0 = npyv_loadable_stride_s64(ssrc0);
370-
const int loadable1 = npyv_loadable_stride_s64(ssrc1);
371-
const int storable = npyv_storable_stride_s64(sdst);
372-
373372
// lots**lots of specializations, to squeeze out max performance
374373
// contig
375374
if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
@@ -414,7 +413,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
414413
}
415414
}
416415
// non-contig
417-
else if (loadable1 && storable) {
416+
else {
418417
for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
419418
npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
420419
npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
@@ -433,9 +432,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
433432
npyv_storen2_till_@sfx@(dst, sdst, len, r);
434433
}
435434
}
436-
else {
437-
goto loop_scalar;
438-
}
439435
}
440436
// scalar 1
441437
else if (ssrc1 == 0) {
@@ -460,7 +456,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
460456
}
461457
}
462458
// non-contig
463-
else if (loadable0 && storable) {
459+
else {
464460
for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
465461
npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
466462
npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
@@ -479,13 +475,10 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
479475
npyv_storen2_till_@sfx@(dst, sdst, len, r);
480476
}
481477
}
482-
else {
483-
goto loop_scalar;
484-
}
485478
}
486479
#if @is_mul@
487480
// non-contig
488-
else if (loadable0 && loadable1 && storable) {
481+
else {
489482
for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
490483
src1 += ssrc1*vstep, dst += sdst*vstep
491484
) {
@@ -512,12 +505,16 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
512505
npyv_storen2_till_@sfx@(dst, sdst, len, r);
513506
}
514507
}
515-
#endif
508+
#else /* @is_mul@ */
516509
else {
510+
// Only multiply is vectorized for the generic non-contig case.
517511
goto loop_scalar;
518512
}
513+
#endif /* @is_mul@ */
514+
519515
npyv_cleanup();
520516
return;
517+
521518
loop_scalar:
522519
#endif
523520
for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
@@ -580,8 +577,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
580577
npy_intp b_ssrc = steps[0], b_sdst = steps[1];
581578
#if @VECTOR@
582579
if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
583-
b_sdst % sizeof(@ftype@) != 0 ||
584-
b_ssrc % sizeof(@ftype@) != 0
580+
!npyv_loadable_stride_@sfx@(b_ssrc) ||
581+
!npyv_storable_stride_@sfx@(b_sdst)
585582
) {
586583
goto loop_scalar;
587584
}
@@ -609,7 +606,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
609606
npyv_store2_till_@sfx@(dst, len, r);
610607
}
611608
}
612-
else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
609+
else if (ssrc == 2) {
613610
for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
614611
npyv_@sfx@ a0 = npyv_load_@sfx@(src);
615612
npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
@@ -624,7 +621,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
624621
npyv_storen2_till_@sfx@(dst, sdst, len, r);
625622
}
626623
}
627-
else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
624+
else if (sdst == 2) {
628625
for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
629626
npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
630627
npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);

numpy/_core/src/umath/loops_exponent_log.dispatch.c.src

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,16 +1315,16 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
13151315
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
13161316
{
13171317
#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
1318-
const npy_double *src = (npy_double*)args[0];
1319-
npy_double *dst = (npy_double*)args[1];
1320-
const int lsize = sizeof(src[0]);
1321-
const npy_intp ssrc = steps[0] / lsize;
1322-
const npy_intp sdst = steps[1] / lsize;
13231318
const npy_intp len = dimensions[0];
1324-
assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
1325-
if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
1326-
npyv_loadable_stride_f64(ssrc) &&
1327-
npyv_storable_stride_f64(sdst)) {
1319+
1320+
if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
1321+
npyv_loadable_stride_f64(steps[0]) &&
1322+
npyv_storable_stride_f64(steps[1])) {
1323+
const npy_double *src = (npy_double*)args[0];
1324+
npy_double *dst = (npy_double*)args[1];
1325+
const npy_intp ssrc = steps[0] / sizeof(src[0]);
1326+
const npy_intp sdst = steps[1] / sizeof(src[0]);
1327+
13281328
simd_@func@_f64(src, ssrc, dst, sdst, len);
13291329
return;
13301330
}

numpy/_core/src/umath/loops_hyperbolic.dispatch.c.src

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include "simd/simd.h"
1010
#include "loops_utils.h"
1111
#include "loops.h"
12+
// Provides the various *_LOOP macros
13+
#include "fast_loop_macros.h"
1214

1315
#if NPY_SIMD_FMA3 // native support
1416
/*
@@ -608,32 +610,29 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
608610
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
609611
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
610612
{
611-
const @type@ *src = (@type@*)args[0];
612-
@type@ *dst = (@type@*)args[1];
613-
614-
const int lsize = sizeof(src[0]);
615-
const npy_intp ssrc = steps[0] / lsize;
616-
const npy_intp sdst = steps[1] / lsize;
617-
npy_intp len = dimensions[0];
618-
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
619613
#if @simd@
620-
if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
621-
!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)
614+
npy_intp len = dimensions[0];
615+
616+
if (is_mem_overlap(args[0], steps[0], args[1], steps[1], len) ||
617+
!npyv_loadable_stride_@sfx@(steps[0]) ||
618+
!npyv_storable_stride_@sfx@(steps[1])
622619
) {
623-
for (; len > 0; --len, src += ssrc, dst += sdst) {
624-
simd_@func@_@sfx@(src, 1, dst, 1, 1);
620+
UNARY_LOOP {
621+
simd_@func@_@sfx@((@type@ *)ip1, 1, (@type@ *)op1, 1, 1);
625622
}
626623
} else {
627-
simd_@func@_@sfx@(src, ssrc, dst, sdst, len);
624+
npy_intp ssrc = steps[0] / sizeof(@type@);
625+
npy_intp sdst = steps[1] / sizeof(@type@);
626+
simd_@func@_@sfx@((@type@ *)args[0], ssrc, (@type@ *)args[1], sdst, len);
628627
}
629628
npyv_cleanup();
630629
#if @simd_req_clear@
631630
npy_clear_floatstatus_barrier((char*)dimensions);
632631
#endif
633632
#else
634-
for (; len > 0; --len, src += ssrc, dst += sdst) {
635-
const @type@ src0 = *src;
636-
*dst = npy_@func@@ssfx@(src0);
633+
UNARY_LOOP {
634+
const @type@ in1 = *(@type@ *)ip1;
635+
*(@type@ *)op1 = npy_@func@@ssfx@(in1);
637636
}
638637
#endif
639638
}

numpy/_core/src/umath/loops_minmax.dispatch.c.src

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
352352
}
353353
// unroll scalars faster than non-contiguous vector load/store on Arm
354354
#if !defined(NPY_HAVE_NEON) && @is_fp@
355-
if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
356-
TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
357-
TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
355+
if (TO_SIMD_SFX(npyv_loadable_stride)(is1) &&
356+
TO_SIMD_SFX(npyv_loadable_stride)(is2) &&
357+
TO_SIMD_SFX(npyv_storable_stride)(os1)
358358
) {
359359
TO_SIMD_SFX(simd_binary_@intrin@)(
360360
(STYPE*)ip1, is1/sizeof(STYPE),

numpy/_core/src/umath/loops_trigonometric.dispatch.cpp

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -214,21 +214,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
214214
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
215215
{
216216
#if NPY_SIMD_FMA3
217-
const npy_float *src = (npy_float*)args[0];
218-
npy_float *dst = (npy_float*)args[1];
219-
220-
const int lsize = sizeof(src[0]);
221-
const npy_intp ssrc = steps[0] / lsize;
222-
const npy_intp sdst = steps[1] / lsize;
223217
npy_intp len = dimensions[0];
224-
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
225-
if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
226-
!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
218+
219+
if (is_mem_overlap(args[0], steps[0], args[1], steps[1], len) ||
220+
!npyv_loadable_stride_f32(steps[0]) ||
221+
!npyv_storable_stride_f32(steps[1])
227222
) {
228-
for (; len > 0; --len, src += ssrc, dst += sdst) {
229-
simd_sincos_f32(src, 1, dst, 1, 1, SIMD_COMPUTE_SIN);
223+
UNARY_LOOP {
224+
simd_sincos_f32(
225+
(npy_float *)ip1, 1, (npy_float *)op1, 1, 1, SIMD_COMPUTE_SIN);
230226
}
231227
} else {
228+
const npy_float *src = (npy_float*)args[0];
229+
npy_float *dst = (npy_float*)args[1];
230+
const npy_intp ssrc = steps[0] / sizeof(npy_float);
231+
const npy_intp sdst = steps[1] / sizeof(npy_float);
232+
232233
simd_sincos_f32(src, ssrc, dst, sdst, len, SIMD_COMPUTE_SIN);
233234
}
234235
#else
@@ -243,21 +244,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cos)
243244
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
244245
{
245246
#if NPY_SIMD_FMA3
246-
const npy_float *src = (npy_float*)args[0];
247-
npy_float *dst = (npy_float*)args[1];
248-
249-
const int lsize = sizeof(src[0]);
250-
const npy_intp ssrc = steps[0] / lsize;
251-
const npy_intp sdst = steps[1] / lsize;
252247
npy_intp len = dimensions[0];
253-
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
254-
if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
255-
!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
248+
249+
if (is_mem_overlap(args[0], steps[0], args[1], steps[1], len) ||
250+
!npyv_loadable_stride_f32(steps[0]) ||
251+
!npyv_storable_stride_f32(steps[1])
256252
) {
257-
for (; len > 0; --len, src += ssrc, dst += sdst) {
258-
simd_sincos_f32(src, 1, dst, 1, 1, SIMD_COMPUTE_COS);
253+
UNARY_LOOP {
254+
simd_sincos_f32(
255+
(npy_float *)ip1, 1, (npy_float *)op1, 1, 1, SIMD_COMPUTE_COS);
259256
}
260257
} else {
258+
const npy_float *src = (npy_float*)args[0];
259+
npy_float *dst = (npy_float*)args[1];
260+
const npy_intp ssrc = steps[0] / sizeof(npy_float);
261+
const npy_intp sdst = steps[1] / sizeof(npy_float);
262+
261263
simd_sincos_f32(src, ssrc, dst, sdst, len, SIMD_COMPUTE_COS);
262264
}
263265
#else

0 commit comments

Comments
 (0)