@@ -160,13 +160,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
160
160
#if NPY_SIMD && defined(NPY_CAN_LINK_SVML)
161
161
const npy_half *src = (npy_half*)args[0];
162
162
npy_half *dst = (npy_half*)args[1];
163
- const int lsize = sizeof(src[0]);
164
- const npy_intp ssrc = steps[0] / lsize;
165
- const npy_intp sdst = steps[1] / lsize;
163
+
166
164
const npy_intp len = dimensions[0];
165
+
167
166
if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
168
- (ssrc == 1 ) &&
169
- (sdst == 1 )) {
167
+ (steps[0] == sizeof(npy_half) ) &&
168
+ (steps[1] == sizeof(npy_half) )) {
170
169
#if defined(NPY_HAVE_AVX512_SPR)
171
170
__svml_@intrin@s32(src, dst, len);
172
171
return;
@@ -199,14 +198,15 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
199
198
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
200
199
{
201
200
#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
201
+ const @type@ *src = (@type@*)args[0];
202
+ @type@ *dst = (@type@*)args[1];
203
+
202
204
const npy_intp len = dimensions[0];
203
205
204
206
if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
205
207
npyv_loadable_stride_@sfx@(steps[0]) &&
206
208
npyv_storable_stride_@sfx@(steps[1]))
207
209
{
208
- const @type@ *src = (@type@*)args[0];
209
- @type@ *dst = (@type@*)args[1];
210
210
const npy_intp ssrc = steps[0] / sizeof(@type@);
211
211
const npy_intp sdst = steps[1] / sizeof(@type@);
212
212
simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len);
@@ -289,7 +289,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
289
289
@type@ *dst = (@type@*)args[2];
290
290
291
291
const npy_intp len = dimensions[0];
292
- assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
292
+
293
293
if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) &&
294
294
!is_mem_overlap(src2, steps[1], dst, steps[2], len) &&
295
295
npyv_loadable_stride_@sfx@(steps[0]) &&
0 commit comments