Skip to content

Commit 28173db

Browse files
authored
Merge pull request numpy#20219 from seiko2plus/simd_msvc_broadcast_64bit
BUG, SIMD: Workaround broadcasting SIMD 64-bit integers on MSVC 32-bit
2 parents 383ef2e + 4971fef commit 28173db

File tree

7 files changed

+116
-25
lines changed

7 files changed

+116
-25
lines changed

numpy/core/src/common/simd/avx2/memory.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
8787
#if 0 // slower
8888
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
8989
{
90-
const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
90+
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
9191
return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
9292
}
9393
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
@@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
170170
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
171171
{
172172
assert(nlane > 0);
173-
const __m256i vfill = _mm256_set1_epi64x(fill);
174-
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
175-
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
173+
const __m256i vfill = npyv_setall_s64(fill);
174+
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
175+
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
176176
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
177177
__m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
178178
return _mm256_blendv_epi8(vfill, payload, mask);
@@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
181181
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
182182
{
183183
assert(nlane > 0);
184-
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
185-
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
184+
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
185+
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
186186
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
187187
return _mm256_maskload_epi64((const void*)ptr, mask);
188188
}
@@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
211211
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
212212
{
213213
assert(nlane > 0);
214-
const __m256i vfill = _mm256_set1_epi64x(fill);
215-
const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
216-
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
217-
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
214+
const __m256i vfill = npyv_setall_s64(fill);
215+
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
216+
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
217+
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
218218
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
219219
return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
220220
}
@@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
238238
NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
239239
{
240240
assert(nlane > 0);
241-
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
242-
__m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
241+
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
242+
__m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
243243
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
244244
_mm256_maskstore_epi64((void*)ptr, mask, a);
245245
}

numpy/core/src/common/simd/avx2/misc.h

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,27 @@
2424
#define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
2525
#define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
2626
#define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
27-
#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
28-
#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
2927
#define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
3028
#define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)
3129

30+
NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64);
31+
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
32+
{
33+
npy_int64 ai = (npy_int64)a;
34+
#if defined(_MSC_VER) && defined(_M_IX86)
35+
return npyv__setr_epi64(ai, ai, ai, ai);
36+
#else
37+
return _mm256_set1_epi64x(ai);
38+
#endif
39+
}
40+
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
41+
{
42+
#if defined(_MSC_VER) && defined(_M_IX86)
43+
return npyv__setr_epi64(a, a, a, a);
44+
#else
45+
return _mm256_set1_epi64x(a);
46+
#endif
47+
}
3248
/*
3349
* vector with specific values set to each lane and
3450
* set a specific value to all remained lanes
@@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int
5975
}
6076
NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
6177
{
78+
#if defined(_MSC_VER) && defined(_M_IX86)
79+
return _mm256_setr_epi32(
80+
(int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
81+
(int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32)
82+
);
83+
#else
6284
return _mm256_setr_epi64x(i0, i1, i2, i3);
85+
#endif
6386
}
6487

6588
NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,

numpy/core/src/common/simd/avx512/math.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
3535
return _mm512_range_pd(a, a, 8);
3636
#else
3737
return npyv_and_f64(
38-
a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
38+
a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL))
3939
);
4040
#endif
4141
}

numpy/core/src/common/simd/avx512/memory.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
110110
//// 64
111111
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
112112
{
113-
const __m512i idx = _mm512_setr_epi64(
113+
const __m512i idx = npyv_set_s64(
114114
0*stride, 1*stride, 2*stride, 3*stride,
115115
4*stride, 5*stride, 6*stride, 7*stride
116116
);
@@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
140140
//// 64
141141
NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
142142
{
143-
const __m512i idx = _mm512_setr_epi64(
143+
const __m512i idx = npyv_set_s64(
144144
0*stride, 1*stride, 2*stride, 3*stride,
145145
4*stride, 5*stride, 6*stride, 7*stride
146146
);
@@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
173173
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
174174
{
175175
assert(nlane > 0);
176-
const __m512i vfill = _mm512_set1_epi64(fill);
176+
const __m512i vfill = npyv_setall_s64(fill);
177177
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
178178
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
179179
}
@@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64
210210
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
211211
{
212212
assert(nlane > 0);
213-
const __m512i idx = _mm512_setr_epi64(
213+
const __m512i idx = npyv_set_s64(
214214
0*stride, 1*stride, 2*stride, 3*stride,
215215
4*stride, 5*stride, 6*stride, 7*stride
216216
);
217-
const __m512i vfill = _mm512_set1_epi64(fill);
217+
const __m512i vfill = npyv_setall_s64(fill);
218218
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
219219
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
220220
}
@@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
258258
NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
259259
{
260260
assert(nlane > 0);
261-
const __m512i idx = _mm512_setr_epi64(
261+
const __m512i idx = npyv_set_s64(
262262
0*stride, 1*stride, 2*stride, 3*stride,
263263
4*stride, 5*stride, 6*stride, 7*stride
264264
);

numpy/core/src/common/simd/avx512/misc.h

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,30 @@
2424
#define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
2525
#define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
2626
#define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
27-
#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
28-
#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
2927
#define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
3028
#define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)
3129

30+
NPY_FINLINE __m512i npyv__setr_epi64(
31+
npy_int64, npy_int64, npy_int64, npy_int64,
32+
npy_int64, npy_int64, npy_int64, npy_int64
33+
);
34+
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
35+
{
36+
npy_int64 ai = (npy_int64)a;
37+
#if defined(_MSC_VER) && defined(_M_IX86)
38+
return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
39+
#else
40+
return _mm512_set1_epi64(ai);
41+
#endif
42+
}
43+
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
44+
{
45+
#if defined(_MSC_VER) && defined(_M_IX86)
46+
return npyv__setr_epi64(a, a, a, a, a, a, a, a);
47+
#else
48+
return _mm512_set1_epi64(a);
49+
#endif
50+
}
3251
/**
3352
* vector with specific values set to each lane and
3453
* set a specific value to all remained lanes
@@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32(
7695
NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
7796
npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
7897
{
98+
#if defined(_MSC_VER) && defined(_M_IX86)
99+
return _mm512_setr_epi32(
100+
(int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
101+
(int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32),
102+
(int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32),
103+
(int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32)
104+
);
105+
#else
79106
return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
107+
#endif
80108
}
81109

82110
NPY_FINLINE __m512 npyv__setr_ps(

numpy/core/src/common/simd/simd.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,25 @@ typedef npy_int64 npyv_lanetype_s64;
2727
typedef float npyv_lanetype_f32;
2828
typedef double npyv_lanetype_f64;
2929

30+
#if defined(_MSC_VER) && defined(_M_IX86)
31+
/*
32+
* Avoid using any of the following intrinsics with MSVC 32-bit,
33+
* even if they are apparently work on newer versions.
34+
* They had bad impact on the generated instructions,
35+
* sometimes the compiler deal with them without the respect
36+
* of 32-bit mode which lead to crush due to execute 64-bit
37+
* instructions and other times generate bad emulated instructions.
38+
*/
39+
#undef _mm512_set1_epi64
40+
#undef _mm256_set1_epi64x
41+
#undef _mm_set1_epi64x
42+
#undef _mm512_setr_epi64x
43+
#undef _mm256_setr_epi64x
44+
#undef _mm_setr_epi64x
45+
#undef _mm512_set_epi64x
46+
#undef _mm256_set_epi64x
47+
#undef _mm_set_epi64x
48+
#endif
3049
#if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
3150
#include "avx512/avx512.h"
3251
#elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)

numpy/core/src/common/simd/sse/misc.h

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,28 @@
2424
#define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
2525
#define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
2626
#define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
27-
#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
28-
#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
2927
#define npyv_setall_f32 _mm_set1_ps
3028
#define npyv_setall_f64 _mm_set1_pd
3129

30+
NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1);
31+
32+
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
33+
{
34+
#if defined(_MSC_VER) && defined(_M_IX86)
35+
return npyv__setr_epi64((npy_int64)a, (npy_int64)a);
36+
#else
37+
return _mm_set1_epi64x((npy_int64)a);
38+
#endif
39+
}
40+
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
41+
{
42+
#if defined(_MSC_VER) && defined(_M_IX86)
43+
return npyv__setr_epi64(a, a);
44+
#else
45+
return _mm_set1_epi64x((npy_int64)a);
46+
#endif
47+
}
48+
3249
/**
3350
* vector with specific values set to each lane and
3451
* set a specific value to all remained lanes
@@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
5370
}
5471
NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
5572
{
73+
#if defined(_MSC_VER) && defined(_M_IX86)
74+
return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32));
75+
#else
5676
return _mm_set_epi64x(i1, i0);
77+
#endif
5778
}
5879
NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
5980
{

0 commit comments

Comments
 (0)