Skip to content

Commit c3ce003

Browse files
authored
Merge pull request numpy#26281 from seiko2plus/issue_26197
BUG: Workaround for Intel Compiler mask conversion bug
2 parents 370f1e5 + 38bf07f commit c3ce003

File tree

1 file changed

+29
-5
lines changed

1 file changed

+29
-5
lines changed

numpy/_core/src/common/simd/avx512/conversion.h

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,20 +131,44 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
131131
__mmask16 gh = _mm512_kunpackb((__mmask16)h, (__mmask16)g);
132132
return npyv_pack_b8_b32(ab, cd, ef, gh);
133133
}
134-
134+
/*
135+
* A compiler bug workaround on Intel Compiler Classic.
136+
* The bug manifests specifically when the
137+
* scalar result of _cvtmask64_u64 is compared against the constant -1. This
138+
* comparison uniquely triggers a bug under conditions of equality (==) or
139+
* inequality (!=) checks, which are typically used in reduction operations like
140+
* np.logical_or.
141+
*
142+
* The underlying issue arises from the compiler's optimizer. When the last
143+
* vector comparison instruction operates on zmm, the optimizer erroneously
144+
* emits a duplicate of this instruction but on the lower half register ymm. It
145+
* then performs a bitwise XOR operation between the mask produced by this
146+
* duplicated instruction and the mask from the original comparison instruction.
147+
* This erroneous behavior leads to incorrect results.
148+
*
149+
* See https://github.com/numpy/numpy/issues/26197#issuecomment-2056750975
150+
*/
151+
#ifdef __INTEL_COMPILER
152+
#define NPYV__VOLATILE_CVTMASK64 volatile
153+
#else
154+
#define NPYV__VOLATILE_CVTMASK64
155+
#endif
135156
// convert boolean vectors to integer bitfield
136-
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
137-
{
157+
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) {
138158
#ifdef NPY_HAVE_AVX512BW_MASK
139-
return (npy_uint64)_cvtmask64_u64(a);
159+
npy_uint64 NPYV__VOLATILE_CVTMASK64 t = (npy_uint64)_cvtmask64_u64(a);
160+
return t;
140161
#elif defined(NPY_HAVE_AVX512BW)
141-
return (npy_uint64)a;
162+
npy_uint64 NPYV__VOLATILE_CVTMASK64 t = (npy_uint64)a;
163+
return t;
142164
#else
143165
int mask_lo = _mm256_movemask_epi8(npyv512_lower_si256(a));
144166
int mask_hi = _mm256_movemask_epi8(npyv512_higher_si256(a));
145167
return (unsigned)mask_lo | ((npy_uint64)(unsigned)mask_hi << 32);
146168
#endif
147169
}
170+
#undef NPYV__VOLATILE_CVTMASK64
171+
148172
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
149173
{
150174
#ifdef NPY_HAVE_AVX512BW_MASK

0 commit comments

Comments
 (0)