@@ -131,20 +131,44 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
131
131
__mmask16 gh = _mm512_kunpackb ((__mmask16 )h , (__mmask16 )g );
132
132
return npyv_pack_b8_b32 (ab , cd , ef , gh );
133
133
}
134
-
134
+ /*
135
+ * A compiler bug workaround on Intel Compiler Classic.
136
+ * The bug manifests specifically when the
137
+ * scalar result of _cvtmask64_u64 is compared against the constant -1. This
138
+ * comparison uniquely triggers a bug under conditions of equality (==) or
139
+ * inequality (!=) checks, which are typically used in reduction operations like
140
+ * np.logical_or.
141
+ *
142
+ * The underlying issue arises from the compiler's optimizer. When the last
143
+ * vector comparison instruction operates on zmm, the optimizer erroneously
144
+ * emits a duplicate of this instruction but on the lower half register ymm. It
145
+ * then performs a bitwise XOR operation between the mask produced by this
146
+ * duplicated instruction and the mask from the original comparison instruction.
147
+ * This erroneous behavior leads to incorrect results.
148
+ *
149
+ * See https://github.com/numpy/numpy/issues/26197#issuecomment-2056750975
150
+ */
151
+ #ifdef __INTEL_COMPILER
152
+ #define NPYV__VOLATILE_CVTMASK64 volatile
153
+ #else
154
+ #define NPYV__VOLATILE_CVTMASK64
155
+ #endif
135
156
// convert boolean vectors to integer bitfield
136
- NPY_FINLINE npy_uint64 npyv_tobits_b8 (npyv_b8 a )
137
- {
157
+ NPY_FINLINE npy_uint64 npyv_tobits_b8 (npyv_b8 a ) {
138
158
#ifdef NPY_HAVE_AVX512BW_MASK
139
- return (npy_uint64 )_cvtmask64_u64 (a );
159
+ npy_uint64 NPYV__VOLATILE_CVTMASK64 t = (npy_uint64 )_cvtmask64_u64 (a );
160
+ return t ;
140
161
#elif defined(NPY_HAVE_AVX512BW )
141
- return (npy_uint64 )a ;
162
+ npy_uint64 NPYV__VOLATILE_CVTMASK64 t = (npy_uint64 )a ;
163
+ return t ;
142
164
#else
143
165
int mask_lo = _mm256_movemask_epi8 (npyv512_lower_si256 (a ));
144
166
int mask_hi = _mm256_movemask_epi8 (npyv512_higher_si256 (a ));
145
167
return (unsigned )mask_lo | ((npy_uint64 )(unsigned )mask_hi << 32 );
146
168
#endif
147
169
}
170
+ #undef NPYV__VOLATILE_CVTMASK64
171
+
148
172
NPY_FINLINE npy_uint64 npyv_tobits_b16 (npyv_b16 a )
149
173
{
150
174
#ifdef NPY_HAVE_AVX512BW_MASK
0 commit comments