@@ -129,7 +129,8 @@ LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
129129 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ));
130130}
131131LIBC_INLINE uint16_t big_endian_cmp_mask (__m128i max, __m128i value) {
132- return static_cast <uint16_t >(_mm_movemask_epi8 (bytewise_reverse (_mm_cmpeq_epi8 (max, value))));
132+ return static_cast <uint16_t >(
133+ _mm_movemask_epi8 (bytewise_reverse (_mm_cmpeq_epi8 (max, value))));
133134}
134135template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
135136 const auto a = load<__m128i>(p1, offset);
@@ -180,15 +181,41 @@ template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
180181LIBC_INLINE __m256i bytewise_max (__m256i a, __m256i b) {
181182 return _mm256_max_epu8 (a, b);
182183}
183- LIBC_INLINE __m256i bytewise_reverse (__m256i value) {
184- return _mm256_shuffle_epi8 (value,
185- _mm256_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
186- 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
187- 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
188- 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ));
189- }
190184LIBC_INLINE uint32_t big_endian_cmp_mask (__m256i max, __m256i value) {
191- return _mm256_movemask_epi8 (bytewise_reverse (_mm256_cmpeq_epi8 (max, value)));
185+ // Bytewise comparison of 'max' and 'value'.
186+ const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8 (max, value);
187+ // Because x86 is little endian, bytes in the vector must be reversed before
188+ // using movemask.
189+ #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
190+ // When AVX512BMI is available we can completely reverse the vector through
191+ // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
192+ const __m256i big_endian_byte_mask =
193+ _mm256_permutexvar_epi8 (_mm256_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
194+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
195+ 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
196+ 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ),
197+ little_endian_byte_mask);
198+ // And turn the byte vector mask into an 'uint32_t' for direct scalar
199+ // comparison.
200+ return _mm256_movemask_epi8 (big_endian_byte_mask);
201+ #else
202+ // We can't byte-reverse '__m256i' in a single instruction with AVX2.
203+ // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
204+ // leading to:
205+ // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
206+ // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
207+ // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
208+ const __m256i half_reversed = _mm256_shuffle_epi8 (
209+ little_endian_byte_mask, _mm256_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
210+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
211+ 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
212+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ));
213+ // Then we turn the vector into an uint32_t.
214+ const uint32_t half_reversed_scalar = _mm256_movemask_epi8 (half_reversed);
215+ // And swap the lower and upper parts. This is optimized into a single `rorx`
216+ // instruction.
217+ return (half_reversed_scalar << 16 ) | (half_reversed_scalar >> 16 );
218+ #endif
192219}
193220template <>
194221LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
@@ -198,7 +225,7 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
198225 const auto le = big_endian_cmp_mask (vmax, b);
199226 const auto ge = big_endian_cmp_mask (vmax, a);
200227 static_assert (cpp::is_same_v<cpp::remove_cv_t <decltype (le)>, uint32_t >);
201- return cmp_uint32_t (ge, le);
228+ return cmp_neq_uint64_t (ge, le);
202229}
203230#endif // __AVX2__
204231
@@ -210,19 +237,48 @@ template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
210237LIBC_INLINE __m512i bytewise_max (__m512i a, __m512i b) {
211238 return _mm512_max_epu8 (a, b);
212239}
213- LIBC_INLINE __m512i bytewise_reverse (__m512i value) {
214- return _mm512_shuffle_epi8 (value,
215- _mm512_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
216- 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
217- 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
218- 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , //
219- 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , //
220- 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , //
221- 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , //
222- 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ));
223- }
224240LIBC_INLINE uint64_t big_endian_cmp_mask (__m512i max, __m512i value) {
225- return _mm512_cmpeq_epi8_mask (bytewise_reverse (max), bytewise_reverse (value));
241+ // The AVX512BMI version is disabled due to bad codegen.
242+ // https://github.com/llvm/llvm-project/issues/77459
243+ // https://github.com/llvm/llvm-project/pull/77081
244+ // TODO: Re-enable when clang version meets the fixed version.
245+ #if false && defined(__AVX512VBMI__)
246+ // When AVX512BMI is available we can completely reverse the vector through
247+ // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
248+ const auto indices = _mm512_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
249+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
250+ 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
251+ 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , //
252+ 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , //
253+ 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , //
254+ 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , //
255+ 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 );
256+ // Then we compute the mask for equal bytes.
257+ return _mm512_cmpeq_epi8_mask (_mm512_permutexvar_epi8 (indices, max), //
258+ _mm512_permutexvar_epi8 (indices, value));
259+ #else
260+ // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
261+ // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
262+ // So we only reverse groups of 8 bytes, these groups are necessarily within a
263+ // 16-byte lane.
264+ // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes |
265+ // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
266+ const __m512i indices = _mm512_set_epi8 (56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , //
267+ 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , //
268+ 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , //
269+ 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , //
270+ 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , //
271+ 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
272+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
273+ 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 );
274+ // Then we compute the mask for equal bytes. In this mask the bits of each
275+ // byte are already reversed but the byte themselves should be reversed, this
276+ // is done by using a bswap instruction.
277+ return __builtin_bswap64 (
278+ _mm512_cmpeq_epi8_mask (_mm512_shuffle_epi8 (max, indices), //
279+ _mm512_shuffle_epi8 (value, indices)));
280+
281+ #endif
226282}
227283template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
228284 const auto a = load<__m512i>(p1, offset);
0 commit comments