Skip to content

Commit 08e8d38

Browse files
committed
Optimize batch_bool load in avx512f
1 parent 16825ef commit 08e8d38

File tree

2 files changed

+31
-8
lines changed

2 files changed

+31
-8
lines changed

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -320,11 +320,15 @@ namespace xsimd
320320
XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
321321
{
322322
using register_type = typename batch_bool<T, A>::register_type;
323-
constexpr auto size = batch_bool<T, A>::size;
324-
__mmask64 mask = size >= 64 ? ~(__mmask64)0 : (1ULL << size) - 1;
325-
__m512i zeros = _mm512_setzero_si512();
326-
__m512i bool_val = _mm512_mask_loadu_epi8(zeros, mask, (void*)mem);
327-
return (register_type)_mm512_cmpgt_epu8_mask(bool_val, zeros);
323+
XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 64)
324+
{
325+
__m512i bool_val = _mm512_loadu_si512((__m512i const*)mem);
326+
return (register_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
327+
}
328+
else
329+
{
330+
return load(mem, batch_bool<T, A>(), avx512dq {});
331+
}
328332
}
329333

330334
// max

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,16 +1193,35 @@ namespace xsimd
11931193
return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
11941194
}
11951195

1196+
namespace detail
1197+
{
1198+
// Generate a bitset from an array of boolean.
1199+
XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8])
1200+
{
1201+
uint64_t data;
1202+
memcpy(&data, unpacked, sizeof(uint64_t));
1203+
1204+
const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
1205+
1206+
unsigned char res = ((data * magic) >> 56) & 0xFF;
1207+
return res;
1208+
}
1209+
}
1210+
11961211
// load
11971212
template <class A, class T>
11981213
XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512f>) noexcept
11991214
{
12001215
using register_type = typename batch_bool<T, A>::register_type;
12011216
constexpr auto size = batch_bool<T, A>::size;
1217+
constexpr auto iter = size / 8;
1218+
assert((size % 8 == 0) && "ucorrectly size of bool batch");
12021219
register_type mask = 0;
1203-
for (std::size_t i = 0; i < size; ++i)
1204-
mask |= (register_type(mem[i] ? 1 : 0) << i);
1205-
1220+
for (std::size_t i = 0; i < iter; ++i)
1221+
{
1222+
unsigned char block = detail::tobitset((unsigned char*)mem + i * 8);
1223+
mask |= (register_type(block) << (i * 8));
1224+
}
12061225
return mask;
12071226
}
12081227

0 commit comments

Comments
 (0)