Skip to content

Commit a304547

Browse files
committed
Optimize batch_bool load in avx512f
1 parent 16825ef commit a304547

File tree

2 files changed

+32
-8
lines changed

2 files changed

+32
-8
lines changed

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -320,11 +320,15 @@ namespace xsimd
320320
XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
321321
{
322322
using register_type = typename batch_bool<T, A>::register_type;
323-
constexpr auto size = batch_bool<T, A>::size;
324-
__mmask64 mask = size >= 64 ? ~(__mmask64)0 : (1ULL << size) - 1;
325-
__m512i zeros = _mm512_setzero_si512();
326-
__m512i bool_val = _mm512_mask_loadu_epi8(zeros, mask, (void*)mem);
327-
return (register_type)_mm512_cmpgt_epu8_mask(bool_val, zeros);
323+
XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 64)
324+
{
325+
__m512i bool_val = _mm512_loadu_si512((__m512i const*)mem);
326+
return (register_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
327+
}
328+
else
329+
{
330+
return load(mem, batch_bool<T, A>(), avx512dq {});
331+
}
328332
}
329333

330334
// max

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,16 +1193,36 @@ namespace xsimd
11931193
return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
11941194
}
11951195

1196+
namespace detail
1197+
{
1198+
// Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array
1199+
// Generate a bitset from an array of boolean.
1200+
XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8])
1201+
{
1202+
uint64_t data;
1203+
memcpy(&data, unpacked, sizeof(uint64_t));
1204+
1205+
const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
1206+
1207+
unsigned char res = ((data * magic) >> 56) & 0xFF;
1208+
return res;
1209+
}
1210+
}
1211+
11961212
// load
11971213
template <class A, class T>
11981214
XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512f>) noexcept
11991215
{
12001216
using register_type = typename batch_bool<T, A>::register_type;
12011217
constexpr auto size = batch_bool<T, A>::size;
1218+
constexpr auto iter = size / 8;
1219+
static_assert(size % 8 == 0, "incorrect size of bool batch");
12021220
register_type mask = 0;
1203-
for (std::size_t i = 0; i < size; ++i)
1204-
mask |= (register_type(mem[i] ? 1 : 0) << i);
1205-
1221+
for (std::size_t i = 0; i < iter; ++i)
1222+
{
1223+
unsigned char block = detail::tobitset((unsigned char*)mem + i * 8);
1224+
mask |= (register_type(block) << (i * 8));
1225+
}
12061226
return mask;
12071227
}
12081228

0 commit comments

Comments
 (0)