Skip to content

Commit ac12cbc

Browse files
Specialize avx512bw bool loader for aligned access
1 parent abd9a8a commit ac12cbc

File tree

4 files changed

+23
-18
lines changed

4 files changed

+23
-18
lines changed

include/xsimd/arch/generic/xsimd_generic_memory.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ namespace xsimd
262262

263263
// load
264264
template <class A, class T>
265-
XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<generic>) noexcept
265+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<generic>) noexcept
266266
{
267267
using batch_type = batch<T, A>;
268268
batch_type ref(0);
@@ -273,6 +273,12 @@ namespace xsimd
273273
return ref != batch_type::load_aligned(&buffer[0]);
274274
}
275275

276+
template <class A, class T>
277+
XSIMD_INLINE batch_bool<T, A> load_aligned(bool const* mem, batch_bool<T, A> b, requires_arch<generic>) noexcept
278+
{
279+
return load_unaligned(mem, b, A {});
280+
}
281+
276282
// load_aligned
277283
namespace detail
278284
{

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -316,19 +316,18 @@ namespace xsimd
316316
}
317317

318318
// load
319-
template <class A, class T>
320-
XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
319+
template <class A, class T, class = typename std::enable_if<batch_bool<T, A>::size == 64, void>::type>
320+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
321321
{
322-
using register_type = typename batch_bool<T, A>::register_type;
323-
XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 64)
324-
{
325-
__m512i bool_val = _mm512_loadu_si512((__m512i const*)mem);
326-
return (register_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
327-
}
328-
else
329-
{
330-
return load(mem, batch_bool<T, A>(), avx512dq {});
331-
}
322+
__m512i bool_val = _mm512_loadu_si512((__m512i const*)mem);
323+
return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
324+
}
325+
326+
template <class A, class T, class = typename std::enable_if<batch_bool<T, A>::size == 64, void>::type>
327+
XSIMD_INLINE batch_bool<T, A> load_aligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
328+
{
329+
__m512i bool_val = _mm512_load_si512((__m512i const*)mem);
330+
return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
332331
}
333332

334333
// max

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1209,14 +1209,14 @@ namespace xsimd
12091209
}
12101210
}
12111211

1212-
// load
1212+
// load mask
12131213
template <class A, class T>
1214-
XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512f>) noexcept
1214+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512f>) noexcept
12151215
{
12161216
using register_type = typename batch_bool<T, A>::register_type;
12171217
constexpr auto size = batch_bool<T, A>::size;
12181218
constexpr auto iter = size / 8;
1219-
static_assert(size % 8 == 0, "incorrect size of bool batch");
1219+
static_assert((size % 8) == 0, "incorrect size of bool batch");
12201220
register_type mask = 0;
12211221
for (std::size_t i = 0; i < iter; ++i)
12221222
{

include/xsimd/types/xsimd_batch.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -968,13 +968,13 @@ namespace xsimd
968968
template <class T, class A>
969969
XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
970970
{
971-
return kernel::load<A>(mem, batch_bool<T, A>(), A {});
971+
return kernel::load_aligned<A>(mem, batch_bool<T, A>(), A {});
972972
}
973973

974974
template <class T, class A>
975975
XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_unaligned(bool const* mem) noexcept
976976
{
977-
return load_aligned(mem);
977+
return kernel::load_unaligned<A>(mem, batch_bool<T, A>(), A {});
978978
}
979979

980980
/**

0 commit comments

Comments
 (0)