@@ -316,18 +316,64 @@ namespace xsimd
316316 }
317317
318318 // load
319- template <class A , class T , class = typename std::enable_if<batch_bool<T, A >::size == 64 , void >::type>
319+ template <class A , class T , class = typename std::enable_if<std::is_integral<T >::value , void >::type>
320320 XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
321321 {
322- __m512i bool_val = _mm512_loadu_si512 ((__m512i const *)mem);
323- return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
322+ XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 64 )
323+ {
324+ __m512i bool_val = _mm512_loadu_si512 ((__m512i const *)mem);
325+ return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
326+ }
327+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 32 )
328+ {
329+ __m256i bpack = _mm256_loadu_si256 ((__m256i const *)mem);
330+ return _mm512_cmpgt_epu16_mask (_mm512_cvtepu8_epi16 (bpack), _mm512_setzero_si512 ());
331+ }
332+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 16 )
333+ {
334+ __m128i bpack = _mm_loadu_si128 ((__m128i const *)mem);
335+ return _mm512_cmpgt_epu32 (_mm512_cvtepu8_epi32 (bpack), _mm512_setzero_si512 ());
336+ }
337+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 8 )
338+ {
339+ __m128i bpack = _mm_loadl_epi64 ((__m128i const *)mem);
340+ return _mm512_cmpgt_epu64 (_mm512_cvtepu8_epi64 (bpack), _mm512_setzero_si512 ());
341+ }
342+ else
343+ {
344+ assert (false && " unexpected batch size" );
345+ return {};
346+ }
324347 }
325348
326- template <class A , class T , class = typename std::enable_if<batch_bool<T, A >::size == 64 , void >::type>
349+ template <class A , class T , class = typename std::enable_if<std::is_integral<T >::value , void >::type>
327350 XSIMD_INLINE batch_bool<T, A> load_aligned (bool const * mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
328351 {
329- __m512i bool_val = _mm512_load_si512 ((__m512i const *)mem);
330- return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
352+ XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 64 )
353+ {
354+ __m512i bool_val = _mm512_load_si512 ((__m512i const *)mem);
355+ return _mm512_cmpgt_epu8_mask (bool_val, _mm512_setzero_si512 ());
356+ }
357+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 32 )
358+ {
359+ __m256i bpack = _mm256_load_si256 ((__m256i const *)mem);
360+ return _mm512_cmpgt_epu16_mask (_mm512_cvtepu8_epi16 (bpack), _mm512_setzero_si512 ());
361+ }
362+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 16 )
363+ {
364+ __m128i bpack = _mm_load_si128 ((__m128i const *)mem);
365+ return _mm512_cmpgt_epu32 (_mm512_cvtepu8_epi32 (bpack), _mm512_setzero_si512 ());
366+ }
367+ else XSIMD_IF_CONSTEXPR (batch_bool<T, A>::size == 8 )
368+ {
369+ __m128i bpack = _mm_loadl_epi64 ((__m128i const *)mem);
370+ return _mm512_cmpgt_epu64 (_mm512_cvtepu8_epi64 (bpack), _mm512_setzero_si512 ());
371+ }
372+ else
373+ {
374+ assert (false && " unexpected batch size" );
375+ return {};
376+ }
331377 }
332378
333379 // max
0 commit comments