@@ -574,36 +574,58 @@ namespace xsimd
574574 // load_unaligned<batch_bool>
575575 namespace detail {
576576 template <class T >
577- XSIMD_INLINE __m256i load_bool_avx2 (bool const * mem, T) noexcept {
578- XSIMD_IF_CONSTEXPR (sizeof (T) == 1 ) {
577+ XSIMD_INLINE __m256i load_bool_avx2 (bool const * mem) noexcept
578+ {
579+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
580+ {
579581 auto maskz = _mm256_cmpeq_epi8 (_mm256_loadu_si256 ((__m256i const *)mem), _mm256_set1_epi8 (0 ));
580582 return _mm256_xor_si256 (maskz, _mm256_set1_epi8 (-1 ));
581583 }
582- else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 ) {
584+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
585+ {
583586 auto bpack = _mm_loadu_si128 ((__m128i const *)mem);
584587 return _mm256_cmpgt_epi16 (_mm256_cvtepu8_epi16 (bpack), _mm256_set1_epi16 (0 ));
585588 }
586589 // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
587590 // GCC/Clang/MSVC will turn it into the correct load.
588- else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 ) {
591+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
592+ {
589593 uint64_t tmp;
590594 memcpy (&tmp, mem, sizeof (tmp));
591595 auto bpack = _mm_cvtsi64_si128 (tmp);
592596 return _mm256_cmpgt_epi32 (_mm256_cvtepu8_epi32 (bpack), _mm256_set1_epi32 (0 ));
593597 }
594- else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 ) {
598+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
599+ {
595600 uint32_t tmp;
596601 memcpy (&tmp, mem, sizeof (tmp));
597602 auto bpack = _mm_cvtsi32_si128 (tmp);
598603 return _mm256_cmpgt_epi64 (_mm256_cvtepu8_epi64 (bpack), _mm256_set1_epi64x (0 ));
599604 }
605+ else
606+ {
607+ assert (false && " unsupported arch/op combination" );
608+ return __m256i{};
609+ }
600610 }
601611 }
602612
603- template <class T , class A >
613+ template <class T , class A , class = typename std::enable_if<std::is_integral<T>::value, void >::type >
604614 XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
605615 {
606- return batch_bool_cast<T, A>(detail::load_bool_avx2 (mem, T{}), avx2{});
616+ return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
617+ }
618+
619+ template <class A >
620+ XSIMD_INLINE batch_bool<float , A> load_unaligned (bool const * mem, batch_bool<float , A>, requires_arch<avx2>) noexcept
621+ {
622+ return batch_bool<float , A>(_mm256_castsi256_ps (detail::load_bool_avx2<float >(mem)));
623+ }
624+
625+ template <class A >
626+ XSIMD_INLINE batch_bool<double , A> load_unaligned (bool const * mem, batch_bool<double , A>, requires_arch<avx2>) noexcept
627+ {
628+ return batch_bool<double , A>(_mm256_castsi256_pd (detail::load_bool_avx2<double >(mem)));
607629 }
608630
609631 // mask
@@ -965,36 +987,44 @@ namespace xsimd
965987 XSIMD_INLINE void store_bool_avx2 (__m256i b, bool * mem, T) noexcept {
966988 // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
967989 // GCC/Clang/MSVC will turn it into the correct store.
968- XSIMD_IF_CONSTEXPR (sizeof (T) == 1 ) {
990+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
991+ {
969992 // negate mask to convert to 0 or 1
970993 auto val = _mm256_sub_epi8 (_mm256_set1_epi8 (0 ), b);
971994 memcpy (mem, &val, sizeof (val));
972995 }
973- else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 ) {
996+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
997+ {
974998 auto packed = _mm256_castsi256_si128 (_mm256_packs_epi16 (b, b));
975999 auto val = _mm_sub_epi8 (_mm_set1_epi8 (0 ), packed);
9761000 memcpy (mem, &val, sizeof (val));
9771001 }
978- else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 ) {
1002+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1003+ {
9791004 auto bmask = _mm256_set_epi8 (
9801005 -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
9811006 -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
9821007 -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
9831008 28 , 24 , 20 , 16 , 12 , 8 , 4 , 0 );
9841009 auto packed = _mm256_castsi256_si128 (_mm256_shuffle_epi8 (b, bmask));
985- auto val = _mm_extract_epi64 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), packed), 0 );
1010+ auto val = _mm_cvtsi128_si64 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), packed));
9861011 memcpy (mem, &val, sizeof (val));
9871012 }
988- else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 ) {
1013+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1014+ {
9891015 auto bmask = _mm256_set_epi8 (
9901016 -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
9911017 -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
9921018 -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
9931019 -1 , -1 , -1 , -1 , 24 , 16 , 8 , 0 );
9941020 auto packed = _mm256_castsi256_si128 (_mm256_shuffle_epi8 (b, bmask));
995- uint32_t val = _mm_extract_epi32 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), packed), 0 );
1021+ uint32_t val = _mm_cvtsi128_si32 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), packed));
9961022 memcpy (mem, &val, sizeof (val));
9971023 }
1024+ else
1025+ {
1026+ assert (false && " unsupported arch/op combination" );
1027+ }
9981028 }
9991029
10001030 XSIMD_INLINE __m256i avx_to_i (__m256 x) { return _mm256_castps_si256 (x); }
0 commit comments