@@ -1718,57 +1718,56 @@ namespace xsimd
17181718 }
17191719
17201720 // store<batch_bool>
1721- namespace detail
1721+ template <class T , class A , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
1722+ XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<sse2>) noexcept
17221723 {
1723- template <class T >
1724- XSIMD_INLINE void store_bool_sse2 (__m128i b, bool * mem, T) noexcept
1724+ // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
1725+ // GCC/Clang/MSVC will turn it into the correct store.
1726+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
17251727 {
1726- // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
1727- // GCC/Clang/MSVC will turn it into the correct store.
1728- XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
1729- {
1730- // negate mask to convert to 0 or 1
1731- auto val = _mm_sub_epi8 (_mm_set1_epi8 (0 ), b);
1732- memcpy (mem, &val, sizeof (val));
1733- }
1734- else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
1735- {
1736- auto val = _mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (b, b));
1728+ // negate mask to convert to 0 or 1
1729+ auto val = _mm_sub_epi8 (_mm_set1_epi8 (0 ), b);
1730+ memcpy (mem, &val, sizeof (val));
1731+ }
1732+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
1733+ {
1734+ auto val = _mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (b, b));
17371735#if defined(__x86_64__)
1738- auto val_lo = _mm_cvtsi128_si64 (val);
1739- memcpy (mem, &val_lo, sizeof (val_lo));
1736+ auto val_lo = _mm_cvtsi128_si64 (val);
1737+ memcpy (mem, &val_lo, sizeof (val_lo));
17401738#else
1741- memcpy (mem, &val, sizeof (uint64_t ));
1739+ memcpy (mem, &val, sizeof (uint64_t ));
17421740#endif
1743- }
1744- else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1745- {
1746- auto pack_16 = _mm_packs_epi32 (b, b);
1747- uint32_t val = _mm_cvtsi128_si32 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (pack_16, pack_16)));
1748- memcpy (mem, &val, sizeof (val));
1749- }
1750- else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1751- {
1752- auto pack_32 = _mm_packs_epi32 (b, b);
1753- auto pack_16 = _mm_packs_epi32 (pack_32, pack_32);
1754- uint16_t val = _mm_cvtsi128_si32 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (pack_16, pack_16)));
1755- memcpy (mem, &val, sizeof (val));
1756- }
1757- else
1758- {
1759- assert (false && " unsupported arch/op combination" );
1760- }
17611741 }
1742+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1743+ {
1744+ auto pack_16 = _mm_packs_epi32 (b, b);
1745+ uint32_t val = _mm_cvtsi128_si32 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (pack_16, pack_16)));
1746+ memcpy (mem, &val, sizeof (val));
1747+ }
1748+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1749+ {
1750+ auto pack_32 = _mm_packs_epi32 (b, b);
1751+ auto pack_16 = _mm_packs_epi32 (pack_32, pack_32);
1752+ uint16_t val = _mm_cvtsi128_si32 (_mm_sub_epi8 (_mm_set1_epi8 (0 ), _mm_packs_epi16 (pack_16, pack_16)));
1753+ memcpy (mem, &val, sizeof (val));
1754+ }
1755+ else
1756+ {
1757+ assert (false && " unsupported arch/op combination" );
1758+ }
1759+ }
17621760
1763- XSIMD_INLINE __m128i sse_to_i (__m128 x) { return _mm_castps_si128 (x); }
1764- XSIMD_INLINE __m128i sse_to_i (__m128d x) { return _mm_castpd_si128 (x); }
1765- XSIMD_INLINE __m128i sse_to_i (__m128i x) { return x; }
1761+ template <class A >
1762+ XSIMD_INLINE void store (batch_bool<float , A> b, bool * mem, requires_arch<sse2>) noexcept
1763+ {
1764+ store (batch_bool<uint32_t , A>(b.data ), mem, A {});
17661765 }
17671766
1768- template <class T , class A >
1769- XSIMD_INLINE void store (batch_bool<T , A> b, bool * mem, requires_arch<sse2>) noexcept
1767+ template <class A >
1768+ XSIMD_INLINE void store (batch_bool<double , A> b, bool * mem, requires_arch<sse2>) noexcept
17701769 {
1771- detail::store_bool_sse2 ( detail::sse_to_i (b ), mem, T {});
1770+ store (batch_bool< uint64_t , A>(b. data ), mem, A {});
17721771 }
17731772
17741773 // store_aligned
0 commit comments