@@ -730,24 +730,31 @@ namespace xsimd
730730 template <class T , class A , detail::enable_sized_t <T, 1 > = 0 >
731731 XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
732732 {
733- uint8x16_t val = vsubq_u8 ( vdupq_n_u8 ( 0 ), b.data );
733+ uint8x16_t val = vshrq_n_u8 ( b.data , 7 );
734734 vst1q_u8 ((uint8_t *)mem, val);
735735 }
736736
737737 template <class T , class A , detail::enable_sized_t <T, 2 > = 0 >
738738 XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
739739 {
740- uint8x8_t val = vsub_u8 ( vdup_n_u8 ( 0 ), vqmovn_u16 (b.data ));
740+ uint8x8_t val = vshr_n_u8 ( vqmovn_u16 (b.data ), 7 );
741741 vst1_u8 ((uint8_t *)mem, val);
742742 }
743743
744744 template <class T , class A , detail::enable_sized_t <T, 4 > = 0 >
745745 XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
746746 {
747- uint8x8_t val = vsub_u8 ( vdup_n_u8 ( 0 ), vqmovn_u16 (vcombine_u16 (vqmovn_u32 (b.data ), vdup_n_u16 (0 ))));
747+ uint8x8_t val = vshr_n_u8 ( vqmovn_u16 (vcombine_u16 (vqmovn_u32 (b.data ), vdup_n_u16 (0 ))), 7 );
748748 vst1_lane_u32 ((uint32_t *)mem, vreinterpret_u32_u8 (val), 0 );
749749 }
750750
751+ template <class T , class A , detail::enable_sized_t <T, 8 > = 0 >
752+ XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
753+ {
754+ uint8x8_t val = vshr_n_u8 (vqmovn_u16 (vcombine_u16 (vqmovn_u32 (vcombine_u32 (vqmovn_u64 (b.data ), vdup_n_u32 (0 ))), vdup_n_u16 (0 ))), 7 );
755+ vst1_lane_u16 ((uint16_t *)mem, vreinterpret_u16_u8 (val), 0 );
756+ }
757+
751758 template <class A >
752759 XSIMD_INLINE void store (batch_bool<float , A> b, bool * mem, requires_arch<neon>) noexcept
753760 {
0 commit comments