++

serge-sans-paille · serge-sans-paille · commit 619d93bcf59f · 2025-10-12T00:44:17.000+02:00
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
@@ -730,24 +730,31 @@ namespace xsimd
         template <class T, class A, detail::enable_sized_t<T, 1> = 0>
         XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
         {
-            uint8x16_t val = vsubq_u8(vdupq_n_u8(0), b.data);
+            uint8x16_t val = vshrq_n_u8(b.data, 7);
             vst1q_u8((uint8_t*)mem, val);
         }
 
         template <class T, class A, detail::enable_sized_t<T, 2> = 0>
         XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
         {
-            uint8x8_t val = vsub_u8(vdup_n_u8(0), vqmovn_u16(b.data));
+            uint8x8_t val = vshr_n_u8(vqmovn_u16(b.data), 7);
             vst1_u8((uint8_t*)mem, val);
         }
 
         template <class T, class A, detail::enable_sized_t<T, 4> = 0>
         XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
         {
-            uint8x8_t val = vsub_u8(vdup_n_u8(0), vqmovn_u16(vcombine_u16(vqmovn_u32(b.data), vdup_n_u16(0))));
+            uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(b.data), vdup_n_u16(0))), 7);
             vst1_lane_u32((uint32_t*)mem, vreinterpret_u32_u8(val), 0);
         }
 
+        template <class T, class A, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
+        {
+            uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(vcombine_u32(vqmovn_u64(b.data), vdup_n_u32(0))), vdup_n_u16(0))), 7);
+            vst1_lane_u16((uint16_t*)mem, vreinterpret_u16_u8(val), 0);
+        }
+
         template <class A>
         XSIMD_INLINE void store(batch_bool<float, A> b, bool* mem, requires_arch<neon>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
@@ -177,6 +177,16 @@ namespace xsimd
             return store_aligned<A>(dst, src, A {});
         }
 
+        /*********************
+         * store<batch_bool> *
+         *********************/
+
+        template <class A>
+        XSIMD_INLINE void store(batch_bool<double, A> b, bool* mem, requires_arch<neon>) noexcept
+        {
+            store(batch_bool<uint64_t, A>(b.data), mem, A {});
+        }
+
         /****************
          * load_complex *
          ****************/

Original file line number	Diff line number	Diff line change
`@@ -730,24 +730,31 @@ namespace xsimd`
`730`	`730`	`template <class T, class A, detail::enable_sized_t<T, 1> = 0>`
`731`	`731`	`XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept`
`732`	`732`	`{`
`733`		`- uint8x16_t val = vsubq_u8(vdupq_n_u8(0), b.data);`
	`733`	`+ uint8x16_t val = vshrq_n_u8(b.data, 7);`
`734`	`734`	`vst1q_u8((uint8_t*)mem, val);`
`735`	`735`	`}`
`736`	`736`
`737`	`737`	`template <class T, class A, detail::enable_sized_t<T, 2> = 0>`
`738`	`738`	`XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept`
`739`	`739`	`{`
`740`		`- uint8x8_t val = vsub_u8(vdup_n_u8(0), vqmovn_u16(b.data));`
	`740`	`+ uint8x8_t val = vshr_n_u8(vqmovn_u16(b.data), 7);`
`741`	`741`	`vst1_u8((uint8_t*)mem, val);`
`742`	`742`	`}`
`743`	`743`
`744`	`744`	`template <class T, class A, detail::enable_sized_t<T, 4> = 0>`
`745`	`745`	`XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept`
`746`	`746`	`{`
`747`		`- uint8x8_t val = vsub_u8(vdup_n_u8(0), vqmovn_u16(vcombine_u16(vqmovn_u32(b.data), vdup_n_u16(0))));`
	`747`	`+ uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(b.data), vdup_n_u16(0))), 7);`
`748`	`748`	`vst1_lane_u32((uint32_t*)mem, vreinterpret_u32_u8(val), 0);`
`749`	`749`	`}`
`750`	`750`
	`751`	`+ template <class T, class A, detail::enable_sized_t<T, 8> = 0>`
	`752`	`+ XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept`
	`753`	`+ {`
	`754`	`+ uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(vcombine_u32(vqmovn_u64(b.data), vdup_n_u32(0))), vdup_n_u16(0))), 7);`
	`755`	`+ vst1_lane_u16((uint16_t*)mem, vreinterpret_u16_u8(val), 0);`
	`756`	`+ }`
	`757`	`+`
`751`	`758`	`template <class A>`
`752`	`759`	`XSIMD_INLINE void store(batch_bool<float, A> b, bool* mem, requires_arch<neon>) noexcept`
`753`	`760`	`{`