@@ -1517,13 +1517,35 @@ namespace xsimd
15171517 {
15181518 XSIMD_IF_CONSTEXPR ((8 * sizeof (T)) >= batch_bool<T, A>::size)
15191519 {
1520+ // (A) Easy case: the number of slots fits in T.
15201521 const auto zero = detail::broadcast<as_unsigned_integer_t <T>, types::detail::rvv_width_m1>(T (0 ));
15211522 auto ones = detail::broadcast<as_unsigned_integer_t <T>, A::width>(1 );
15221523 auto iota = detail::vindex<A, as_unsigned_integer_t <T>>();
15231524 auto upowers = detail::rvvsll (ones, iota);
15241525 auto r = __riscv_vredor (self.data .as_mask (), upowers, (typename decltype (zero)::register_type)zero, batch_bool<T, A>::size);
15251526 return detail::reduce_scalar<A, as_unsigned_integer_t <T>>(r);
15261527 }
1528+ else XSIMD_IF_CONSTEXPR ((2 * 8 * sizeof (T)) == batch_bool<T, A>::size)
1529+ {
1530+ // (B) We need two rounds, one for the low part, one for the high part.
1531+
1532+ // The low part is similar to the approach in (A).
1533+ const auto zero = detail::broadcast<as_unsigned_integer_t <T>, types::detail::rvv_width_m1>(T (0 ));
1534+ auto ones = detail::broadcast<as_unsigned_integer_t <T>, A::width>(1 );
1535+ auto iota = ::vindex<A, as_unsigned_integer_t <T>>();
1536+ auto upowers_low = detail::rvvsll (ones, iota);
1537+ auto low_mask = self & (batch<T, A>(detail::vindex<A, T>()) < batch_bool<T, A>::size / 2 );
1538+ auto r_low = __riscv_vredor (low_mask.as_mask (), upowers_low, (typename decltype (zero)::register_type)zero, batch_bool<T, A>::size);
1539+
1540+ // The high part requires a sub before the shift.
1541+ auto iota_high = __riscv_vsub (iota, 8 * sizeof (T), batch_bool<T, A>::size);
1542+ auto upowers_high = detail::rvvsll (ones, iota_high);
1543+ auto high_mask = self & (batch<T, A>(detail::vindex<A, T>()) >= batch_bool<T, A>::size / 2 );
1544+ auto r_high = __riscv_vredor (high_mask.as_mask (), upowers_high, (typename decltype (zero)::register_type)zero, batch_bool<T, A>::size);
1545+
1546+ // Agglomerate the two parts.
1547+ return detail::reduce_scalar<A, as_unsigned_integer_t <T>>(r_low) | (detail::reduce_scalar<A, as_unsigned_integer_t <T>>(r_high) << (8 * sizeof (T)));
1548+ }
15271549 else
15281550 {
15291551 return mask (self, common {});
0 commit comments