Skip to content

Commit 588fa1a

Browse files
extra
1 parent 99a534c commit 588fa1a

File tree

1 file changed

+19
-0
lines changed

1 file changed

+19
-0
lines changed

include/xsimd/arch/xsimd_rvv.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,13 +1517,32 @@ namespace xsimd
15171517
{
15181518
XSIMD_IF_CONSTEXPR((8 * sizeof(T)) >= batch_bool<T, A>::size)
15191519
{
1520+
// (A) Easy case: the number of slots fits in T.
15201521
const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
15211522
auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
15221523
auto iota = detail::vindex<A, as_unsigned_integer_t<T>>();
15231524
auto upowers = detail::rvvsll(ones, iota);
15241525
auto r = __riscv_vredor(self.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
15251526
return detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r);
15261527
}
1528+
else XSIMD_IF_CONSTEXPR((2 * 8 * sizeof(T)) == batch_bool<T, A>::size) {
1529+
// (B) We need two rounds, one for the low part, one for the high part.
1530+
1531+
// The low part is similar to the approach in (A).
1532+
const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
1533+
auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
1534+
auto iota_low = detail::vindex<A, as_unsigned_integer_t<T>>();
1535+
auto upowers_low = detail::rvvsll(ones, iota_low);
1536+
auto r_low = __riscv_vredor(self.data.as_mask(), upowers_low, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
1537+
1538+
// The high part requires a sub before the shift. The lower part
1539+
// gets a negative number interpreted as a very high positive
1540+
// number because we work on unsigned number.
1541+
auto iota_high = __riscv_vsub(iota_low, 8 * sizeof(T), batch_bool<T, A>::size);
1542+
auto upowers_high = detail::rvvsll(ones, iota_high);
1543+
auto r_high = __riscv_vredor(self.data.as_mask(), upowers_high, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
1544+
return detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_low) | (detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_high) << 8 * sizeof(T));
1545+
}
15271546
else
15281547
{
15291548
return mask(self, common {});

0 commit comments

Comments
 (0)