Skip to content

Commit bdf169f

Browse files
extra
1 parent 99a534c commit bdf169f

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

include/xsimd/arch/xsimd_rvv.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,13 +1517,35 @@ namespace xsimd
15171517
{
15181518
XSIMD_IF_CONSTEXPR((8 * sizeof(T)) >= batch_bool<T, A>::size)
15191519
{
1520+
// (A) Easy case: the number of slots fits in T.
15201521
const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
15211522
auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
15221523
auto iota = detail::vindex<A, as_unsigned_integer_t<T>>();
15231524
auto upowers = detail::rvvsll(ones, iota);
15241525
auto r = __riscv_vredor(self.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
15251526
return detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r);
15261527
}
1528+
else XSIMD_IF_CONSTEXPR((2 * 8 * sizeof(T)) == batch_bool<T, A>::size)
1529+
{
1530+
// (B) We need two rounds, one for the low part, one for the high part.
1531+
1532+
// The low part is similar to the approach in (A).
1533+
const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
1534+
auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
1535+
auto iota = detail::vindex<A, as_unsigned_integer_t<T>>();
1536+
auto upowers_low = detail::rvvsll(ones, iota);
1537+
auto low_mask = self & (batch<T, A>(detail::vindex<A, T>()) < batch_bool<T, A>::size / 2);
1538+
auto r_low = __riscv_vredor(low_mask.data.as_mask(), upowers_low, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
1539+
1540+
// The high part requires a sub before the shift.
1541+
auto iota_high = __riscv_vsub(iota, 8 * sizeof(T), batch_bool<T, A>::size);
1542+
auto upowers_high = detail::rvvsll(ones, iota_high);
1543+
auto high_mask = self & (batch<T, A>(detail::vindex<A, T>()) >= batch_bool<T, A>::size / 2);
1544+
auto r_high = __riscv_vredor(high_mask.data.as_mask(), upowers_high, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
1545+
1546+
// Agglomerate the two parts.
1547+
return detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_low) | (detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_high) << (8 * sizeof(T)));
1548+
}
15271549
else
15281550
{
15291551
return mask(self, common {});

0 commit comments

Comments
 (0)