Skip to content

Commit 97dc14d

Browse files
Improve xsimd::expand common implementation
Previous implementation was making the assumption of cheap xsimd::insert, and it always generated batch::size inserts. This implementation can take advantage of smaller popcount on the bitmask. Note to self: it would be great to have a good implementation for constant mask.
1 parent 71a344e commit 97dc14d

File tree

1 file changed

+9
-17
lines changed

1 file changed

+9
-17
lines changed

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -88,28 +88,20 @@ namespace xsimd
8888
}
8989

9090
// expand
91-
namespace detail
92-
{
93-
template <class IT, class A, class I, size_t... Is>
94-
XSIMD_INLINE batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
95-
{
96-
batch<IT, A> swizzle_mask(IT(0));
97-
IT j = 0;
98-
(void)std::initializer_list<bool> { ((swizzle_mask = insert(swizzle_mask, j, index<Is>())), (j += ((bitmask >> Is) & 1u)), true)... };
99-
return swizzle_mask;
100-
}
101-
}
102-
10391
template <typename A, typename T>
10492
XSIMD_INLINE batch<T, A>
10593
expand(batch<T, A> const& x, batch_bool<T, A> const& mask,
10694
kernel::requires_arch<common>) noexcept
10795
{
108-
constexpr std::size_t size = batch_bool<T, A>::size;
109-
auto bitmask = mask.mask();
110-
auto swizzle_mask = detail::create_expand_swizzle_mask<as_unsigned_integer_t<T>, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
111-
auto z = swizzle(x, swizzle_mask);
112-
return select(mask, z, batch<T, A>(T(0)));
96+
constexpr auto size = x.size;
97+
alignas(A::alignment()) T x_in[size], x_out[size] = {T()};
98+
x.store_aligned(x_in);
99+
int i = 0, j = 0;
100+
for(auto bitmask = mask.mask(); bitmask; bitmask >>= 1) {
101+
if(bitmask & 1)
102+
x_out[i] = x_in[j++];
103+
}
104+
return xsimd::batch<T, A>::load_aligned(x_out);
113105
}
114106

115107
// extract_pair

0 commit comments

Comments
 (0)