Skip to content

Commit 14cd2f5

Browse files
Improve xsimd::expand common implementation
Previous implementation was making the assumption of cheap xsimd::insert, and it always generated batch::size inserts. This implementation can take advantage of smaller popcount on the bitmask. Note to self: it would be great to have a good implementation for constant mask.
1 parent 71a344e commit 14cd2f5

File tree

2 files changed

+10
-19
lines changed

2 files changed

+10
-19
lines changed

.github/workflows/cross-arm.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ concurrency:
66
jobs:
77
build:
88
runs-on: ubuntu-latest
9-
name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
9+
name: '${{ matrix.target.arch }}, ${{ matrix.target.flags }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
1010
strategy:
1111
matrix:
1212
target:
1313
- { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'}
14-
- { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon
14+
- { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon
1515
- { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' }
1616
sys:
1717
- { compiler: 'gcc', version: '10' }

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -88,28 +88,19 @@ namespace xsimd
8888
}
8989

9090
// expand
91-
namespace detail
92-
{
93-
template <class IT, class A, class I, size_t... Is>
94-
XSIMD_INLINE batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
95-
{
96-
batch<IT, A> swizzle_mask(IT(0));
97-
IT j = 0;
98-
(void)std::initializer_list<bool> { ((swizzle_mask = insert(swizzle_mask, j, index<Is>())), (j += ((bitmask >> Is) & 1u)), true)... };
99-
return swizzle_mask;
100-
}
101-
}
102-
10391
template <typename A, typename T>
10492
XSIMD_INLINE batch<T, A>
10593
expand(batch<T, A> const& x, batch_bool<T, A> const& mask,
10694
kernel::requires_arch<common>) noexcept
10795
{
108-
constexpr std::size_t size = batch_bool<T, A>::size;
109-
auto bitmask = mask.mask();
110-
auto swizzle_mask = detail::create_expand_swizzle_mask<as_unsigned_integer_t<T>, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
111-
auto z = swizzle(x, swizzle_mask);
112-
return select(mask, z, batch<T, A>(T(0)));
96+
alignas(A::alignment()) T x_in[x.size], x_out[x.size] = {T()};
97+
x.store_aligned(x_in);
98+
int i = 0, j = 0;
99+
for(auto bitmask = m.mask(); mask; mask >>= 1) {
100+
if(mask & 1)
101+
x_out[i] = x_int[j++];
102+
}
103+
return xsimd::batch<T, A>::load_aligned(x_out);
113104
}
114105

115106
// extract_pair

0 commit comments

Comments
 (0)