Skip to content

Commit 6e51bf1

Browse files
committed
using array is faster then xsimd::get
1 parent b2d7e3e commit 6e51bf1

File tree

1 file changed

+18
-7
lines changed

1 file changed

+18
-7
lines changed

src/spreadinterp.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ constexpr auto select_odd_mask =
5454
template<typename T, std::size_t N, std::size_t M, std::size_t PaddedM>
5555
constexpr std::array<std::array<T, PaddedM>, N> pad_2D_array_with_zeros(
5656
const std::array<std::array<T, M>, N> &input) noexcept;
57+
template<typename T> FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept;
58+
5759
FINUFFT_NEVER_INLINE
5860
void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
5961
UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
@@ -781,7 +783,6 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
781783
pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);
782784

783785
const simd_type zv(z);
784-
785786
for (uint8_t i = 0; i < w; i += simd_size) {
786787
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
787788
for (uint8_t j = 1; j < nc; ++j) {
@@ -904,9 +905,10 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
904905
// optimize the code better
905906
return res_low + res_hi;
906907
}();
908+
const auto res_array = xsimd_to_array(res);
907909
for (uint8_t i{0}; i < simd_size; i += 2) {
908-
out[0] += res.get(i);
909-
out[1] += res.get(i + 1);
910+
out[0] += res_array[i];
911+
out[1] += res_array[i + 1];
910912
}
911913
// this is where the code differs from spread_kernel, the interpolator does an extra
912914
// reduction step to SIMD elements down to 2 elements
@@ -1061,9 +1063,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10611063
}
10621064
return res_low + res_hi;
10631065
}();
1066+
const auto res_array = xsimd_to_array(res);
10641067
for (uint8_t i{0}; i < simd_size; i += 2) {
1065-
out[0] += res.get(i);
1066-
out[1] += res.get(i + 1);
1068+
out[0] += res_array[i];
1069+
out[1] += res_array[i + 1];
10671070
}
10681071
} else { // wraps somewhere: use ptr list
10691072
// this is slower than above, but occurs much less often, with fractional
@@ -1227,9 +1230,10 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12271230
}
12281231
return res_low + res_hi;
12291232
}();
1233+
const auto res_array = xsimd_to_array(res);
12301234
for (uint8_t i{0}; i < simd_size; i += 2) {
1231-
out[0] += res.get(i);
1232-
out[1] += res.get(i + 1);
1235+
out[0] += res_array[i];
1236+
out[1] += res_array[i + 1];
12331237
}
12341238
} else {
12351239
return interp_cube_wrapped<ns, simd_type>(target, du, ker1, ker2, ker3, i1, i2, i3,
@@ -2170,6 +2174,13 @@ struct select_odd {
21702174
}
21712175
};
21722176

2177+
template<typename T> auto xsimd_to_array(const T &vec) noexcept {
2178+
constexpr auto alignment = T::arch_type::alignment();
2179+
alignas(alignment) std::array<typename T::value_type, T::size> array{};
2180+
vec.store_aligned(array.data());
2181+
return array;
2182+
}
2183+
21732184
void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
21742185
UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
21752186
UBIGINT M0) {

0 commit comments

Comments
 (0)