@@ -54,6 +54,8 @@ constexpr auto select_odd_mask =
5454template <typename T, std::size_t N, std::size_t M, std::size_t PaddedM>
5555constexpr std::array<std::array<T, PaddedM>, N> pad_2D_array_with_zeros (
5656 const std::array<std::array<T, M>, N> &input) noexcept ;
57+ template <typename T> FINUFFT_ALWAYS_INLINE auto xsimd_to_array (const T &vec) noexcept ;
58+
5759FINUFFT_NEVER_INLINE
5860void print_subgrid_info (int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
5961 UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
@@ -781,7 +783,6 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
781783 pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);
782784
783785 const simd_type zv (z);
784-
785786 for (uint8_t i = 0 ; i < w; i += simd_size) {
786787 auto k = simd_type::load_aligned (padded_coeffs[0 ].data () + i);
787788 for (uint8_t j = 1 ; j < nc; ++j) {
@@ -904,9 +905,10 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
904905 // optimize the code better
905906 return res_low + res_hi;
906907 }();
908+ const auto res_array = xsimd_to_array (res);
907909 for (uint8_t i{0 }; i < simd_size; i += 2 ) {
908- out[0 ] += res. get (i) ;
909- out[1 ] += res. get ( i + 1 ) ;
910+ out[0 ] += res_array[i] ;
911+ out[1 ] += res_array[ i + 1 ] ;
910912 }
911913 // this is where the code differs from spread_kernel, the interpolator does an extra
912914 // reduction step to SIMD elements down to 2 elements
@@ -1061,9 +1063,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10611063 }
10621064 return res_low + res_hi;
10631065 }();
1066+ const auto res_array = xsimd_to_array (res);
10641067 for (uint8_t i{0 }; i < simd_size; i += 2 ) {
1065- out[0 ] += res. get (i) ;
1066- out[1 ] += res. get ( i + 1 ) ;
1068+ out[0 ] += res_array[i] ;
1069+ out[1 ] += res_array[ i + 1 ] ;
10671070 }
10681071 } else { // wraps somewhere: use ptr list
10691072 // this is slower than above, but occurs much less often, with fractional
@@ -1227,9 +1230,10 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12271230 }
12281231 return res_low + res_hi;
12291232 }();
1233+ const auto res_array = xsimd_to_array (res);
12301234 for (uint8_t i{0 }; i < simd_size; i += 2 ) {
1231- out[0 ] += res. get (i) ;
1232- out[1 ] += res. get ( i + 1 ) ;
1235+ out[0 ] += res_array[i] ;
1236+ out[1 ] += res_array[ i + 1 ] ;
12331237 }
12341238 } else {
12351239 return interp_cube_wrapped<ns, simd_type>(target, du, ker1, ker2, ker3, i1, i2, i3,
@@ -2170,6 +2174,13 @@ struct select_odd {
21702174 }
21712175};
21722176
2177+ template <typename T> auto xsimd_to_array (const T &vec) noexcept {
2178+ constexpr auto alignment = T::arch_type::alignment ();
2179+ alignas (alignment) std::array<typename T::value_type, T::size> array{};
2180+ vec.store_aligned (array.data ());
2181+ return array;
2182+ }
2183+
21732184void print_subgrid_info (int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
21742185 UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
21752186 UBIGINT M0) {
0 commit comments