@@ -47,10 +47,10 @@ constexpr auto zip_hi_index =
4747 xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t , zip_hi>();
4848template <class arch_t >
4949constexpr auto select_even_mask =
50- xsimd::make_batch_bool_constant< FLT, arch_t , select_even>();
50+ xsimd::make_batch_constant<xsimd:: as_unsigned_integer_t < FLT> , arch_t , select_even>();
5151template <class arch_t >
5252constexpr auto select_odd_mask =
53- xsimd::make_batch_bool_constant< FLT, arch_t , select_odd>();
53+ xsimd::make_batch_constant<xsimd:: as_unsigned_integer_t < FLT> , arch_t , select_odd>();
5454template <typename T, std::size_t N, std::size_t M, std::size_t PaddedM>
5555constexpr std::array<std::array<T, PaddedM>, N> pad_2D_array_with_zeros (
5656 const std::array<std::array<T, M>, N> &input) noexcept ;
@@ -817,7 +817,6 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
817817 Barnett 6/16/17.
818818*/
819819{
820-
821820 std::array<FLT, 2 > out{0 };
822821 BIGINT j = i1;
823822 if (FINUFFT_UNLIKELY (i1 < 0 )) { // wraps at left
@@ -847,21 +846,29 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
847846 static constexpr auto alignment = arch_t::alignment ();
848847 static constexpr auto simd_size = simd_type::size;
849848 static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
850- simd_type res {0 };
849+ simd_type res_low{ 0 }, res_hi {0 };
851850 for (uint8_t dx{0 }; dx < regular_part; dx += 2 * simd_size) {
852851 const auto ker_v = simd_type::load_aligned (ker + dx / 2 );
853852 const auto du_pt0 = simd_type::load_unaligned (du + dx);
854853 const auto du_pt1 = simd_type::load_unaligned (du + dx + simd_size);
855854 const auto ker0low = xsimd::swizzle (ker_v, zip_low_index<arch_t >);
856855 const auto ker0hi = xsimd::swizzle (ker_v, zip_hi_index<arch_t >);
857- res = xsimd::fma (ker0low, du_pt0, xsimd::fma (ker0hi, du_pt1, res));
856+ res_low = xsimd::fma (ker0low, du_pt0, res_low);
857+ res_hi = xsimd::fma (ker0hi, du_pt1, res_hi);
858858 }
859859 if constexpr (regular_part < 2 * ns) {
860860 const auto ker0 = simd_type::load_unaligned (ker + (regular_part / 2 ));
861861 const auto du_pt = simd_type::load_unaligned (du + regular_part);
862862 const auto ker0low = xsimd::swizzle (ker0, zip_low_index<arch_t >);
863- res = xsimd::fma (ker0low, du_pt, res );
863+ res_low = xsimd::fma (ker0low, du_pt, res_low );
864864 }
865+ // This is slower than summing and looping
866+ // const auto res_real = xsimd::shuffle(res_low, res_hi,
867+ // select_even_mask<arch_t>); const auto res_imag = xsimd::shuffle(res_low,
868+ // res_hi, select_odd_mask<arch_t>); out[0] = xsimd::reduce_add(res_real); out[1]
869+ // = xsimd::reduce_add(res_imag);
870+
871+ const auto res = res_low + res_hi;
865872 alignas (alignment) std::array<FLT, simd_size> res_array{};
866873 res.store_aligned (res_array.data ());
867874 for (uint8_t i{0 }; i < simd_size; i += 2 ) {
@@ -1957,10 +1964,12 @@ struct zip_hi {
19571964};
19581965
19591966struct select_even {
1960- static constexpr bool get (unsigned index, unsigned /* size*/ ) { return index % 2 == 0 ; }
1967+ static constexpr unsigned get (unsigned index, unsigned /* size*/ ) { return index * 2 ; }
19611968};
19621969struct select_odd {
1963- static constexpr bool get (unsigned index, unsigned /* size*/ ) { return index % 2 == 1 ; }
1970+ static constexpr unsigned get (unsigned index, unsigned /* size*/ ) {
1971+ return index * 2 + 1 ;
1972+ }
19641973};
19651974
19661975void print_subgrid_info (int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
0 commit comments