Skip to content

Commit 88129a4

Browse files
committed
minor optimization
1 parent 1c3e40e commit 88129a4

File tree

2 files changed

+20
-8
lines changed

2 files changed

+20
-8
lines changed

include/finufft/defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,18 +45,21 @@
4545
#define FINUFFT_RESTRICT __restrict
4646
#define FINUFFT_UNREACHABLE __assume(0)
4747
#define FINUFFT_UNLIKELY(x) (x)
48+
#define FINUFFT_LIKELY(x) (x)
4849
#elif defined(__GNUC__) || defined(__clang__)
4950
#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
5051
#define FINUFFT_NEVER_INLINE __attribute__((noinline))
5152
#define FINUFFT_RESTRICT __restrict__
5253
#define FINUFFT_UNREACHABLE __builtin_unreachable()
5354
#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0)
55+
#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1)
5456
#else
5557
#define FINUFFT_ALWAYS_INLINE inline
5658
#define FINUFFT_NEVER_INLINE
5759
#define FINUFFT_RESTRICT
5860
#define FINUFFT_UNREACHABLE
5961
#define FINUFFT_UNLIKELY(x) (x)
62+
#define FINUFFT_LIKELY(x) (x)
6063
#endif
6164

6265
// ------------- Library-wide algorithm parameter settings ----------------

src/spreadinterp.cpp

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,10 @@ constexpr auto zip_hi_index =
4747
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, zip_hi>();
4848
template<class arch_t>
4949
constexpr auto select_even_mask =
50-
xsimd::make_batch_bool_constant<FLT, arch_t, select_even>();
50+
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, select_even>();
5151
template<class arch_t>
5252
constexpr auto select_odd_mask =
53-
xsimd::make_batch_bool_constant<FLT, arch_t, select_odd>();
53+
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, select_odd>();
5454
template<typename T, std::size_t N, std::size_t M, std::size_t PaddedM>
5555
constexpr std::array<std::array<T, PaddedM>, N> pad_2D_array_with_zeros(
5656
const std::array<std::array<T, M>, N> &input) noexcept;
@@ -817,7 +817,6 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
817817
Barnett 6/16/17.
818818
*/
819819
{
820-
821820
std::array<FLT, 2> out{0};
822821
BIGINT j = i1;
823822
if (FINUFFT_UNLIKELY(i1 < 0)) { // wraps at left
@@ -847,21 +846,29 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
847846
static constexpr auto alignment = arch_t::alignment();
848847
static constexpr auto simd_size = simd_type::size;
849848
static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
850-
simd_type res{0};
849+
simd_type res_low{0}, res_hi{0};
851850
for (uint8_t dx{0}; dx < regular_part; dx += 2 * simd_size) {
852851
const auto ker_v = simd_type::load_aligned(ker + dx / 2);
853852
const auto du_pt0 = simd_type::load_unaligned(du + dx);
854853
const auto du_pt1 = simd_type::load_unaligned(du + dx + simd_size);
855854
const auto ker0low = xsimd::swizzle(ker_v, zip_low_index<arch_t>);
856855
const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index<arch_t>);
857-
res = xsimd::fma(ker0low, du_pt0, xsimd::fma(ker0hi, du_pt1, res));
856+
res_low = xsimd::fma(ker0low, du_pt0, res_low);
857+
res_hi = xsimd::fma(ker0hi, du_pt1, res_hi);
858858
}
859859
if constexpr (regular_part < 2 * ns) {
860860
const auto ker0 = simd_type::load_unaligned(ker + (regular_part / 2));
861861
const auto du_pt = simd_type::load_unaligned(du + regular_part);
862862
const auto ker0low = xsimd::swizzle(ker0, zip_low_index<arch_t>);
863-
res = xsimd::fma(ker0low, du_pt, res);
863+
res_low = xsimd::fma(ker0low, du_pt, res_low);
864864
}
865+
// This is slower than summing and looping
866+
// const auto res_real = xsimd::shuffle(res_low, res_hi,
867+
// select_even_mask<arch_t>); const auto res_imag = xsimd::shuffle(res_low,
868+
// res_hi, select_odd_mask<arch_t>); out[0] = xsimd::reduce_add(res_real); out[1]
869+
// = xsimd::reduce_add(res_imag);
870+
871+
const auto res = res_low + res_hi;
865872
alignas(alignment) std::array<FLT, simd_size> res_array{};
866873
res.store_aligned(res_array.data());
867874
for (uint8_t i{0}; i < simd_size; i += 2) {
@@ -1957,10 +1964,12 @@ struct zip_hi {
19571964
};
19581965

19591966
struct select_even {
1960-
static constexpr bool get(unsigned index, unsigned /*size*/) { return index % 2 == 0; }
1967+
static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; }
19611968
};
19621969
struct select_odd {
1963-
static constexpr bool get(unsigned index, unsigned /*size*/) { return index % 2 == 1; }
1970+
static constexpr unsigned get(unsigned index, unsigned /*size*/) {
1971+
return index * 2 + 1;
1972+
}
19641973
};
19651974

19661975
void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,

0 commit comments

Comments
 (0)