Skip to content

Commit dd5dd2a

Browse files
committed
clean up
1 parent 4d25c51 commit dd5dd2a

File tree

1 file changed

+42
-131
lines changed

1 file changed

+42
-131
lines changed

src/spreadinterp.cpp

Lines changed: 42 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,8 @@ namespace { // anonymous namespace for internal structs equivalent to declaring
2424
// static
2525
struct zip_low;
2626
struct zip_hi;
27-
template<unsigned cap>
28-
struct reverse_index;
29-
template<unsigned cap>
30-
struct select_index;
31-
template<unsigned cap>
32-
struct reverse_index_tail;
33-
template<unsigned cap>
34-
struct shuffle_index;
27+
template<unsigned cap> struct reverse_index;
28+
template<unsigned cap> struct shuffle_index;
3529
// forward declaration to clean up the code and be able to use this everywhere in the file
3630
template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper();
3731
template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth();
@@ -752,8 +746,7 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts
752746
if (opts.kerpad) {
753747
// padded part should be zero, in spread_subproblem_nd_kernels, there are
754748
// out of bound writes to trg arrays
755-
for (int i = N; i < Npad; ++i)
756-
ker[i] = 0.0;
749+
for (int i = N; i < Npad; ++i) ker[i] = 0.0;
757750
}
758751
} else {
759752
for (int i = 0; i < N; i++) // dummy for timing only
@@ -764,66 +757,6 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts
764757
if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0;
765758
}
766759

767-
template<uint8_t w, class simd_type> // aka ns
768-
void eval_kernel_vec_Horner_unaligned_store(FLT *FINUFFT_RESTRICT ker, const FLT x,
769-
const finufft_spread_opts &opts) noexcept
770-
/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
771-
x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns.
772-
This is the current evaluation method, since it's faster (except i7 w=16).
773-
Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
774-
775-
{
776-
const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in
777-
// [-1,1]
778-
if (opts.upsampfac == 2.0) { // floating point equality is fine here
779-
static constexpr auto alignment = simd_type::arch_type::alignment();
780-
static constexpr auto simd_size = simd_type::size;
781-
static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1);
782-
static constexpr auto nc = nc200<w>();
783-
static constexpr auto horner_coeffs = get_horner_coeffs_200<FLT, w>();
784-
785-
alignas(alignment) static constexpr auto padded_coeffs =
786-
pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);
787-
788-
static constexpr uint8_t nvec = (w+simd_size-1)/simd_size;
789-
static constexpr uint8_t nvec_eval = (nvec+1)/2;
790-
static constexpr uint8_t n_eval = simd_size*nvec_eval;
791-
static constexpr uint8_t if_odd_degree = ((nc+1) % 2);
792-
static const simd_type zerov(0.0);
793-
const simd_type zv(z);
794-
const simd_type z2v = zv * zv;
795-
alignas(alignment) std::array<FLT, simd_size> sym_{};
796-
797-
// process simd vecs
798-
for (uint8_t i = 0; i < n_eval; i += simd_size) {
799-
auto k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov;
800-
auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
801-
for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) {
802-
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
803-
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
804-
const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i);
805-
k_even = xsimd::fma(k_even, z2v, cji_even);
806-
}
807-
808-
// left
809-
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
810-
811-
// right
812-
xsimd::fma(k_odd, -zv, k_even).store_aligned(sym_.data());
813-
// let compiler optimize the store, probably unaligned?
814-
for (uint8_t j=0, j2=w-1-i; (j<simd_size)&&(j2>=n_eval); ++j,--j2) {
815-
ker[j2] = sym_[j];
816-
}
817-
}
818-
return;
819-
}
820-
// insert the auto-generated code which expects z, w args, writes to ker...
821-
if (opts.upsampfac == 1.25) {
822-
#include "ker_lowupsampfac_horner_allw_loop_constexpr.c"
823-
return;
824-
}
825-
}
826-
827760
template<uint8_t w, class simd_type> // aka ns
828761
void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x,
829762
const finufft_spread_opts &opts) noexcept
@@ -849,76 +782,68 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
849782

850783
// use kernel symmetry trick if w > simd_size
851784
if constexpr (use_ker_sym) {
852-
static constexpr uint8_t tail = w % simd_size;
853-
static constexpr uint8_t if_odd_degree = ((nc+1) % 2);
785+
static constexpr uint8_t tail = w % simd_size;
786+
static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
854787
static const simd_type zerov(0.0);
855788
const simd_type zv(z);
856789
const simd_type z2v = zv * zv;
857790

858791
// no xsimd::select neeeded if tail is zero
859792
if constexpr (tail) {
860-
// some xsimd constants for shuffle
861-
//static constexpr auto reverse_batch_head = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, reverse_index<tail>>();
862-
//static constexpr auto reverse_batch_tail = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, reverse_index_tail<tail>>();
863-
//static constexpr auto select_batch = xsimd::make_batch_bool_constant<typename simd_type::value_type, arch_t, select_index<tail>>();
864-
static constexpr auto shuffle_batch = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, shuffle_index<tail>>();
793+
// some xsimd constant for shuffle
794+
static constexpr auto shuffle_batch =
795+
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
796+
shuffle_index<tail>>();
865797

866798
// process simd vecs
867799
simd_type k_odd, k_even, k_prev, k_sym = zerov;
868-
for (uint8_t i = 0, offset = w - tail; i < (w+1)/2; i += simd_size, offset -= simd_size) {
869-
k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov;
800+
for (uint8_t i = 0, offset = w - tail; i < (w + 1) / 2;
801+
i += simd_size, offset -= simd_size) {
802+
k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i)
803+
: zerov;
870804
k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
871-
for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) {
872-
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
873-
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
874-
const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i);
875-
k_even = xsimd::fma(k_even, z2v, cji_even);
805+
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) {
806+
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
807+
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
808+
const auto cji_even =
809+
simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
810+
k_even = xsimd::fma(k_even, z2v, cji_even);
876811
}
877812
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
878-
if (offset >= (w+1)/2) {
813+
if (offset >= (w + 1) / 2) {
879814
k_prev = k_sym;
880-
k_sym = xsimd::fma(k_odd, -zv, k_even);
815+
k_sym = xsimd::fma(k_odd, -zv, k_even);
881816
xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
882-
/*
883-
// the following is the equivalent code for the shuffle operation to avoid one swizzle in the first iteration
884-
// seems not helping the performance
885-
if (i==0) {
886-
// save one xsimd::swizzle for the first iteration(k_prev is zerov)
887-
// by assumption, ker is padded to be multiple of simd_size
888-
// the padded part must be zero because in spread_subproblem_*d_kernel, trg has out of bound writes.
889-
xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), zerov).store_aligned(ker + offset);
890-
}
891-
else {
892-
// xsimd::select of two xsimd::swizzle is the xsimd::shuffle for the general shuffle case
893-
//xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), xsimd::swizzle(k_prev, reverse_batch_tail)).store_aligned(ker + offset);
894-
xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
895-
}
896-
*/
897817
}
898818
}
899-
}
900-
else {
819+
} else {
901820
// xsimd constants for reverse
902-
static constexpr auto reverse_batch = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, reverse_index<simd_size>>();
821+
static constexpr auto reverse_batch =
822+
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
823+
reverse_index<simd_size>>();
903824

904825
// process simd vecs
905-
for (uint8_t i = 0, offset = w - simd_size; i < w/2; i += simd_size, offset -= simd_size) {
906-
auto k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov;
826+
for (uint8_t i = 0, offset = w - simd_size; i < w / 2;
827+
i += simd_size, offset -= simd_size) {
828+
auto k_odd = if_odd_degree
829+
? simd_type::load_aligned(padded_coeffs[0].data() + i)
830+
: zerov;
907831
auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
908-
for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) {
909-
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
910-
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
911-
const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i);
912-
k_even = xsimd::fma(k_even, z2v, cji_even);
832+
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) {
833+
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
834+
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
835+
const auto cji_even =
836+
simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
837+
k_even = xsimd::fma(k_even, z2v, cji_even);
913838
}
914839
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
915-
if(offset >= w/2) {
916-
xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch).store_aligned(ker + offset);
840+
if (offset >= w / 2) {
841+
xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch)
842+
.store_aligned(ker + offset);
917843
}
918844
}
919845
}
920-
}
921-
else {
846+
} else {
922847
const simd_type zv(z);
923848

924849
for (uint8_t i = 0; i < w; i += simd_size) {
@@ -2080,26 +2005,12 @@ struct zip_hi {
20802005
return (size + index) / 2;
20812006
}
20822007
};
2083-
template<unsigned cap>
2084-
struct reverse_index {
2008+
template<unsigned cap> struct reverse_index {
20852009
static constexpr unsigned get(unsigned index, const unsigned size) {
20862010
return index < cap ? (cap - 1 - index) : index;
20872011
}
20882012
};
2089-
template<unsigned cap>
2090-
struct select_index {
2091-
static constexpr bool get(unsigned index, const unsigned size) {
2092-
return index < cap ? 1 : 0;
2093-
}
2094-
};
2095-
template<unsigned cap>
2096-
struct reverse_index_tail {
2097-
static constexpr unsigned get(unsigned index, const unsigned size) {
2098-
return index < cap ? index : size + cap - 1 - index;
2099-
}
2100-
};
2101-
template<unsigned cap>
2102-
struct shuffle_index {
2013+
template<unsigned cap> struct shuffle_index {
21032014
static constexpr unsigned get(unsigned index, const unsigned size) {
21042015
return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
21052016
}

0 commit comments

Comments
 (0)