@@ -24,14 +24,8 @@ namespace { // anonymous namespace for internal structs equivalent to declaring
2424 // static
2525struct zip_low ;
2626struct zip_hi ;
27- template <unsigned cap>
28- struct reverse_index ;
29- template <unsigned cap>
30- struct select_index ;
31- template <unsigned cap>
32- struct reverse_index_tail ;
33- template <unsigned cap>
34- struct shuffle_index ;
27+ template <unsigned cap> struct reverse_index ;
28+ template <unsigned cap> struct shuffle_index ;
3529// forward declaration to clean up the code and be able to use this everywhere in the file
3630template <class T , uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper ();
3731template <class T , uint8_t N> constexpr auto GetPaddedSIMDWidth ();
@@ -752,8 +746,7 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts
752746 if (opts.kerpad ) {
753747 // padded part should be zero, in spread_subproblem_nd_kernels, there are
754748 // out of bound writes to trg arrays
755- for (int i = N; i < Npad; ++i)
756- ker[i] = 0.0 ;
749+ for (int i = N; i < Npad; ++i) ker[i] = 0.0 ;
757750 }
758751 } else {
759752 for (int i = 0 ; i < N; i++) // dummy for timing only
@@ -764,66 +757,6 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts
764757 if (abs (args[i]) >= (FLT)opts.ES_halfwidth ) ker[i] = 0.0 ;
765758}
766759
767- template <uint8_t w, class simd_type > // aka ns
768- void eval_kernel_vec_Horner_unaligned_store (FLT *FINUFFT_RESTRICT ker, const FLT x,
769- const finufft_spread_opts &opts) noexcept
770- /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
771- x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns.
772- This is the current evaluation method, since it's faster (except i7 w=16).
773- Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
774-
775- {
776- const FLT z = std::fma (FLT (2.0 ), x, FLT (w - 1 )); // scale so local grid offset z in
777- // [-1,1]
778- if (opts.upsampfac == 2.0 ) { // floating point equality is fine here
779- static constexpr auto alignment = simd_type::arch_type::alignment ();
780- static constexpr auto simd_size = simd_type::size;
781- static constexpr auto padded_ns = (w + simd_size - 1 ) & ~(simd_size - 1 );
782- static constexpr auto nc = nc200<w>();
783- static constexpr auto horner_coeffs = get_horner_coeffs_200<FLT, w>();
784-
785- alignas (alignment) static constexpr auto padded_coeffs =
786- pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);
787-
788- static constexpr uint8_t nvec = (w+simd_size-1 )/simd_size;
789- static constexpr uint8_t nvec_eval = (nvec+1 )/2 ;
790- static constexpr uint8_t n_eval = simd_size*nvec_eval;
791- static constexpr uint8_t if_odd_degree = ((nc+1 ) % 2 );
792- static const simd_type zerov (0.0 );
793- const simd_type zv (z);
794- const simd_type z2v = zv * zv;
795- alignas (alignment) std::array<FLT, simd_size> sym_{};
796-
797- // process simd vecs
798- for (uint8_t i = 0 ; i < n_eval; i += simd_size) {
799- auto k_odd = if_odd_degree ? simd_type::load_aligned (padded_coeffs[0 ].data () + i) : zerov;
800- auto k_even = simd_type::load_aligned (padded_coeffs[if_odd_degree].data () + i);
801- for (uint8_t j = 1 +if_odd_degree; j < nc; j += 2 ) {
802- const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
803- k_odd = xsimd::fma (k_odd, z2v, cji_odd);
804- const auto cji_even = simd_type::load_aligned (padded_coeffs[j+1 ].data () + i);
805- k_even = xsimd::fma (k_even, z2v, cji_even);
806- }
807-
808- // left
809- xsimd::fma (k_odd, zv, k_even).store_aligned (ker + i);
810-
811- // right
812- xsimd::fma (k_odd, -zv, k_even).store_aligned (sym_.data ());
813- // let compiler optimize the store, probably unaligned?
814- for (uint8_t j=0 , j2=w-1 -i; (j<simd_size)&&(j2>=n_eval); ++j,--j2) {
815- ker[j2] = sym_[j];
816- }
817- }
818- return ;
819- }
820- // insert the auto-generated code which expects z, w args, writes to ker...
821- if (opts.upsampfac == 1.25 ) {
822- #include " ker_lowupsampfac_horner_allw_loop_constexpr.c"
823- return ;
824- }
825- }
826-
827760template <uint8_t w, class simd_type > // aka ns
828761void eval_kernel_vec_Horner (FLT *FINUFFT_RESTRICT ker, const FLT x,
829762 const finufft_spread_opts &opts) noexcept
@@ -849,76 +782,68 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
849782
850783 // use kernel symmetry trick if w > simd_size
851784 if constexpr (use_ker_sym) {
852- static constexpr uint8_t tail = w % simd_size;
853- static constexpr uint8_t if_odd_degree = ((nc+ 1 ) % 2 );
785+ static constexpr uint8_t tail = w % simd_size;
786+ static constexpr uint8_t if_odd_degree = ((nc + 1 ) % 2 );
854787 static const simd_type zerov (0.0 );
855788 const simd_type zv (z);
856789 const simd_type z2v = zv * zv;
857790
858791 // no xsimd::select neeeded if tail is zero
859792 if constexpr (tail) {
860- // some xsimd constants for shuffle
861- // static constexpr auto reverse_batch_head = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, reverse_index<tail>>();
862- // static constexpr auto reverse_batch_tail = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, reverse_index_tail<tail>>();
863- // static constexpr auto select_batch = xsimd::make_batch_bool_constant<typename simd_type::value_type, arch_t, select_index<tail>>();
864- static constexpr auto shuffle_batch = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t , shuffle_index<tail>>();
793+ // some xsimd constant for shuffle
794+ static constexpr auto shuffle_batch =
795+ xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t ,
796+ shuffle_index<tail>>();
865797
866798 // process simd vecs
867799 simd_type k_odd, k_even, k_prev, k_sym = zerov;
868- for (uint8_t i = 0 , offset = w - tail; i < (w+1 )/2 ; i += simd_size, offset -= simd_size) {
869- k_odd = if_odd_degree ? simd_type::load_aligned (padded_coeffs[0 ].data () + i) : zerov;
800+ for (uint8_t i = 0 , offset = w - tail; i < (w + 1 ) / 2 ;
801+ i += simd_size, offset -= simd_size) {
802+ k_odd = if_odd_degree ? simd_type::load_aligned (padded_coeffs[0 ].data () + i)
803+ : zerov;
870804 k_even = simd_type::load_aligned (padded_coeffs[if_odd_degree].data () + i);
871- for (uint8_t j = 1 +if_odd_degree; j < nc; j += 2 ) {
872- const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
873- k_odd = xsimd::fma (k_odd, z2v, cji_odd);
874- const auto cji_even = simd_type::load_aligned (padded_coeffs[j+1 ].data () + i);
875- k_even = xsimd::fma (k_even, z2v, cji_even);
805+ for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2 ) {
806+ const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
807+ k_odd = xsimd::fma (k_odd, z2v, cji_odd);
808+ const auto cji_even =
809+ simd_type::load_aligned (padded_coeffs[j + 1 ].data () + i);
810+ k_even = xsimd::fma (k_even, z2v, cji_even);
876811 }
877812 xsimd::fma (k_odd, zv, k_even).store_aligned (ker + i);
878- if (offset >= (w+ 1 )/ 2 ) {
813+ if (offset >= (w + 1 ) / 2 ) {
879814 k_prev = k_sym;
880- k_sym = xsimd::fma (k_odd, -zv, k_even);
815+ k_sym = xsimd::fma (k_odd, -zv, k_even);
881816 xsimd::shuffle (k_sym, k_prev, shuffle_batch).store_aligned (ker + offset);
882- /*
883- // the following is the equivalent code for the shuffle operation to avoid one swizzle in the first iteration
884- // seems not helping the performance
885- if (i==0) {
886- // save one xsimd::swizzle for the first iteration(k_prev is zerov)
887- // by assumption, ker is padded to be multiple of simd_size
888- // the padded part must be zero because in spread_subproblem_*d_kernel, trg has out of bound writes.
889- xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), zerov).store_aligned(ker + offset);
890- }
891- else {
892- // xsimd::select of two xsimd::swizzle is the xsimd::shuffle for the general shuffle case
893- //xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), xsimd::swizzle(k_prev, reverse_batch_tail)).store_aligned(ker + offset);
894- xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
895- }
896- */
897817 }
898818 }
899- }
900- else {
819+ } else {
901820 // xsimd constants for reverse
902- static constexpr auto reverse_batch = xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t , reverse_index<simd_size>>();
821+ static constexpr auto reverse_batch =
822+ xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t ,
823+ reverse_index<simd_size>>();
903824
904825 // process simd vecs
905- for (uint8_t i = 0 , offset = w - simd_size; i < w/2 ; i += simd_size, offset -= simd_size) {
906- auto k_odd = if_odd_degree ? simd_type::load_aligned (padded_coeffs[0 ].data () + i) : zerov;
826+ for (uint8_t i = 0 , offset = w - simd_size; i < w / 2 ;
827+ i += simd_size, offset -= simd_size) {
828+ auto k_odd = if_odd_degree
829+ ? simd_type::load_aligned (padded_coeffs[0 ].data () + i)
830+ : zerov;
907831 auto k_even = simd_type::load_aligned (padded_coeffs[if_odd_degree].data () + i);
908- for (uint8_t j = 1 +if_odd_degree; j < nc; j += 2 ) {
909- const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
910- k_odd = xsimd::fma (k_odd, z2v, cji_odd);
911- const auto cji_even = simd_type::load_aligned (padded_coeffs[j+1 ].data () + i);
912- k_even = xsimd::fma (k_even, z2v, cji_even);
832+ for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2 ) {
833+ const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
834+ k_odd = xsimd::fma (k_odd, z2v, cji_odd);
835+ const auto cji_even =
836+ simd_type::load_aligned (padded_coeffs[j + 1 ].data () + i);
837+ k_even = xsimd::fma (k_even, z2v, cji_even);
913838 }
914839 xsimd::fma (k_odd, zv, k_even).store_aligned (ker + i);
915- if (offset >= w/2 ) {
916- xsimd::swizzle (xsimd::fma (k_odd, -zv, k_even), reverse_batch).store_aligned (ker + offset);
840+ if (offset >= w / 2 ) {
841+ xsimd::swizzle (xsimd::fma (k_odd, -zv, k_even), reverse_batch)
842+ .store_aligned (ker + offset);
917843 }
918844 }
919845 }
920- }
921- else {
846+ } else {
922847 const simd_type zv (z);
923848
924849 for (uint8_t i = 0 ; i < w; i += simd_size) {
@@ -2080,26 +2005,12 @@ struct zip_hi {
20802005 return (size + index) / 2 ;
20812006 }
20822007};
2083- template <unsigned cap>
2084- struct reverse_index {
2008+ template <unsigned cap> struct reverse_index {
20852009 static constexpr unsigned get (unsigned index, const unsigned size) {
20862010 return index < cap ? (cap - 1 - index) : index;
20872011 }
20882012};
2089- template <unsigned cap>
2090- struct select_index {
2091- static constexpr bool get (unsigned index, const unsigned size) {
2092- return index < cap ? 1 : 0 ;
2093- }
2094- };
2095- template <unsigned cap>
2096- struct reverse_index_tail {
2097- static constexpr unsigned get (unsigned index, const unsigned size) {
2098- return index < cap ? index : size + cap - 1 - index;
2099- }
2100- };
2101- template <unsigned cap>
2102- struct shuffle_index {
2013+ template <unsigned cap> struct shuffle_index {
21032014 static constexpr unsigned get (unsigned index, const unsigned size) {
21042015 return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
21052016 }
0 commit comments