@@ -24,6 +24,8 @@ namespace { // anonymous namespace for internal structs equivalent to declaring
2424 // static
2525struct zip_low ;
2626struct zip_hi ;
27+ template <unsigned cap> struct reverse_index ;
28+ template <unsigned cap> struct shuffle_index ;
2729struct select_even ;
2830struct select_odd ;
2931// forward declaration to clean up the code and be able to use this everywhere in the file
@@ -777,23 +779,80 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
777779 const FLT z = std::fma (FLT (2.0 ), x, FLT (w - 1 )); // scale so local grid offset z in
778780 // [-1,1]
779781 if (opts.upsampfac == 2.0 ) { // floating point equality is fine here
780- static constexpr auto alignment = simd_type::arch_type::alignment ();
782+ using arch_t = typename simd_type::arch_type;
783+ static constexpr auto alignment = arch_t::alignment ();
781784 static constexpr auto simd_size = simd_type::size;
782785 static constexpr auto padded_ns = (w + simd_size - 1 ) & ~(simd_size - 1 );
783786 static constexpr auto nc = nc200<w>();
784787 static constexpr auto horner_coeffs = get_horner_coeffs_200<FLT, w>();
788+ static constexpr auto use_ker_sym = (simd_size < w);
785789
786790 alignas (alignment) static constexpr auto padded_coeffs =
787791 pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);
788792
789- const simd_type zv (z);
790- for (uint8_t i = 0 ; i < w; i += simd_size) {
791- auto k = simd_type::load_aligned (padded_coeffs[0 ].data () + i);
792- for (uint8_t j = 1 ; j < nc; ++j) {
793- const auto cji = simd_type::load_aligned (padded_coeffs[j].data () + i);
794- k = xsimd::fma (k, zv, cji);
793+ // use kernel symmetry trick if w > simd_size
794+ if constexpr (use_ker_sym) {
795+ static constexpr uint8_t tail = w % simd_size;
796+ static constexpr uint8_t if_odd_degree = ((nc + 1 ) % 2 );
797+ static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size;
798+ static constexpr uint8_t end_idx = (w + (tail > 0 )) / 2 ;
799+ const simd_type zv{z};
800+ const auto z2v = zv * zv;
801+
802+ // some xsimd constant for shuffle or inverse
803+ static constexpr auto shuffle_batch = []() constexpr noexcept {
804+ if constexpr (tail) {
805+ return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t ,
806+ shuffle_index<tail>>();
807+ } else {
808+ return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t ,
809+ reverse_index<simd_size>>();
810+ }
811+ }();
812+
813+ // process simd vecs
814+ simd_type k_prev, k_sym{0 };
815+ for (uint8_t i{0 }, offset = offset_start; i < end_idx;
816+ i += simd_size, offset -= simd_size) {
817+ auto k_odd = [i]() constexpr noexcept {
818+ if constexpr (if_odd_degree) {
819+ return simd_type::load_aligned (padded_coeffs[0 ].data () + i);
820+ } else {
821+ return simd_type{0 };
822+ }
823+ }();
824+ auto k_even = simd_type::load_aligned (padded_coeffs[if_odd_degree].data () + i);
825+ for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2 ) {
826+ const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
827+ const auto cji_even = simd_type::load_aligned (padded_coeffs[j + 1 ].data () + i);
828+ k_odd = xsimd::fma (k_odd, z2v, cji_odd);
829+ k_even = xsimd::fma (k_even, z2v, cji_even);
830+ }
831+ // left part
832+ xsimd::fma (k_odd, zv, k_even).store_aligned (ker + i);
833+ // right part symmetric to the left part
834+ if (offset >= end_idx) {
835+ if constexpr (tail) {
836+ // to use aligned store, we need shuffle the previous k_sym and current k_sym
837+ k_prev = k_sym;
838+ k_sym = xsimd::fnma (k_odd, zv, k_even);
839+ xsimd::shuffle (k_sym, k_prev, shuffle_batch).store_aligned (ker + offset);
840+ } else {
841+ xsimd::swizzle (xsimd::fnma (k_odd, zv, k_even), shuffle_batch)
842+ .store_aligned (ker + offset);
843+ }
844+ }
845+ }
846+ } else {
847+ const simd_type zv (z);
848+ for (uint8_t i = 0 ; i < w; i += simd_size) {
849+ auto k = simd_type::load_aligned (padded_coeffs[0 ].data () + i);
850+ for (uint8_t j = 1 ; j < nc; ++j) {
851+ const auto cji = simd_type::load_aligned (padded_coeffs[j].data () + i);
852+ k = xsimd::fma (k, zv, cji);
853+ }
854+ k.store_aligned (ker + i);
795855 }
796- k.store_aligned (ker + i);
797856 }
798857 return ;
799858 }
@@ -2168,6 +2227,16 @@ struct zip_hi {
21682227 return (size + index) / 2 ;
21692228 }
21702229};
2230+ template <unsigned cap> struct reverse_index {
2231+ static constexpr unsigned get (unsigned index, const unsigned size) {
2232+ return index < cap ? (cap - 1 - index) : index;
2233+ }
2234+ };
2235+ template <unsigned cap> struct shuffle_index {
2236+ static constexpr unsigned get (unsigned index, const unsigned size) {
2237+ return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
2238+ }
2239+ };
21712240
21722241struct select_even {
21732242 static constexpr unsigned get (unsigned index, unsigned /* size*/ ) { return index * 2 ; }
0 commit comments