@@ -794,67 +794,51 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
794794 if constexpr (use_ker_sym) {
795795 static constexpr uint8_t tail = w % simd_size;
796796 static constexpr uint8_t if_odd_degree = ((nc + 1 ) % 2 );
797- static const simd_type zerov (0.0 );
797+ static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size;
798+ static constexpr uint8_t end_idx = (w + (tail > 0 )) / 2 ;
798799 const simd_type zv (z);
799800 const simd_type z2v = zv * zv;
800801
801- // no xsimd::shuffle neeeded if tail is zero
802- if constexpr (tail) {
803- // some xsimd constant for shuffle
804- static constexpr auto shuffle_batch =
805- xsimd::make_batch_constant<xsimd:: as_unsigned_integer_t <FLT>, arch_t ,
806- shuffle_index<tail>>();
807-
808- // process simd vecs
809- simd_type k_odd, k_even, k_prev, k_sym = zerov;
810- for ( uint8_t i = 0 , offset = w - tail; i < (w + 1 ) / 2 ;
811- i += simd_size, offset -= simd_size) {
812- k_odd = if_odd_degree ? simd_type::load_aligned (padded_coeffs[ 0 ]. data () + i)
813- : zerov ;
814- k_even = simd_type::load_aligned (padded_coeffs[if_odd_degree]. data () + i) ;
815- for ( uint8_t j = 1 + if_odd_degree; j < nc; j += 2 ) {
816- const auto cji_odd = simd_type::load_aligned (padded_coeffs[j]. data () + i);
817- k_odd = xsimd::fma (k_odd, z2v, cji_odd);
818- const auto cji_even =
819- simd_type::load_aligned (padded_coeffs[j + 1 ]. data () + i);
820- k_even = xsimd::fma (k_even, z2v, cji_even) ;
802+ // some xsimd constant for shuffle or inverse
803+ static constexpr auto shuffle_batch = []() constexpr noexcept {
804+ if constexpr (tail) {
805+ return xsimd::make_batch_constant<xsimd:: as_unsigned_integer_t <FLT>, arch_t ,
806+ shuffle_index<tail>>();
807+ } else {
808+ return xsimd::make_batch_constant<xsimd:: as_unsigned_integer_t <FLT>, arch_t ,
809+ reverse_index<simd_size>>();
810+ }
811+ }() ;
812+
813+ // process simd vecs
814+ simd_type k_odd, k_even, k_prev, k_sym{ 0 } ;
815+ for ( uint8_t i = 0 , offset = offset_start; i < end_idx ;
816+ i += simd_size, offset -= simd_size ) {
817+ k_odd = [i] () constexpr noexcept {
818+ if constexpr (if_odd_degree) {
819+ return simd_type::load_aligned (padded_coeffs[ 0 ]. data () + i);
820+ } else {
821+ return simd_type{ 0 } ;
821822 }
822- // left part
823- xsimd::fma (k_odd, zv, k_even).store_aligned (ker + i);
824- // right part symmetric to the left part
825- if (offset >= (w + 1 ) / 2 ) {
823+ }();
824+ k_even = simd_type::load_aligned (padded_coeffs[if_odd_degree].data () + i);
825+ for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2 ) {
826+ const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
827+ k_odd = xsimd::fma (k_odd, z2v, cji_odd);
828+ const auto cji_even = simd_type::load_aligned (padded_coeffs[j + 1 ].data () + i);
829+ k_even = xsimd::fma (k_even, z2v, cji_even);
830+ }
831+ // left part
832+ xsimd::fma (k_odd, zv, k_even).store_aligned (ker + i);
833+ // right part symmetric to the left part
834+ if (offset >= end_idx) {
835+ if constexpr (tail) {
826836 // to use aligned store, we need shuffle the previous k_sym and current k_sym
827837 k_prev = k_sym;
828838 k_sym = xsimd::fnma (k_odd, zv, k_even);
829839 xsimd::shuffle (k_sym, k_prev, shuffle_batch).store_aligned (ker + offset);
830- }
831- }
832- } else {
833- // xsimd constants for reverse
834- static constexpr auto reverse_batch =
835- xsimd::make_batch_constant<xsimd::as_unsigned_integer_t <FLT>, arch_t ,
836- reverse_index<simd_size>>();
837-
838- // process simd vecs
839- for (uint8_t i = 0 , offset = w - simd_size; i < w / 2 ;
840- i += simd_size, offset -= simd_size) {
841- auto k_odd = if_odd_degree
842- ? simd_type::load_aligned (padded_coeffs[0 ].data () + i)
843- : zerov;
844- auto k_even = simd_type::load_aligned (padded_coeffs[if_odd_degree].data () + i);
845- for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2 ) {
846- const auto cji_odd = simd_type::load_aligned (padded_coeffs[j].data () + i);
847- k_odd = xsimd::fma (k_odd, z2v, cji_odd);
848- const auto cji_even =
849- simd_type::load_aligned (padded_coeffs[j + 1 ].data () + i);
850- k_even = xsimd::fma (k_even, z2v, cji_even);
851- }
852- // left part
853- xsimd::fma (k_odd, zv, k_even).store_aligned (ker + i);
854- // right part symmetric to the left part
855- if (offset >= w / 2 ) {
856- // reverse the order for symmetric part
857- xsimd::swizzle (xsimd::fnma (k_odd, zv, k_even), reverse_batch)
840+ } else {
841+ xsimd::swizzle (xsimd::fnma (k_odd, zv, k_even), shuffle_batch)
858842 .store_aligned (ker + offset);
859843 }
860844 }
0 commit comments