@@ -1089,22 +1089,37 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10891089 // using lambda to limit the scope of the temporary variables
10901090 const auto res = [ker1](const auto &line) constexpr noexcept {
10911091 // apply x kernel to the (interleaved) line and add together
1092- simd_type res_low{0 }, res_hi{0 };
1093- for ( uint8_t i = 0 ; i < (line_vectors & ~ 1 ); // NOLINT(*-too-small-loop-variable)
1094- i += 2 ) {
1095- const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
1092+ simd_type res_low{}, res_hi{};
1093+ if constexpr (line_vectors > 1 ) {
1094+ // Manually write out the first iteration
1095+ const auto ker1_v = simd_type::load_aligned (ker1);
10961096 const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
10971097 const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1098- res_low = xsimd::fma (ker1low, line[i], res_low);
1099- res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1098+ res_low = ker1low * line[0 ];
1099+ res_hi = ker1hi * line[1 ];
1100+ }
1101+ if constexpr (line_vectors > 3 ) {
1102+ // Start the loop from the second iteration
1103+ for (uint8_t i = 2 ; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1104+ i += 2 ) {
1105+ const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
1106+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1107+ const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1108+ res_low = xsimd::fma (ker1low, line[i], res_low);
1109+ res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1110+ }
11001111 }
11011112 if constexpr (line_vectors % 2 ) {
11021113 const auto ker1_v =
11031114 simd_type::load_aligned (ker1 + (line_vectors - 1 ) * simd_size / 2 );
11041115 const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
11051116 res_low = xsimd::fma (ker1low, line.back (), res_low);
11061117 }
1107- return res_low + res_hi;
1118+ if constexpr (line_vectors > 1 ) {
1119+ return res_low + res_hi;
1120+ } else {
1121+ return res_low;
1122+ }
11081123 }(line);
11091124 alignas (alignment) std::array<FLT, simd_size> res_array{};
11101125 res.store_aligned (res_array.data ());
@@ -1261,22 +1276,39 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12611276 // apply x kernel to the (interleaved) line and add together
12621277 const auto res_array = [ker1](const auto &line) constexpr noexcept {
12631278 const auto res = [ker1](const auto &line) constexpr noexcept {
1264- simd_type res_low{0 }, res_hi{0 };
1265- for (uint8_t i{0 }; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1266- i += 2 ) {
1267- const auto ker1_v = simd_type::load_aligned (i * simd_size / 2 + ker1);
1279+ // apply x kernel to the (interleaved) line and add together
1280+ simd_type res_low{}, res_hi{};
1281+ if constexpr (line_vectors > 1 ) {
1282+ // Manually write out the first iteration
1283+ const auto ker1_v = simd_type::load_aligned (ker1);
12681284 const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
12691285 const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1270- res_low = xsimd::fma (ker1low, line[i], res_low);
1271- res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1286+ res_low = ker1low * line[0 ];
1287+ res_hi = ker1hi * line[1 ];
1288+ }
1289+ if constexpr (line_vectors > 3 ) {
1290+ // Start the loop from the second iteration
1291+ for (uint8_t i = 2 ;
1292+ i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1293+ i += 2 ) {
1294+ const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
1295+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1296+ const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1297+ res_low = xsimd::fma (ker1low, line[i], res_low);
1298+ res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1299+ }
12721300 }
12731301 if constexpr (line_vectors % 2 ) {
12741302 const auto ker1_v =
1275- simd_type::load_aligned ((line_vectors - 1 ) * simd_size / 2 + ker1 );
1303+ simd_type::load_aligned (ker1 + (line_vectors - 1 ) * simd_size / 2 );
12761304 const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
12771305 res_low = xsimd::fma (ker1low, line.back (), res_low);
12781306 }
1279- return res_low + res_hi;
1307+ if constexpr (line_vectors > 1 ) {
1308+ return res_low + res_hi;
1309+ } else {
1310+ return res_low;
1311+ }
12801312 }(line);
12811313 alignas (alignment) std::array<FLT, simd_size> res_array{};
12821314 res.store_aligned (res_array.data ());
0 commit comments