Skip to content

Commit 52150f0

Browse files
committed
minor optimisation
1 parent 92b4def commit 52150f0

File tree

1 file changed

+47
-15
lines changed

1 file changed

+47
-15
lines changed

src/spreadinterp.cpp

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,22 +1089,37 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10891089
// using lambda to limit the scope of the temporary variables
10901090
const auto res = [ker1](const auto &line) constexpr noexcept {
10911091
// apply x kernel to the (interleaved) line and add together
1092-
simd_type res_low{0}, res_hi{0};
1093-
for (uint8_t i = 0; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1094-
i += 2) {
1095-
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
1092+
simd_type res_low{}, res_hi{};
1093+
if constexpr (line_vectors > 1) {
1094+
// Manually write out the first iteration
1095+
const auto ker1_v = simd_type::load_aligned(ker1);
10961096
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
10971097
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1098-
res_low = xsimd::fma(ker1low, line[i], res_low);
1099-
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1098+
res_low = ker1low * line[0];
1099+
res_hi = ker1hi * line[1];
1100+
}
1101+
if constexpr (line_vectors > 3) {
1102+
// Start the loop from the second iteration
1103+
for (uint8_t i = 2; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1104+
i += 2) {
1105+
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
1106+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1107+
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1108+
res_low = xsimd::fma(ker1low, line[i], res_low);
1109+
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1110+
}
11001111
}
11011112
if constexpr (line_vectors % 2) {
11021113
const auto ker1_v =
11031114
simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
11041115
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
11051116
res_low = xsimd::fma(ker1low, line.back(), res_low);
11061117
}
1107-
return res_low + res_hi;
1118+
if constexpr (line_vectors > 1) {
1119+
return res_low + res_hi;
1120+
} else {
1121+
return res_low;
1122+
}
11081123
}(line);
11091124
alignas(alignment) std::array<FLT, simd_size> res_array{};
11101125
res.store_aligned(res_array.data());
@@ -1261,22 +1276,39 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12611276
// apply x kernel to the (interleaved) line and add together
12621277
const auto res_array = [ker1](const auto &line) constexpr noexcept {
12631278
const auto res = [ker1](const auto &line) constexpr noexcept {
1264-
simd_type res_low{0}, res_hi{0};
1265-
for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1266-
i += 2) {
1267-
const auto ker1_v = simd_type::load_aligned(i * simd_size / 2 + ker1);
1279+
// apply x kernel to the (interleaved) line and add together
1280+
simd_type res_low{}, res_hi{};
1281+
if constexpr (line_vectors > 1) {
1282+
// Manually write out the first iteration
1283+
const auto ker1_v = simd_type::load_aligned(ker1);
12681284
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
12691285
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1270-
res_low = xsimd::fma(ker1low, line[i], res_low);
1271-
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1286+
res_low = ker1low * line[0];
1287+
res_hi = ker1hi * line[1];
1288+
}
1289+
if constexpr (line_vectors > 3) {
1290+
// Start the loop from the second iteration
1291+
for (uint8_t i = 2;
1292+
i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1293+
i += 2) {
1294+
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
1295+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1296+
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1297+
res_low = xsimd::fma(ker1low, line[i], res_low);
1298+
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1299+
}
12721300
}
12731301
if constexpr (line_vectors % 2) {
12741302
const auto ker1_v =
1275-
simd_type::load_aligned((line_vectors - 1) * simd_size / 2 + ker1);
1303+
simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
12761304
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
12771305
res_low = xsimd::fma(ker1low, line.back(), res_low);
12781306
}
1279-
return res_low + res_hi;
1307+
if constexpr (line_vectors > 1) {
1308+
return res_low + res_hi;
1309+
} else {
1310+
return res_low;
1311+
}
12801312
}(line);
12811313
alignas(alignment) std::array<FLT, simd_size> res_array{};
12821314
res.store_aligned(res_array.data());

0 commit comments

Comments
 (0)