Skip to content

Commit 9176518

Browse files
committed
reverted extra vectorization
1 parent d2cc988 commit 9176518

File tree

1 file changed

+80
-182
lines changed

1 file changed

+80
-182
lines changed

src/spreadinterp.cpp

Lines changed: 80 additions & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -800,9 +800,8 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
800800
}
801801

802802
template<uint8_t ns>
803-
FINUFFT_NEVER_INLINE static void interp_line_wrap(FLT *FINUFFT_RESTRICT target,
804-
const FLT *du, const FLT *ker,
805-
const BIGINT i1, const UBIGINT N1) {
803+
static void interp_line_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
804+
const BIGINT i1, const UBIGINT N1) {
806805
/* This function is called when the kernel wraps around the grid. It is
807806
slower than interp_line.
808807
M. Barbone July 2024: - moved the logic to a separate function
@@ -879,22 +878,10 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
879878
} else { // doesn't wrap
880879
// logic largely similar to spread 1D kernel, please see the explanation there
881880
// for the first part of this code
882-
const auto du_ptr = du + 2 * j;
883-
simd_type res_low{}, res_hi{};
884-
// Handle the first iteration only if regular_part is greater than 0
885-
// unrolling the first iteration to avoid initializing the variables to 0
886-
if constexpr (regular_part > 0) {
887-
const auto ker_v = simd_type::load_aligned(ker);
888-
const auto du_pt0 = simd_type::load_unaligned(du_ptr);
889-
const auto du_pt1 = simd_type::load_unaligned(du_ptr + simd_size);
890-
const auto ker0low = xsimd::swizzle(ker_v, zip_low_index<arch_t>);
891-
const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index<arch_t>);
892-
res_low = ker0low * du_pt0;
893-
res_hi = ker0hi * du_pt1;
894-
}
895-
if constexpr (regular_part > 3 * simd_size) {
896-
// Now loop over the remaining elements
897-
for (uint8_t dx = 2 * simd_size; dx < regular_part; dx += 2 * simd_size) {
881+
const auto res = [du, j, ker]() constexpr noexcept {
882+
const auto du_ptr = du + 2 * j;
883+
simd_type res_low{0}, res_hi{0};
884+
for (uint8_t dx{0}; dx < regular_part; dx += 2 * simd_size) {
898885
const auto ker_v = simd_type::load_aligned(ker + dx / 2);
899886
const auto du_pt0 = simd_type::load_unaligned(du_ptr + dx);
900887
const auto du_pt1 = simd_type::load_unaligned(du_ptr + dx + simd_size);
@@ -903,19 +890,24 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
903890
res_low = xsimd::fma(ker0low, du_pt0, res_low);
904891
res_hi = xsimd::fma(ker0hi, du_pt1, res_hi);
905892
}
906-
}
907-
if constexpr (regular_part < 2 * ns) {
908-
const auto ker0 = simd_type::load_unaligned(ker + (regular_part / 2));
909-
const auto du_pt = simd_type::load_unaligned(du_ptr + regular_part);
910-
const auto ker0low = xsimd::swizzle(ker0, zip_low_index<arch_t>);
911-
if constexpr (regular_part > 0) {
912-
res_low = xsimd::fma(ker0low, du_pt, res_low);
913-
} else {
914-
// handle the case where res_hi is unused because now is not 0 is un-initialized
915-
res_low = ker0low * du_pt;
893+
894+
if constexpr (regular_part < 2 * ns) {
895+
const auto ker0 = simd_type::load_unaligned(ker + (regular_part / 2));
896+
const auto du_pt = simd_type::load_unaligned(du_ptr + regular_part);
897+
const auto ker0low = xsimd::swizzle(ker0, zip_low_index<arch_t>);
898+
res_low = xsimd::fma(ker0low, du_pt, res_low);
916899
}
917-
}
918900

901+
// This does a horizontal sum using a loop instead of relying on SIMD instructions
902+
// this is faster than the code below but less elegant.
903+
// lambdas here to limit the scope of temporary variables and have the compiler
904+
// optimize the code better
905+
return res_low + res_hi;
906+
}();
907+
for (uint8_t i{0}; i < simd_size; i += 2) {
908+
out[0] += res.get(i);
909+
out[1] += res.get(i + 1);
910+
}
919911
// this is where the code differs from spread_kernel, the interpolator does an extra
920912
// reduction step to SIMD elements down to 2 elements
921913
// This is known as horizontal sum in SIMD terminology
@@ -928,36 +920,15 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
928920
// out[0] = xsimd::reduce_add(res_real);
929921
// out[1] = xsimd::reduce_add(res_imag);
930922
// clang-format on
931-
932-
// This does a horizontal sum using a loop instead of relying on SIMD instructions
933-
// this is faster than the above code but less elegant.
934-
// lambdas here to limit the scope of temporary variables and have the compiler
935-
// optimize the code better
936-
alignas(alignment) const auto res_array = [](const auto &res_low,
937-
const auto &res_hi) constexpr noexcept {
938-
alignas(alignment) std::array<FLT, simd_size> res_array{};
939-
if constexpr (regular_part > 0) {
940-
const auto res = res_low + res_hi;
941-
res.store_aligned(res_array.data());
942-
} else {
943-
// handle the case where res_hi is unused because now is not 0 is un-initialized
944-
res_low.store_aligned(res_array.data());
945-
}
946-
return res_array;
947-
}(res_low, res_hi);
948-
for (uint8_t i{0}; i < simd_size; i += 2) {
949-
out[0] += res_array[i];
950-
out[1] += res_array[i + 1];
951-
}
952923
}
953924
target[0] = out[0];
954925
target[1] = out[1];
955926
}
956927

957928
template<uint8_t ns, class simd_type>
958-
FINUFFT_NEVER_INLINE static void interp_square_wrap(
959-
FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, const FLT *ker2,
960-
const BIGINT i1, const BIGINT i2, const UBIGINT N1, const UBIGINT N2) {
929+
static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du,
930+
const FLT *ker1, const FLT *ker2, const BIGINT i1,
931+
const BIGINT i2, const UBIGINT N1, const UBIGINT N2) {
961932
/*
962933
* This function is called when the kernel wraps around the grid. It is slower than
963934
* the non wrapping version.
@@ -969,16 +940,9 @@ FINUFFT_NEVER_INLINE static void interp_square_wrap(
969940
static constexpr auto alignment = arch_t::alignment();
970941
if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) {
971942
// store a horiz line (interleaved real,imag)
972-
alignas(alignment) std::array<FLT, 2 * MAX_NSPREAD> line{};
973-
// block for first y line, to avoid explicitly initializing line with zeros
974-
{
975-
const auto l_ptr = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
976-
for (uint8_t l{0}; l < 2 * ns; ++l) { // l is like dx but for ns interleaved
977-
line[l] = ker2[0] * l_ptr[l];
978-
}
979-
}
943+
alignas(alignment) std::array<FLT, 2 * ns> line{0};
980944
// add remaining const-y lines to the line (expensive inner loop)
981-
for (uint8_t dy{1}; dy < ns; ++dy) {
945+
for (uint8_t dy{0}; dy < ns; ++dy) {
982946
const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
983947
for (uint8_t l{0}; l < 2 * ns; ++l) {
984948
line[l] = std::fma(ker2[dy], l_ptr[l], line[l]);
@@ -1059,77 +1023,47 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10591023
static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
10601024
if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 &&
10611025
(i1 + ns + (padding + 1) / 2 < N1)) {
1062-
const auto line = [du, N1, i2, i1, ker2]() constexpr noexcept {
1026+
const auto line = [du, N1, i1 = UBIGINT(i1), i2 = UBIGINT(i2),
1027+
ker2]() constexpr noexcept {
10631028
// new array du_pts to store the du values for the current y line
1064-
std::array<simd_type, line_vectors> line{}, du_pts{};
1029+
std::array<simd_type, line_vectors> line{0};
10651030
// block for first y line, to avoid explicitly initializing line with zeros
1066-
const auto l_ptr_base = du + 2 * UBIGINT(N1 * i2 + i1); // ptr to horiz line start
1067-
// in du
1068-
for (uint8_t l{0}; l < line_vectors; ++l) {
1069-
du_pts[l] = simd_type::load_unaligned(l * simd_size + l_ptr_base);
1070-
}
1071-
for (uint8_t l{0}; l < line_vectors; ++l) {
1072-
// l is like dx but for ns interleaved
1073-
// no fancy trick needed to multiply real,imag by ker2
1074-
line[l] = du_pts[l] * simd_type{ker2[0]};
1075-
}
10761031
// add remaining const-y lines to the line (expensive inner loop)
1077-
for (uint8_t dy{1}; dy < ns; dy++) {
1078-
const auto l_ptr = l_ptr_base + 2 * (N1 * dy); // (see above)
1032+
for (uint8_t dy{0}; dy < ns; dy++) {
1033+
const auto l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
10791034
const simd_type ker2_v{ker2[dy]};
1080-
// vectorize over the fast axis of the du array
1081-
// First loop: Load all du_pt into the du_pts array
1082-
for (uint8_t l{0}; l < line_vectors; ++l) {
1083-
du_pts[l] = simd_type::load_unaligned(l * simd_size + l_ptr);
1084-
}
1085-
// Second loop: Perform the FMA operation
10861035
for (uint8_t l{0}; l < line_vectors; ++l) {
1087-
line[l] = xsimd::fma(ker2_v, du_pts[l], line[l]);
1036+
const auto du_pt = simd_type::load_unaligned(l * simd_size + l_ptr);
1037+
line[l] = xsimd::fma(ker2_v, du_pt, line[l]);
10881038
}
10891039
}
10901040
return line;
10911041
}();
10921042
// This is the same as 1D interpolation
10931043
// using lambda to limit the scope of the temporary variables
1094-
const auto res = [ker1](const auto &line) constexpr noexcept {
1044+
const auto res = [ker1, &line]() constexpr noexcept {
10951045
// apply x kernel to the (interleaved) line and add together
1096-
simd_type res_low{}, res_hi{};
1097-
if constexpr (line_vectors > 1) {
1098-
// Manually write out the first iteration
1099-
const auto ker1_v = simd_type::load_aligned(ker1);
1046+
simd_type res_low{0}, res_hi{0};
1047+
// Start the loop from the second iteration
1048+
for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1049+
i += 2) {
1050+
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
11001051
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
11011052
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1102-
res_low = ker1low * line[0];
1103-
res_hi = ker1hi * line[1];
1104-
}
1105-
if constexpr (line_vectors > 3) {
1106-
// Start the loop from the second iteration
1107-
for (uint8_t i = 2; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1108-
i += 2) {
1109-
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
1110-
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1111-
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1112-
res_low = xsimd::fma(ker1low, line[i], res_low);
1113-
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1114-
}
1053+
res_low = xsimd::fma(ker1low, line[i], res_low);
1054+
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
11151055
}
11161056
if constexpr (line_vectors % 2) {
11171057
const auto ker1_v =
11181058
simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
11191059
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
11201060
res_low = xsimd::fma(ker1low, line.back(), res_low);
11211061
}
1122-
if constexpr (line_vectors > 1) {
1123-
return res_low + res_hi;
1124-
} else {
1125-
return res_low;
1126-
}
1127-
}(line);
1128-
alignas(alignment) std::array<FLT, simd_size> res_array{};
1129-
res.store_aligned(res_array.data());
1062+
return res_low + res_hi;
1063+
}();
11301064
for (uint8_t i{0}; i < simd_size; i += 2) {
1131-
out[0] += res_array[i];
1132-
out[1] += res_array[i + 1];
1065+
out[0] += res.get(i);
1066+
out[1] += res.get(i + 1);
11331067
}
11341068
} else { // wraps somewhere: use ptr list
11351069
// this is slower than above, but occurs much less often, with fractional
@@ -1141,10 +1075,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
11411075
}
11421076

11431077
template<uint8_t ns, class simd_type>
1144-
FINUFFT_NEVER_INLINE static void interp_cube_wrapped(
1145-
FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, const FLT *ker2,
1146-
const FLT *ker3, const BIGINT i1, const BIGINT i2, const BIGINT i3, const UBIGINT N1,
1147-
const UBIGINT N2, const UBIGINT N3) {
1078+
static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du,
1079+
const FLT *ker1, const FLT *ker2, const FLT *ker3,
1080+
const BIGINT i1, const BIGINT i2, const BIGINT i3,
1081+
const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) {
11481082
/*
11491083
* This function is called when the kernel wraps around the cube.
11501084
* Similarly to 2D and 1D wrapping, this is slower than the non wrapping version.
@@ -1257,81 +1191,45 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12571191
const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3);
12581192
std::array<FLT, 2> out{0};
12591193
if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1) / 2 < N1)) {
1260-
const auto line = [N1, N2, i1, i2, i3, ker2, ker3, du]() constexpr noexcept {
1261-
std::array<simd_type, line_vectors> line{0}, du_pts{};
1262-
std::array<simd_type, ns> ker23_array{};
1263-
const auto base_oz = N1 * N2 * UBIGINT(i3); // Move invariant part outside the loop
1194+
const auto line = [N1, N2, i1 = UBIGINT(i1), i2 = UBIGINT(i2), i3 = UBIGINT(i3), ker2,
1195+
ker3, du]() constexpr noexcept {
1196+
std::array<simd_type, line_vectors> line{0};
12641197
for (uint8_t dz{0}; dz < ns; ++dz) {
1265-
const auto oz = base_oz + N1 * N2 * dz; // Only the dz part is inside the loop
1266-
const auto base_du_ptr = du + 2 * UBIGINT(oz + N1 * i2 + i1);
1267-
{
1268-
alignas(alignment) std::array<FLT, ker23_size> ker23_scalar{};
1269-
const simd_type ker3_v{ker3[dz]};
1270-
for (uint8_t dy{0}; dy < ns; dy += simd_size) {
1271-
const auto ker2_v = simd_type::load_aligned(ker2 + dy);
1272-
const auto ker23_v = ker2_v * ker3_v;
1273-
ker23_v.store_aligned(ker23_scalar.data() + dy);
1274-
}
1275-
for (uint8_t dy{0}; dy < ns; ++dy) {
1276-
ker23_array[dy] = ker23_scalar[dy];
1277-
}
1278-
}
1198+
const auto oz = N1 * N2 * (i3 + dz); // offset due to z
12791199
for (uint8_t dy{0}; dy < ns; ++dy) {
1280-
const auto du_ptr = base_du_ptr + 2 * N1 * dy; // (see above)
1200+
const auto l_ptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of line
1201+
const simd_type ker23{ker2[dy] * ker3[dz]};
12811202
for (uint8_t l{0}; l < line_vectors; ++l) {
1282-
du_pts[l] = simd_type::load_unaligned(l * simd_size + du_ptr);
1283-
}
1284-
for (uint8_t l{0}; l < line_vectors; ++l) {
1285-
line[l] = xsimd::fma(ker23_array[dy], du_pts[l], line[l]);
1203+
const auto du_pt = simd_type::load_unaligned(l * simd_size + l_ptr);
1204+
line[l] = xsimd::fma(ker23, du_pt, line[l]);
12861205
}
12871206
}
12881207
}
12891208
return line;
12901209
}();
1291-
// apply x kernel to the (interleaved) line and add together
1292-
const auto res_array = [ker1](const auto &line) constexpr noexcept {
1293-
const auto res = [ker1](const auto &line) constexpr noexcept {
1294-
// apply x kernel to the (interleaved) line and add together
1295-
simd_type res_low{}, res_hi{};
1296-
if constexpr (line_vectors > 1) {
1297-
// Manually write out the first iteration
1298-
const auto ker1_v = simd_type::load_aligned(ker1);
1299-
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1300-
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1301-
res_low = ker1low * line[0];
1302-
res_hi = ker1hi * line[1];
1303-
}
1304-
if constexpr (line_vectors > 3) {
1305-
// Start the loop from the second iteration
1306-
for (uint8_t i = 2;
1307-
i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1308-
i += 2) {
1309-
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
1310-
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1311-
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1312-
res_low = xsimd::fma(ker1low, line[i], res_low);
1313-
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1314-
}
1315-
}
1316-
if constexpr (line_vectors % 2) {
1317-
const auto ker1_v =
1318-
simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
1319-
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1320-
res_low = xsimd::fma(ker1low, line.back(), res_low);
1321-
}
1322-
if constexpr (line_vectors > 1) {
1323-
return res_low + res_hi;
1324-
} else {
1325-
return res_low;
1326-
}
1327-
}(line);
1328-
alignas(alignment) std::array<FLT, simd_size> res_array{};
1329-
res.store_aligned(res_array.data());
1330-
return res_array;
1331-
}(line);
1210+
const auto res = [ker1, &line]() constexpr noexcept {
1211+
// apply x kernel to the (interleaved) line and add together
1212+
simd_type res_low{0}, res_hi{0};
1213+
// Start the loop from the second iteration
1214+
for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1215+
i += 2) {
1216+
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
1217+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1218+
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1219+
res_low = xsimd::fma(ker1low, line[i], res_low);
1220+
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1221+
}
1222+
if constexpr (line_vectors % 2) {
1223+
const auto ker1_v =
1224+
simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
1225+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1226+
res_low = xsimd::fma(ker1low, line.back(), res_low);
1227+
}
1228+
return res_low + res_hi;
1229+
}();
13321230
for (uint8_t i{0}; i < simd_size; i += 2) {
1333-
out[0] += res_array[i];
1334-
out[1] += res_array[i + 1];
1231+
out[0] += res.get(i);
1232+
out[1] += res.get(i + 1);
13351233
}
13361234
} else {
13371235
return interp_cube_wrapped<ns, simd_type>(target, du, ker1, ker2, ker3, i1, i2, i3,

0 commit comments

Comments
 (0)