@@ -800,9 +800,8 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
800800}
801801
802802template <uint8_t ns>
803- FINUFFT_NEVER_INLINE static void interp_line_wrap (FLT *FINUFFT_RESTRICT target,
804- const FLT *du, const FLT *ker,
805- const BIGINT i1, const UBIGINT N1) {
803+ static void interp_line_wrap (FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
804+ const BIGINT i1, const UBIGINT N1) {
806805 /* This function is called when the kernel wraps around the grid. It is
807806 slower than interp_line.
808807 M. Barbone July 2024: - moved the logic to a separate function
@@ -879,22 +878,10 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
879878 } else { // doesn't wrap
880879 // logic largely similar to spread 1D kernel, please see the explanation there
881880 // for the first part of this code
882- const auto du_ptr = du + 2 * j;
883- simd_type res_low{}, res_hi{};
884- // Handle the first iteration only if regular_part is greater than 0
885- // unrolling the first iteration to avoid initializing the variables to 0
886- if constexpr (regular_part > 0 ) {
887- const auto ker_v = simd_type::load_aligned (ker);
888- const auto du_pt0 = simd_type::load_unaligned (du_ptr);
889- const auto du_pt1 = simd_type::load_unaligned (du_ptr + simd_size);
890- const auto ker0low = xsimd::swizzle (ker_v, zip_low_index<arch_t >);
891- const auto ker0hi = xsimd::swizzle (ker_v, zip_hi_index<arch_t >);
892- res_low = ker0low * du_pt0;
893- res_hi = ker0hi * du_pt1;
894- }
895- if constexpr (regular_part > 3 * simd_size) {
896- // Now loop over the remaining elements
897- for (uint8_t dx = 2 * simd_size; dx < regular_part; dx += 2 * simd_size) {
881+ const auto res = [du, j, ker]() constexpr noexcept {
882+ const auto du_ptr = du + 2 * j;
883+ simd_type res_low{0 }, res_hi{0 };
884+ for (uint8_t dx{0 }; dx < regular_part; dx += 2 * simd_size) {
898885 const auto ker_v = simd_type::load_aligned (ker + dx / 2 );
899886 const auto du_pt0 = simd_type::load_unaligned (du_ptr + dx);
900887 const auto du_pt1 = simd_type::load_unaligned (du_ptr + dx + simd_size);
@@ -903,19 +890,24 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
903890 res_low = xsimd::fma (ker0low, du_pt0, res_low);
904891 res_hi = xsimd::fma (ker0hi, du_pt1, res_hi);
905892 }
906- }
907- if constexpr (regular_part < 2 * ns) {
908- const auto ker0 = simd_type::load_unaligned (ker + (regular_part / 2 ));
909- const auto du_pt = simd_type::load_unaligned (du_ptr + regular_part);
910- const auto ker0low = xsimd::swizzle (ker0, zip_low_index<arch_t >);
911- if constexpr (regular_part > 0 ) {
912- res_low = xsimd::fma (ker0low, du_pt, res_low);
913- } else {
914- // handle the case where res_hi is unused because now is not 0 is un-initialized
915- res_low = ker0low * du_pt;
893+
894+ if constexpr (regular_part < 2 * ns) {
895+ const auto ker0 = simd_type::load_unaligned (ker + (regular_part / 2 ));
896+ const auto du_pt = simd_type::load_unaligned (du_ptr + regular_part);
897+ const auto ker0low = xsimd::swizzle (ker0, zip_low_index<arch_t >);
898+ res_low = xsimd::fma (ker0low, du_pt, res_low);
916899 }
917- }
918900
901+ // This does a horizontal sum using a loop instead of relying on SIMD instructions
902+ // this is faster than the code below but less elegant.
903+ // lambdas here to limit the scope of temporary variables and have the compiler
904+ // optimize the code better
905+ return res_low + res_hi;
906+ }();
907+ for (uint8_t i{0 }; i < simd_size; i += 2 ) {
908+ out[0 ] += res.get (i);
909+ out[1 ] += res.get (i + 1 );
910+ }
919911 // this is where the code differs from spread_kernel, the interpolator does an extra
920912 // reduction step to SIMD elements down to 2 elements
921913 // This is known as horizontal sum in SIMD terminology
@@ -928,36 +920,15 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
928920 // out[0] = xsimd::reduce_add(res_real);
929921 // out[1] = xsimd::reduce_add(res_imag);
930922 // clang-format on
931-
932- // This does a horizontal sum using a loop instead of relying on SIMD instructions
933- // this is faster than the above code but less elegant.
934- // lambdas here to limit the scope of temporary variables and have the compiler
935- // optimize the code better
936- alignas (alignment) const auto res_array = [](const auto &res_low,
937- const auto &res_hi) constexpr noexcept {
938- alignas (alignment) std::array<FLT, simd_size> res_array{};
939- if constexpr (regular_part > 0 ) {
940- const auto res = res_low + res_hi;
941- res.store_aligned (res_array.data ());
942- } else {
943- // handle the case where res_hi is unused because now is not 0 is un-initialized
944- res_low.store_aligned (res_array.data ());
945- }
946- return res_array;
947- }(res_low, res_hi);
948- for (uint8_t i{0 }; i < simd_size; i += 2 ) {
949- out[0 ] += res_array[i];
950- out[1 ] += res_array[i + 1 ];
951- }
952923 }
953924 target[0 ] = out[0 ];
954925 target[1 ] = out[1 ];
955926}
956927
957928template <uint8_t ns, class simd_type >
958- FINUFFT_NEVER_INLINE static void interp_square_wrap (
959- FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, const FLT *ker2,
960- const BIGINT i1, const BIGINT i2, const UBIGINT N1, const UBIGINT N2) {
929+ static void interp_square_wrap (FLT *FINUFFT_RESTRICT target, const FLT *du,
930+ const FLT *ker1, const FLT *ker2, const BIGINT i1 ,
931+ const BIGINT i2, const UBIGINT N1, const UBIGINT N2) {
961932 /*
962933 * This function is called when the kernel wraps around the grid. It is slower than
963934 * the non wrapping version.
@@ -969,16 +940,9 @@ FINUFFT_NEVER_INLINE static void interp_square_wrap(
969940 static constexpr auto alignment = arch_t::alignment ();
970941 if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) {
971942 // store a horiz line (interleaved real,imag)
972- alignas (alignment) std::array<FLT, 2 * MAX_NSPREAD> line{};
973- // block for first y line, to avoid explicitly initializing line with zeros
974- {
975- const auto l_ptr = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
976- for (uint8_t l{0 }; l < 2 * ns; ++l) { // l is like dx but for ns interleaved
977- line[l] = ker2[0 ] * l_ptr[l];
978- }
979- }
943+ alignas (alignment) std::array<FLT, 2 * ns> line{0 };
980944 // add remaining const-y lines to the line (expensive inner loop)
981- for (uint8_t dy{1 }; dy < ns; ++dy) {
945+ for (uint8_t dy{0 }; dy < ns; ++dy) {
982946 const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
983947 for (uint8_t l{0 }; l < 2 * ns; ++l) {
984948 line[l] = std::fma (ker2[dy], l_ptr[l], line[l]);
@@ -1059,77 +1023,47 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10591023 static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
10601024 if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 &&
10611025 (i1 + ns + (padding + 1 ) / 2 < N1)) {
1062- const auto line = [du, N1, i2, i1, ker2]() constexpr noexcept {
1026+ const auto line = [du, N1, i1 = UBIGINT (i1), i2 = UBIGINT (i2),
1027+ ker2]() constexpr noexcept {
10631028 // new array du_pts to store the du values for the current y line
1064- std::array<simd_type, line_vectors> line{}, du_pts{ };
1029+ std::array<simd_type, line_vectors> line{0 };
10651030 // block for first y line, to avoid explicitly initializing line with zeros
1066- const auto l_ptr_base = du + 2 * UBIGINT (N1 * i2 + i1); // ptr to horiz line start
1067- // in du
1068- for (uint8_t l{0 }; l < line_vectors; ++l) {
1069- du_pts[l] = simd_type::load_unaligned (l * simd_size + l_ptr_base);
1070- }
1071- for (uint8_t l{0 }; l < line_vectors; ++l) {
1072- // l is like dx but for ns interleaved
1073- // no fancy trick needed to multiply real,imag by ker2
1074- line[l] = du_pts[l] * simd_type{ker2[0 ]};
1075- }
10761031 // add remaining const-y lines to the line (expensive inner loop)
1077- for (uint8_t dy{1 }; dy < ns; dy++) {
1078- const auto l_ptr = l_ptr_base + 2 * (N1 * dy ); // (see above)
1032+ for (uint8_t dy{0 }; dy < ns; dy++) {
1033+ const auto l_ptr = du + 2 * (N1 * (i2 + dy) + i1 ); // (see above)
10791034 const simd_type ker2_v{ker2[dy]};
1080- // vectorize over the fast axis of the du array
1081- // First loop: Load all du_pt into the du_pts array
1082- for (uint8_t l{0 }; l < line_vectors; ++l) {
1083- du_pts[l] = simd_type::load_unaligned (l * simd_size + l_ptr);
1084- }
1085- // Second loop: Perform the FMA operation
10861035 for (uint8_t l{0 }; l < line_vectors; ++l) {
1087- line[l] = xsimd::fma (ker2_v, du_pts[l], line[l]);
1036+ const auto du_pt = simd_type::load_unaligned (l * simd_size + l_ptr);
1037+ line[l] = xsimd::fma (ker2_v, du_pt, line[l]);
10881038 }
10891039 }
10901040 return line;
10911041 }();
10921042 // This is the same as 1D interpolation
10931043 // using lambda to limit the scope of the temporary variables
1094- const auto res = [ker1]( const auto &line) constexpr noexcept {
1044+ const auto res = [ker1, &line]( ) constexpr noexcept {
10951045 // apply x kernel to the (interleaved) line and add together
1096- simd_type res_low{}, res_hi{};
1097- if constexpr (line_vectors > 1 ) {
1098- // Manually write out the first iteration
1099- const auto ker1_v = simd_type::load_aligned (ker1);
1046+ simd_type res_low{0 }, res_hi{0 };
1047+ // Start the loop from the second iteration
1048+ for (uint8_t i{0 }; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1049+ i += 2 ) {
1050+ const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
11001051 const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
11011052 const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1102- res_low = ker1low * line[0 ];
1103- res_hi = ker1hi * line[1 ];
1104- }
1105- if constexpr (line_vectors > 3 ) {
1106- // Start the loop from the second iteration
1107- for (uint8_t i = 2 ; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1108- i += 2 ) {
1109- const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
1110- const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1111- const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1112- res_low = xsimd::fma (ker1low, line[i], res_low);
1113- res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1114- }
1053+ res_low = xsimd::fma (ker1low, line[i], res_low);
1054+ res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
11151055 }
11161056 if constexpr (line_vectors % 2 ) {
11171057 const auto ker1_v =
11181058 simd_type::load_aligned (ker1 + (line_vectors - 1 ) * simd_size / 2 );
11191059 const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
11201060 res_low = xsimd::fma (ker1low, line.back (), res_low);
11211061 }
1122- if constexpr (line_vectors > 1 ) {
1123- return res_low + res_hi;
1124- } else {
1125- return res_low;
1126- }
1127- }(line);
1128- alignas (alignment) std::array<FLT, simd_size> res_array{};
1129- res.store_aligned (res_array.data ());
1062+ return res_low + res_hi;
1063+ }();
11301064 for (uint8_t i{0 }; i < simd_size; i += 2 ) {
1131- out[0 ] += res_array[i] ;
1132- out[1 ] += res_array[ i + 1 ] ;
1065+ out[0 ] += res. get (i) ;
1066+ out[1 ] += res. get ( i + 1 ) ;
11331067 }
11341068 } else { // wraps somewhere: use ptr list
11351069 // this is slower than above, but occurs much less often, with fractional
@@ -1141,10 +1075,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
11411075}
11421076
11431077template <uint8_t ns, class simd_type >
1144- FINUFFT_NEVER_INLINE static void interp_cube_wrapped (
1145- FLT *FINUFFT_RESTRICT target, const FLT *du , const FLT *ker1 , const FLT *ker2 ,
1146- const FLT *ker3, const BIGINT i1, const BIGINT i2, const BIGINT i3, const UBIGINT N1 ,
1147- const UBIGINT N2, const UBIGINT N3) {
1078+ static void interp_cube_wrapped (FLT *FINUFFT_RESTRICT target, const FLT *du,
1079+ const FLT *ker1 , const FLT *ker2 , const FLT *ker3 ,
1080+ const BIGINT i1, const BIGINT i2, const BIGINT i3,
1081+ const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) {
11481082 /*
11491083 * This function is called when the kernel wraps around the cube.
11501084 * Similarly to 2D and 1D wrapping, this is slower than the non wrapping version.
@@ -1257,81 +1191,45 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12571191 const auto in_bounds_3 = (i3 >= 0 ) & (i3 + ns <= N3);
12581192 std::array<FLT, 2 > out{0 };
12591193 if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1 ) / 2 < N1)) {
1260- const auto line = [N1, N2, i1, i2, i3, ker2, ker3, du]() constexpr noexcept {
1261- std::array<simd_type, line_vectors> line{0 }, du_pts{};
1262- std::array<simd_type, ns> ker23_array{};
1263- const auto base_oz = N1 * N2 * UBIGINT (i3); // Move invariant part outside the loop
1194+ const auto line = [N1, N2, i1 = UBIGINT (i1), i2 = UBIGINT (i2), i3 = UBIGINT (i3), ker2,
1195+ ker3, du]() constexpr noexcept {
1196+ std::array<simd_type, line_vectors> line{0 };
12641197 for (uint8_t dz{0 }; dz < ns; ++dz) {
1265- const auto oz = base_oz + N1 * N2 * dz; // Only the dz part is inside the loop
1266- const auto base_du_ptr = du + 2 * UBIGINT (oz + N1 * i2 + i1);
1267- {
1268- alignas (alignment) std::array<FLT, ker23_size> ker23_scalar{};
1269- const simd_type ker3_v{ker3[dz]};
1270- for (uint8_t dy{0 }; dy < ns; dy += simd_size) {
1271- const auto ker2_v = simd_type::load_aligned (ker2 + dy);
1272- const auto ker23_v = ker2_v * ker3_v;
1273- ker23_v.store_aligned (ker23_scalar.data () + dy);
1274- }
1275- for (uint8_t dy{0 }; dy < ns; ++dy) {
1276- ker23_array[dy] = ker23_scalar[dy];
1277- }
1278- }
1198+ const auto oz = N1 * N2 * (i3 + dz); // offset due to z
12791199 for (uint8_t dy{0 }; dy < ns; ++dy) {
1280- const auto du_ptr = base_du_ptr + 2 * N1 * dy; // (see above)
1200+ const auto l_ptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of line
1201+ const simd_type ker23{ker2[dy] * ker3[dz]};
12811202 for (uint8_t l{0 }; l < line_vectors; ++l) {
1282- du_pts[l] = simd_type::load_unaligned (l * simd_size + du_ptr);
1283- }
1284- for (uint8_t l{0 }; l < line_vectors; ++l) {
1285- line[l] = xsimd::fma (ker23_array[dy], du_pts[l], line[l]);
1203+ const auto du_pt = simd_type::load_unaligned (l * simd_size + l_ptr);
1204+ line[l] = xsimd::fma (ker23, du_pt, line[l]);
12861205 }
12871206 }
12881207 }
12891208 return line;
12901209 }();
1291- // apply x kernel to the (interleaved) line and add together
1292- const auto res_array = [ker1](const auto &line) constexpr noexcept {
1293- const auto res = [ker1](const auto &line) constexpr noexcept {
1294- // apply x kernel to the (interleaved) line and add together
1295- simd_type res_low{}, res_hi{};
1296- if constexpr (line_vectors > 1 ) {
1297- // Manually write out the first iteration
1298- const auto ker1_v = simd_type::load_aligned (ker1);
1299- const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1300- const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1301- res_low = ker1low * line[0 ];
1302- res_hi = ker1hi * line[1 ];
1303- }
1304- if constexpr (line_vectors > 3 ) {
1305- // Start the loop from the second iteration
1306- for (uint8_t i = 2 ;
1307- i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1308- i += 2 ) {
1309- const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
1310- const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1311- const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1312- res_low = xsimd::fma (ker1low, line[i], res_low);
1313- res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1314- }
1315- }
1316- if constexpr (line_vectors % 2 ) {
1317- const auto ker1_v =
1318- simd_type::load_aligned (ker1 + (line_vectors - 1 ) * simd_size / 2 );
1319- const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1320- res_low = xsimd::fma (ker1low, line.back (), res_low);
1321- }
1322- if constexpr (line_vectors > 1 ) {
1323- return res_low + res_hi;
1324- } else {
1325- return res_low;
1326- }
1327- }(line);
1328- alignas (alignment) std::array<FLT, simd_size> res_array{};
1329- res.store_aligned (res_array.data ());
1330- return res_array;
1331- }(line);
1210+ const auto res = [ker1, &line]() constexpr noexcept {
1211+ // apply x kernel to the (interleaved) line and add together
1212+ simd_type res_low{0 }, res_hi{0 };
1213+ // Start the loop from the second iteration
1214+ for (uint8_t i{0 }; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1215+ i += 2 ) {
1216+ const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
1217+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1218+ const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1219+ res_low = xsimd::fma (ker1low, line[i], res_low);
1220+ res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1221+ }
1222+ if constexpr (line_vectors % 2 ) {
1223+ const auto ker1_v =
1224+ simd_type::load_aligned (ker1 + (line_vectors - 1 ) * simd_size / 2 );
1225+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1226+ res_low = xsimd::fma (ker1low, line.back (), res_low);
1227+ }
1228+ return res_low + res_hi;
1229+ }();
13321230 for (uint8_t i{0 }; i < simd_size; i += 2 ) {
1333- out[0 ] += res_array[i] ;
1334- out[1 ] += res_array[ i + 1 ] ;
1231+ out[0 ] += res. get (i) ;
1232+ out[1 ] += res. get ( i + 1 ) ;
13351233 }
13361234 } else {
13371235 return interp_cube_wrapped<ns, simd_type>(target, du, ker1, ker2, ker3, i1, i2, i3,
0 commit comments