@@ -81,7 +81,7 @@ static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *
8181template <uint8_t ns, class simd_type = PaddedSIMD<FLT, 2 * ns>>
8282static void interp_square (FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
8383 const FLT *ker2, BIGINT i1, BIGINT i2, BIGINT N1, BIGINT N2);
84- template <uint8_t ns>
84+ template <uint8_t ns, class simd_type = PaddedSIMD<FLT, 2 * ns> >
8585static void interp_cube (FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
8686 const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3,
8787 BIGINT N1, BIGINT N2, BIGINT N3);
@@ -540,8 +540,8 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel(
540540 case 3 :
541541 ker_eval<ns, kerevalmeth, FLT, simd_type>(kernel_values.data (), opts, x1, x2,
542542 x3);
543- interp_cube<ns>(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, N1, N2 ,
544- N3);
543+ interp_cube<ns, simd_type >(target, data_uniform, ker1, ker2, ker3, i1, i2, i3,
544+ N1, N2, N3);
545545 break ;
546546 default : // can't get here
547547 FINUFFT_UNREACHABLE;
@@ -818,37 +818,42 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
818818 Barnett 6/16/17.
819819*/
820820{
821+ using arch_t = typename simd_type::arch_type;
822+ static constexpr auto padding = get_padding<FLT, 2 * ns>();
823+ static constexpr auto alignment = arch_t::alignment ();
824+ static constexpr auto simd_size = simd_type::size;
825+ static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
821826 std::array<FLT, 2 > out{0 };
822827 BIGINT j = i1;
823828 // removing the wrapping leads up to 10% speedup in certain cases
824829 if (FINUFFT_UNLIKELY (i1 < 0 )) { // wraps at left
825830 j += N1;
826- for (UBIGINT dx = 0 ; dx < -i1; ++dx, ++j) {
831+ for (uint8_t dx = 0 ; dx < -i1; ++dx, ++j) {
827832 out[0 ] = xsimd::fma (du[2 * j], ker[dx], out[0 ]);
828833 out[1 ] = xsimd::fma (du[2 * j + 1 ], ker[dx], out[1 ]);
829834 }
830835 j -= N1;
831- for (UBIGINT dx = -i1; dx < ns; ++dx, ++j) {
836+ for (uint8_t dx = -i1; dx < ns; ++dx, ++j) {
832837 out[0 ] = xsimd::fma (du[2 * j], ker[dx], out[0 ]);
833838 out[1 ] = xsimd::fma (du[2 * j + 1 ], ker[dx], out[1 ]);
834839 }
835840 } else if (FINUFFT_UNLIKELY (i1 + ns >= N1)) { // wraps at right
836- for (UBIGINT dx = 0 ; dx < N1 - i1; ++dx, ++j) {
841+ for (uint8_t dx = 0 ; dx < N1 - i1; ++dx, ++j) {
837842 out[0 ] = xsimd::fma (du[2 * j], ker[dx], out[0 ]);
838843 out[1 ] = xsimd::fma (du[2 * j + 1 ], ker[dx], out[1 ]);
839844 }
840845 j -= N1;
841- for (UBIGINT dx = N1 - i1; dx < ns; ++dx, ++j) {
846+ for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) {
847+ out[0 ] = xsimd::fma (du[2 * j], ker[dx], out[0 ]);
848+ out[1 ] = xsimd::fma (du[2 * j + 1 ], ker[dx], out[1 ]);
849+ }
850+ } else if (FINUFFT_UNLIKELY (i1 + ns + (padding + 1 ) / 2 >= N1)) {
851+ for (uint8_t dx = 0 ; dx < ns; ++dx, ++j) {
842852 out[0 ] = xsimd::fma (du[2 * j], ker[dx], out[0 ]);
843853 out[1 ] = xsimd::fma (du[2 * j + 1 ], ker[dx], out[1 ]);
844854 }
845855 } else { // doesn't wrap
846- using arch_t = typename simd_type::arch_type;
847- static constexpr auto padding = get_padding<FLT, 2 * ns>();
848- static constexpr auto alignment = arch_t::alignment ();
849- static constexpr auto simd_size = simd_type::size;
850- static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
851- const auto du_ptr = du + 2 * j;
856+ const auto du_ptr = du + 2 * j;
852857 simd_type res_low{0 }, res_hi{0 };
853858 for (uint8_t dx{0 }; dx < regular_part; dx += 2 * simd_size) {
854859 const auto ker_v = simd_type::load_aligned (ker + dx / 2 );
@@ -916,65 +921,90 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
916921 it."
917922*/
918923{
919- FLT out[] = {0.0 , 0.0 };
920- if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { // no wrapping: avoid ptrs
921- using arch_t = typename simd_type::arch_type;
922- static constexpr auto padding = get_padding<FLT, 2 * ns>();
923- static constexpr auto alignment = arch_t::alignment ();
924- static constexpr auto simd_size = simd_type::size;
925- static constexpr uint8_t regular_part = (2 * ns + padding) & (-(2 * simd_size));
926- static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
927- const auto line = [du, N1, i2, i1, ker2]() {
928- std::array<simd_type, line_vectors> line{};
924+ std::array<FLT, 2 > out{0 };
925+ // no wrapping: avoid ptrs
926+ using arch_t = typename simd_type::arch_type;
927+ static constexpr auto padding = get_padding<FLT, 2 * ns>();
928+ static constexpr auto alignment = arch_t::alignment ();
929+ static constexpr auto simd_size = simd_type::size;
930+ static constexpr uint8_t regular_part = (2 * ns + padding) & (-(2 * simd_size));
931+ static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
932+ if (FINUFFT_LIKELY (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2)) {
933+ if (FINUFFT_LIKELY (i1 + ns + (padding + 1 ) / 2 < N1)) {
934+ const auto line = [du, N1, i2, i1, ker2]() constexpr noexcept {
935+ std::array<simd_type, line_vectors> line{};
936+ // block for first y line, to avoid explicitly initializing line with zeros
937+ {
938+ const auto l_ptr = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
939+ const simd_type ker2_v{ker2[0 ]};
940+ for (uint8_t l{0 }; l < line_vectors; ++l) {
941+ // l is like dx but for ns interleaved
942+ line[l] = ker2_v * simd_type::load_unaligned (l * simd_size + l_ptr);
943+ }
944+ }
945+ // add remaining const-y lines to the line (expensive inner loop)
946+ for (uint8_t dy{1 }; dy < ns; dy++) {
947+ const auto l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
948+ const simd_type ker2_v{ker2[dy]};
949+ for (uint8_t l{0 }; l < line_vectors; ++l) {
950+ line[l] = xsimd::fma (ker2_v, simd_type::load_unaligned (l * simd_size + l_ptr),
951+ line[l]);
952+ }
953+ }
954+ return line;
955+ }();
956+ // apply x kernel to the (interleaved) line and add together
957+ simd_type res_low{0 }, res_hi{0 };
958+ for (uint8_t i = 0 ; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
959+ i += 2 ) {
960+ const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
961+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
962+ const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
963+ res_low = xsimd::fma (ker1low, line[i], res_low);
964+ res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
965+ }
966+ if constexpr (line_vectors % 2 ) {
967+ const auto ker1_v =
968+ simd_type::load_aligned (ker1 + (line_vectors - 1 ) * simd_size / 2 );
969+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
970+ res_low = xsimd::fma (ker1low, line.back (), res_low);
971+ }
972+ const auto res = res_low + res_hi;
973+ alignas (alignment) std::array<FLT, simd_size> res_array{};
974+ res.store_aligned (res_array.data ());
975+ for (uint8_t i{0 }; i < simd_size; i += 2 ) {
976+ out[0 ] += res_array[i];
977+ out[1 ] += res_array[i + 1 ];
978+ }
979+ } else {
980+ // store a horiz line (interleaved real,imag)
981+ alignas (alignment) std::array<FLT, 2 * MAX_NSPREAD> line{};
929982 // block for first y line, to avoid explicitly initializing line with zeros
930983 {
931984 const auto l_ptr = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
932- const simd_type ker2_v{ker2[0 ]};
933- for (uint8_t l{0 }; l < line_vectors; ++l) {
934- // l is like dx but for ns interleaved
935- line[l] = ker2_v * simd_type::load_unaligned (l * simd_size + l_ptr);
985+ for (uint8_t l{0 }; l < 2 * ns; ++l) { // l is like dx but for ns interleaved
986+ line[l] = ker2[0 ] * l_ptr[l];
936987 }
937988 }
938989 // add remaining const-y lines to the line (expensive inner loop)
939- for (uint8_t dy{1 }; dy < ns; dy++) {
940- const auto l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
941- const simd_type ker2_v{ker2[dy]};
942- for (uint8_t l{0 }; l < line_vectors; ++l) {
943- line[l] = xsimd::fma (ker2_v, simd_type::load_unaligned (l * simd_size + l_ptr),
944- line[l]);
990+ for (uint8_t dy{1 }; dy < ns; ++dy) {
991+ const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
992+ for (uint8_t l{0 }; l < 2 * ns; ++l) {
993+ line[l] = xsimd::fma (ker2[dy], l_ptr[l], line[l]);
945994 }
946995 }
947- return line;
948- }();
949- // apply x kernel to the (interleaved) line and add together
950- simd_type res_low{0 }, res_hi{0 };
951- for (uint8_t i = 0 ; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
952- i += 2 ) {
953- const auto ker1_v = simd_type::load_aligned (ker1 + i * simd_size / 2 );
954- const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
955- const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
956- res_low = xsimd::fma (ker1low, line[i], res_low);
957- res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
958- }
959- if constexpr (line_vectors % 2 ) {
960- const auto ker1_v =
961- simd_type::load_aligned (ker1 + (line_vectors - 1 ) * simd_size / 2 );
962- const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
963- res_low = xsimd::fma (ker1low, line.back (), res_low);
964- }
965- const auto res = res_low + res_hi;
966- alignas (alignment) std::array<FLT, simd_size> res_array{};
967- res.store_aligned (res_array.data ());
968- for (uint8_t i{0 }; i < simd_size; i += 2 ) {
969- out[0 ] += res_array[i];
970- out[1 ] += res_array[i + 1 ];
996+ // apply x kernel to the (interleaved) line and add together
997+ for (uint8_t dx{0 }; dx < ns; dx++) {
998+ out[0 ] = xsimd::fma (line[2 * dx], ker1[dx], out[0 ]);
999+ out[1 ] = xsimd::fma (line[2 * dx + 1 ], ker1[dx], out[1 ]);
1000+ }
9711001 }
9721002 } else { // wraps somewhere: use ptr list
9731003 // this is slower than above, but occurs much less often, with fractional
9741004 // rate O(ns/min(N1,N2)). Thus this code doesn't need to be so optimized.
975- BIGINT j1[MAX_NSPREAD] , j2[MAX_NSPREAD] ; // 1d ptr lists
976- BIGINT x = i1, y = i2; // initialize coords
977- for (uint8_t d{0 }; d < ns; d++) { // set up ptr lists
1005+ std::array<UBIGINT, ns> j1{} , j2{} ; // 1d ptr lists
1006+ auto x = i1, y = i2; // initialize coords
1007+ for (uint8_t d{0 }; d < ns; d++) { // set up ptr lists
9781008 if (x < 0 ) x += N1;
9791009 if (x >= N1) x -= N1;
9801010 j1[d] = x++;
@@ -983,10 +1013,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
9831013 j2[d] = y++;
9841014 }
9851015 for (uint8_t dy{0 }; dy < ns; dy++) { // use the pts lists
986- BIGINT oy = N1 * j2[dy]; // offset due to y
1016+ const auto oy = N1 * j2[dy]; // offset due to y
9871017 for (uint8_t dx{0 }; dx < ns; dx++) {
988- FLT k = ker1[dx] * ker2[dy];
989- BIGINT j = oy + j1[dx];
1018+ const auto k = ker1[dx] * ker2[dy];
1019+ const UBIGINT j = oy + j1[dx];
9901020 out[0 ] += du[2 * j] * k;
9911021 out[1 ] += du[2 * j + 1 ] * k;
9921022 }
@@ -996,7 +1026,7 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
9961026 target[1 ] = out[1 ];
9971027}
9981028
999- template <uint8_t ns>
1029+ template <uint8_t ns, class simd_type >
10001030void interp_cube (FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10011031 const FLT *ker2, const FLT *ker3, const BIGINT i1, const BIGINT i2,
10021032 const BIGINT i3, const BIGINT N1, const BIGINT N2, const BIGINT N3)
@@ -1024,37 +1054,86 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10241054 (see above note in interp_square)
10251055*/
10261056{
1027- FLT out[] = {0.0 , 0.0 };
1028- if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 && i3 >= 0 && i3 + ns <= N3) {
1029- // no wrapping: avoid ptrs (by far the most common case)
1030- FLT line[2 * MAX_NSPREAD]; // store a horiz line (interleaved real,imag)
1031- // initialize line with zeros; hard to avoid here, but overhead small in 3D
1032- for (int l = 0 ; l < 2 * ns; l++) {
1033- line[l] = 0 ;
1034- }
1035- // co-add y and z contributions to line in x; do not apply x kernel yet
1036- // This is expensive innermost loop
1037- for (int dz = 0 ; dz < ns; dz++) {
1038- BIGINT oz = N1 * N2 * (i3 + dz); // offset due to z
1039- for (int dy = 0 ; dy < ns; dy++) {
1040- const FLT *lptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of line
1041- FLT ker23 = ker2[dy] * ker3[dz];
1042- for (int l = 0 ; l < 2 * ns; ++l) { // loop over ns interleaved (R,I) pairs
1043- line[l] += lptr[l] * ker23;
1057+ std::array<FLT, 2 > out{0 };
1058+ using arch_t = typename simd_type::arch_type;
1059+ static constexpr auto padding = get_padding<FLT, 2 * ns>();
1060+ static constexpr auto alignment = arch_t::alignment ();
1061+ static constexpr auto simd_size = simd_type::size;
1062+ static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
1063+ const auto in_bounds_1 = (i1 >= 0 ) & (i1 + ns <= N1);
1064+ const auto in_bounds_2 = (i2 >= 0 ) & (i2 + ns <= N2);
1065+ const auto in_bounds_3 = (i3 >= 0 ) & (i3 + ns <= N3);
1066+ if (FINUFFT_LIKELY (in_bounds_1 && in_bounds_2 && in_bounds_3)) {
1067+ if (FINUFFT_LIKELY (i1 + ns + (padding + 1 ) / 2 < N1)) {
1068+ std::array<simd_type, line_vectors> line{0 };
1069+ for (uint8_t dz{0 }; dz < ns; ++dz) {
1070+ const UBIGINT oz = N1 * N2 * (i3 + dz);
1071+ for (uint8_t dy{0 }; dy < ns; ++dy) {
1072+ const auto du_ptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // (see above)
1073+ const simd_type ker23_v{ker2[dy] * ker3[dz]};
1074+ for (uint8_t l{0 }; l < line_vectors; ++l) {
1075+ const auto du_pt = simd_type::load_unaligned (l * simd_size + du_ptr);
1076+ line[l] = xsimd::fma (ker23_v, du_pt, line[l]);
1077+ }
10441078 }
10451079 }
1046- }
1047- // apply x kernel to the (interleaved) line and add together (cheap)
1048- for (int dx = 0 ; dx < ns; dx++) {
1049- out[0 ] += line[2 * dx] * ker1[dx];
1050- out[1 ] += line[2 * dx + 1 ] * ker1[dx];
1080+ // apply x kernel to the (interleaved) line and add together
1081+ const auto res_array = [ker1](const auto &line) constexpr noexcept {
1082+ const auto res = [ker1](const auto &line) constexpr noexcept {
1083+ simd_type res_low{0 }, res_hi{0 };
1084+ for (uint8_t i{0 }; i < (line_vectors & ~1 ); // NOLINT(*-too-small-loop-variable)
1085+ i += 2 ) {
1086+ const auto ker1_v = simd_type::load_aligned (i * simd_size / 2 + ker1);
1087+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1088+ const auto ker1hi = xsimd::swizzle (ker1_v, zip_hi_index<arch_t >);
1089+ res_low = xsimd::fma (ker1low, line[i], res_low);
1090+ res_hi = xsimd::fma (ker1hi, line[i + 1 ], res_hi);
1091+ }
1092+ if constexpr (line_vectors % 2 ) {
1093+ const auto ker1_v =
1094+ simd_type::load_aligned ((line_vectors - 1 ) * simd_size / 2 + ker1);
1095+ const auto ker1low = xsimd::swizzle (ker1_v, zip_low_index<arch_t >);
1096+ res_low = xsimd::fma (ker1low, line.back (), res_low);
1097+ }
1098+ return res_low + res_hi;
1099+ }(line);
1100+ alignas (alignment) std::array<FLT, simd_size> res_array{};
1101+ res.store_aligned (res_array.data ());
1102+ return res_array;
1103+ }(line);
1104+ for (uint8_t i{0 }; i < simd_size; i += 2 ) {
1105+ out[0 ] += res_array[i];
1106+ out[1 ] += res_array[i + 1 ];
1107+ }
1108+ } else {
1109+ // no wrapping: avoid ptrs (by far the most common case)
1110+ // store a horiz line (interleaved real,imag)
1111+ // initialize line with zeros; hard to avoid here, but overhead small in 3D
1112+ std::array<FLT, 2 * MAX_NSPREAD> line{0 };
1113+ // co-add y and z contributions to line in x; do not apply x kernel yet
1114+ // This is expensive innermost loop
1115+ for (uint8_t dz{0 }; dz < ns; ++dz) {
1116+ const auto oz = N1 * N2 * (i3 + dz); // offset due to z
1117+ for (uint8_t dy{0 }; dy < ns; ++dy) {
1118+ const auto l_ptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of line
1119+ const auto ker23 = ker2[dy] * ker3[dz];
1120+ for (uint8_t l{0 }; l < 2 * ns; ++l) { // loop over ns interleaved (R,I) pairs
1121+ line[l] = xsimd::fma (l_ptr[l], ker23, line[l]);
1122+ }
1123+ }
1124+ }
1125+ // apply x kernel to the (interleaved) line and add together (cheap)
1126+ for (uint8_t dx{0 }; dx < ns; ++dx) {
1127+ out[0 ] += line[2 * dx] * ker1[dx];
1128+ out[1 ] += line[2 * dx + 1 ] * ker1[dx];
1129+ }
10511130 }
10521131 } else { // wraps somewhere: use ptr list
10531132 // ...can be slower since this case only happens with probability
10541133 // O(ns/min(N1,N2,N3))
1055- BIGINT j1[MAX_NSPREAD] , j2[MAX_NSPREAD] , j3[MAX_NSPREAD] ; // 1d ptr lists
1056- BIGINT x = i1, y = i2, z = i3; // initialize coords
1057- for (int d = 0 ; d < ns; d++) { // set up ptr lists
1134+ alignas (alignment) std::array<UBIGINT, ns> j1{} , j2{} , j3{} ; // 1d ptr lists
1135+ auto x = i1, y = i2, z = i3; // initialize coords
1136+ for (uint8_t d{ 0 } ; d < ns; d++) { // set up ptr lists
10581137 if (x < 0 ) x += N1;
10591138 if (x >= N1) x -= N1;
10601139 j1[d] = x++;
@@ -1065,14 +1144,14 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10651144 if (z >= N3) z -= N3;
10661145 j3[d] = z++;
10671146 }
1068- for (int dz = 0 ; dz < ns; dz++) { // use the pts lists
1069- BIGINT oz = N1 * N2 * j3[dz]; // offset due to z
1070- for (int dy = 0 ; dy < ns; dy++) {
1071- BIGINT oy = oz + N1 * j2[dy]; // offset due to y & z
1072- FLT ker23 = ker2[dy] * ker3[dz];
1073- for (int dx = 0 ; dx < ns; dx++) {
1074- FLT k = ker1[dx] * ker23;
1075- BIGINT j = oy + j1[dx];
1147+ for (uint8_t dz{ 0 } ; dz < ns; dz++) { // use the pts lists
1148+ const auto oz = N1 * N2 * j3[dz]; // offset due to z
1149+ for (uint8_t dy{ 0 } ; dy < ns; dy++) {
1150+ const auto oy = oz + N1 * j2[dy]; // offset due to y & z
1151+ const auto ker23 = ker2[dy] * ker3[dz];
1152+ for (uint8_t dx{ 0 } ; dx < ns; dx++) {
1153+ const auto k = ker1[dx] * ker23;
1154+ const auto j = oy + j1[dx];
10761155 out[0 ] += du[2 * j] * k;
10771156 out[1 ] += du[2 * j + 1 ] * k;
10781157 }
0 commit comments