Skip to content

Commit 339b02c

Browse files
committed
Vectorized 1D, 2D, 3D and fixed out-of-bound memory error
1 parent 5b9731f commit 339b02c

File tree

1 file changed

+176
-97
lines changed

1 file changed

+176
-97
lines changed

src/spreadinterp.cpp

Lines changed: 176 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *
8181
template<uint8_t ns, class simd_type = PaddedSIMD<FLT, 2 * ns>>
8282
static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
8383
const FLT *ker2, BIGINT i1, BIGINT i2, BIGINT N1, BIGINT N2);
84-
template<uint8_t ns>
84+
template<uint8_t ns, class simd_type = PaddedSIMD<FLT, 2 * ns>>
8585
static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
8686
const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3,
8787
BIGINT N1, BIGINT N2, BIGINT N3);
@@ -540,8 +540,8 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel(
540540
case 3:
541541
ker_eval<ns, kerevalmeth, FLT, simd_type>(kernel_values.data(), opts, x1, x2,
542542
x3);
543-
interp_cube<ns>(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, N1, N2,
544-
N3);
543+
interp_cube<ns, simd_type>(target, data_uniform, ker1, ker2, ker3, i1, i2, i3,
544+
N1, N2, N3);
545545
break;
546546
default: // can't get here
547547
FINUFFT_UNREACHABLE;
@@ -818,37 +818,42 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
818818
Barnett 6/16/17.
819819
*/
820820
{
821+
using arch_t = typename simd_type::arch_type;
822+
static constexpr auto padding = get_padding<FLT, 2 * ns>();
823+
static constexpr auto alignment = arch_t::alignment();
824+
static constexpr auto simd_size = simd_type::size;
825+
static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
821826
std::array<FLT, 2> out{0};
822827
BIGINT j = i1;
823828
// removing the wrapping leads up to 10% speedup in certain cases
824829
if (FINUFFT_UNLIKELY(i1 < 0)) { // wraps at left
825830
j += N1;
826-
for (UBIGINT dx = 0; dx < -i1; ++dx, ++j) {
831+
for (uint8_t dx = 0; dx < -i1; ++dx, ++j) {
827832
out[0] = xsimd::fma(du[2 * j], ker[dx], out[0]);
828833
out[1] = xsimd::fma(du[2 * j + 1], ker[dx], out[1]);
829834
}
830835
j -= N1;
831-
for (UBIGINT dx = -i1; dx < ns; ++dx, ++j) {
836+
for (uint8_t dx = -i1; dx < ns; ++dx, ++j) {
832837
out[0] = xsimd::fma(du[2 * j], ker[dx], out[0]);
833838
out[1] = xsimd::fma(du[2 * j + 1], ker[dx], out[1]);
834839
}
835840
} else if (FINUFFT_UNLIKELY(i1 + ns >= N1)) { // wraps at right
836-
for (UBIGINT dx = 0; dx < N1 - i1; ++dx, ++j) {
841+
for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) {
837842
out[0] = xsimd::fma(du[2 * j], ker[dx], out[0]);
838843
out[1] = xsimd::fma(du[2 * j + 1], ker[dx], out[1]);
839844
}
840845
j -= N1;
841-
for (UBIGINT dx = N1 - i1; dx < ns; ++dx, ++j) {
846+
for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) {
847+
out[0] = xsimd::fma(du[2 * j], ker[dx], out[0]);
848+
out[1] = xsimd::fma(du[2 * j + 1], ker[dx], out[1]);
849+
}
850+
} else if (FINUFFT_UNLIKELY(i1 + ns + (padding + 1) / 2 >= N1)) {
851+
for (uint8_t dx = 0; dx < ns; ++dx, ++j) {
842852
out[0] = xsimd::fma(du[2 * j], ker[dx], out[0]);
843853
out[1] = xsimd::fma(du[2 * j + 1], ker[dx], out[1]);
844854
}
845855
} else { // doesn't wrap
846-
using arch_t = typename simd_type::arch_type;
847-
static constexpr auto padding = get_padding<FLT, 2 * ns>();
848-
static constexpr auto alignment = arch_t::alignment();
849-
static constexpr auto simd_size = simd_type::size;
850-
static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
851-
const auto du_ptr = du + 2 * j;
856+
const auto du_ptr = du + 2 * j;
852857
simd_type res_low{0}, res_hi{0};
853858
for (uint8_t dx{0}; dx < regular_part; dx += 2 * simd_size) {
854859
const auto ker_v = simd_type::load_aligned(ker + dx / 2);
@@ -916,65 +921,90 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
916921
it."
917922
*/
918923
{
919-
FLT out[] = {0.0, 0.0};
920-
if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { // no wrapping: avoid ptrs
921-
using arch_t = typename simd_type::arch_type;
922-
static constexpr auto padding = get_padding<FLT, 2 * ns>();
923-
static constexpr auto alignment = arch_t::alignment();
924-
static constexpr auto simd_size = simd_type::size;
925-
static constexpr uint8_t regular_part = (2 * ns + padding) & (-(2 * simd_size));
926-
static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
927-
const auto line = [du, N1, i2, i1, ker2]() {
928-
std::array<simd_type, line_vectors> line{};
924+
std::array<FLT, 2> out{0};
925+
// no wrapping: avoid ptrs
926+
using arch_t = typename simd_type::arch_type;
927+
static constexpr auto padding = get_padding<FLT, 2 * ns>();
928+
static constexpr auto alignment = arch_t::alignment();
929+
static constexpr auto simd_size = simd_type::size;
930+
static constexpr uint8_t regular_part = (2 * ns + padding) & (-(2 * simd_size));
931+
static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
932+
if (FINUFFT_LIKELY(i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2)) {
933+
if (FINUFFT_LIKELY(i1 + ns + (padding + 1) / 2 < N1)) {
934+
const auto line = [du, N1, i2, i1, ker2]() constexpr noexcept {
935+
std::array<simd_type, line_vectors> line{};
936+
// block for first y line, to avoid explicitly initializing line with zeros
937+
{
938+
const auto l_ptr = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
939+
const simd_type ker2_v{ker2[0]};
940+
for (uint8_t l{0}; l < line_vectors; ++l) {
941+
// l is like dx but for ns interleaved
942+
line[l] = ker2_v * simd_type::load_unaligned(l * simd_size + l_ptr);
943+
}
944+
}
945+
// add remaining const-y lines to the line (expensive inner loop)
946+
for (uint8_t dy{1}; dy < ns; dy++) {
947+
const auto l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
948+
const simd_type ker2_v{ker2[dy]};
949+
for (uint8_t l{0}; l < line_vectors; ++l) {
950+
line[l] = xsimd::fma(ker2_v, simd_type::load_unaligned(l * simd_size + l_ptr),
951+
line[l]);
952+
}
953+
}
954+
return line;
955+
}();
956+
// apply x kernel to the (interleaved) line and add together
957+
simd_type res_low{0}, res_hi{0};
958+
for (uint8_t i = 0; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
959+
i += 2) {
960+
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
961+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
962+
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
963+
res_low = xsimd::fma(ker1low, line[i], res_low);
964+
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
965+
}
966+
if constexpr (line_vectors % 2) {
967+
const auto ker1_v =
968+
simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
969+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
970+
res_low = xsimd::fma(ker1low, line.back(), res_low);
971+
}
972+
const auto res = res_low + res_hi;
973+
alignas(alignment) std::array<FLT, simd_size> res_array{};
974+
res.store_aligned(res_array.data());
975+
for (uint8_t i{0}; i < simd_size; i += 2) {
976+
out[0] += res_array[i];
977+
out[1] += res_array[i + 1];
978+
}
979+
} else {
980+
// store a horiz line (interleaved real,imag)
981+
alignas(alignment) std::array<FLT, 2 * MAX_NSPREAD> line{};
929982
// block for first y line, to avoid explicitly initializing line with zeros
930983
{
931984
const auto l_ptr = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
932-
const simd_type ker2_v{ker2[0]};
933-
for (uint8_t l{0}; l < line_vectors; ++l) {
934-
// l is like dx but for ns interleaved
935-
line[l] = ker2_v * simd_type::load_unaligned(l * simd_size + l_ptr);
985+
for (uint8_t l{0}; l < 2 * ns; ++l) { // l is like dx but for ns interleaved
986+
line[l] = ker2[0] * l_ptr[l];
936987
}
937988
}
938989
// add remaining const-y lines to the line (expensive inner loop)
939-
for (uint8_t dy{1}; dy < ns; dy++) {
940-
const auto l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
941-
const simd_type ker2_v{ker2[dy]};
942-
for (uint8_t l{0}; l < line_vectors; ++l) {
943-
line[l] = xsimd::fma(ker2_v, simd_type::load_unaligned(l * simd_size + l_ptr),
944-
line[l]);
990+
for (uint8_t dy{1}; dy < ns; ++dy) {
991+
const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
992+
for (uint8_t l{0}; l < 2 * ns; ++l) {
993+
line[l] = xsimd::fma(ker2[dy], l_ptr[l], line[l]);
945994
}
946995
}
947-
return line;
948-
}();
949-
// apply x kernel to the (interleaved) line and add together
950-
simd_type res_low{0}, res_hi{0};
951-
for (uint8_t i = 0; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
952-
i += 2) {
953-
const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2);
954-
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
955-
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
956-
res_low = xsimd::fma(ker1low, line[i], res_low);
957-
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
958-
}
959-
if constexpr (line_vectors % 2) {
960-
const auto ker1_v =
961-
simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
962-
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
963-
res_low = xsimd::fma(ker1low, line.back(), res_low);
964-
}
965-
const auto res = res_low + res_hi;
966-
alignas(alignment) std::array<FLT, simd_size> res_array{};
967-
res.store_aligned(res_array.data());
968-
for (uint8_t i{0}; i < simd_size; i += 2) {
969-
out[0] += res_array[i];
970-
out[1] += res_array[i + 1];
996+
// apply x kernel to the (interleaved) line and add together
997+
for (uint8_t dx{0}; dx < ns; dx++) {
998+
out[0] = xsimd::fma(line[2 * dx], ker1[dx], out[0]);
999+
out[1] = xsimd::fma(line[2 * dx + 1], ker1[dx], out[1]);
1000+
}
9711001
}
9721002
} else { // wraps somewhere: use ptr list
9731003
// this is slower than above, but occurs much less often, with fractional
9741004
// rate O(ns/min(N1,N2)). Thus this code doesn't need to be so optimized.
975-
BIGINT j1[MAX_NSPREAD], j2[MAX_NSPREAD]; // 1d ptr lists
976-
BIGINT x = i1, y = i2; // initialize coords
977-
for (uint8_t d{0}; d < ns; d++) { // set up ptr lists
1005+
std::array<UBIGINT, ns> j1{}, j2{}; // 1d ptr lists
1006+
auto x = i1, y = i2; // initialize coords
1007+
for (uint8_t d{0}; d < ns; d++) { // set up ptr lists
9781008
if (x < 0) x += N1;
9791009
if (x >= N1) x -= N1;
9801010
j1[d] = x++;
@@ -983,10 +1013,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
9831013
j2[d] = y++;
9841014
}
9851015
for (uint8_t dy{0}; dy < ns; dy++) { // use the pts lists
986-
BIGINT oy = N1 * j2[dy]; // offset due to y
1016+
const auto oy = N1 * j2[dy]; // offset due to y
9871017
for (uint8_t dx{0}; dx < ns; dx++) {
988-
FLT k = ker1[dx] * ker2[dy];
989-
BIGINT j = oy + j1[dx];
1018+
const auto k = ker1[dx] * ker2[dy];
1019+
const UBIGINT j = oy + j1[dx];
9901020
out[0] += du[2 * j] * k;
9911021
out[1] += du[2 * j + 1] * k;
9921022
}
@@ -996,7 +1026,7 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
9961026
target[1] = out[1];
9971027
}
9981028

999-
template<uint8_t ns>
1029+
template<uint8_t ns, class simd_type>
10001030
void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10011031
const FLT *ker2, const FLT *ker3, const BIGINT i1, const BIGINT i2,
10021032
const BIGINT i3, const BIGINT N1, const BIGINT N2, const BIGINT N3)
@@ -1024,37 +1054,86 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10241054
(see above note in interp_square)
10251055
*/
10261056
{
1027-
FLT out[] = {0.0, 0.0};
1028-
if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 && i3 >= 0 && i3 + ns <= N3) {
1029-
// no wrapping: avoid ptrs (by far the most common case)
1030-
FLT line[2 * MAX_NSPREAD]; // store a horiz line (interleaved real,imag)
1031-
// initialize line with zeros; hard to avoid here, but overhead small in 3D
1032-
for (int l = 0; l < 2 * ns; l++) {
1033-
line[l] = 0;
1034-
}
1035-
// co-add y and z contributions to line in x; do not apply x kernel yet
1036-
// This is expensive innermost loop
1037-
for (int dz = 0; dz < ns; dz++) {
1038-
BIGINT oz = N1 * N2 * (i3 + dz); // offset due to z
1039-
for (int dy = 0; dy < ns; dy++) {
1040-
const FLT *lptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of line
1041-
FLT ker23 = ker2[dy] * ker3[dz];
1042-
for (int l = 0; l < 2 * ns; ++l) { // loop over ns interleaved (R,I) pairs
1043-
line[l] += lptr[l] * ker23;
1057+
std::array<FLT, 2> out{0};
1058+
using arch_t = typename simd_type::arch_type;
1059+
static constexpr auto padding = get_padding<FLT, 2 * ns>();
1060+
static constexpr auto alignment = arch_t::alignment();
1061+
static constexpr auto simd_size = simd_type::size;
1062+
static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
1063+
const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= N1);
1064+
const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= N2);
1065+
const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3);
1066+
if (FINUFFT_LIKELY(in_bounds_1 && in_bounds_2 && in_bounds_3)) {
1067+
if (FINUFFT_LIKELY(i1 + ns + (padding + 1) / 2 < N1)) {
1068+
std::array<simd_type, line_vectors> line{0};
1069+
for (uint8_t dz{0}; dz < ns; ++dz) {
1070+
const UBIGINT oz = N1 * N2 * (i3 + dz);
1071+
for (uint8_t dy{0}; dy < ns; ++dy) {
1072+
const auto du_ptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // (see above)
1073+
const simd_type ker23_v{ker2[dy] * ker3[dz]};
1074+
for (uint8_t l{0}; l < line_vectors; ++l) {
1075+
const auto du_pt = simd_type::load_unaligned(l * simd_size + du_ptr);
1076+
line[l] = xsimd::fma(ker23_v, du_pt, line[l]);
1077+
}
10441078
}
10451079
}
1046-
}
1047-
// apply x kernel to the (interleaved) line and add together (cheap)
1048-
for (int dx = 0; dx < ns; dx++) {
1049-
out[0] += line[2 * dx] * ker1[dx];
1050-
out[1] += line[2 * dx + 1] * ker1[dx];
1080+
// apply x kernel to the (interleaved) line and add together
1081+
const auto res_array = [ker1](const auto &line) constexpr noexcept {
1082+
const auto res = [ker1](const auto &line) constexpr noexcept {
1083+
simd_type res_low{0}, res_hi{0};
1084+
for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
1085+
i += 2) {
1086+
const auto ker1_v = simd_type::load_aligned(i * simd_size / 2 + ker1);
1087+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1088+
const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
1089+
res_low = xsimd::fma(ker1low, line[i], res_low);
1090+
res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi);
1091+
}
1092+
if constexpr (line_vectors % 2) {
1093+
const auto ker1_v =
1094+
simd_type::load_aligned((line_vectors - 1) * simd_size / 2 + ker1);
1095+
const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
1096+
res_low = xsimd::fma(ker1low, line.back(), res_low);
1097+
}
1098+
return res_low + res_hi;
1099+
}(line);
1100+
alignas(alignment) std::array<FLT, simd_size> res_array{};
1101+
res.store_aligned(res_array.data());
1102+
return res_array;
1103+
}(line);
1104+
for (uint8_t i{0}; i < simd_size; i += 2) {
1105+
out[0] += res_array[i];
1106+
out[1] += res_array[i + 1];
1107+
}
1108+
} else {
1109+
// no wrapping: avoid ptrs (by far the most common case)
1110+
// store a horiz line (interleaved real,imag)
1111+
// initialize line with zeros; hard to avoid here, but overhead small in 3D
1112+
std::array<FLT, 2 * MAX_NSPREAD> line{0};
1113+
// co-add y and z contributions to line in x; do not apply x kernel yet
1114+
// This is expensive innermost loop
1115+
for (uint8_t dz{0}; dz < ns; ++dz) {
1116+
const auto oz = N1 * N2 * (i3 + dz); // offset due to z
1117+
for (uint8_t dy{0}; dy < ns; ++dy) {
1118+
const auto l_ptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of line
1119+
const auto ker23 = ker2[dy] * ker3[dz];
1120+
for (uint8_t l{0}; l < 2 * ns; ++l) { // loop over ns interleaved (R,I) pairs
1121+
line[l] = xsimd::fma(l_ptr[l], ker23, line[l]);
1122+
}
1123+
}
1124+
}
1125+
// apply x kernel to the (interleaved) line and add together (cheap)
1126+
for (uint8_t dx{0}; dx < ns; ++dx) {
1127+
out[0] += line[2 * dx] * ker1[dx];
1128+
out[1] += line[2 * dx + 1] * ker1[dx];
1129+
}
10511130
}
10521131
} else { // wraps somewhere: use ptr list
10531132
// ...can be slower since this case only happens with probability
10541133
// O(ns/min(N1,N2,N3))
1055-
BIGINT j1[MAX_NSPREAD], j2[MAX_NSPREAD], j3[MAX_NSPREAD]; // 1d ptr lists
1056-
BIGINT x = i1, y = i2, z = i3; // initialize coords
1057-
for (int d = 0; d < ns; d++) { // set up ptr lists
1134+
alignas(alignment) std::array<UBIGINT, ns> j1{}, j2{}, j3{}; // 1d ptr lists
1135+
auto x = i1, y = i2, z = i3; // initialize coords
1136+
for (uint8_t d{0}; d < ns; d++) { // set up ptr lists
10581137
if (x < 0) x += N1;
10591138
if (x >= N1) x -= N1;
10601139
j1[d] = x++;
@@ -1065,14 +1144,14 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10651144
if (z >= N3) z -= N3;
10661145
j3[d] = z++;
10671146
}
1068-
for (int dz = 0; dz < ns; dz++) { // use the pts lists
1069-
BIGINT oz = N1 * N2 * j3[dz]; // offset due to z
1070-
for (int dy = 0; dy < ns; dy++) {
1071-
BIGINT oy = oz + N1 * j2[dy]; // offset due to y & z
1072-
FLT ker23 = ker2[dy] * ker3[dz];
1073-
for (int dx = 0; dx < ns; dx++) {
1074-
FLT k = ker1[dx] * ker23;
1075-
BIGINT j = oy + j1[dx];
1147+
for (uint8_t dz{0}; dz < ns; dz++) { // use the pts lists
1148+
const auto oz = N1 * N2 * j3[dz]; // offset due to z
1149+
for (uint8_t dy{0}; dy < ns; dy++) {
1150+
const auto oy = oz + N1 * j2[dy]; // offset due to y & z
1151+
const auto ker23 = ker2[dy] * ker3[dz];
1152+
for (uint8_t dx{0}; dx < ns; dx++) {
1153+
const auto k = ker1[dx] * ker23;
1154+
const auto j = oy + j1[dx];
10761155
out[0] += du[2 * j] * k;
10771156
out[1] += du[2 * j + 1] * k;
10781157
}

0 commit comments

Comments
 (0)