Skip to content

Commit 363a8bd

Browse files
committed
3D is now faster than master
1 parent 223edc6 commit 363a8bd

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

src/spreadinterp.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,29 +1256,30 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12561256
if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1) / 2 < N1)) {
12571257
const auto line = [N1, N2, i1, i2, i3, ker2, ker3, du]() constexpr noexcept {
12581258
std::array<simd_type, line_vectors> line{0}, du_pts{};
1259-
alignas(alignment) std::array<FLT, ker23_size> ker23_array{};
1260-
const UBIGINT base_oz = N1 * N2 * i3; // Move invariant part outside the loop
1259+
std::array<simd_type, ns> ker23_array{};
1260+
const auto base_oz = N1 * N2 * UBIGINT(i3); // Move invariant part outside the loop
12611261
for (uint8_t dz{0}; dz < ns; ++dz) {
1262-
const auto oz = base_oz + N1 * N2 * dz; // Only the dz part is inside the loop
1263-
const auto base_du_ptr = du + 2 * UBIGINT(oz + N1 * i2 + i1);
1262+
const auto oz = base_oz + N1 * N2 * dz; // Only the dz part is inside the loop
1263+
const auto base_du_ptr = du + 2 * (oz + N1 * i2 + UBIGINT(i1));
12641264
{
1265+
alignas(alignment) std::array<FLT, ker23_size> ker23_scalar{};
12651266
const simd_type ker3_v{ker3[dz]};
12661267
for (uint8_t dy{0}; dy < ns; dy += simd_size) {
12671268
const auto ker2_v = simd_type::load_aligned(ker2 + dy);
12681269
const auto ker23_v = ker2_v * ker3_v;
1269-
ker23_v.store_aligned(ker23_array.data() + dy);
1270+
ker23_v.store_aligned(ker23_scalar.data() + dy);
1271+
}
1272+
for (uint8_t dy{0}; dy < ns; ++dy) {
1273+
ker23_array[dy] = ker23_scalar[dy];
12701274
}
12711275
}
12721276
for (uint8_t dy{0}; dy < ns; ++dy) {
12731277
const auto du_ptr = base_du_ptr + 2 * N1 * dy; // (see above)
1274-
const simd_type ker23_v{ker23_array[dy]};
1275-
// First loop: Load all du_pt into the du_pts array
12761278
for (uint8_t l{0}; l < line_vectors; ++l) {
1277-
du_pts[l] = simd_type::load_unaligned(l * simd_size + du_ptr);
1279+
du_pts[l] = simd_type::load_unaligned(l * simd_size + base_du_ptr);
12781280
}
1279-
// Second loop: Perform the multiplication
12801281
for (uint8_t l{0}; l < line_vectors; ++l) {
1281-
line[l] = xsimd::fma(ker23_v, du_pts[l], line[l]);
1282+
line[l] = xsimd::fma(ker23_array[dy], du_pts[l], line[l]);
12821283
}
12831284
}
12841285
}

0 commit comments

Comments
 (0)