@@ -1256,29 +1256,30 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12561256 if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1 ) / 2 < N1)) {
12571257 const auto line = [N1, N2, i1, i2, i3, ker2, ker3, du]() constexpr noexcept {
12581258 std::array<simd_type, line_vectors> line{0 }, du_pts{};
1259- alignas (alignment) std::array<FLT, ker23_size > ker23_array{};
1260- const UBIGINT base_oz = N1 * N2 * i3; // Move invariant part outside the loop
1259+ std::array<simd_type, ns > ker23_array{};
1260+ const auto base_oz = N1 * N2 * UBIGINT (i3); // Move invariant part outside the loop
12611261 for (uint8_t dz{0 }; dz < ns; ++dz) {
1262- const auto oz = base_oz + N1 * N2 * dz; // Only the dz part is inside the loop
1263- const auto base_du_ptr = du + 2 * UBIGINT (oz + N1 * i2 + i1 );
1262+ const auto oz = base_oz + N1 * N2 * dz; // Only the dz part is inside the loop
1263+ const auto base_du_ptr = du + 2 * (oz + N1 * i2 + UBIGINT (i1) );
12641264 {
1265+ alignas (alignment) std::array<FLT, ker23_size> ker23_scalar{};
12651266 const simd_type ker3_v{ker3[dz]};
12661267 for (uint8_t dy{0 }; dy < ns; dy += simd_size) {
12671268 const auto ker2_v = simd_type::load_aligned (ker2 + dy);
12681269 const auto ker23_v = ker2_v * ker3_v;
1269- ker23_v.store_aligned (ker23_array.data () + dy);
1270+ ker23_v.store_aligned (ker23_scalar.data () + dy);
1271+ }
1272+ for (uint8_t dy{0 }; dy < ns; ++dy) {
1273+ ker23_array[dy] = ker23_scalar[dy];
12701274 }
12711275 }
12721276 for (uint8_t dy{0 }; dy < ns; ++dy) {
12731277 const auto du_ptr = base_du_ptr + 2 * N1 * dy; // (see above)
1274- const simd_type ker23_v{ker23_array[dy]};
1275- // First loop: Load all du_pt into the du_pts array
12761278 for (uint8_t l{0 }; l < line_vectors; ++l) {
1277- du_pts[l] = simd_type::load_unaligned (l * simd_size + du_ptr );
1279+ du_pts[l] = simd_type::load_unaligned (l * simd_size + base_du_ptr );
12781280 }
1279- // Second loop: Perform the multiplication
12801281 for (uint8_t l{0 }; l < line_vectors; ++l) {
1281- line[l] = xsimd::fma (ker23_v , du_pts[l], line[l]);
1282+ line[l] = xsimd::fma (ker23_array[dy] , du_pts[l], line[l]);
12821283 }
12831284 }
12841285 }
0 commit comments