@@ -1063,12 +1063,15 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
10631063 // new array du_pts to store the du values for the current y line
10641064 std::array<simd_type, line_vectors> line{}, du_pts{};
10651065 // block for first y line, to avoid explicitly initializing line with zeros
1066- const auto l_ptr_base = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
1066+ const auto l_ptr_base = du + 2 * UBIGINT (N1 * i2 + i1); // ptr to horiz line start
1067+ // in du
1068+ for (uint8_t l{0 }; l < line_vectors; ++l) {
1069+ du_pts[l] = simd_type::load_unaligned (l * simd_size + l_ptr_base);
1070+ }
10671071 for (uint8_t l{0 }; l < line_vectors; ++l) {
10681072 // l is like dx but for ns interleaved
10691073 // no fancy trick needed to multiply real,imag by ker2
1070- const auto du_pt = simd_type::load_unaligned (l * simd_size + l_ptr_base);
1071- line[l] = du_pt * simd_type{ker2[0 ]};
1074+ line[l] = du_pts[l] * simd_type{ker2[0 ]};
10721075 }
10731076 // add remaining const-y lines to the line (expensive inner loop)
10741077 for (uint8_t dy{1 }; dy < ns; dy++) {
@@ -1260,7 +1263,7 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
12601263 const auto base_oz = N1 * N2 * UBIGINT (i3); // Move invariant part outside the loop
12611264 for (uint8_t dz{0 }; dz < ns; ++dz) {
12621265 const auto oz = base_oz + N1 * N2 * dz; // Only the dz part is inside the loop
1263- const auto base_du_ptr = du + 2 * (oz + N1 * i2 + UBIGINT (i1) );
1266+ const auto base_du_ptr = du + 2 * UBIGINT (oz + N1 * i2 + i1 );
12641267 {
12651268 alignas (alignment) std::array<FLT, ker23_size> ker23_scalar{};
12661269 const simd_type ker3_v{ker3[dz]};
0 commit comments