@@ -1036,7 +1036,11 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
10361036 // +-----------------------+
10371037 // |re|im|re|im|re|im|re|im|
10381038 // +-----------------------+
1039- const auto dd_pt = initialize_complex_batch<batch_t >(dd[i * 2 ], dd[i * 2 + 1 ]);
1039+ // const auto dd_pt = initialize_complex_batch<batch_t>(dd[i * 2], dd[i * 2 + 1]);
1040+ const auto dd_pt = [dd, i]() constexpr noexcept {
1041+ const batch_t ddi{dd[i * 2 ]}, ddj{dd[i * 2 + 1 ]};
1042+ return xsimd::zip_lo (ddi, ddj);
1043+ }();
10401044 // ceil offset, hence rounding, must match that in get_subgrid...
10411045 const auto i1 = BIGINT (std::ceil (kx[i] - ns2)); // fine grid start index
10421046 // FLT(i1) has different semantics and results an extra cast
@@ -1205,7 +1209,12 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
12051209 static constexpr auto ns2 = ns * FLT (0.5 ); // half spread width
12061210 std::fill (du, du + 2 * size1 * size2, 0 ); // initialized to 0 due to the padding
12071211 for (uint64_t pt = 0 ; pt < M; pt++) { // loop over NU pts
1208- const auto dd_pt = initialize_complex_batch<batch_t >(dd[pt * 2 ], dd[pt * 2 + 1 ]);
1212+ // const auto dd_pt = initialize_complex_batch<batch_t>(dd[pt * 2], dd[pt * 2 +
1213+ // 1]);
1214+ const auto dd_pt = [dd, pt]() constexpr noexcept {
1215+ const batch_t ddi{dd[pt * 2 ]}, ddj{dd[pt * 2 + 1 ]};
1216+ return xsimd::zip_lo (ddi, ddj);
1217+ }();
12091218 // ceil offset, hence rounding, must match that in get_subgrid...
12101219 const auto i1 = (BIGINT)std::ceil (kx[pt] - ns2); // fine grid start indices
12111220 const auto i2 = (BIGINT)std::ceil (ky[pt] - ns2);
@@ -1309,7 +1318,12 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
13091318 static constexpr auto ns2 = ns * FLT (0.5 ); // half spread width
13101319 std::fill (du, du + 2 * size1 * size2 * size3, 0 );
13111320 for (uint64_t pt = 0 ; pt < M; pt++) { // loop over NU pts
1312- const auto dd_pt = initialize_complex_batch<batch_t >(dd[pt * 2 ], dd[pt * 2 + 1 ]);
1321+ // const auto dd_pt = initialize_complex_batch<batch_t>(dd[pt * 2], dd[pt * 2 +
1322+ // 1]);
1323+ const auto dd_pt = [dd, pt]() constexpr noexcept {
1324+ const batch_t ddi{dd[pt * 2 ]}, ddj{dd[pt * 2 + 1 ]};
1325+ return xsimd::zip_lo (ddi, ddj);
1326+ }();
13131327 // ceil offset, hence rounding, must match that in get_subgrid...
13141328 const auto i1 = (BIGINT)std::ceil (kx[pt] - ns2); // fine grid start indices
13151329 const auto i2 = (BIGINT)std::ceil (ky[pt] - ns2);
0 commit comments