Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 180 additions & 4 deletions src/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

using namespace std;
Expand Down Expand Up @@ -73,8 +74,8 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker,
const V... elems) noexcept;
static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, UBIGINT N) noexcept;
template<class simd_type>
FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x,
UBIGINT N) noexcept;
static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x,
const BIGINT N) noexcept;
static FINUFFT_ALWAYS_INLINE void set_kernel_args(
FLT *args, FLT x, const finufft_spread_opts &opts) noexcept;
static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(
Expand Down Expand Up @@ -114,6 +115,10 @@ static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const F
const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3,
double bin_size_x, double bin_size_y, double bin_size_z,
int debug);
static void bin_sort_singlethread_vector(BIGINT *ret, UBIGINT M, const FLT *kx,
const FLT *ky, const FLT *kz, UBIGINT N1,
UBIGINT N2, UBIGINT N3, double bin_size_x,
double bin_size_y, double bin_size_z, int debug);
void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1,
UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y,
double bin_size_z, int debug, int nthr);
Expand Down Expand Up @@ -296,8 +301,8 @@ int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT
if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better!
sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic
if (sort_nthr == 1)
bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
bin_size_y, bin_size_z, sort_debug);
bin_sort_singlethread_vector(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
bin_size_y, bin_size_z, sort_debug);
else // sort_nthr>1, user fixes # threads (>=2)
bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
bin_size_y, bin_size_z, sort_debug, sort_nthr);
Expand Down Expand Up @@ -1841,6 +1846,177 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
}
}

template<typename simd_type, std::size_t... Is>
static FINUFFT_ALWAYS_INLINE simd_type make_incremented_vectors(
std::index_sequence<Is...>) {
return simd_type{Is...}; // Creates a SIMD vector with the index sequence
}

template<std::size_t N, typename simd_type>
static FINUFFT_ALWAYS_INLINE bool has_duplicates_impl(const simd_type &vec) noexcept {

if constexpr (N == simd_type::size) {
return false;
} else {
const auto duplicates =
(xsimd::rotate_right<N>(vec) == vec) != xsimd::batch_bool<bool>(false);
if (duplicates) {
return true;
}
return has_duplicates_impl<N + 1>(vec);
}
}

template<typename simd_type>
static FINUFFT_ALWAYS_INLINE bool has_duplicates(const simd_type &vec) noexcept {
return has_duplicates_impl<1, simd_type>(vec);
}

void bin_sort_singlethread_vector(
BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,
const double bin_size_y, const double bin_size_z, const int debug)
/* Returns permutation of all nonuniform points with good RAM access,
* ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
*
* This is achieved by binning into cuboids (of given bin_size within the
* overall box domain), then reading out the indices within
* these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest).
* Finally the permutation is inverted, so that the good ordering is: the
* NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1]
*
* Inputs: M - number of input NU points.
* kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi).
* Points outside this range are folded into it.
* N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D)
* bin_size_x,y,z - what binning box size to use in each dimension
* (in rescaled coords where ranges are [0,Ni] ).
* For 1D, only bin_size_x is used; for 2D, it & bin_size_y.
* Output:
* writes to ret a vector list of indices, each in the range 0,..,M-1.
* Thus, ret must have been preallocated for M BIGINTs.
*
* Notes: I compared RAM usage against declaring an internal vector and passing
* back; the latter used more RAM and was slower.
* Avoided the bins array, as in JFM's spreader of 2016,
* tidied up, early 2017, Barnett.
* Timings (2017): 3s for M=1e8 NU pts on 1 core of i7; 5s on 1 core of xeon.
* Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
*/
{
using simd_type = xsimd::batch<FLT>;
using int_simd_type = xsimd::batch<xsimd::as_integer_t<FLT>>;
using arch_t = simd_type::arch_type;
static constexpr auto simd_size = simd_type::size;
static constexpr auto alignment = arch_t::alignment();

static constexpr auto to_array = [](const auto &vec) constexpr noexcept {
using T = decltype(std::decay_t<decltype(vec)>());
alignas(alignment) std::array<typename T::value_type, T::size> array{};
vec.store_aligned(array.data());
return array;
};

const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
// here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
// for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
// Note that round-off near kx=-pi stably rounds negative to i1=0.
const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1);
const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
const auto nbins = nbins1 * nbins2 * nbins3;
const auto inv_bin_size_x = FLT(1.0 / bin_size_x);
const auto inv_bin_size_y = FLT(1.0 / bin_size_y);
const auto inv_bin_size_z = FLT(1.0 / bin_size_z);
const auto inv_bin_size_x_vec = simd_type(1.0 / bin_size_x);
const auto inv_bin_size_y_vec = simd_type(1.0 / bin_size_y);
const auto inv_bin_size_z_vec = simd_type(1.0 / bin_size_z);
const auto zero = to_int(simd_type(0));
const auto increment =
to_int(make_incremented_vectors<simd_type>(std::make_index_sequence<simd_size>{}));

// count how many pts in each bin
alignas(alignment) std::vector<xsimd::as_integer_t<FLT>> counts(nbins + simd_size, 0);
const auto simd_M = M & (-simd_size); // round down to simd_size multiple
UBIGINT i{};
for (i = 0; i < simd_M; i += simd_size) {
const auto i1 = xsimd::to_int(
fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec);
const auto i2 =
isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) *
inv_bin_size_y_vec)
: zero;
const auto i3 =
iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
inv_bin_size_z_vec)
: zero;
const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
if (has_duplicates(bin)) {
const auto bin_array = to_array(bin);
for (int j = 0; j < simd_size; j++) {
++counts[bin_array[j]];
}
} else {
const auto bins = int_simd_type::gather(counts.data(), bin);
const auto incr_bins = xsimd::incr(bins);
incr_bins.scatter(counts.data(), bin);
}
}

for (; i < M; i++) {
// find the bin index in however many dims are needed
const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
++counts[bin];
}

// compute the offsets directly in the counts array (no offset array)
BIGINT current_offset = 0;
for (i = 0; i < nbins; i++) {
BIGINT tmp = counts[i];
counts[i] = current_offset; // Reinecke's cute replacement of counts[i]
current_offset += tmp;
} // (counts now contains the index offsets for each bin)

for (i = 0; i < simd_M; i += simd_size) {
const auto i1 = xsimd::to_int(
fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec);
const auto i2 =
isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) *
inv_bin_size_y_vec)
: zero;
const auto i3 =
iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
inv_bin_size_z_vec)
: zero;
const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
if (has_duplicates(bin)) {
const auto bin_array = to_array(to_int(bin));
for (int j = 0; j < simd_size; j++) {
ret[counts[bin_array[j]]] = j + i;
counts[bin_array[j]]++;
}
} else {
const auto bins = decltype(bin)::gather(counts.data(), bin);
const auto incr_bins = xsimd::incr(bins);
incr_bins.scatter(counts.data(), bin);
const auto result = increment + int_simd_type(i);
result.scatter(ret, bins);
}
}
for (; i < M; i++) {
// find the bin index (again! but better than using RAM)
const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
ret[counts[bin]] = i; // fill the inverse map on the fly
++counts[bin]; // update the offsets
}
}

void bin_sort_singlethread(
BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,
Expand Down