Skip to content

Commit 28022fe

Browse files
committed
scalar-bin-sort
1 parent 53606d8 commit 28022fe

File tree

1 file changed

+19
-204
lines changed

1 file changed

+19
-204
lines changed

src/spreadinterp.cpp

Lines changed: 19 additions & 204 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ FINUFFT_NEVER_INLINE
5050
void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
5151
BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3,
5252
BIGINT M0);
53-
FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold_rescale_vec(xsimd::batch<FLT> x, FLT N);
54-
FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold(xsimd::batch<FLT> x);
5553
} // namespace
5654
// declarations of purely internal functions... (thus need not be in .h)
5755
template<uint8_t ns, uint8_t kerevalmeth, class T,
@@ -1475,80 +1473,6 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
14751473
}
14761474
}
14771475

1478-
// void bin_sort_singlethread(
1479-
// BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
1480-
// const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x,
1481-
// const double bin_size_y, const double bin_size_z, const int debug)
1482-
///* Returns permutation of all nonuniform points with good RAM access,
1483-
// * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
1484-
// *
1485-
// * This is achieved by binning into cuboids (of given bin_size within the
1486-
// * overall box domain), then reading out the indices within
1487-
// * these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest).
1488-
// * Finally the permutation is inverted, so that the good ordering is: the
1489-
// * NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1]
1490-
// *
1491-
// * Inputs: M - number of input NU points.
1492-
// * kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi).
1493-
// * Points outside this range are folded into it.
1494-
// * N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D)
1495-
// * bin_size_x,y,z - what binning box size to use in each dimension
1496-
// * (in rescaled coords where ranges are [0,Ni] ).
1497-
// * For 1D, only bin_size_x is used; for 2D, it & bin_size_y.
1498-
// * Output:
1499-
// * writes to ret a vector list of indices, each in the range 0,..,M-1.
1500-
// * Thus, ret must have been preallocated for M BIGINTs.
1501-
// *
1502-
// * Notes: I compared RAM usage against declaring an internal vector and passing
1503-
// * back; the latter used more RAM and was slower.
1504-
// * Avoided the bins array, as in JFM's spreader of 2016,
1505-
// * tidied up, early 2017, Barnett.
1506-
// * Timings (2017): 3s for M=1e8 NU pts on 1 core of i7; 5s on 1 core of xeon.
1507-
// * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
1508-
// */
1509-
//{
1510-
// const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
1511-
// // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
1512-
// // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
1513-
// // Note that round-off near kx=-pi stably rounds negative to i1=0.
1514-
// const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1);
1515-
// const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
1516-
// const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
1517-
// const auto nbins = nbins1 * nbins2 * nbins3;
1518-
// const auto inv_bin_size_x = FLT(1.0 / bin_size_x);
1519-
// const auto inv_bin_size_y = FLT(1.0 / bin_size_y);
1520-
// const auto inv_bin_size_z = FLT(1.0 / bin_size_z);
1521-
// // count how many pts in each bin
1522-
// std::vector<BIGINT> counts(nbins, 0);
1523-
//
1524-
// for (auto i = 0; i < M; i++) {
1525-
// // find the bin index in however many dims are needed
1526-
// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1527-
// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1528-
// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
1529-
// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
1530-
// ++counts[bin];
1531-
// }
1532-
//
1533-
// // compute the offsets directly in the counts array (no offset array)
1534-
// BIGINT current_offset = 0;
1535-
// for (BIGINT i = 0; i < nbins; i++) {
1536-
// BIGINT tmp = counts[i];
1537-
// counts[i] = current_offset; // Reinecke's cute replacement of counts[i]
1538-
// current_offset += tmp;
1539-
// } // (counts now contains the index offsets for each bin)
1540-
//
1541-
// for (auto i = 0; i < M; i++) {
1542-
// // find the bin index (again! but better than using RAM)
1543-
// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1544-
// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1545-
// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
1546-
// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
1547-
// ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly
1548-
// ++counts[bin]; // update the offsets
1549-
// }
1550-
//}
1551-
15521476
void bin_sort_singlethread(
15531477
BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
15541478
const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x,
@@ -1581,135 +1505,45 @@ void bin_sort_singlethread(
15811505
* Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
15821506
*/
15831507
{
1584-
static constexpr auto alignment = xsimd::batch<FLT>::arch_type::alignment();
15851508
const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
15861509
// here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
15871510
// for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
15881511
// Note that round-off near kx=-pi stably rounds negative to i1=0.
1589-
const auto nbins1 = UBIGINT(FLT(N1) / bin_size_x + 1);
1590-
const auto nbins2 = isky ? UBIGINT(FLT(N2) / bin_size_y + 1) : 1;
1591-
const auto nbins3 = iskz ? UBIGINT(FLT(N3) / bin_size_z + 1) : 1;
1592-
const auto nbins12 = nbins1 * nbins2;
1512+
const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1);
1513+
const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
1514+
const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
15931515
const auto nbins = nbins1 * nbins2 * nbins3;
15941516
const auto inv_bin_size_x = FLT(1.0 / bin_size_x);
15951517
const auto inv_bin_size_y = FLT(1.0 / bin_size_y);
15961518
const auto inv_bin_size_z = FLT(1.0 / bin_size_z);
1597-
const auto fN1 = FLT(N1);
1598-
const auto fN2 = FLT(N2);
1599-
const auto fN3 = FLT(N3);
1600-
const auto rescale1 = fN1 * inv_bin_size_x;
1601-
const auto rescale2 = fN2 * inv_bin_size_y;
1602-
const auto rescale3 = fN3 * inv_bin_size_z;
1603-
1604-
static constexpr auto avx_width = xsimd::batch<FLT>::size;
1605-
const auto regular_part = M & (-avx_width);
1606-
16071519
// count how many pts in each bin
1608-
std::vector<UBIGINT> counts(nbins, 0);
1609-
1610-
static constexpr auto to_array = [](const auto bins) noexcept {
1611-
using contained_t = typename decltype(bins)::value_type;
1612-
alignas(alignment) std::array<contained_t, decltype(bins)::size> result{};
1613-
bins.store_aligned(result.data());
1614-
return result;
1615-
};
1616-
1617-
static constexpr auto to_uint = [](const xsimd::batch<FLT> bins) noexcept {
1618-
return xsimd::batch_cast<xsimd::as_unsigned_integer_t<FLT>>(bins);
1619-
};
1620-
1621-
const auto compute_bins = [=](const auto... args) constexpr noexcept {
1622-
const std::array<const FLT *, sizeof...(args)> k_arr = {args...};
1623-
//
1624-
auto bins0 = to_uint(fold(xsimd::load_unaligned(k_arr[0])) * rescale1);
1625-
auto bins1 = to_uint(FLT(0));
1626-
auto bins2 = to_uint(FLT(0));
1627-
if constexpr (sizeof...(args) > 1) {
1628-
const auto i2 = to_uint(fold(xsimd::load_unaligned(k_arr[1])) * rescale2);
1629-
bins1 = nbins1 * i2;
1630-
} else {
1631-
return bins0;
1632-
}
1633-
if constexpr (sizeof...(args) > 2) {
1634-
const auto i3 = to_uint(fold(xsimd::load_unaligned(k_arr[2])) * rescale3);
1635-
bins2 = nbins12 * i3;
1636-
} else {
1637-
return bins0 + bins1;
1638-
}
1639-
return bins0 + bins1 + bins2;
1640-
};
1641-
1642-
const auto increment_bins = [&counts](const auto bins) constexpr noexcept {
1643-
const auto bin_array = to_array(bins);
1644-
for (const auto bin : bin_array) {
1645-
++counts[bin];
1646-
}
1647-
};
1648-
1649-
const auto accumulate_bins = [&counts, &ret](const auto bins,
1650-
const auto i) constexpr noexcept {
1651-
const auto bin_array = to_array(bins);
1652-
for (uint8_t j{0}; j < avx_width; ++j) {
1653-
const auto bin = bin_array[j];
1654-
// fill the inverse map on the fly, careful of indexes errors
1655-
ret[counts[bin]] = i + j;
1656-
++counts[bin];
1657-
}
1658-
};
1520+
std::vector<BIGINT> counts(nbins, 0);
16591521

1660-
UBIGINT i{0};
1661-
if (iskz) {
1662-
for (; i < regular_part; i += avx_width) {
1663-
increment_bins(compute_bins(kx + i, ky + i, kz + i));
1664-
}
1665-
} else if (isky) {
1666-
for (; i < regular_part; i += avx_width) {
1667-
increment_bins(compute_bins(kx + i, ky + i));
1668-
}
1669-
} else {
1670-
for (; i < regular_part; i += avx_width) {
1671-
increment_bins(compute_bins(kx + i));
1672-
}
1673-
}
1674-
1675-
for (; i < M; ++i) {
1522+
for (auto i = 0; i < M; i++) {
16761523
// find the bin index in however many dims are needed
1677-
const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1678-
const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1679-
const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
1524+
const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1525+
const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1526+
const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
16801527
const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
16811528
++counts[bin];
16821529
}
16831530

16841531
// compute the offsets directly in the counts array (no offset array)
1685-
UBIGINT current_offset{0}; // Reinecke's cute replacement of counts[i]
1686-
for (i = 0; i < nbins; ++i) {
1687-
counts[i] = std::exchange(current_offset, current_offset + counts[i]);
1532+
BIGINT current_offset = 0;
1533+
for (BIGINT i = 0; i < nbins; i++) {
1534+
BIGINT tmp = counts[i];
1535+
counts[i] = current_offset; // Reinecke's cute replacement of counts[i]
1536+
current_offset += tmp;
16881537
} // (counts now contains the index offsets for each bin)
16891538

1690-
i = 0; // we need to redo the loop so variable should be zeroed here
1691-
if (iskz) {
1692-
for (; i < regular_part; i += avx_width) {
1693-
accumulate_bins(compute_bins(kx + i, ky + i, kz + i), i);
1694-
}
1695-
} else if (isky) {
1696-
for (; i < regular_part; i += avx_width) {
1697-
accumulate_bins(compute_bins(kx + i, ky + i), i);
1698-
}
1699-
} else {
1700-
for (; i < regular_part; i += avx_width) {
1701-
accumulate_bins(compute_bins(kx + i), i);
1702-
}
1703-
}
1704-
1705-
for (; i < M; ++i) {
1539+
for (auto i = 0; i < M; i++) {
17061540
// find the bin index (again! but better than using RAM)
1707-
const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1708-
const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1709-
const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
1541+
const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1542+
const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1543+
const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
17101544
const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
1711-
ret[counts[bin]] = i; // fill the inverse map on the fly
1712-
++counts[bin]; // update the offsets
1545+
ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly
1546+
++counts[bin]; // update the offsets
17131547
}
17141548
}
17151549

@@ -2049,24 +1883,5 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset
20491883
break;
20501884
}
20511885
}
2052-
2053-
// since xsimd is not constexpr this slows down the loop due to the guard variables
2054-
// hence moved them here
2055-
2056-
FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold_rescale_vec(const xsimd::batch<FLT> x,
2057-
const FLT N) {
2058-
const xsimd::batch<FLT> x2pi{FLT(M_1_2PI)};
2059-
const xsimd::batch<FLT> half{FLT(0.5)};
2060-
auto result = xsimd::fma(x, x2pi, half);
2061-
result -= xsimd::floor(result);
2062-
return result * N;
2063-
}
2064-
FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold(xsimd::batch<FLT> x) {
2065-
const xsimd::batch<FLT> x2pi{FLT(M_1_2PI)};
2066-
const xsimd::batch<FLT> half{FLT(0.5)};
2067-
auto result = xsimd::fma(x, x2pi, half);
2068-
result -= xsimd::floor(result);
2069-
return result;
2070-
}
20711886
} // namespace
20721887
} // namespace finufft::spreadinterp

0 commit comments

Comments
 (0)