@@ -50,8 +50,6 @@ FINUFFT_NEVER_INLINE
5050void print_subgrid_info (int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
5151 BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3,
5252 BIGINT M0);
53- FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold_rescale_vec (xsimd::batch<FLT> x, FLT N);
54- FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold (xsimd::batch<FLT> x);
5553} // namespace
5654// declarations of purely internal functions... (thus need not be in .h)
5755template <uint8_t ns, uint8_t kerevalmeth, class T ,
@@ -1475,80 +1473,6 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
14751473 }
14761474}
14771475
1478- // void bin_sort_singlethread(
1479- // BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
1480- // const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x,
1481- // const double bin_size_y, const double bin_size_z, const int debug)
1482- // /* Returns permutation of all nonuniform points with good RAM access,
1483- // * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
1484- // *
1485- // * This is achieved by binning into cuboids (of given bin_size within the
1486- // * overall box domain), then reading out the indices within
1487- // * these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest).
1488- // * Finally the permutation is inverted, so that the good ordering is: the
1489- // * NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1]
1490- // *
1491- // * Inputs: M - number of input NU points.
1492- // * kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi).
1493- // * Points outside this range are folded into it.
1494- // * N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D)
1495- // * bin_size_x,y,z - what binning box size to use in each dimension
1496- // * (in rescaled coords where ranges are [0,Ni] ).
1497- // * For 1D, only bin_size_x is used; for 2D, it & bin_size_y.
1498- // * Output:
1499- // * writes to ret a vector list of indices, each in the range 0,..,M-1.
1500- // * Thus, ret must have been preallocated for M BIGINTs.
1501- // *
1502- // * Notes: I compared RAM usage against declaring an internal vector and passing
1503- // * back; the latter used more RAM and was slower.
1504- // * Avoided the bins array, as in JFM's spreader of 2016,
1505- // * tidied up, early 2017, Barnett.
1506- // * Timings (2017): 3s for M=1e8 NU pts on 1 core of i7; 5s on 1 core of xeon.
1507- // * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
1508- // */
1509- // {
1510- // const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
1511- // // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
1512- // // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
1513- // // Note that round-off near kx=-pi stably rounds negative to i1=0.
1514- // const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1);
1515- // const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
1516- // const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
1517- // const auto nbins = nbins1 * nbins2 * nbins3;
1518- // const auto inv_bin_size_x = FLT(1.0 / bin_size_x);
1519- // const auto inv_bin_size_y = FLT(1.0 / bin_size_y);
1520- // const auto inv_bin_size_z = FLT(1.0 / bin_size_z);
1521- // // count how many pts in each bin
1522- // std::vector<BIGINT> counts(nbins, 0);
1523- //
1524- // for (auto i = 0; i < M; i++) {
1525- // // find the bin index in however many dims are needed
1526- // const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1527- // const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1528- // const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
1529- // const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
1530- // ++counts[bin];
1531- // }
1532- //
1533- // // compute the offsets directly in the counts array (no offset array)
1534- // BIGINT current_offset = 0;
1535- // for (BIGINT i = 0; i < nbins; i++) {
1536- // BIGINT tmp = counts[i];
1537- // counts[i] = current_offset; // Reinecke's cute replacement of counts[i]
1538- // current_offset += tmp;
1539- // } // (counts now contains the index offsets for each bin)
1540- //
1541- // for (auto i = 0; i < M; i++) {
1542- // // find the bin index (again! but better than using RAM)
1543- // const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
1544- // const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
1545- // const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
1546- // const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
1547- // ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly
1548- // ++counts[bin]; // update the offsets
1549- // }
1550- // }
1551-
15521476void bin_sort_singlethread (
15531477 BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
15541478 const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x,
@@ -1581,135 +1505,45 @@ void bin_sort_singlethread(
15811505 * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
15821506 */
15831507{
1584- static constexpr auto alignment = xsimd::batch<FLT>::arch_type::alignment ();
15851508 const auto isky = (N2 > 1 ), iskz = (N3 > 1 ); // ky,kz avail? (cannot access if not)
15861509 // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
15871510 // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
15881511 // Note that round-off near kx=-pi stably rounds negative to i1=0.
1589- const auto nbins1 = UBIGINT (FLT (N1) / bin_size_x + 1 );
1590- const auto nbins2 = isky ? UBIGINT (FLT (N2) / bin_size_y + 1 ) : 1 ;
1591- const auto nbins3 = iskz ? UBIGINT (FLT (N3) / bin_size_z + 1 ) : 1 ;
1592- const auto nbins12 = nbins1 * nbins2;
1512+ const auto nbins1 = BIGINT (FLT (N1) / bin_size_x + 1 );
1513+ const auto nbins2 = isky ? BIGINT (FLT (N2) / bin_size_y + 1 ) : 1 ;
1514+ const auto nbins3 = iskz ? BIGINT (FLT (N3) / bin_size_z + 1 ) : 1 ;
15931515 const auto nbins = nbins1 * nbins2 * nbins3;
15941516 const auto inv_bin_size_x = FLT (1.0 / bin_size_x);
15951517 const auto inv_bin_size_y = FLT (1.0 / bin_size_y);
15961518 const auto inv_bin_size_z = FLT (1.0 / bin_size_z);
1597- const auto fN1 = FLT (N1);
1598- const auto fN2 = FLT (N2);
1599- const auto fN3 = FLT (N3);
1600- const auto rescale1 = fN1 * inv_bin_size_x;
1601- const auto rescale2 = fN2 * inv_bin_size_y;
1602- const auto rescale3 = fN3 * inv_bin_size_z;
1603-
1604- static constexpr auto avx_width = xsimd::batch<FLT>::size;
1605- const auto regular_part = M & (-avx_width);
1606-
16071519 // count how many pts in each bin
1608- std::vector<UBIGINT> counts (nbins, 0 );
1609-
1610- static constexpr auto to_array = [](const auto bins) noexcept {
1611- using contained_t = typename decltype (bins)::value_type;
1612- alignas (alignment) std::array<contained_t , decltype (bins)::size> result{};
1613- bins.store_aligned (result.data ());
1614- return result;
1615- };
1616-
1617- static constexpr auto to_uint = [](const xsimd::batch<FLT> bins) noexcept {
1618- return xsimd::batch_cast<xsimd::as_unsigned_integer_t <FLT>>(bins);
1619- };
1620-
1621- const auto compute_bins = [=](const auto ... args) constexpr noexcept {
1622- const std::array<const FLT *, sizeof ...(args)> k_arr = {args...};
1623- //
1624- auto bins0 = to_uint (fold (xsimd::load_unaligned (k_arr[0 ])) * rescale1);
1625- auto bins1 = to_uint (FLT (0 ));
1626- auto bins2 = to_uint (FLT (0 ));
1627- if constexpr (sizeof ...(args) > 1 ) {
1628- const auto i2 = to_uint (fold (xsimd::load_unaligned (k_arr[1 ])) * rescale2);
1629- bins1 = nbins1 * i2;
1630- } else {
1631- return bins0;
1632- }
1633- if constexpr (sizeof ...(args) > 2 ) {
1634- const auto i3 = to_uint (fold (xsimd::load_unaligned (k_arr[2 ])) * rescale3);
1635- bins2 = nbins12 * i3;
1636- } else {
1637- return bins0 + bins1;
1638- }
1639- return bins0 + bins1 + bins2;
1640- };
1641-
1642- const auto increment_bins = [&counts](const auto bins) constexpr noexcept {
1643- const auto bin_array = to_array (bins);
1644- for (const auto bin : bin_array) {
1645- ++counts[bin];
1646- }
1647- };
1648-
1649- const auto accumulate_bins = [&counts, &ret](const auto bins,
1650- const auto i) constexpr noexcept {
1651- const auto bin_array = to_array (bins);
1652- for (uint8_t j{0 }; j < avx_width; ++j) {
1653- const auto bin = bin_array[j];
1654- // fill the inverse map on the fly, careful of indexes errors
1655- ret[counts[bin]] = i + j;
1656- ++counts[bin];
1657- }
1658- };
1520+ std::vector<BIGINT> counts (nbins, 0 );
16591521
1660- UBIGINT i{0 };
1661- if (iskz) {
1662- for (; i < regular_part; i += avx_width) {
1663- increment_bins (compute_bins (kx + i, ky + i, kz + i));
1664- }
1665- } else if (isky) {
1666- for (; i < regular_part; i += avx_width) {
1667- increment_bins (compute_bins (kx + i, ky + i));
1668- }
1669- } else {
1670- for (; i < regular_part; i += avx_width) {
1671- increment_bins (compute_bins (kx + i));
1672- }
1673- }
1674-
1675- for (; i < M; ++i) {
1522+ for (auto i = 0 ; i < M; i++) {
16761523 // find the bin index in however many dims are needed
1677- const auto i1 = UBIGINT (fold_rescale (kx[i], N1) * inv_bin_size_x);
1678- const auto i2 = isky ? UBIGINT (fold_rescale (ky[i], N2) * inv_bin_size_y) : 0 ;
1679- const auto i3 = iskz ? UBIGINT (fold_rescale (kz[i], N3) * inv_bin_size_z) : 0 ;
1524+ const auto i1 = BIGINT (fold_rescale (kx[i], N1) * inv_bin_size_x);
1525+ const auto i2 = isky ? BIGINT (fold_rescale (ky[i], N2) * inv_bin_size_y) : 0 ;
1526+ const auto i3 = iskz ? BIGINT (fold_rescale (kz[i], N3) * inv_bin_size_z) : 0 ;
16801527 const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
16811528 ++counts[bin];
16821529 }
16831530
16841531 // compute the offsets directly in the counts array (no offset array)
1685- UBIGINT current_offset{0 }; // Reinecke's cute replacement of counts[i]
1686- for (i = 0 ; i < nbins; ++i) {
1687- counts[i] = std::exchange (current_offset, current_offset + counts[i]);
1532+ BIGINT current_offset = 0 ;
1533+ for (BIGINT i = 0 ; i < nbins; i++) {
1534+ BIGINT tmp = counts[i];
1535+ counts[i] = current_offset; // Reinecke's cute replacement of counts[i]
1536+ current_offset += tmp;
16881537 } // (counts now contains the index offsets for each bin)
16891538
1690- i = 0 ; // we need to redo the loop so variable should be zeroed here
1691- if (iskz) {
1692- for (; i < regular_part; i += avx_width) {
1693- accumulate_bins (compute_bins (kx + i, ky + i, kz + i), i);
1694- }
1695- } else if (isky) {
1696- for (; i < regular_part; i += avx_width) {
1697- accumulate_bins (compute_bins (kx + i, ky + i), i);
1698- }
1699- } else {
1700- for (; i < regular_part; i += avx_width) {
1701- accumulate_bins (compute_bins (kx + i), i);
1702- }
1703- }
1704-
1705- for (; i < M; ++i) {
1539+ for (auto i = 0 ; i < M; i++) {
17061540 // find the bin index (again! but better than using RAM)
1707- const auto i1 = UBIGINT (fold_rescale (kx[i], N1) * inv_bin_size_x);
1708- const auto i2 = isky ? UBIGINT (fold_rescale (ky[i], N2) * inv_bin_size_y) : 0 ;
1709- const auto i3 = iskz ? UBIGINT (fold_rescale (kz[i], N3) * inv_bin_size_z) : 0 ;
1541+ const auto i1 = BIGINT (fold_rescale (kx[i], N1) * inv_bin_size_x);
1542+ const auto i2 = isky ? BIGINT (fold_rescale (ky[i], N2) * inv_bin_size_y) : 0 ;
1543+ const auto i3 = iskz ? BIGINT (fold_rescale (kz[i], N3) * inv_bin_size_z) : 0 ;
17101544 const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
1711- ret[counts[bin]] = i ; // fill the inverse map on the fly
1712- ++counts[bin]; // update the offsets
1545+ ret[counts[bin]] = BIGINT (i) ; // fill the inverse map on the fly
1546+ ++counts[bin]; // update the offsets
17131547 }
17141548}
17151549
@@ -2049,24 +1883,5 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset
20491883 break ;
20501884 }
20511885}
2052-
2053- // since xsimd is not constexpr this slows down the loop due to the guard variables
2054- // hence moved them here
2055-
2056- FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold_rescale_vec (const xsimd::batch<FLT> x,
2057- const FLT N) {
2058- const xsimd::batch<FLT> x2pi{FLT (M_1_2PI)};
2059- const xsimd::batch<FLT> half{FLT (0.5 )};
2060- auto result = xsimd::fma (x, x2pi, half);
2061- result -= xsimd::floor (result);
2062- return result * N;
2063- }
2064- FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold (xsimd::batch<FLT> x) {
2065- const xsimd::batch<FLT> x2pi{FLT (M_1_2PI)};
2066- const xsimd::batch<FLT> half{FLT (0.5 )};
2067- auto result = xsimd::fma (x, x2pi, half);
2068- result -= xsimd::floor (result);
2069- return result;
2070- }
20711886} // namespace
20721887} // namespace finufft::spreadinterp
0 commit comments