GW Spectrum computation done fully on GPUs now

MSABuschmann · MSABuschmann · commit 4180ec30d66f · 2025-03-17T06:34:15.000-07:00
diff --git a/source/gravitational_waves.cpp b/source/gravitational_waves.cpp
@@ -2,6 +2,7 @@
 #include "fft.h"
 #include "hdf5_utils.h"
 #include "sledgehamr_utils.h"
+#include <AMReX_ParallelReduce.H>
 
 namespace sledgehamr {
 
@@ -67,6 +68,9 @@ void GravitationalWaves::ComputeSpectrum(
     int comps[6];
     modifier->SelectComponents(comps);
 
+    std::chrono::steady_clock::time_point start_time =
+        std::chrono::steady_clock::now();
+
     for (int i = 0; i < 6; ++i) {
 #ifdef OLD_FFT
         const amrex::BoxArray &ba = ld.boxArray();
@@ -78,6 +82,14 @@ void GravitationalWaves::ComputeSpectrum(
                    sim->geom[lev], false, zero_padding);
     }
 
+    std::chrono::steady_clock::time_point end_time =
+        std::chrono::steady_clock::now();
+    double duration_ms = static_cast<double>(
+        std::chrono::duration_cast<std::chrono::microseconds>(end_time -
+                                                              start_time)
+            .count());
+    // amrex::Print() << "FFTs: " << duration_ms << std::endl;
+
     double dk = 2. * M_PI / L;
     double dimN6 = pow(dimN, 6);
 
@@ -93,15 +105,105 @@ void GravitationalWaves::ComputeSpectrum(
              k <= std::sqrt(3.) / 2. * static_cast<double>(dimN) + .5; ++k) {
             ks.push_back(k * k);
         }
-        NTHREADS = 256;
+        NTHREADS = 16;
     }
-
     int kmax = ks.size();
+    amrex::Gpu::AsyncArray<double> async_index_to_k(sim->index_to_k.data(),
+                                                    sim->index_to_k.size());
+    double *index_to_k = async_index_to_k.data();
+
+    start_time = std::chrono::steady_clock::now();
+
+#ifdef AMREX_USE_GPU
+    if (l_unbinned) {
+        amrex::Abort(
+            "Unbinned gw spectra on GPUs are currently not supported!");
+    }
+
+    bool l_unbinned = unbinned;
+
+    unsigned long SpecLen = kmax;
+    std::vector<double> gw_spectrum(SpecLen, 0.0);
+    amrex::Gpu::DeviceVector<double> d_data(SpecLen, 0.0);
+    double *const AMREX_RESTRICT dptr_data = d_data.dataPtr();
+
+    for (amrex::MFIter mfi(du_real[0], false); mfi.isValid(); ++mfi) {
+        const amrex::Box &bx = mfi.tilebox();
+
+        const amrex::Array4<double> du_real_arr[6] = {
+            du_real[0].array(mfi), du_real[1].array(mfi),
+            du_real[2].array(mfi), du_real[3].array(mfi),
+            du_real[4].array(mfi), du_real[5].array(mfi)};
+
+        const amrex::Array4<double> du_imag_arr[6] = {
+            du_imag[0].array(mfi), du_imag[1].array(mfi),
+            du_imag[2].array(mfi), du_imag[3].array(mfi),
+            du_imag[4].array(mfi), du_imag[5].array(mfi)};
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int a, int b,
+                                                    int c) noexcept {
+            // To account of negative frequencies
+            double multpl = (a == 0 || a == dimN / 2) ? 1. : 2.;
+            int abc[3] = {a, b, c};
+            double running_sum = 0;
+
+            int li = a >= dimN / 2 ? a - dimN : a;
+            int lj = b >= dimN / 2 ? b - dimN : b;
+            int lk = c >= dimN / 2 ? c - dimN : c;
+            unsigned int sq = li * li + lj * lj + lk * lk;
+            unsigned long index;
+            if (l_unbinned) {
+                // index = std::lower_bound(ks.begin(), ks.end(), sq) -
+                // ks.begin();
+            } else {
+                index =
+                    static_cast<long>(std::sqrt(static_cast<double>(sq)) + .5);
+            }
+
+            for (int i = 0; i < 3; ++i) {
+                for (int j = 0; j < 3; ++j) {
+                    for (int l = 0; l < 3; ++l) {
+                        for (int m = 0; m < 3; ++m) {
+                            running_sum +=
+                                gw_GetLambda(i, j, l, m, abc, index_to_k) *
+                                multpl *
+                                (du_real_arr[mat[i][j]].operator()(a, b, c) *
+                                     du_real_arr[mat[l][m]].operator()(a, b,
+                                                                       c) +
+                                 du_imag_arr[mat[i][j]].operator()(a, b, c) *
+                                     du_imag_arr[mat[l][m]].operator()(a, b,
+                                                                       c));
+                        }
+                    }
+                }
+            }
+            amrex::HostDevice::Atomic::Add(&dptr_data[index],
+                                           running_sum / dimN6);
+        });
+    }
+
+    end_time = std::chrono::steady_clock::now();
+    duration_ms = static_cast<double>(
+        std::chrono::duration_cast<std::chrono::microseconds>(end_time -
+                                                              start_time)
+            .count());
+    // amrex::Print() << "Sum: " << duration_ms << std::endl;
+    start_time = std::chrono::steady_clock::now();
+
+    // blocking copy from device to host
+    amrex::Gpu::copy(amrex::Gpu::deviceToHost, d_data.begin(), d_data.end(),
+                     gw_spectrum.begin());
+
+    // reduced sum over mpi ranks
+    amrex::ParallelDescriptor::ReduceRealSum(
+        gw_spectrum.data(), static_cast<int>(gw_spectrum.size()),
+        amrex::ParallelDescriptor::IOProcessorNumber());
+
+#else // ifdef AMREX_USE_GPU
+
     unsigned long SpecLen = kmax * NTHREADS;
-    std::unique_ptr<double[]> gw_spectrum(new double[SpecLen]);
-    std::fill_n(gw_spectrum.get(), SpecLen, 0.0);
+    std::vector<double> gw_spectrum(SpecLen, 0.0);
 
-// Non-trivial load-balancing here. Not sure what wins.
 #pragma omp parallel num_threads(std::min(NTHREADS, omp_get_max_threads()))
     for (amrex::MFIter mfi(du_real[0], true); mfi.isValid(); ++mfi) {
         const amrex::Box &bx = mfi.tilebox();
@@ -156,7 +258,8 @@ void GravitationalWaves::ComputeSpectrum(
                             for (int l = 0; l < 3; ++l) {
                                 for (int m = 0; m < 3; ++m) {
                                     running_sum +=
-                                        GetLambda(i, j, l, m, abc, dimN) *
+                                        gw_GetLambda(i, j, l, m, abc,
+                                                     index_to_k) *
                                         multpl *
                                         (du_real_arr[mat[i][j]]->operator()(
                                              a, b, c) *
@@ -175,94 +278,47 @@ void GravitationalWaves::ComputeSpectrum(
             }
         }
     }
+
+    end_time = std::chrono::steady_clock::now();
+    duration_ms = static_cast<double>(
+        std::chrono::duration_cast<std::chrono::microseconds>(end_time -
+                                                              start_time)
+            .count());
+    // amrex::Print() << "Sum: " << duration_ms << std::endl;
+    start_time = std::chrono::steady_clock::now();
+
     for (int a = 1; a < NTHREADS; ++a) {
         for (int c = 0; c < kmax; ++c) {
             gw_spectrum[c] += gw_spectrum[a * kmax + c];
         }
     }
 
     amrex::ParallelDescriptor::ReduceRealSum(
-        gw_spectrum.get(), kmax,
+        &(gw_spectrum[0]), kmax,
         amrex::ParallelDescriptor::IOProcessorNumber());
 
 #pragma omp parallel for
     for (int c = 0; c < kmax; ++c) {
         gw_spectrum[c] /= dimN6;
     }
 
+#endif // AMREX_USE_GPU
+    end_time = std::chrono::steady_clock::now();
+    duration_ms = static_cast<double>(
+        std::chrono::duration_cast<std::chrono::microseconds>(end_time -
+                                                              start_time)
+            .count());
+    // amrex::Print() << "Reduce: " << duration_ms << std::endl;
+
     if (amrex::ParallelDescriptor::IOProcessor()) {
         const int nparams = 6;
         double header_data[nparams] = {
             ld.t, (double)dimN,         (double)kmax,
             L,    (double)zero_padding, (double)unbinned};
         utils::hdf5::Write(file_id, "Header", header_data, nparams);
         utils::hdf5::Write(file_id, "k", &(ks[0]), kmax);
-        utils::hdf5::Write(file_id, "Spectrum", gw_spectrum.get(), kmax);
+        utils::hdf5::Write(file_id, "Spectrum", &gw_spectrum[0], kmax);
     }
 }
 
-/** @brief Converts a given index to k-space given a projection type.
- * @param   a   Index to be converted.
- * @param   N   Total index length.
- * @return k-value.
- */
-inline double GravitationalWaves::IndexToK(int a, int N) {
-    return sim->index_to_k[a];
-    /*
-        double n_tilde = a - N <= -N / 2 - 1 ? a : a - N;
-        double two_pi_n_tilde = 2. * M_PI / static_cast<double>(N) *
-       n_tilde;
-
-        if (projection_type == 1) {
-            return sin(two_pi_n_tilde);
-        } else if (projection_type == 2) {
-            return (8. * sin(two_pi_n_tilde) - sin(2. * two_pi_n_tilde))
-       / 6.;
-        }
-
-        return 0.;
-    */
-}
-
-/** @brief Projects all indicies.
- * @param   i   i-index.
- * @param   j   j-index.
- * @param   abc a-, b-, and c- index.
- * @param   N   Total index length.
- * @return Projected value.
- */
-inline double GravitationalWaves::GetProjection(int i, int j, int abc[3],
-                                                int N) {
-    if (abc[0] == 0 && abc[1] == 0 && abc[2] == 0)
-        return 0.;
-
-    double abc_d[3];
-    abc_d[0] = IndexToK(abc[0], N);
-    abc_d[1] = IndexToK(abc[1], N);
-    abc_d[2] = IndexToK(abc[2], N);
-
-    double norm =
-        abc_d[0] * abc_d[0] + abc_d[1] * abc_d[1] + abc_d[2] * abc_d[2];
-    int l = amrex::min(i, j);
-    int m = amrex::max(i, j);
-
-    double proj = abc_d[l] * abc_d[m];
-
-    return static_cast<double>(l == m) - proj / norm;
-}
-
-/** @brief Computes lambda from projections.
- * @param   i   i-index.
- * @param   j   j-index.
- * @param   l   l-index.
- * @param   m   m-index.
- * @param   abc a-, b-, and c-index.
- * @param   N   Total index length.
- * @return Lambda
- */
-inline double GravitationalWaves::GetLambda(int i, int j, int l, int m,
-                                            int abc[3], int N) {
-    return GetProjection(i, l, abc, N) * GetProjection(j, m, abc, N) -
-           GetProjection(i, j, abc, N) * GetProjection(l, m, abc, N) / 2.;
-}
 }; // namespace sledgehamr
diff --git a/source/gravitational_waves.h b/source/gravitational_waves.h
@@ -43,10 +43,6 @@ class GravitationalWaves {
     };
 
   private:
-    double IndexToK(int a, int N);
-    double GetProjection(int i, int j, int abc[3], int N);
-    double GetLambda(int i, int j, int l, int m, int abc[3], int N);
-
     /** @brief Pointer to the simulation.
      */
     Sledgehamr *sim;
@@ -92,9 +88,50 @@ struct GravitationalWavesSpectrumModifier {
 
     virtual void FourierSpaceModifications(amrex::MultiFab du_real[6],
                                            amrex::MultiFab du_imag[6],
-                                           const double dk, const int dimN){};
+                                           const double dk, const int dimN) {};
 };
 
+/** @brief Projects all indicies.
+ * @param   i   i-index.
+ * @param   j   j-index.
+ * @param   abc a-, b-, and c- index.
+ * @param   N   Total index length.
+ * @return Projected value.
+ */
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE double gw_GetProjection(int i, int j,
+                                                            double abc[3]) {
+    double norm = abc[0] * abc[0] + abc[1] * abc[1] + abc[2] * abc[2];
+    int l = amrex::min(i, j);
+    int m = amrex::max(i, j);
+
+    double proj = abc[l] * abc[m];
+
+    return static_cast<double>(l == m) - proj / norm;
+}
+
+/** @brief Computes lambda from projections.
+ * @param   i   i-index.
+ * @param   j   j-index.
+ * @param   l   l-index.
+ * @param   m   m-index.
+ * @param   abc a-, b-, and c-index.
+ * @param   N   Total index length.
+ * @return Lambda
+ */
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE double
+gw_GetLambda(int i, int j, int l, int m, int abc[3], const double *index_to_k) {
+    if (abc[0] == 0 && abc[1] == 0 && abc[2] == 0)
+        return 0.;
+
+    double abc_d[3];
+    abc_d[0] = index_to_k[abc[0]];
+    abc_d[1] = index_to_k[abc[1]];
+    abc_d[2] = index_to_k[abc[2]];
+
+    return gw_GetProjection(i, l, abc_d) * gw_GetProjection(j, m, abc_d) -
+           gw_GetProjection(i, j, abc_d) * gw_GetProjection(l, m, abc_d) / 2.;
+}
+
 }; // namespace sledgehamr
 
 #endif // SLEDGEHAMR_GRAVITATIONAL_WAVES_H_