22#include " fft.h"
33#include " hdf5_utils.h"
44#include " sledgehamr_utils.h"
5+ #include < AMReX_ParallelReduce.H>
56
67namespace sledgehamr {
78
@@ -67,6 +68,9 @@ void GravitationalWaves::ComputeSpectrum(
6768 int comps[6 ];
6869 modifier->SelectComponents (comps);
6970
71+ std::chrono::steady_clock::time_point start_time =
72+ std::chrono::steady_clock::now ();
73+
7074 for (int i = 0 ; i < 6 ; ++i) {
7175#ifdef OLD_FFT
7276 const amrex::BoxArray &ba = ld.boxArray ();
@@ -78,6 +82,14 @@ void GravitationalWaves::ComputeSpectrum(
7882 sim->geom [lev], false , zero_padding);
7983 }
8084
85+ std::chrono::steady_clock::time_point end_time =
86+ std::chrono::steady_clock::now ();
87+ double duration_ms = static_cast <double >(
88+ std::chrono::duration_cast<std::chrono::microseconds>(end_time -
89+ start_time)
90+ .count ());
91+ // amrex::Print() << "FFTs: " << duration_ms << std::endl;
92+
8193 double dk = 2 . * M_PI / L;
8294 double dimN6 = pow (dimN, 6 );
8395
@@ -93,15 +105,105 @@ void GravitationalWaves::ComputeSpectrum(
93105 k <= std::sqrt (3 .) / 2 . * static_cast <double >(dimN) + .5 ; ++k) {
94106 ks.push_back (k * k);
95107 }
96- NTHREADS = 256 ;
108+ NTHREADS = 16 ;
97109 }
98-
99110 int kmax = ks.size ();
111+ amrex::Gpu::AsyncArray<double > async_index_to_k (sim->index_to_k .data (),
112+ sim->index_to_k .size ());
113+ double *index_to_k = async_index_to_k.data ();
114+
115+ start_time = std::chrono::steady_clock::now ();
116+
117+ #ifdef AMREX_USE_GPU
118+ if (l_unbinned) {
119+ amrex::Abort (
120+ " Unbinned gw spectra on GPUs are currently not supported!" );
121+ }
122+
123+ bool l_unbinned = unbinned;
124+
125+ unsigned long SpecLen = kmax;
126+ std::vector<double > gw_spectrum (SpecLen, 0.0 );
127+ amrex::Gpu::DeviceVector<double > d_data (SpecLen, 0.0 );
128+ double *const AMREX_RESTRICT dptr_data = d_data.dataPtr ();
129+
130+ for (amrex::MFIter mfi (du_real[0 ], false ); mfi.isValid (); ++mfi) {
131+ const amrex::Box &bx = mfi.tilebox ();
132+
133+ const amrex::Array4<double > du_real_arr[6 ] = {
134+ du_real[0 ].array (mfi), du_real[1 ].array (mfi),
135+ du_real[2 ].array (mfi), du_real[3 ].array (mfi),
136+ du_real[4 ].array (mfi), du_real[5 ].array (mfi)};
137+
138+ const amrex::Array4<double > du_imag_arr[6 ] = {
139+ du_imag[0 ].array (mfi), du_imag[1 ].array (mfi),
140+ du_imag[2 ].array (mfi), du_imag[3 ].array (mfi),
141+ du_imag[4 ].array (mfi), du_imag[5 ].array (mfi)};
142+
143+ amrex::ParallelFor (bx, [=] AMREX_GPU_DEVICE (int a, int b,
144+ int c) noexcept {
145+ // To account of negative frequencies
146+ double multpl = (a == 0 || a == dimN / 2 ) ? 1 . : 2 .;
147+ int abc[3 ] = {a, b, c};
148+ double running_sum = 0 ;
149+
150+ int li = a >= dimN / 2 ? a - dimN : a;
151+ int lj = b >= dimN / 2 ? b - dimN : b;
152+ int lk = c >= dimN / 2 ? c - dimN : c;
153+ unsigned int sq = li * li + lj * lj + lk * lk;
154+ unsigned long index;
155+ if (l_unbinned) {
156+ // index = std::lower_bound(ks.begin(), ks.end(), sq) -
157+ // ks.begin();
158+ } else {
159+ index =
160+ static_cast <long >(std::sqrt (static_cast <double >(sq)) + .5 );
161+ }
162+
163+ for (int i = 0 ; i < 3 ; ++i) {
164+ for (int j = 0 ; j < 3 ; ++j) {
165+ for (int l = 0 ; l < 3 ; ++l) {
166+ for (int m = 0 ; m < 3 ; ++m) {
167+ running_sum +=
168+ gw_GetLambda (i, j, l, m, abc, index_to_k) *
169+ multpl *
170+ (du_real_arr[mat[i][j]].operator ()(a, b, c) *
171+ du_real_arr[mat[l][m]].operator ()(a, b,
172+ c) +
173+ du_imag_arr[mat[i][j]].operator ()(a, b, c) *
174+ du_imag_arr[mat[l][m]].operator ()(a, b,
175+ c));
176+ }
177+ }
178+ }
179+ }
180+ amrex::HostDevice::Atomic::Add (&dptr_data[index],
181+ running_sum / dimN6);
182+ });
183+ }
184+
185+ end_time = std::chrono::steady_clock::now ();
186+ duration_ms = static_cast <double >(
187+ std::chrono::duration_cast<std::chrono::microseconds>(end_time -
188+ start_time)
189+ .count ());
190+ // amrex::Print() << "Sum: " << duration_ms << std::endl;
191+ start_time = std::chrono::steady_clock::now ();
192+
193+ // blocking copy from device to host
194+ amrex::Gpu::copy (amrex::Gpu::deviceToHost, d_data.begin (), d_data.end (),
195+ gw_spectrum.begin ());
196+
197+ // reduced sum over mpi ranks
198+ amrex::ParallelDescriptor::ReduceRealSum (
199+ gw_spectrum.data (), static_cast <int >(gw_spectrum.size ()),
200+ amrex::ParallelDescriptor::IOProcessorNumber ());
201+
202+ #else // ifdef AMREX_USE_GPU
203+
100204 unsigned long SpecLen = kmax * NTHREADS;
101- std::unique_ptr<double []> gw_spectrum (new double [SpecLen]);
102- std::fill_n (gw_spectrum.get (), SpecLen, 0.0 );
205+ std::vector<double > gw_spectrum (SpecLen, 0.0 );
103206
104- // Non-trivial load-balancing here. Not sure what wins.
105207#pragma omp parallel num_threads(std::min(NTHREADS, omp_get_max_threads()))
106208 for (amrex::MFIter mfi (du_real[0 ], true ); mfi.isValid (); ++mfi) {
107209 const amrex::Box &bx = mfi.tilebox ();
@@ -156,7 +258,8 @@ void GravitationalWaves::ComputeSpectrum(
156258 for (int l = 0 ; l < 3 ; ++l) {
157259 for (int m = 0 ; m < 3 ; ++m) {
158260 running_sum +=
159- GetLambda (i, j, l, m, abc, dimN) *
261+ gw_GetLambda (i, j, l, m, abc,
262+ index_to_k) *
160263 multpl *
161264 (du_real_arr[mat[i][j]]->operator ()(
162265 a, b, c) *
@@ -175,94 +278,47 @@ void GravitationalWaves::ComputeSpectrum(
175278 }
176279 }
177280 }
281+
282+ end_time = std::chrono::steady_clock::now ();
283+ duration_ms = static_cast <double >(
284+ std::chrono::duration_cast<std::chrono::microseconds>(end_time -
285+ start_time)
286+ .count ());
287+ // amrex::Print() << "Sum: " << duration_ms << std::endl;
288+ start_time = std::chrono::steady_clock::now ();
289+
178290 for (int a = 1 ; a < NTHREADS; ++a) {
179291 for (int c = 0 ; c < kmax; ++c) {
180292 gw_spectrum[c] += gw_spectrum[a * kmax + c];
181293 }
182294 }
183295
184296 amrex::ParallelDescriptor::ReduceRealSum (
185- gw_spectrum. get ( ), kmax,
297+ &(gw_spectrum[ 0 ] ), kmax,
186298 amrex::ParallelDescriptor::IOProcessorNumber ());
187299
188300#pragma omp parallel for
189301 for (int c = 0 ; c < kmax; ++c) {
190302 gw_spectrum[c] /= dimN6;
191303 }
192304
305+ #endif // AMREX_USE_GPU
306+ end_time = std::chrono::steady_clock::now ();
307+ duration_ms = static_cast <double >(
308+ std::chrono::duration_cast<std::chrono::microseconds>(end_time -
309+ start_time)
310+ .count ());
311+ // amrex::Print() << "Reduce: " << duration_ms << std::endl;
312+
193313 if (amrex::ParallelDescriptor::IOProcessor ()) {
194314 const int nparams = 6 ;
195315 double header_data[nparams] = {
196316 ld.t , (double )dimN, (double )kmax,
197317 L, (double )zero_padding, (double )unbinned};
198318 utils::hdf5::Write (file_id, " Header" , header_data, nparams);
199319 utils::hdf5::Write (file_id, " k" , &(ks[0 ]), kmax);
200- utils::hdf5::Write (file_id, " Spectrum" , gw_spectrum. get () , kmax);
320+ utils::hdf5::Write (file_id, " Spectrum" , & gw_spectrum[ 0 ] , kmax);
201321 }
202322}
203323
204- /* * @brief Converts a given index to k-space given a projection type.
205- * @param a Index to be converted.
206- * @param N Total index length.
207- * @return k-value.
208- */
209- inline double GravitationalWaves::IndexToK (int a, int N) {
210- return sim->index_to_k [a];
211- /*
212- double n_tilde = a - N <= -N / 2 - 1 ? a : a - N;
213- double two_pi_n_tilde = 2. * M_PI / static_cast<double>(N) *
214- n_tilde;
215-
216- if (projection_type == 1) {
217- return sin(two_pi_n_tilde);
218- } else if (projection_type == 2) {
219- return (8. * sin(two_pi_n_tilde) - sin(2. * two_pi_n_tilde))
220- / 6.;
221- }
222-
223- return 0.;
224- */
225- }
226-
227- /* * @brief Projects all indicies.
228- * @param i i-index.
229- * @param j j-index.
230- * @param abc a-, b-, and c- index.
231- * @param N Total index length.
232- * @return Projected value.
233- */
234- inline double GravitationalWaves::GetProjection (int i, int j, int abc[3 ],
235- int N) {
236- if (abc[0 ] == 0 && abc[1 ] == 0 && abc[2 ] == 0 )
237- return 0 .;
238-
239- double abc_d[3 ];
240- abc_d[0 ] = IndexToK (abc[0 ], N);
241- abc_d[1 ] = IndexToK (abc[1 ], N);
242- abc_d[2 ] = IndexToK (abc[2 ], N);
243-
244- double norm =
245- abc_d[0 ] * abc_d[0 ] + abc_d[1 ] * abc_d[1 ] + abc_d[2 ] * abc_d[2 ];
246- int l = amrex::min (i, j);
247- int m = amrex::max (i, j);
248-
249- double proj = abc_d[l] * abc_d[m];
250-
251- return static_cast <double >(l == m) - proj / norm;
252- }
253-
254- /* * @brief Computes lambda from projections.
255- * @param i i-index.
256- * @param j j-index.
257- * @param l l-index.
258- * @param m m-index.
259- * @param abc a-, b-, and c-index.
260- * @param N Total index length.
261- * @return Lambda
262- */
263- inline double GravitationalWaves::GetLambda (int i, int j, int l, int m,
264- int abc[3 ], int N) {
265- return GetProjection (i, l, abc, N) * GetProjection (j, m, abc, N) -
266- GetProjection (i, j, abc, N) * GetProjection (l, m, abc, N) / 2 .;
267- }
268324}; // namespace sledgehamr
0 commit comments