@@ -237,47 +237,119 @@ void ABLStats::compute_zi()
237
237
.create_scratch_field (3 , m_temperature.num_grow ()[0 ]);
238
238
fvm::gradient (*gradT, m_temperature);
239
239
240
- // Only compute zi using coarsest level
240
+ // // Only compute zi using coarsest level
241
+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_a", ab);
241
242
const int lev = 0 ;
242
243
const int dir = m_normal_dir;
243
244
const auto & geom = (this ->m_sim .repo ()).mesh ().Geom (lev);
244
245
auto const & domain_box = geom.Domain ();
245
- const auto & gradT_arrs = (*gradT)(lev).const_arrays ();
246
- auto device_tg_fab = amrex::ReduceToPlane<
247
- amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int >>(
248
- dir, domain_box, m_temperature (lev),
249
- [=] AMREX_GPU_DEVICE (int nbx, int i, int j, int k)
250
- -> amrex::KeyValuePair<amrex::Real, int > {
251
- const amrex::IntVect iv (i, j, k);
252
- return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
246
+ // const auto& gradT_arrs = (*gradT)(lev).const_arrays();
247
+ // auto device_tg_fab = amrex::ReduceToPlane<
248
+ // amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int>>(
249
+ // dir, domain_box, m_temperature(lev),
250
+ // [=] AMREX_GPU_DEVICE(int nbx, int i, int j, int k)
251
+ // -> amrex::KeyValuePair<amrex::Real, int> {
252
+ // const amrex::IntVect iv(i, j, k);
253
+ // return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
254
+ // });
255
+ // BL_PROFILE_VAR_STOP(ab);
256
+
257
+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_b", bb);
258
+ // #ifdef AMREX_USE_GPU
259
+ // amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int>> pinned_tg_fab(
260
+ // device_tg_fab.box(), device_tg_fab.nComp(), amrex::The_Pinned_Arena());
261
+ // amrex::Gpu::dtoh_memcpy(
262
+ // pinned_tg_fab.dataPtr(), device_tg_fab.dataPtr(),
263
+ // pinned_tg_fab.nBytes());
264
+ // #else
265
+ // auto& pinned_tg_fab = device_tg_fab;
266
+ // #endif
267
+ // BL_PROFILE_VAR_STOP(bb);
268
+
269
+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_barrier", barrier);
270
+ // amrex::ParallelDescriptor::Barrier();
271
+ // BL_PROFILE_VAR_STOP(barrier);
272
+
273
+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_c", cb);
274
+ // amrex::ParallelReduce::Max(
275
+ // pinned_tg_fab.dataPtr(), static_cast<int>(pinned_tg_fab.size()),
276
+ // amrex::ParallelDescriptor::IOProcessorNumber(),
277
+ // amrex::ParallelDescriptor::Communicator());
278
+ // BL_PROFILE_VAR_STOP(cb);
279
+
280
+ // BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_d", db);
281
+ // if (amrex::ParallelDescriptor::IOProcessor()) {
282
+ // const auto dnval = m_dn;
283
+ // auto* p = pinned_tg_fab.dataPtr();
284
+ // m_zi = amrex::Reduce::Sum<amrex::Real>(
285
+ // pinned_tg_fab.size(),
286
+ // [=] AMREX_GPU_DEVICE(int i) noexcept -> amrex::Real {
287
+ // return (p[i].second() + 0.5) * dnval;
288
+ // },
289
+ // 0.0);
290
+ // m_zi /= static_cast<amrex::Real>(pinned_tg_fab.size());
291
+ // }
292
+ // BL_PROFILE_VAR_STOP(db);
293
+
294
+ AMREX_ALWAYS_ASSERT (domain_box.smallEnd () == 0 ); // We could relax this if necessary.
295
+ amrex::Array<bool ,AMREX_SPACEDIM> decomp{AMREX_D_DECL (true ,true ,true )};
296
+ decomp[dir] = false ; // no domain decompose in the dir direction.
297
+ auto new_ba = amrex::decompose (domain_box, amrex::ParallelDescriptor::NProcs (), decomp);
298
+
299
+ amrex::Vector<int > pmap (new_ba.size ());
300
+ std::iota (pmap.begin (), pmap.end (), 0 );
301
+ amrex::DistributionMapping new_dm (std::move (pmap));
302
+
303
+ amrex::MultiFab new_mf (new_ba, new_dm, 1 , 0 );
304
+ new_mf.ParallelCopy ((*gradT)(lev), dir, 0 , 1 );
305
+
306
+ amrex::Real zi_sum = 0 ;
307
+ int myproc = amrex::ParallelDescriptor::MyProc ();
308
+ if (myproc < new_mf.size ()) {
309
+ auto const & a = new_mf.const_array (myproc);
310
+ amrex::Box box2d = amrex::makeSlab (amrex::Box (a), dir, 0 );
311
+ AMREX_ALWAYS_ASSERT (dir == 2 ); // xxxxx TODO: we can support other directions later
312
+ // xxxxx TODO: sycl can be supported in the future.
313
+ // xxxxx TODO: we can support CPU later.
314
+ const int nblocks = box2d.numPts ();
315
+ constexpr int nthreads = 128 ;
316
+ const int lenx = box2d.length (0 );
317
+ const int lenz = domain_box.length (2 );
318
+ const int lox = box2d.smallEnd (0 );
319
+ const int loy = box2d.smallEnd (1 );
320
+ amrex::Gpu::DeviceVector<int > tmp (nblocks);
321
+ auto * ptmp = tmp.data ();
322
+ amrex::launch<nthreads>(nblocks, amrex::Gpu::gpuStream (),
323
+ [=] AMREX_GPU_DEVICE ()
324
+ {
325
+ const int j = int (blockIdx.x ) / lenx + loy;
326
+ const int i = int (blockIdx.x ) - j*lenx +lox;
327
+ amrex::KeyValuePair<amrex::Real,int > r{std::numeric_limits<amrex::Real>::lowest (),0 };
328
+ for (int k = threadIdx.x ; k < lenz; k += nthreads) {
329
+ if (a (i,j,k) > r.first ()) { r.second () = k; r.first () = a (i,j,k);}
330
+ }
331
+ r = amrex::Gpu::blockReduceMax<nthreads>(r);
332
+ if (threadIdx.x == 0 ) {
333
+ ptmp[blockIdx.x ] = r.second ();
334
+ }
253
335
});
254
336
255
- #ifdef AMREX_USE_GPU
256
- amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int >> pinned_tg_fab (
257
- device_tg_fab.box (), device_tg_fab.nComp (), amrex::The_Pinned_Arena ());
258
- amrex::Gpu::dtoh_memcpy (
259
- pinned_tg_fab.dataPtr (), device_tg_fab.dataPtr (),
260
- pinned_tg_fab.nBytes ());
261
- #else
262
- auto & pinned_tg_fab = device_tg_fab;
263
- #endif
337
+ const auto dnval = m_dn;
338
+ zi_sum = amrex::Reduce::Sum<amrex::Real>
339
+ (nblocks, [=] AMREX_GPU_DEVICE (int iblock)
340
+ {
341
+ return (ptmp[iblock] + amrex::Real (0.5 )) * dnval;
342
+ });
343
+ }
264
344
265
- amrex::ParallelReduce::Max (
266
- pinned_tg_fab.dataPtr (), static_cast <int >(pinned_tg_fab.size ()),
267
- amrex::ParallelDescriptor::IOProcessorNumber (),
268
- amrex::ParallelDescriptor::Communicator ());
345
+ amrex::ParallelReduce::Sum (zi_sum, amrex::ParallelDescriptor::IOProcessorNumber (),
346
+ amrex::ParallelDescriptor::Communicator ());
269
347
270
- if (amrex::ParallelDescriptor::IOProcessor ()) {
271
- const auto dnval = m_dn;
272
- auto * p = pinned_tg_fab.dataPtr ();
273
- m_zi = amrex::Reduce::Sum<amrex::Real>(
274
- pinned_tg_fab.size (),
275
- [=] AMREX_GPU_DEVICE (int i) noexcept -> amrex::Real {
276
- return (p[i].second () + 0.5 ) * dnval;
277
- },
278
- 0.0 );
279
- m_zi /= static_cast <amrex::Real>(pinned_tg_fab.size ());
348
+ amrex::Long npts = 1 ;
349
+ for (int idim = 0 ; idim < AMREX_SPACEDIM; ++idim) {
350
+ if (idim != dir) { npts *= domain_box.length (idim); }
280
351
}
352
+ m_zi = zi_sum / static_cast <amrex::Real>(npts);
281
353
}
282
354
283
355
void ABLStats::process_output ()
0 commit comments