Skip to content

Commit 51374e3

Browse files
committed
improve comp zi formulation
1 parent a844050 commit 51374e3

File tree

2 files changed

+107
-32
lines changed

2 files changed

+107
-32
lines changed

amr-wind/utilities/SecondMomentAveraging.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ void SecondMomentAveraging::compute_average(
146146
const int ncomp2 = m_plane_average2.ncomp();
147147
const int nmoments = m_num_moments;
148148

149+
BL_PROFILE_VAR("amr-wind::SecondMomentAveraging::compute_average_a", ab);
150+
149151
#ifdef AMREX_USE_OMP
150152
#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
151153
#endif
@@ -197,6 +199,7 @@ void SecondMomentAveraging::compute_average(
197199
}
198200
});
199201
}
202+
BL_PROFILE_VAR_STOP(ab);
200203

201204
lfluc.copyToHost(
202205
m_second_moments_line.data(), m_second_moments_line.size());

amr-wind/wind_energy/ABLStats.cpp

Lines changed: 104 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -237,47 +237,119 @@ void ABLStats::compute_zi()
237237
.create_scratch_field(3, m_temperature.num_grow()[0]);
238238
fvm::gradient(*gradT, m_temperature);
239239

240-
// Only compute zi using coarsest level
240+
// // Only compute zi using coarsest level
241+
// BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_a", ab);
241242
const int lev = 0;
242243
const int dir = m_normal_dir;
243244
const auto& geom = (this->m_sim.repo()).mesh().Geom(lev);
244245
auto const& domain_box = geom.Domain();
245-
const auto& gradT_arrs = (*gradT)(lev).const_arrays();
246-
auto device_tg_fab = amrex::ReduceToPlane<
247-
amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int>>(
248-
dir, domain_box, m_temperature(lev),
249-
[=] AMREX_GPU_DEVICE(int nbx, int i, int j, int k)
250-
-> amrex::KeyValuePair<amrex::Real, int> {
251-
const amrex::IntVect iv(i, j, k);
252-
return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
246+
// const auto& gradT_arrs = (*gradT)(lev).const_arrays();
247+
// auto device_tg_fab = amrex::ReduceToPlane<
248+
// amrex::ReduceOpMax, amrex::KeyValuePair<amrex::Real, int>>(
249+
// dir, domain_box, m_temperature(lev),
250+
// [=] AMREX_GPU_DEVICE(int nbx, int i, int j, int k)
251+
// -> amrex::KeyValuePair<amrex::Real, int> {
252+
// const amrex::IntVect iv(i, j, k);
253+
// return {gradT_arrs[nbx](i, j, k, dir), iv[dir]};
254+
// });
255+
// BL_PROFILE_VAR_STOP(ab);
256+
257+
// BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_b", bb);
258+
// #ifdef AMREX_USE_GPU
259+
// amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int>> pinned_tg_fab(
260+
// device_tg_fab.box(), device_tg_fab.nComp(), amrex::The_Pinned_Arena());
261+
// amrex::Gpu::dtoh_memcpy(
262+
// pinned_tg_fab.dataPtr(), device_tg_fab.dataPtr(),
263+
// pinned_tg_fab.nBytes());
264+
// #else
265+
// auto& pinned_tg_fab = device_tg_fab;
266+
// #endif
267+
// BL_PROFILE_VAR_STOP(bb);
268+
269+
// BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_barrier", barrier);
270+
// amrex::ParallelDescriptor::Barrier();
271+
// BL_PROFILE_VAR_STOP(barrier);
272+
273+
// BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_c", cb);
274+
// amrex::ParallelReduce::Max(
275+
// pinned_tg_fab.dataPtr(), static_cast<int>(pinned_tg_fab.size()),
276+
// amrex::ParallelDescriptor::IOProcessorNumber(),
277+
// amrex::ParallelDescriptor::Communicator());
278+
// BL_PROFILE_VAR_STOP(cb);
279+
280+
// BL_PROFILE_VAR("amr-wind::ABLStats::compute_zi_d", db);
281+
// if (amrex::ParallelDescriptor::IOProcessor()) {
282+
// const auto dnval = m_dn;
283+
// auto* p = pinned_tg_fab.dataPtr();
284+
// m_zi = amrex::Reduce::Sum<amrex::Real>(
285+
// pinned_tg_fab.size(),
286+
// [=] AMREX_GPU_DEVICE(int i) noexcept -> amrex::Real {
287+
// return (p[i].second() + 0.5) * dnval;
288+
// },
289+
// 0.0);
290+
// m_zi /= static_cast<amrex::Real>(pinned_tg_fab.size());
291+
// }
292+
// BL_PROFILE_VAR_STOP(db);
293+
294+
AMREX_ALWAYS_ASSERT(domain_box.smallEnd() == 0); // We could relax this if necessary.
295+
amrex::Array<bool,AMREX_SPACEDIM> decomp{AMREX_D_DECL(true,true,true)};
296+
decomp[dir] = false; // no domain decompose in the dir direction.
297+
auto new_ba = amrex::decompose(domain_box, amrex::ParallelDescriptor::NProcs(), decomp);
298+
299+
amrex::Vector<int> pmap(new_ba.size());
300+
std::iota(pmap.begin(), pmap.end(), 0);
301+
amrex::DistributionMapping new_dm(std::move(pmap));
302+
303+
amrex::MultiFab new_mf(new_ba, new_dm, 1, 0);
304+
new_mf.ParallelCopy((*gradT)(lev), dir, 0, 1);
305+
306+
amrex::Real zi_sum = 0;
307+
int myproc = amrex::ParallelDescriptor::MyProc();
308+
if (myproc < new_mf.size()) {
309+
auto const& a = new_mf.const_array(myproc);
310+
amrex::Box box2d = amrex::makeSlab(amrex::Box(a), dir, 0);
311+
AMREX_ALWAYS_ASSERT(dir == 2); // xxxxx TODO: we can support other directions later
312+
// xxxxx TODO: sycl can be supported in the future.
313+
// xxxxx TODO: we can support CPU later.
314+
const int nblocks = box2d.numPts();
315+
constexpr int nthreads = 128;
316+
const int lenx = box2d.length(0);
317+
const int lenz = domain_box.length(2);
318+
const int lox = box2d.smallEnd(0);
319+
const int loy = box2d.smallEnd(1);
320+
amrex::Gpu::DeviceVector<int> tmp(nblocks);
321+
auto* ptmp = tmp.data();
322+
amrex::launch<nthreads>(nblocks, amrex::Gpu::gpuStream(),
323+
[=] AMREX_GPU_DEVICE()
324+
{
325+
const int j = int(blockIdx.x) / lenx + loy;
326+
const int i = int(blockIdx.x) - j*lenx +lox;
327+
amrex::KeyValuePair<amrex::Real,int> r{std::numeric_limits<amrex::Real>::lowest(),0};
328+
for (int k = threadIdx.x; k < lenz; k += nthreads) {
329+
if (a(i,j,k) > r.first()) { r.second() = k; r.first() = a(i,j,k);}
330+
}
331+
r = amrex::Gpu::blockReduceMax<nthreads>(r);
332+
if (threadIdx.x == 0) {
333+
ptmp[blockIdx.x] = r.second();
334+
}
253335
});
254336

255-
#ifdef AMREX_USE_GPU
256-
amrex::BaseFab<amrex::KeyValuePair<amrex::Real, int>> pinned_tg_fab(
257-
device_tg_fab.box(), device_tg_fab.nComp(), amrex::The_Pinned_Arena());
258-
amrex::Gpu::dtoh_memcpy(
259-
pinned_tg_fab.dataPtr(), device_tg_fab.dataPtr(),
260-
pinned_tg_fab.nBytes());
261-
#else
262-
auto& pinned_tg_fab = device_tg_fab;
263-
#endif
337+
const auto dnval = m_dn;
338+
zi_sum = amrex::Reduce::Sum<amrex::Real>
339+
(nblocks, [=] AMREX_GPU_DEVICE (int iblock)
340+
{
341+
return (ptmp[iblock] + amrex::Real(0.5)) * dnval;
342+
});
343+
}
264344

265-
amrex::ParallelReduce::Max(
266-
pinned_tg_fab.dataPtr(), static_cast<int>(pinned_tg_fab.size()),
267-
amrex::ParallelDescriptor::IOProcessorNumber(),
268-
amrex::ParallelDescriptor::Communicator());
345+
amrex::ParallelReduce::Sum(zi_sum, amrex::ParallelDescriptor::IOProcessorNumber(),
346+
amrex::ParallelDescriptor::Communicator());
269347

270-
if (amrex::ParallelDescriptor::IOProcessor()) {
271-
const auto dnval = m_dn;
272-
auto* p = pinned_tg_fab.dataPtr();
273-
m_zi = amrex::Reduce::Sum<amrex::Real>(
274-
pinned_tg_fab.size(),
275-
[=] AMREX_GPU_DEVICE(int i) noexcept -> amrex::Real {
276-
return (p[i].second() + 0.5) * dnval;
277-
},
278-
0.0);
279-
m_zi /= static_cast<amrex::Real>(pinned_tg_fab.size());
348+
amrex::Long npts = 1;
349+
for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
350+
if (idim != dir) { npts *= domain_box.length(idim); }
280351
}
352+
m_zi = zi_sum / static_cast<amrex::Real>(npts);
281353
}
282354

283355
void ABLStats::process_output()

0 commit comments

Comments
 (0)