Skip to content

Commit fab0411

Browse files
authored
GMRESMLMG: Support Multi-level composite solve (#4271)
We also need to use smoother at the bottom level of the mlmg preconditioner, because our cg solvers are non-deterministic. What that means is with cg solvers the preconditioner matrix is not fixed. That can result in wrong solutions in GMRES.
1 parent 275f55f commit fab0411

24 files changed

+1002
-274
lines changed

Src/Base/AMReX_FabArrayUtility.H

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1602,6 +1602,207 @@ Dot (FabArray<FAB> const& x, int xcomp, FabArray<FAB> const& y, int ycomp, int n
16021602
return sm;
16031603
}
16041604

1605+
/**
1606+
* \brief Compute dot product of FabArray with itself
1607+
*
1608+
* \param x FabArray
1609+
* \param xcomp starting component of x
1610+
* \param ncomp number of components
1611+
* \param nghost number of ghost cells
1612+
* \param local If true, MPI communication is skipped.
1613+
*/
1614+
template <typename FAB, std::enable_if_t<IsBaseFab<FAB>::value,int> FOO = 0>
1615+
typename FAB::value_type
1616+
Dot (FabArray<FAB> const& x, int xcomp, int ncomp, IntVect const& nghost, bool local = false)
1617+
{
1618+
BL_ASSERT(x.nGrowVect().allGE(nghost));
1619+
1620+
BL_PROFILE("amrex::Dot()");
1621+
1622+
using T = typename FAB::value_type;
1623+
auto sm = T(0.0);
1624+
#ifdef AMREX_USE_GPU
1625+
if (Gpu::inLaunchRegion()) {
1626+
auto const& xma = x.const_arrays();
1627+
sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
1628+
[=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
1629+
{
1630+
auto t = T(0.0);
1631+
auto const& xfab = xma[box_no];
1632+
for (int n = 0; n < ncomp; ++n) {
1633+
auto v = xfab(i,j,k,xcomp+n);
1634+
t += v*v;
1635+
}
1636+
return t;
1637+
});
1638+
} else
1639+
#endif
1640+
{
1641+
#ifdef AMREX_USE_OMP
1642+
#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
1643+
#endif
1644+
for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
1645+
{
1646+
Box const& bx = mfi.growntilebox(nghost);
1647+
auto const& xfab = x.const_array(mfi);
1648+
AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
1649+
{
1650+
auto v = xfab(i,j,k,xcomp+n);
1651+
sm += v*v;
1652+
});
1653+
}
1654+
}
1655+
1656+
if (!local) {
1657+
ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
1658+
}
1659+
1660+
return sm;
1661+
}
1662+
1663+
/**
1664+
* \brief Compute dot product of two FabArrays in region that mask is true
1665+
*
1666+
* \param mask mask
1667+
* \param x first FabArray
1668+
* \param xcomp starting component of x
1669+
* \param y second FabArray
1670+
* \param ycomp starting component of y
1671+
* \param ncomp number of components
1672+
* \param nghost number of ghost cells
1673+
* \param local If true, MPI communication is skipped.
1674+
*/
1675+
template <typename IFAB, typename FAB,
1676+
std::enable_if_t<IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int> FOO = 0>
1677+
typename FAB::value_type
1678+
Dot (FabArray<IFAB> const& mask, FabArray<FAB> const& x, int xcomp,
1679+
FabArray<FAB> const& y, int ycomp, int ncomp, IntVect const& nghost,
1680+
bool local = false)
1681+
{
1682+
BL_ASSERT(x.boxArray() == y.boxArray() && x.boxArray() == mask.boxArray());
1683+
BL_ASSERT(x.DistributionMap() == y.DistributionMap() && x.DistributionMap() == mask.DistributionMap());
1684+
BL_ASSERT(x.nGrowVect().allGE(nghost) && y.nGrowVect().allGE(nghost) &&
1685+
mask.nGrowVect().allGE(nghost));
1686+
1687+
BL_PROFILE("amrex::Dot()");
1688+
1689+
using T = typename FAB::value_type;
1690+
auto sm = T(0.0);
1691+
#ifdef AMREX_USE_GPU
1692+
if (Gpu::inLaunchRegion()) {
1693+
auto const& mma = mask.const_arrays();
1694+
auto const& xma = x.const_arrays();
1695+
auto const& yma = y.const_arrays();
1696+
sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
1697+
[=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
1698+
{
1699+
auto t = T(0.0);
1700+
auto m = T(mma[box_no](i,j,k));
1701+
if (m != 0) {
1702+
auto const& xfab = xma[box_no];
1703+
auto const& yfab = yma[box_no];
1704+
for (int n = 0; n < ncomp; ++n) {
1705+
t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
1706+
}
1707+
}
1708+
return t*m;
1709+
});
1710+
} else
1711+
#endif
1712+
{
1713+
#ifdef AMREX_USE_OMP
1714+
#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
1715+
#endif
1716+
for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
1717+
{
1718+
Box const& bx = mfi.growntilebox(nghost);
1719+
auto const& mfab = mask.const_array(mfi);
1720+
auto const& xfab = x.const_array(mfi);
1721+
auto const& yfab = y.const_array(mfi);
1722+
AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
1723+
{
1724+
auto m = T(mfab(i,j,k));
1725+
sm += m * xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
1726+
});
1727+
}
1728+
}
1729+
1730+
if (!local) {
1731+
ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
1732+
}
1733+
1734+
return sm;
1735+
}
1736+
1737+
/**
1738+
* \brief Compute dot product of FabArray with itself in region that mask is true
1739+
*
1740+
* \param mask mask
1741+
* \param x FabArray
1742+
* \param xcomp starting component of x
1743+
* \param ncomp number of components
1744+
* \param nghost number of ghost cells
1745+
* \param local If true, MPI communication is skipped.
1746+
*/
1747+
template <typename IFAB, typename FAB,
1748+
std::enable_if_t<IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int> FOO = 0>
1749+
typename FAB::value_type
1750+
Dot (FabArray<IFAB> const& mask, FabArray<FAB> const& x, int xcomp, int ncomp,
1751+
IntVect const& nghost, bool local = false)
1752+
{
1753+
BL_ASSERT(x.boxArray() == mask.boxArray());
1754+
BL_ASSERT(x.DistributionMap() == mask.DistributionMap());
1755+
BL_ASSERT(x.nGrowVect().allGE(nghost) && mask.nGrowVect().allGE(nghost));
1756+
1757+
BL_PROFILE("amrex::Dot()");
1758+
1759+
using T = typename FAB::value_type;
1760+
auto sm = T(0.0);
1761+
#ifdef AMREX_USE_GPU
1762+
if (Gpu::inLaunchRegion()) {
1763+
auto const& mma = mask.const_arrays();
1764+
auto const& xma = x.const_arrays();
1765+
sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
1766+
[=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
1767+
{
1768+
auto t = T(0.0);
1769+
auto m = T(mma[box_no](i,j,k));
1770+
if (m != 0) {
1771+
auto const& xfab = xma[box_no];
1772+
for (int n = 0; n < ncomp; ++n) {
1773+
auto v = xfab(i,j,k,xcomp+n);
1774+
t += v*v;
1775+
}
1776+
}
1777+
return t*m;
1778+
});
1779+
} else
1780+
#endif
1781+
{
1782+
#ifdef AMREX_USE_OMP
1783+
#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
1784+
#endif
1785+
for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
1786+
{
1787+
Box const& bx = mfi.growntilebox(nghost);
1788+
auto const& mfab = mask.const_array(mfi);
1789+
auto const& xfab = x.const_array(mfi);
1790+
AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
1791+
{
1792+
auto m = T(mfab(i,j,k));
1793+
auto v = xfab(i,j,k,xcomp+n);
1794+
sm += m*v*v;
1795+
});
1796+
}
1797+
}
1798+
1799+
if (!local) {
1800+
ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
1801+
}
1802+
1803+
return sm;
1804+
}
1805+
16051806
//! dst = val
16061807
template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
16071808
void setVal (MF& dst, typename MF::value_type val)

Src/Base/AMReX_Vector.H

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@ namespace amrex
6666
return r;
6767
}
6868

69+
template <class T, std::size_t N, typename = typename T::FABType>
70+
[[nodiscard]] Vector<Array<T,N>*> GetVecOfPtrs (Vector<Array<T,N>>& a)
71+
{
72+
Vector<Array<T,N>*> r;
73+
r.reserve(a.size());
74+
for (auto& x : a) { r.push_back(&x); }
75+
return r;
76+
}
77+
6978
template <class T>
7079
[[nodiscard]] Vector<T*> GetVecOfPtrs (const Vector<std::unique_ptr<T> >& a)
7180
{
@@ -86,6 +95,15 @@ namespace amrex
8695
return r;
8796
}
8897

98+
template <class T, std::size_t N, typename = typename T::FABType>
99+
[[nodiscard]] Vector<Array<T,N> const*> GetVecOfConstPtrs (Vector<Array<T,N>> const& a)
100+
{
101+
Vector<Array<T,N> const*> r;
102+
r.reserve(a.size());
103+
for (auto& x : a) { r.push_back(&x); }
104+
return r;
105+
}
106+
89107
template <class T>
90108
[[nodiscard]] Vector<const T*> GetVecOfConstPtrs (const Vector<std::unique_ptr<T> >& a)
91109
{

0 commit comments

Comments
 (0)