@@ -1602,6 +1602,207 @@ Dot (FabArray<FAB> const& x, int xcomp, FabArray<FAB> const& y, int ycomp, int n
16021602 return sm;
16031603}
16041604
1605+ /* *
1606+ * \brief Compute dot product of FabArray with itself
1607+ *
1608+ * \param x FabArray
1609+ * \param xcomp starting component of x
1610+ * \param ncomp number of components
1611+ * \param nghost number of ghost cells
1612+ * \param local If true, MPI communication is skipped.
1613+ */
1614+ template <typename FAB, std::enable_if_t <IsBaseFab<FAB>::value,int > FOO = 0 >
1615+ typename FAB::value_type
1616+ Dot (FabArray<FAB> const & x, int xcomp, int ncomp, IntVect const & nghost, bool local = false )
1617+ {
1618+ BL_ASSERT (x.nGrowVect ().allGE (nghost));
1619+
1620+ BL_PROFILE (" amrex::Dot()" );
1621+
1622+ using T = typename FAB::value_type;
1623+ auto sm = T (0.0 );
1624+ #ifdef AMREX_USE_GPU
1625+ if (Gpu::inLaunchRegion ()) {
1626+ auto const & xma = x.const_arrays ();
1627+ sm = ParReduce (TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
1628+ [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
1629+ {
1630+ auto t = T (0.0 );
1631+ auto const & xfab = xma[box_no];
1632+ for (int n = 0 ; n < ncomp; ++n) {
1633+ auto v = xfab (i,j,k,xcomp+n);
1634+ t += v*v;
1635+ }
1636+ return t;
1637+ });
1638+ } else
1639+ #endif
1640+ {
1641+ #ifdef AMREX_USE_OMP
1642+ #pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
1643+ #endif
1644+ for (MFIter mfi (x,true ); mfi.isValid (); ++mfi)
1645+ {
1646+ Box const & bx = mfi.growntilebox (nghost);
1647+ auto const & xfab = x.const_array (mfi);
1648+ AMREX_LOOP_4D (bx, ncomp, i, j, k, n,
1649+ {
1650+ auto v = xfab (i,j,k,xcomp+n);
1651+ sm += v*v;
1652+ });
1653+ }
1654+ }
1655+
1656+ if (!local) {
1657+ ParallelAllReduce::Sum (sm, ParallelContext::CommunicatorSub ());
1658+ }
1659+
1660+ return sm;
1661+ }
1662+
1663+ /* *
1664+ * \brief Compute dot product of two FabArrays in region that mask is true
1665+ *
1666+ * \param mask mask
1667+ * \param x first FabArray
1668+ * \param xcomp starting component of x
1669+ * \param y second FabArray
1670+ * \param ycomp starting component of y
1671+ * \param ncomp number of components
1672+ * \param nghost number of ghost cells
1673+ * \param local If true, MPI communication is skipped.
1674+ */
1675+ template <typename IFAB, typename FAB,
1676+ std::enable_if_t <IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int > FOO = 0 >
1677+ typename FAB::value_type
1678+ Dot (FabArray<IFAB> const & mask, FabArray<FAB> const & x, int xcomp,
1679+ FabArray<FAB> const & y, int ycomp, int ncomp, IntVect const & nghost,
1680+ bool local = false )
1681+ {
1682+ BL_ASSERT (x.boxArray () == y.boxArray () && x.boxArray () == mask.boxArray ());
1683+ BL_ASSERT (x.DistributionMap () == y.DistributionMap () && x.DistributionMap () == mask.DistributionMap ());
1684+ BL_ASSERT (x.nGrowVect ().allGE (nghost) && y.nGrowVect ().allGE (nghost) &&
1685+ mask.nGrowVect ().allGE (nghost));
1686+
1687+ BL_PROFILE (" amrex::Dot()" );
1688+
1689+ using T = typename FAB::value_type;
1690+ auto sm = T (0.0 );
1691+ #ifdef AMREX_USE_GPU
1692+ if (Gpu::inLaunchRegion ()) {
1693+ auto const & mma = mask.const_arrays ();
1694+ auto const & xma = x.const_arrays ();
1695+ auto const & yma = y.const_arrays ();
1696+ sm = ParReduce (TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
1697+ [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
1698+ {
1699+ auto t = T (0.0 );
1700+ auto m = T (mma[box_no](i,j,k));
1701+ if (m != 0 ) {
1702+ auto const & xfab = xma[box_no];
1703+ auto const & yfab = yma[box_no];
1704+ for (int n = 0 ; n < ncomp; ++n) {
1705+ t += xfab (i,j,k,xcomp+n) * yfab (i,j,k,ycomp+n);
1706+ }
1707+ }
1708+ return t*m;
1709+ });
1710+ } else
1711+ #endif
1712+ {
1713+ #ifdef AMREX_USE_OMP
1714+ #pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
1715+ #endif
1716+ for (MFIter mfi (x,true ); mfi.isValid (); ++mfi)
1717+ {
1718+ Box const & bx = mfi.growntilebox (nghost);
1719+ auto const & mfab = mask.const_array (mfi);
1720+ auto const & xfab = x.const_array (mfi);
1721+ auto const & yfab = y.const_array (mfi);
1722+ AMREX_LOOP_4D (bx, ncomp, i, j, k, n,
1723+ {
1724+ auto m = T (mfab (i,j,k));
1725+ sm += m * xfab (i,j,k,xcomp+n) * yfab (i,j,k,ycomp+n);
1726+ });
1727+ }
1728+ }
1729+
1730+ if (!local) {
1731+ ParallelAllReduce::Sum (sm, ParallelContext::CommunicatorSub ());
1732+ }
1733+
1734+ return sm;
1735+ }
1736+
1737+ /* *
1738+ * \brief Compute dot product of FabArray with itself in region that mask is true
1739+ *
1740+ * \param mask mask
1741+ * \param x FabArray
1742+ * \param xcomp starting component of x
1743+ * \param ncomp number of components
1744+ * \param nghost number of ghost cells
1745+ * \param local If true, MPI communication is skipped.
1746+ */
1747+ template <typename IFAB, typename FAB,
1748+ std::enable_if_t <IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int > FOO = 0 >
1749+ typename FAB::value_type
1750+ Dot (FabArray<IFAB> const & mask, FabArray<FAB> const & x, int xcomp, int ncomp,
1751+ IntVect const & nghost, bool local = false )
1752+ {
1753+ BL_ASSERT (x.boxArray () == mask.boxArray ());
1754+ BL_ASSERT (x.DistributionMap () == mask.DistributionMap ());
1755+ BL_ASSERT (x.nGrowVect ().allGE (nghost) && mask.nGrowVect ().allGE (nghost));
1756+
1757+ BL_PROFILE (" amrex::Dot()" );
1758+
1759+ using T = typename FAB::value_type;
1760+ auto sm = T (0.0 );
1761+ #ifdef AMREX_USE_GPU
1762+ if (Gpu::inLaunchRegion ()) {
1763+ auto const & mma = mask.const_arrays ();
1764+ auto const & xma = x.const_arrays ();
1765+ sm = ParReduce (TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
1766+ [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
1767+ {
1768+ auto t = T (0.0 );
1769+ auto m = T (mma[box_no](i,j,k));
1770+ if (m != 0 ) {
1771+ auto const & xfab = xma[box_no];
1772+ for (int n = 0 ; n < ncomp; ++n) {
1773+ auto v = xfab (i,j,k,xcomp+n);
1774+ t += v*v;
1775+ }
1776+ }
1777+ return t*m;
1778+ });
1779+ } else
1780+ #endif
1781+ {
1782+ #ifdef AMREX_USE_OMP
1783+ #pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
1784+ #endif
1785+ for (MFIter mfi (x,true ); mfi.isValid (); ++mfi)
1786+ {
1787+ Box const & bx = mfi.growntilebox (nghost);
1788+ auto const & mfab = mask.const_array (mfi);
1789+ auto const & xfab = x.const_array (mfi);
1790+ AMREX_LOOP_4D (bx, ncomp, i, j, k, n,
1791+ {
1792+ auto m = T (mfab (i,j,k));
1793+ auto v = xfab (i,j,k,xcomp+n);
1794+ sm += m*v*v;
1795+ });
1796+ }
1797+ }
1798+
1799+ if (!local) {
1800+ ParallelAllReduce::Sum (sm, ParallelContext::CommunicatorSub ());
1801+ }
1802+
1803+ return sm;
1804+ }
1805+
16051806// ! dst = val
16061807template <class MF , std::enable_if_t <IsMultiFabLike_v<MF>,int > = 0 >
16071808void setVal (MF& dst, typename MF::value_type val)
0 commit comments