Fix GPU communication for non-arithmetic types (#4515)

WeiqunZhang · web-flow · commit a21af25a11ca · 2025-06-21T15:43:29.000-07:00
Tested on Nvidia GTX 1060, GV100, V100, A100 and H100, AMD MI250X and
MI300A, and Intel PVC.
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
@@ -2123,6 +2123,76 @@ DistributionMap (Array<MF,N> const& mf)
     return mf[0].DistributionMap();
 }
 
+/*
+ * \brief Return a mask indicating how many duplicates are in each point
+ *
+ * \param fa     input FabArray
+ * \param nghost number of ghost cells included in counting
+ * \param period periodicity
+ */
+template <class FAB>
+FabArray<BaseFab<int>>
+OverlapMask (FabArray<FAB> const& fa, IntVect const& nghost, Periodicity const& period)
+{
+    BL_PROFILE("OverlapMask()");
+
+    const BoxArray& ba = fa.boxArray();
+    const DistributionMapping& dm = fa.DistributionMap();
+
+    FabArray<BaseFab<int>> mask(ba, dm, 1, nghost);
+    mask.setVal(1);
+
+    const std::vector<IntVect>& pshifts = period.shiftIntVect();
+
+    Vector<Array4BoxTag<int> > tags;
+
+    bool run_on_gpu = Gpu::inLaunchRegion();
+    amrex::ignore_unused(run_on_gpu, tags);
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!run_on_gpu)
+#endif
+    {
+        std::vector< std::pair<int,Box> > isects;
+
+        for (MFIter mfi(mask); mfi.isValid(); ++mfi)
+        {
+            const Box& bx = mask[mfi].box();
+            auto const& arr = mask.array(mfi);
+
+            for (const auto& iv : pshifts)
+            {
+                ba.intersections(bx+iv, isects, false, nghost);
+                for (const auto& is : isects)
+                {
+                    Box const& b = is.second-iv;
+                    if (iv == 0 && b == bx) { continue; }
+#ifdef AMREX_USE_GPU
+                    if (run_on_gpu) {
+                        tags.push_back({arr,b});
+                    } else
+#endif
+                    {
+                        amrex::LoopConcurrentOnCpu(b, [=] (int i, int j, int k) noexcept
+                        {
+                            arr(i,j,k) += 1;
+                        });
+                    }
+                }
+            }
+        }
+    }
+
+#ifdef AMREX_USE_GPU
+    amrex::ParallelFor(tags, 1,
+    [=] AMREX_GPU_DEVICE (int i, int j, int k, int n, Array4BoxTag<int> const& tag) noexcept
+    {
+        Gpu::Atomic::AddNoRet(tag.dfab.ptr(i,j,k,n), 1);
+    });
+#endif
+
+    return mask;
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp
@@ -1529,8 +1529,8 @@ MultiFab::OverlapMask (const Periodicity& period) const
     amrex::ParallelFor(tags, 1,
     [=] AMREX_GPU_DEVICE (int i, int j, int k, int n, Array4BoxTag<Real> const& tag) noexcept
     {
-        Real* p = tag.dfab.ptr(i,j,k,n);
-        Gpu::Atomic::AddNoRet(p, Real(1.0));
+        Real* ptr = tag.dfab.ptr(i,j,k,n);
+        Gpu::Atomic::AddNoRet(ptr, Real(1.0));
     });
 #endif
 
diff --git a/Src/Base/AMReX_PCI.H b/Src/Base/AMReX_PCI.H
@@ -97,13 +97,15 @@ FabArray<FAB>::PC_local_gpu (const CPC& thecpc, FabArray<FAB> const& src,
     loc_copy_tags.reserve(N_locs);
 
     Vector<BaseFab<int> > maskfabs;
+    Vector<Array4Tag<int> > masks_unique;
     Vector<Array4Tag<int> > masks;
     if (!is_thread_safe)
     {
         if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
             (op == FabArrayBase::ADD  && !amrex::HasAtomicAdd <value_type>::value))
         {
             maskfabs.resize(this->local_size());
+            masks_unique.reserve(this->local_size());
             masks.reserve(N_locs);
         }
     }
@@ -122,14 +124,15 @@ FabArray<FAB>::PC_local_gpu (const CPC& thecpc, FabArray<FAB> const& src,
             if (maskfabs.size() > 0) {
                 if (!maskfabs[li].isAllocated()) {
                     maskfabs[li].resize(this->atLocalIdx(li).box());
+                    masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});
                 }
                 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
             }
         }
     }
 
     if (maskfabs.size() > 0) {
-        amrex::ParallelFor(masks,
+        amrex::ParallelFor(masks_unique,
         [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
         {
             msk.dfab(i,j,k) = 0;
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
@@ -125,7 +125,7 @@ else()
    #
    # List of subdirectories to search for CMakeLists.
    #
-   set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CTOParFor DeviceGlobal Enum
+   set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CommType CTOParFor DeviceGlobal Enum
                             MultiBlock MultiPeriod ParmParse Parser Parser2 ParserUserFn Reinit
                             RoundoffDomain SmallMatrix)
 
diff --git a/Tests/CommType/CMakeLists.txt b/Tests/CommType/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/CommType/GNUmakefile b/Tests/CommType/GNUmakefile
@@ -0,0 +1,22 @@
+AMREX_HOME = ../../
+
+DEBUG	= FALSE
+DIM	= 3
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/CommType/Make.package b/Tests/CommType/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/CommType/main.cpp b/Tests/CommType/main.cpp
@@ -0,0 +1,168 @@
+#include <AMReX.H>
+#include <AMReX_Print.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_GpuComplex.H>
+
+using namespace amrex;
+
+int main(int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+
+    int ret_code = EXIT_SUCCESS;
+
+    {
+        int ncells = 128;
+        BoxArray ba(Box(IntVect(0), IntVect(ncells-1)));
+        ba.maxSize(32);
+        ba.convert(IntVect(1));
+        DistributionMapping dm(ba);
+
+        constexpr int ncomp = 2;
+        IntVect nghost(2);
+        Periodicity period{IntVect(ncells)};
+
+        auto value = [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) -> Real
+        {
+            if (i < 0) {
+                i += ncells;
+            } else if (i >= ncells) {
+                i -= ncells;
+            }
+            if (j < 0) {
+                j += ncells;
+            } else if (j >= ncells) {
+                j -= ncells;
+            }
+            if (k < 0) {
+                k += ncells;
+            } else if (k >= ncells) {
+                k -= ncells;
+            }
+            return n + i*ncomp + j*ncomp*ncells + k*ncomp*ncells*ncells;
+        };
+
+        // Test GpuArray
+        {
+            using T = GpuArray<Real,ncomp>;
+            FabArray<BaseFab<T>> fa(ba,dm,1,nghost);
+            FabArray<BaseFab<T>> fa2(ba,dm,1,nghost);
+            FabArray<BaseFab<T>> fa3(ba,dm,1,nghost);
+            auto const& ma = fa.arrays();
+            auto const& ma2 = fa2.arrays();
+            auto const& ma3 = fa3.arrays();
+
+            ParallelFor(fa, IntVect(0),
+                        [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                auto const& a = ma[b];
+                for (int n = 0; n < ncomp; ++n) {
+                    a(i,j,k)[n] = value(i,j,k,n);
+                }
+            });
+
+            fa.FillBoundary(period);
+
+            fa2.ParallelCopy(fa, 0, 0, 1, IntVect(0), nghost, period);
+
+            fa3.setVal(T{});
+            fa3.ParallelAdd(fa, 0, 0, 1, nghost, nghost, period);
+
+            auto mask = OverlapMask(fa3,nghost,period);
+            auto const& mma = mask.const_arrays();
+
+            auto err = ParReduce(TypeList<ReduceOpMax,ReduceOpMax,ReduceOpMax>{},
+                                 TypeList<Real,Real,Real>{},
+                                 fa, nghost,
+            [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+                                 -> GpuTuple<Real,Real,Real>
+            {
+                Real r1 = 0, r2 = 0, r3 = 0;
+                auto const& a1 = ma[b];
+                auto const& a2 = ma2[b];
+                auto const& a3 = ma3[b];
+                auto const& m = mma[b];
+                for (int n = 0; n < ncomp; ++n) {
+                    auto v = value(i,j,k,n);
+                    r1 = std::max(r1, std::abs(a1(i,j,k)[n] - v));
+                    r2 = std::max(r2, std::abs(a2(i,j,k)[n] - v));
+                    r3 = std::max(r3, std::abs(a3(i,j,k)[n] - v*m(i,j,k)));
+                }
+                return {r1, r2, r3};
+            });
+
+            AMREX_ALWAYS_ASSERT(amrex::get<0>(err) == 0);
+            AMREX_ALWAYS_ASSERT(amrex::get<1>(err) == 0);
+            AMREX_ALWAYS_ASSERT(amrex::get<2>(err) == 0);
+
+            Real errmax = std::max({amrex::get<0>(err),
+                                    amrex::get<1>(err),
+                                    amrex::get<2>(err)});
+            ParallelDescriptor::ReduceRealSum(errmax);
+            if (errmax != 0) {
+                ret_code = EXIT_FAILURE;
+            }
+        }
+
+        // Test GpuComplex
+        {
+            using T = GpuComplex<Real>;
+            FabArray<BaseFab<T>> fa(ba,dm,1,nghost);
+            FabArray<BaseFab<T>> fa2(ba,dm,1,nghost);
+            FabArray<BaseFab<T>> fa3(ba,dm,1,nghost);
+            auto const& ma = fa.arrays();
+            auto const& ma2 = fa2.arrays();
+            auto const& ma3 = fa3.arrays();
+
+            ParallelFor(fa, IntVect(0),
+                        [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                auto const& a = ma[b];
+                a(i,j,k) = T{value(i,j,k,0),value(i,j,k,1)};
+            });
+
+            fa.FillBoundary(period);
+
+            fa2.ParallelCopy(fa, 0, 0, 1, IntVect(0), nghost, period);
+
+            fa3.setVal(T{});
+            fa3.ParallelAdd(fa, 0, 0, 1, nghost, nghost, period);
+
+            auto mask = OverlapMask(fa3,nghost,period);
+            auto const& mma = mask.const_arrays();
+
+            auto err = ParReduce(TypeList<ReduceOpMax,ReduceOpMax,ReduceOpMax>{},
+                                 TypeList<Real,Real,Real>{},
+                                 fa, nghost,
+            [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+                                 -> GpuTuple<Real,Real,Real>
+            {
+                Real r1 = 0, r2 = 0, r3 = 0;
+                auto const& a1 = ma[b];
+                auto const& a2 = ma2[b];
+                auto const& a3 = ma3[b];
+                auto const& m = mma[b];
+                auto v = GpuComplex{value(i,j,k,0), value(i,j,k,1)};
+                r1 = std::max(r1, amrex::norm(a1(i,j,k) - v));
+                r2 = std::max(r2, amrex::norm(a2(i,j,k) - v));
+                r3 = std::max(r3, amrex::norm(a3(i,j,k) - v*Real(m(i,j,k))));
+                return {r1, r2, r3};
+            });
+
+            AMREX_ALWAYS_ASSERT(amrex::get<0>(err) == 0);
+            AMREX_ALWAYS_ASSERT(amrex::get<1>(err) == 0);
+            AMREX_ALWAYS_ASSERT(amrex::get<2>(err) == 0);
+
+            Real errmax = std::max({amrex::get<0>(err),
+                                    amrex::get<1>(err),
+                                    amrex::get<2>(err)});
+            ParallelDescriptor::ReduceRealSum(errmax);
+            if (errmax != 0) {
+                ret_code = EXIT_FAILURE;
+            }
+        }
+    }
+    amrex::Finalize();
+
+    return ret_code;
+}

Original file line number	Diff line number	Diff line change
`@@ -97,13 +97,15 @@ FabArray<FAB>::PC_local_gpu (const CPC& thecpc, FabArray<FAB> const& src,`
`97`	`97`	`loc_copy_tags.reserve(N_locs);`
`98`	`98`
`99`	`99`	`Vector<BaseFab<int> > maskfabs;`
	`100`	`+ Vector<Array4Tag<int> > masks_unique;`
`100`	`101`	`Vector<Array4Tag<int> > masks;`
`101`	`102`	`if (!is_thread_safe)`
`102`	`103`	`{`
`103`	`104`	`if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) \|\|`
`104`	`105`	`(op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))`
`105`	`106`	`{`
`106`	`107`	`maskfabs.resize(this->local_size());`
	`108`	`+ masks_unique.reserve(this->local_size());`
`107`	`109`	`masks.reserve(N_locs);`
`108`	`110`	`}`
`109`	`111`	`}`
`@@ -122,14 +124,15 @@ FabArray<FAB>::PC_local_gpu (const CPC& thecpc, FabArray<FAB> const& src,`
`122`	`124`	`if (maskfabs.size() > 0) {`
`123`	`125`	`if (!maskfabs[li].isAllocated()) {`
`124`	`126`	`maskfabs[li].resize(this->atLocalIdx(li).box());`
	`127`	`+ masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});`
`125`	`128`	`}`
`126`	`129`	`masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});`
`127`	`130`	`}`
`128`	`131`	`}`
`129`	`132`	`}`
`130`	`133`
`131`	`134`	`if (maskfabs.size() > 0) {`
`132`		`- amrex::ParallelFor(masks,`
	`135`	`+ amrex::ParallelFor(masks_unique,`
`133`	`136`	`[=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept`
`134`	`137`	`{`
`135`	`138`	`msk.dfab(i,j,k) = 0;`
Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ else()`
`125`	`125`	`#`
`126`	`126`	`# List of subdirectories to search for CMakeLists.`
`127`	`127`	`#`
`128`		`- set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CTOParFor DeviceGlobal Enum`
	`128`	`+ set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CommType CTOParFor DeviceGlobal Enum`
`129`	`129`	`MultiBlock MultiPeriod ParmParse Parser Parser2 ParserUserFn Reinit`
`130`	`130`	`RoundoffDomain SmallMatrix)`
`131`	`131`