Improve usability of amrex::Gpu::Buffer (#4697)

AlexanderSinn · web-flow · commit f9ad2c62584f · 2025-10-09T12:24:41.000-07:00
## Summary This PR expands the capabilities of `amrex::Gpu::Buffer<T>` so that it can be first default constructed, then modified by the CPU, and lastly copied over to the GPU, instead of everything happening in the constructor. Example: ```C++ amrex::Gpu::Buffer<int> buf; buf.resize(n); for (int i=0; i<n; ++i) { buf[i] = i*i; } buf.copyToDeviceAsync(); int * ptr = buf.data(); // Use ptr inside ParallelFor // optional: // Change values of ptr inside ParallelFor buf.copyToHost(); // Use buf.hostData() or buf[] on the CPU ``` ## Additional background Follow-up to discussion in #4640
diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H
@@ -5,6 +5,7 @@
 #include <AMReX_Arena.H>
 #include <AMReX_TypeTraits.H>
 #include <AMReX_GpuDevice.H>
+#include <AMReX_GpuContainers.H>
 #include <cstring>
 #include <cstdlib>
 #include <initializer_list>
@@ -17,87 +18,150 @@ class Buffer
 {
 public:
 
-    Buffer (std::initializer_list<T> init)
-        : m_size(init.size())
-    {
-        if (m_size == 0) { return; }
-#ifdef AMREX_USE_GPU
-        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
-#else
-        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
-#endif
-        std::memcpy(h_data, init.begin(), m_size*sizeof(T));
-#ifdef AMREX_USE_GPU
-        if (Gpu::inLaunchRegion())
-        {
-            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
-            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+    Buffer (std::initializer_list<T> init) {
+        resize(init.size());
+
+        if (init.size() > 0) {
+            std::memcpy(h_vect.data(), init.begin(), init.size()*sizeof(T));
+            copyToDeviceAsync();
         }
-#endif
     }
 
-    Buffer (T const* h_p, const std::size_t n)
-        : m_size(n)
-    {
-        if (m_size == 0) { return; }
-#ifdef AMREX_USE_GPU
-        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
-#else
-        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
-#endif
-        std::memcpy(h_data, h_p, m_size*sizeof(T));
-#ifdef AMREX_USE_GPU
-        if (Gpu::inLaunchRegion())
-        {
-            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
-            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+    Buffer (T const* h_p, const std::size_t n) {
+        resize(n);
+
+        if (n > 0 && h_p != nullptr) {
+            std::memcpy(h_vect.data(), h_p, n*sizeof(T));
+            copyToDeviceAsync();
         }
-#endif
     }
 
-    ~Buffer () { clear(); }
+    Buffer (const std::size_t n) {
+        resize(n);
+    }
 
-    Buffer (Buffer const&) = delete;
-    Buffer (Buffer &&) = delete;
-    void operator= (Buffer const&) = delete;
-    void operator= (Buffer &&) = delete;
+    Buffer () = default;
 
-    [[nodiscard]] T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
-    [[nodiscard]] T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
+    [[nodiscard]] T const* data () const noexcept {
+        return (useDVect() && !d_vect.empty()) ? d_vect.data() : h_vect.data();
+    }
+    [[nodiscard]] T* data () noexcept {
+        return (useDVect() && !d_vect.empty()) ? d_vect.data() : h_vect.data();
+    }
 
-    [[nodiscard]] T const* hostData () const noexcept { return h_data; }
-    [[nodiscard]] T* hostData () noexcept { return h_data; }
+    [[nodiscard]] T const* hostData () const noexcept { return h_vect.data(); }
+    [[nodiscard]] T* hostData () noexcept { return h_vect.data(); }
+
+    /**
+    * \brief Changes the value of an element of the host (CPU) vector.
+    * Does not update the device (GPU) vector, so copyToDeviceAsync()
+    * needs to be called before accessing the data on the GPU.
+    * \code{.cpp}
+    *   amrex::Gpu::Buffer<int> buf;
+    *   buf.resize(n);
+    *   for (int i=0; i<n; ++i) {
+    *       buf[i] = i*i;
+    *   }
+    *   buf.copyToDeviceAsync();
+    *   int * ptr = buf.data();
+    *   // Use ptr inside ParallelFor
+    *   // optional:
+    *   // Change values of ptr inside ParallelFor
+    *   buf.copyToHost();
+    *   // Use buf.hostData() or buf[] on the CPU
+    * \endcode
+    */
+    [[nodiscard]] T& operator[] (const std::size_t i) noexcept {
+        return h_vect[i];
+    }
+
+    [[nodiscard]] const T& operator[] (const std::size_t i) const noexcept {
+        return h_vect[i];
+    }
 
-    [[nodiscard]] std::size_t size () const noexcept { return m_size; }
+    [[nodiscard]] std::size_t size () const noexcept { return h_vect.size(); }
 
-    void clear ()
-    {
+    [[nodiscard]] bool empty () const noexcept { return h_vect.size() == 0; }
+
+    void resize (const std::size_t n) noexcept {
+        h_vect.resize(n);
+        if (useDVect()) {
+            d_vect.resize(n);
+        }
+    }
+
+    void clear () noexcept {
+        h_vect.clear();
+        d_vect.clear();
+    }
+
+    void shrink_to_fit () noexcept {
+        h_vect.shrink_to_fit();
+        d_vect.shrink_to_fit();
+    }
+
+    void reserve (const std::size_t n) noexcept {
+        h_vect.reserve(n);
+        if (useDVect()) {
+            d_vect.reserve(n);
+        }
+    }
+
+    /**
+    * \brief Adds an element to the back of the host (CPU) vector.
+    * Does not update the device (GPU) vector, so copyToDeviceAsync()
+    * needs to be called before accessing the data on the GPU.
+    * \code{.cpp}
+    *   amrex::Gpu::Buffer<int> buf;
+    *   buf.reserve(n);
+    *   for (int i=0; i<n; ++i) {
+    *       buf.push_back(i*i);
+    *   }
+    *   buf.copyToDeviceAsync();
+    *   int * ptr = buf.data();
+    *   // Use ptr inside ParallelFor
+    *   // optional:
+    *   // Change values of ptr inside ParallelFor
+    *   buf.copyToHost();
+    *   // Use buf.hostData() or buf[] on the CPU
+    * \endcode
+    */
+    void push_back (const T& value) noexcept {
+        h_vect.push_back(value);
+    }
+
+    T* copyToDeviceAsync () noexcept {
 #ifdef AMREX_USE_GPU
-        if (d_data) { The_Arena()->free(d_data); }
-        if (h_data) { The_Pinned_Arena()->free(h_data); }
-#else
-        std::free(h_data);
+        if (useDVect() && !h_vect.empty())
+        {
+            d_vect.resize(h_vect.size());
+            Gpu::htod_memcpy_async(d_vect.data(), h_vect.data(), h_vect.size()*sizeof(T));
+            return d_vect.data();
+        }
 #endif
-        d_data = nullptr;
-        h_data = nullptr;
+        return h_vect.data();
     }
 
-    T* copyToHost ()
-    {
+    T* copyToHost () noexcept {
 #ifdef AMREX_USE_GPU
-        if (d_data)
+        if (useDVect() && !d_vect.empty())
         {
-            Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T));
+            h_vect.resize(d_vect.size());
+            Gpu::dtoh_memcpy_async(h_vect.data(), d_vect.data(), d_vect.size()*sizeof(T));
             Gpu::streamSynchronize();
         }
 #endif
-        return h_data;
+        return h_vect.data();
     }
 
 private:
-    std::size_t m_size;
-    T* d_data = nullptr;
-    T* h_data = nullptr;
+
+    [[nodiscard]] bool useDVect () const noexcept {
+        return Gpu::inLaunchRegion() /* && !use_unified_gpu_memory */;
+    }
+
+    DeviceVector<T> d_vect;
+    PinnedVector<T> h_vect;
 };
 
 }