Skip to content

Commit f9ad2c6

Browse files
Improve usability of amrex::Gpu::Buffer (#4697)
## Summary This PR expands the capabilities of `amrex::Gpu::Buffer<T>` so that it can be first default constructed, then modified by the CPU, and lastly copied over to the GPU, instead of everything happening in the constructor. Example: ```C++ amrex::Gpu::Buffer<int> buf; buf.resize(n); for (int i=0; i<n; ++i) { buf[i] = i*i; } buf.copyToDeviceAsync(); int * ptr = buf.data(); // Use ptr inside ParallelFor // optional: // Change values of ptr inside ParallelFor buf.copyToHost(); // Use buf.hostData() or buf[] on the CPU ``` ## Additional background Follow-up to discussion in #4640
1 parent 36aa3e9 commit f9ad2c6

File tree

1 file changed

+122
-58
lines changed

1 file changed

+122
-58
lines changed

Src/Base/AMReX_GpuBuffer.H

Lines changed: 122 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <AMReX_Arena.H>
66
#include <AMReX_TypeTraits.H>
77
#include <AMReX_GpuDevice.H>
8+
#include <AMReX_GpuContainers.H>
89
#include <cstring>
910
#include <cstdlib>
1011
#include <initializer_list>
@@ -17,87 +18,150 @@ class Buffer
1718
{
1819
public:
1920

20-
Buffer (std::initializer_list<T> init)
21-
: m_size(init.size())
22-
{
23-
if (m_size == 0) { return; }
24-
#ifdef AMREX_USE_GPU
25-
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
26-
#else
27-
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
28-
#endif
29-
std::memcpy(h_data, init.begin(), m_size*sizeof(T));
30-
#ifdef AMREX_USE_GPU
31-
if (Gpu::inLaunchRegion())
32-
{
33-
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
34-
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
21+
Buffer (std::initializer_list<T> init) {
22+
resize(init.size());
23+
24+
if (init.size() > 0) {
25+
std::memcpy(h_vect.data(), init.begin(), init.size()*sizeof(T));
26+
copyToDeviceAsync();
3527
}
36-
#endif
3728
}
3829

39-
Buffer (T const* h_p, const std::size_t n)
40-
: m_size(n)
41-
{
42-
if (m_size == 0) { return; }
43-
#ifdef AMREX_USE_GPU
44-
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
45-
#else
46-
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
47-
#endif
48-
std::memcpy(h_data, h_p, m_size*sizeof(T));
49-
#ifdef AMREX_USE_GPU
50-
if (Gpu::inLaunchRegion())
51-
{
52-
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
53-
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
30+
Buffer (T const* h_p, const std::size_t n) {
31+
resize(n);
32+
33+
if (n > 0 && h_p != nullptr) {
34+
std::memcpy(h_vect.data(), h_p, n*sizeof(T));
35+
copyToDeviceAsync();
5436
}
55-
#endif
5637
}
5738

58-
~Buffer () { clear(); }
39+
Buffer (const std::size_t n) {
40+
resize(n);
41+
}
5942

60-
Buffer (Buffer const&) = delete;
61-
Buffer (Buffer &&) = delete;
62-
void operator= (Buffer const&) = delete;
63-
void operator= (Buffer &&) = delete;
43+
Buffer () = default;
6444

65-
[[nodiscard]] T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
66-
[[nodiscard]] T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
45+
[[nodiscard]] T const* data () const noexcept {
46+
return (useDVect() && !d_vect.empty()) ? d_vect.data() : h_vect.data();
47+
}
48+
[[nodiscard]] T* data () noexcept {
49+
return (useDVect() && !d_vect.empty()) ? d_vect.data() : h_vect.data();
50+
}
6751

68-
[[nodiscard]] T const* hostData () const noexcept { return h_data; }
69-
[[nodiscard]] T* hostData () noexcept { return h_data; }
52+
[[nodiscard]] T const* hostData () const noexcept { return h_vect.data(); }
53+
[[nodiscard]] T* hostData () noexcept { return h_vect.data(); }
54+
55+
/**
56+
* \brief Changes the value of an element of the host (CPU) vector.
57+
* Does not update the device (GPU) vector, so copyToDeviceAsync()
58+
* needs to be called before accessing the data on the GPU.
59+
* \code{.cpp}
60+
* amrex::Gpu::Buffer<int> buf;
61+
* buf.resize(n);
62+
* for (int i=0; i<n; ++i) {
63+
* buf[i] = i*i;
64+
* }
65+
* buf.copyToDeviceAsync();
66+
* int * ptr = buf.data();
67+
* // Use ptr inside ParallelFor
68+
* // optional:
69+
* // Change values of ptr inside ParallelFor
70+
* buf.copyToHost();
71+
* // Use buf.hostData() or buf[] on the CPU
72+
* \endcode
73+
*/
74+
[[nodiscard]] T& operator[] (const std::size_t i) noexcept {
75+
return h_vect[i];
76+
}
77+
78+
[[nodiscard]] const T& operator[] (const std::size_t i) const noexcept {
79+
return h_vect[i];
80+
}
7081

71-
[[nodiscard]] std::size_t size () const noexcept { return m_size; }
82+
[[nodiscard]] std::size_t size () const noexcept { return h_vect.size(); }
7283

73-
void clear ()
74-
{
84+
[[nodiscard]] bool empty () const noexcept { return h_vect.size() == 0; }
85+
86+
void resize (const std::size_t n) noexcept {
87+
h_vect.resize(n);
88+
if (useDVect()) {
89+
d_vect.resize(n);
90+
}
91+
}
92+
93+
void clear () noexcept {
94+
h_vect.clear();
95+
d_vect.clear();
96+
}
97+
98+
void shrink_to_fit () noexcept {
99+
h_vect.shrink_to_fit();
100+
d_vect.shrink_to_fit();
101+
}
102+
103+
void reserve (const std::size_t n) noexcept {
104+
h_vect.reserve(n);
105+
if (useDVect()) {
106+
d_vect.reserve(n);
107+
}
108+
}
109+
110+
/**
111+
* \brief Adds an element to the back of the host (CPU) vector.
112+
* Does not update the device (GPU) vector, so copyToDeviceAsync()
113+
* needs to be called before accessing the data on the GPU.
114+
* \code{.cpp}
115+
* amrex::Gpu::Buffer<int> buf;
116+
* buf.reserve(n);
117+
* for (int i=0; i<n; ++i) {
118+
* buf.push_back(i*i);
119+
* }
120+
* buf.copyToDeviceAsync();
121+
* int * ptr = buf.data();
122+
* // Use ptr inside ParallelFor
123+
* // optional:
124+
* // Change values of ptr inside ParallelFor
125+
* buf.copyToHost();
126+
* // Use buf.hostData() or buf[] on the CPU
127+
* \endcode
128+
*/
129+
void push_back (const T& value) noexcept {
130+
h_vect.push_back(value);
131+
}
132+
133+
T* copyToDeviceAsync () noexcept {
75134
#ifdef AMREX_USE_GPU
76-
if (d_data) { The_Arena()->free(d_data); }
77-
if (h_data) { The_Pinned_Arena()->free(h_data); }
78-
#else
79-
std::free(h_data);
135+
if (useDVect() && !h_vect.empty())
136+
{
137+
d_vect.resize(h_vect.size());
138+
Gpu::htod_memcpy_async(d_vect.data(), h_vect.data(), h_vect.size()*sizeof(T));
139+
return d_vect.data();
140+
}
80141
#endif
81-
d_data = nullptr;
82-
h_data = nullptr;
142+
return h_vect.data();
83143
}
84144

85-
T* copyToHost ()
86-
{
145+
T* copyToHost () noexcept {
87146
#ifdef AMREX_USE_GPU
88-
if (d_data)
147+
if (useDVect() && !d_vect.empty())
89148
{
90-
Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T));
149+
h_vect.resize(d_vect.size());
150+
Gpu::dtoh_memcpy_async(h_vect.data(), d_vect.data(), d_vect.size()*sizeof(T));
91151
Gpu::streamSynchronize();
92152
}
93153
#endif
94-
return h_data;
154+
return h_vect.data();
95155
}
96156

97157
private:
98-
std::size_t m_size;
99-
T* d_data = nullptr;
100-
T* h_data = nullptr;
158+
159+
[[nodiscard]] bool useDVect () const noexcept {
160+
return Gpu::inLaunchRegion() /* && !use_unified_gpu_memory */;
161+
}
162+
163+
DeviceVector<T> d_vect;
164+
PinnedVector<T> h_vect;
101165
};
102166

103167
}

0 commit comments

Comments
 (0)