@@ -41,7 +41,7 @@ class DeviceBuffer
4141
4242 // / @brief Initialize buffer
4343 // / @param size byte size of buffer to be initialized
44- // / @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
44+ // / @param device id of the device on which to initialize the buffer
4545 // / @note All existing buffers are first cleared
4646 // / @warning size is expected to be non-zero. Use clear() clear buffer!
4747 void init (uint64_t size, int device, cudaStream_t stream);
@@ -127,6 +127,20 @@ class DeviceBuffer
127127 other.mSize = other.mDeviceCount = other.mManaged = 0 ;
128128 }
129129
130+ // / @brief Copy-constructor from a HostBuffer
131+ // / @param buffer host buffer from which to copy data
132+ // / @param device id of the device on which to initialize the buffer
133+ // / @param stream cuda stream
134+ DeviceBuffer (const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0 )
135+ : DeviceBuffer(buffer.size(), device, stream)
136+ {
137+ if (mCpuData ) {
138+ cudaCheck (cudaMemcpy (mCpuData , buffer.data (), mSize , cudaMemcpyHostToHost));
139+ } else if (mGpuData [device]) {
140+ cudaCheck (cudaMemcpyAsync (mGpuData [device], buffer.data (), mSize , cudaMemcpyHostToDevice, stream));
141+ }
142+ }
143+
130144 // / @brief Destructor frees memory on both the host and device
131145 ~DeviceBuffer () { this ->clear (); };
132146
@@ -138,6 +152,11 @@ class DeviceBuffer
138152 // / @return An instance of this class using move semantics
139153 static DeviceBuffer create (uint64_t size, const DeviceBuffer* dummy, bool host, void * stream){return DeviceBuffer (size, host, stream);}
140154
155+ // / @brief Static factory method that returns an instance of this buffer
156+ // / @param size byte size of buffer to be initialized
157+ // / @param dummy this argument is currently ignored but required to match the API of the HostBuffer
158+ // / @param device id of the device on which to initialize the buffer
159+ // / @param stream cuda stream
141160 static DeviceBuffer create (uint64_t size, const DeviceBuffer* dummy = nullptr , int device = cudaCpuDeviceId, cudaStream_t stream = 0 ){return DeviceBuffer (size, device, stream);}
142161
143162 // / @brief Static factory method that returns an instance of this buffer that wraps externally managed memory
@@ -153,13 +172,20 @@ class DeviceBuffer
153172 // / @param list list of device IDs and device memory pointers
154173 static DeviceBuffer create (uint64_t size, void * cpuData, std::initializer_list<std::pair<int ,void *>> list) {return DeviceBuffer (size, cpuData, list);}
155174
175+ // / @brief Static factory method that returns an instance of this buffer constructed from a HostBuffer
176+ // / @param buffer host buffer from which to copy data
177+ // / @param device id of the device on which to initialize the buffer
178+ // / @param stream cuda stream
179+ static DeviceBuffer create (const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0 ) {return DeviceBuffer (buffer, device, stream);}
180+
156181 // /////////////////////////////////////////////////////////////////////
157182
158183 // / @{
159184 // / @brief Factory methods that create a shared pointer to an DeviceBuffer instance
160185 static PtrT createPtr (uint64_t size, const DeviceBuffer* = nullptr , int device = cudaCpuDeviceId, cudaStream_t stream = 0 ) {return std::make_shared<DeviceBuffer>(size, device, stream);}
161186 static PtrT createPtr (uint64_t size, void * cpuData, void * gpuData) {return std::make_shared<DeviceBuffer>(size, cpuData, gpuData);}
162187 static PtrT createPtr (uint64_t size, void * cpuData, std::initializer_list<std::pair<int ,void *>> list) {return std::make_shared<DeviceBuffer>(size, cpuData, list);}
188+ static PtrT createPtr (const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0 ) {return std::make_shared<DeviceBuffer>(buffer, device, stream);}
163189 // / @}
164190
165191 // /////////////////////////////////////////////////////////////////////
@@ -168,18 +194,7 @@ class DeviceBuffer
168194 DeviceBuffer& operator =(const DeviceBuffer&) = delete ;
169195
170196 // / @brief Move copy assignment operation
171- DeviceBuffer& operator =(DeviceBuffer&& other) noexcept
172- {
173- mSize = other.mSize ;
174- mCpuData = other.mCpuData ;
175- delete [] mGpuData ;
176- mGpuData = other.mGpuData ;
177- mDeviceCount = other.mDeviceCount ;
178- mManaged = other.mManaged ;
179- other.mCpuData = other.mGpuData = nullptr ;
180- other.mSize = other.mDeviceCount = other.mManaged = 0 ;
181- return *this ;
182- }
197+ DeviceBuffer& operator =(DeviceBuffer&& other) noexcept ;
183198
184199 // /////////////////////////////////////////////////////////////////////
185200
@@ -190,7 +205,7 @@ class DeviceBuffer
190205 // / @brief Returns an offset pointer of a specific type from the allocated host memory
191206 // / @tparam T Type of the pointer returned
192207 // / @param count Numbers of elements of @c parameter type T to skip
193- // / @warning assumes that this instance is not empty!
208+ // / @warning might return NULL
194209 template <typename T>
195210 T* data (ptrdiff_t count = 0 , int device = cudaCpuDeviceId) const
196211 {
@@ -293,6 +308,26 @@ class DeviceBuffer
293308
294309// --------------------------> Implementations below <------------------------------------
295310
311+ inline DeviceBuffer& DeviceBuffer::operator =(DeviceBuffer&& other) noexcept
312+ {
313+ if (mManaged ) {// first free all the managed data buffers
314+ cudaCheck (cudaFreeHost (mCpuData ));
315+ for (int i=0 ; i<mDeviceCount ; ++i) cudaCheck (util::cuda::freeAsync (mGpuData [i], 0 ));
316+ }
317+ delete [] mGpuData ;
318+ mSize = other.mSize ;
319+ mCpuData = other.mCpuData ;
320+ mGpuData = other.mGpuData ;
321+ mDeviceCount = other.mDeviceCount ;
322+ mManaged = other.mManaged ;
323+ other.mCpuData = nullptr ;
324+ other.mGpuData = nullptr ;
325+ other.mSize = 0 ;
326+ other.mDeviceCount = 0 ;
327+ other.mManaged = 0 ;
328+ return *this ;
329+ }
330+
296331inline void DeviceBuffer::init (uint64_t size, int device, cudaStream_t stream)
297332{
298333 if (size==0 ) return ;
@@ -303,7 +338,7 @@ inline void DeviceBuffer::init(uint64_t size, int device, cudaStream_t stream)
303338 cudaCheck (cudaMallocHost ((void **)&mCpuData , size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
304339 checkPtr (mCpuData , " cuda::DeviceBuffer::init: failed to allocate host buffer" );
305340 } else {
306- cudaCheck (cudaMallocAsync (mGpuData +device, size, stream)); // un-managed memory on the device, always 32B aligned!
341+ cudaCheck (util::cuda::mallocAsync (mGpuData +device, size, stream)); // un-managed memory on the device, always 32B aligned!
307342 checkPtr (mGpuData [device], " cuda::DeviceBuffer::init: failed to allocate device buffer" );
308343 }
309344 mSize = size;
@@ -316,7 +351,7 @@ inline void DeviceBuffer::deviceUpload(int device, cudaStream_t stream, bool syn
316351 checkPtr (mCpuData , " uninitialized cpu source data" );
317352 if (mGpuData [device] == nullptr ) {
318353 if (mManaged ==0 ) throw std::runtime_error (" DeviceBuffer::deviceUpload called on externally managed memory that wasn\' t allocated." );
319- cudaCheck (cudaMallocAsync (mGpuData +device, mSize , stream)); // un-managed memory on the device, always 32B aligned!
354+ cudaCheck (util::cuda::mallocAsync (mGpuData +device, mSize , stream)); // un-managed memory on the device, always 32B aligned!
320355 }
321356 checkPtr (mGpuData [device], " uninitialized gpu destination data" );
322357 cudaCheck (cudaMemcpyAsync (mGpuData [device], mCpuData , mSize , cudaMemcpyHostToDevice, stream));
@@ -352,13 +387,16 @@ inline void DeviceBuffer::deviceDownload(void* stream, bool sync)
352387
353388inline void DeviceBuffer::clear (cudaStream_t stream)
354389{
355- if (mManaged != 0 ) {// free all the managed data buffers
390+ if (mManaged ) {// free all the managed data buffers
356391 cudaCheck (cudaFreeHost (mCpuData ));
357- for (int i=0 ; i<mDeviceCount ; ++i) cudaCheck (cudaFreeAsync (mGpuData [i], stream));
392+ for (int i=0 ; i<mDeviceCount ; ++i) cudaCheck (util::cuda::freeAsync (mGpuData [i], stream));
358393 }
359394 delete [] mGpuData ;
360- mCpuData = mGpuData = nullptr ;
361- mSize = mDeviceCount = mManaged = 0 ;
395+ mCpuData = nullptr ;
396+ mGpuData = nullptr ;
397+ mSize = 0 ;
398+ mDeviceCount = 0 ;
399+ mManaged = 0 ;
362400} // DeviceBuffer::clear
363401
364402}// namespace cuda
0 commit comments