Skip to content

Commit cbda14e

Browse files
authored
Merge pull request #2070 from matthewdcong/master_develop_merge
Update NanoVDB with the latest round of changes
2 parents 62b95b4 + 556a326 commit cbda14e

File tree

18 files changed

+974
-364
lines changed

18 files changed

+974
-364
lines changed

nanovdb/nanovdb/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,11 @@ set(NANOVDB_INCLUDE_FILES
184184
set(NANOVDB_INCLUDE_CUDA_FILES
185185
cuda/DeviceBuffer.h
186186
cuda/DeviceMesh.h
187+
cuda/DeviceResource.h
187188
cuda/DeviceStreamMap.h
188189
cuda/GridHandle.cuh
189190
cuda/NodeManager.cuh
191+
cuda/TempPool.h
190192
cuda/UnifiedBuffer.h
191193
)
192194

nanovdb/nanovdb/HostBuffer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,13 @@ class HostBuffer
192192
void* data() { return mData; }
193193
//@}
194194

195+
/// @brief Returns an offset pointer of a specific type from the allocated host memory
196+
/// @tparam T Type of the pointer returned
197+
/// @param count Numbers of elements of @c parameter type T to skip
198+
/// @warning might return NULL
199+
template <typename T>
200+
T* data(ptrdiff_t count = 0) const {return mData ? reinterpret_cast<T*>(mData) + count : nullptr;}
201+
195202
//@{
196203
/// @brief Returns the size in bytes associated with this buffer.
197204
uint64_t bufferSize() const { return mSize; }

nanovdb/nanovdb/cuda/DeviceBuffer.h

Lines changed: 58 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class DeviceBuffer
4141

4242
/// @brief Initialize buffer
4343
/// @param size byte size of buffer to be initialized
44-
/// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
44+
/// @param device id of the device on which to initialize the buffer
4545
/// @note All existing buffers are first cleared
4646
/// @warning size is expected to be non-zero. Use clear() clear buffer!
4747
void init(uint64_t size, int device, cudaStream_t stream);
@@ -127,6 +127,20 @@ class DeviceBuffer
127127
other.mSize = other.mDeviceCount = other.mManaged = 0;
128128
}
129129

130+
/// @brief Copy-constructor from a HostBuffer
131+
/// @param buffer host buffer from which to copy data
132+
/// @param device id of the device on which to initialize the buffer
133+
/// @param stream cuda stream
134+
DeviceBuffer(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0)
135+
: DeviceBuffer(buffer.size(), device, stream)
136+
{
137+
if (mCpuData) {
138+
cudaCheck(cudaMemcpy(mCpuData, buffer.data(), mSize, cudaMemcpyHostToHost));
139+
} else if (mGpuData[device]) {
140+
cudaCheck(cudaMemcpyAsync(mGpuData[device], buffer.data(), mSize, cudaMemcpyHostToDevice, stream));
141+
}
142+
}
143+
130144
/// @brief Destructor frees memory on both the host and device
131145
~DeviceBuffer() { this->clear(); };
132146

@@ -138,6 +152,11 @@ class DeviceBuffer
138152
/// @return An instance of this class using move semantics
139153
static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy, bool host, void* stream){return DeviceBuffer(size, host, stream);}
140154

155+
/// @brief Static factory method that returns an instance of this buffer
156+
/// @param size byte size of buffer to be initialized
157+
/// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
158+
/// @param device id of the device on which to initialize the buffer
159+
/// @param stream cuda stream
141160
static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy = nullptr, int device = cudaCpuDeviceId, cudaStream_t stream = 0){return DeviceBuffer(size, device, stream);}
142161

143162
/// @brief Static factory method that returns an instance of this buffer that wraps externally managed memory
@@ -153,13 +172,20 @@ class DeviceBuffer
153172
/// @param list list of device IDs and device memory pointers
154173
static DeviceBuffer create(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return DeviceBuffer(size, cpuData, list);}
155174

175+
/// @brief Static factory method that returns an instance of this buffer constructed from a HostBuffer
176+
/// @param buffer host buffer from which to copy data
177+
/// @param device id of the device on which to initialize the buffer
178+
/// @param stream cuda stream
179+
static DeviceBuffer create(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return DeviceBuffer(buffer, device, stream);}
180+
156181
///////////////////////////////////////////////////////////////////////
157182

158183
/// @{
159184
/// @brief Factory methods that create a shared pointer to an DeviceBuffer instance
160185
static PtrT createPtr(uint64_t size, const DeviceBuffer* = nullptr, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return std::make_shared<DeviceBuffer>(size, device, stream);}
161186
static PtrT createPtr(uint64_t size, void* cpuData, void* gpuData) {return std::make_shared<DeviceBuffer>(size, cpuData, gpuData);}
162187
static PtrT createPtr(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return std::make_shared<DeviceBuffer>(size, cpuData, list);}
188+
static PtrT createPtr(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return std::make_shared<DeviceBuffer>(buffer, device, stream);}
163189
/// @}
164190

165191
///////////////////////////////////////////////////////////////////////
@@ -168,18 +194,7 @@ class DeviceBuffer
168194
DeviceBuffer& operator=(const DeviceBuffer&) = delete;
169195

170196
/// @brief Move copy assignment operation
171-
DeviceBuffer& operator=(DeviceBuffer&& other) noexcept
172-
{
173-
mSize = other.mSize;
174-
mCpuData = other.mCpuData;
175-
delete [] mGpuData;
176-
mGpuData = other.mGpuData;
177-
mDeviceCount = other.mDeviceCount;
178-
mManaged = other.mManaged;
179-
other.mCpuData = other.mGpuData = nullptr;
180-
other.mSize = other.mDeviceCount = other.mManaged = 0;
181-
return *this;
182-
}
197+
DeviceBuffer& operator=(DeviceBuffer&& other) noexcept;
183198

184199
///////////////////////////////////////////////////////////////////////
185200

@@ -190,7 +205,7 @@ class DeviceBuffer
190205
/// @brief Returns an offset pointer of a specific type from the allocated host memory
191206
/// @tparam T Type of the pointer returned
192207
/// @param count Numbers of elements of @c parameter type T to skip
193-
/// @warning assumes that this instance is not empty!
208+
/// @warning might return NULL
194209
template <typename T>
195210
T* data(ptrdiff_t count = 0, int device = cudaCpuDeviceId) const
196211
{
@@ -293,6 +308,26 @@ class DeviceBuffer
293308

294309
// --------------------------> Implementations below <------------------------------------
295310

311+
inline DeviceBuffer& DeviceBuffer::operator=(DeviceBuffer&& other) noexcept
312+
{
313+
if (mManaged) {// first free all the managed data buffers
314+
cudaCheck(cudaFreeHost(mCpuData));
315+
for (int i=0; i<mDeviceCount; ++i) cudaCheck(util::cuda::freeAsync(mGpuData[i], 0));
316+
}
317+
delete [] mGpuData;
318+
mSize = other.mSize;
319+
mCpuData = other.mCpuData;
320+
mGpuData = other.mGpuData;
321+
mDeviceCount = other.mDeviceCount;
322+
mManaged = other.mManaged;
323+
other.mCpuData = nullptr;
324+
other.mGpuData = nullptr;
325+
other.mSize = 0;
326+
other.mDeviceCount = 0;
327+
other.mManaged = 0;
328+
return *this;
329+
}
330+
296331
inline void DeviceBuffer::init(uint64_t size, int device, cudaStream_t stream)
297332
{
298333
if (size==0) return;
@@ -303,7 +338,7 @@ inline void DeviceBuffer::init(uint64_t size, int device, cudaStream_t stream)
303338
cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
304339
checkPtr(mCpuData, "cuda::DeviceBuffer::init: failed to allocate host buffer");
305340
} else {
306-
cudaCheck(cudaMallocAsync(mGpuData+device, size, stream)); // un-managed memory on the device, always 32B aligned!
341+
cudaCheck(util::cuda::mallocAsync(mGpuData+device, size, stream)); // un-managed memory on the device, always 32B aligned!
307342
checkPtr(mGpuData[device], "cuda::DeviceBuffer::init: failed to allocate device buffer");
308343
}
309344
mSize = size;
@@ -316,7 +351,7 @@ inline void DeviceBuffer::deviceUpload(int device, cudaStream_t stream, bool syn
316351
checkPtr(mCpuData, "uninitialized cpu source data");
317352
if (mGpuData[device] == nullptr) {
318353
if (mManaged==0) throw std::runtime_error("DeviceBuffer::deviceUpload called on externally managed memory that wasn\'t allocated.");
319-
cudaCheck(cudaMallocAsync(mGpuData+device, mSize, stream)); // un-managed memory on the device, always 32B aligned!
354+
cudaCheck(util::cuda::mallocAsync(mGpuData+device, mSize, stream)); // un-managed memory on the device, always 32B aligned!
320355
}
321356
checkPtr(mGpuData[device], "uninitialized gpu destination data");
322357
cudaCheck(cudaMemcpyAsync(mGpuData[device], mCpuData, mSize, cudaMemcpyHostToDevice, stream));
@@ -352,13 +387,16 @@ inline void DeviceBuffer::deviceDownload(void* stream, bool sync)
352387

353388
inline void DeviceBuffer::clear(cudaStream_t stream)
354389
{
355-
if (mManaged!=0) {// free all the managed data buffers
390+
if (mManaged) {// free all the managed data buffers
356391
cudaCheck(cudaFreeHost(mCpuData));
357-
for (int i=0; i<mDeviceCount; ++i) cudaCheck(cudaFreeAsync(mGpuData[i], stream));
392+
for (int i=0; i<mDeviceCount; ++i) cudaCheck(util::cuda::freeAsync(mGpuData[i], stream));
358393
}
359394
delete [] mGpuData;
360-
mCpuData = mGpuData = nullptr;
361-
mSize = mDeviceCount = mManaged = 0;
395+
mCpuData = nullptr;
396+
mGpuData = nullptr;
397+
mSize = 0;
398+
mDeviceCount = 0;
399+
mManaged = 0;
362400
} // DeviceBuffer::clear
363401

364402
}// namespace cuda
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// Copyright Contributors to the OpenVDB Project
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
#ifndef NANOVDB_CUDA_DEVICERESOURCE_H_HAS_BEEN_INCLUDED
5+
#define NANOVDB_CUDA_DEVICERESOURCE_H_HAS_BEEN_INCLUDED
6+
7+
#include <cuda_runtime_api.h>
8+
#include <nanovdb/util/cuda/Util.h>
9+
10+
namespace nanovdb {
11+
12+
namespace cuda {
13+
14+
class DeviceResource
15+
{
16+
public:
17+
// cudaMalloc aligns memory to 256 bytes by default
18+
static constexpr size_t DEFAULT_ALIGNMENT = 256;
19+
20+
static void* allocateAsync(size_t bytes, size_t, cudaStream_t stream) {
21+
void* p = nullptr;
22+
cudaCheck(util::cuda::mallocAsync(&p, bytes, stream));
23+
return p;
24+
}
25+
26+
static void deallocateAsync(void *p, size_t, size_t, cudaStream_t stream) {
27+
cudaCheck(util::cuda::freeAsync(p, stream));
28+
}
29+
};
30+
31+
}
32+
33+
} // namespace nanovdb::cuda
34+
35+
#endif // end of NANOVDB_CUDA_DEVICERESOURCE_H_HAS_BEEN_INCLUDED

nanovdb/nanovdb/cuda/GridHandle.cuh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ __global__ void cpyGridHandleMeta(const GridData *d_data, GridHandleMetaData *d_
3434
__global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty)
3535
{
3636
NANOVDB_ASSERT(gridIndex < gridCount);
37-
if (*d_dirty = d_data->mGridIndex != gridIndex || d_data->mGridCount != gridCount) {
37+
*d_dirty = (d_data->mGridIndex != gridIndex) || (d_data->mGridCount != gridCount);
38+
if (*d_dirty) {
3839
d_data->mGridIndex = gridIndex;
3940
d_data->mGridCount = gridCount;
4041
if (d_data->mChecksum.isEmpty()) *d_dirty = false;// no need to update checksum if it didn't already exist
@@ -61,6 +62,7 @@ splitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullp
6162
updateGridCount<<<1, 1, 0, stream>>>(dst, 0u, 1u, d_dirty);
6263
cudaCheckError();
6364
cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
65+
cudaCheck(cudaStreamSynchronize(stream));
6466
if (dirty) tools::cuda::updateChecksum(dst, CheckMode::Partial, stream);
6567
handles[n] = nanovdb::GridHandle<BufferT>(std::move(buffer));
6668
ptr = util::PtrAdd(ptr, handle.gridSize(n));
@@ -93,6 +95,7 @@ mergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* oth
9395
updateGridCount<<<1, 1, 0, stream>>>(data, counter++, gridCount, d_dirty);
9496
cudaCheckError();
9597
cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
98+
cudaCheck(cudaStreamSynchronize(stream));
9699
if (dirty) tools::cuda::updateChecksum(data, CheckMode::Partial, stream);
97100
dst = util::PtrAdd(dst, h.gridSize(n));
98101
src = util::PtrAdd(src, h.gridSize(n));

nanovdb/nanovdb/cuda/TempDevicePool.h

Lines changed: 0 additions & 49 deletions
This file was deleted.

nanovdb/nanovdb/cuda/TempPool.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// Copyright Contributors to the OpenVDB Project
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
#ifndef NANOVDB_CUDA_TEMPPOOL_H_HAS_BEEN_INCLUDED
5+
#define NANOVDB_CUDA_TEMPPOOL_H_HAS_BEEN_INCLUDED
6+
7+
#include <nanovdb/cuda/DeviceResource.h>
8+
9+
namespace nanovdb {
10+
11+
namespace cuda {
12+
13+
template <class Resource>
14+
class TempPool {
15+
public:
16+
TempPool() : mData(nullptr), mSize(0), mRequestedSize(0) {}
17+
~TempPool() {
18+
mRequestedSize = 0;
19+
Resource::deallocateAsync(mData, mSize, Resource::DEFAULT_ALIGNMENT, nullptr);
20+
mData = nullptr;
21+
mSize = 0;
22+
}
23+
24+
void* data() {
25+
return mData;
26+
}
27+
28+
size_t& size() {
29+
return mSize;
30+
}
31+
32+
size_t& requestedSize() {
33+
return mRequestedSize;
34+
}
35+
36+
void reallocate(cudaStream_t stream) {
37+
if (!mData || mRequestedSize > mSize) {
38+
Resource::deallocateAsync(mData, mSize, Resource::DEFAULT_ALIGNMENT, stream);
39+
mData = Resource::allocateAsync(mRequestedSize, Resource::DEFAULT_ALIGNMENT, stream);
40+
mSize = mRequestedSize;
41+
}
42+
}
43+
private:
44+
void* mData;
45+
size_t mSize;
46+
size_t mRequestedSize;
47+
};
48+
49+
using TempDevicePool = TempPool<DeviceResource>;
50+
51+
}
52+
53+
} // namespace nanovdb::cuda
54+
55+
#endif // end of NANOVDB_CUDA_TEMPPOOL_H_HAS_BEEN_INCLUDED

0 commit comments

Comments
 (0)