AcademySoftwareFoundation
diff --git a/‎nanovdb/nanovdb/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎nanovdb/nanovdb/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nanovdb/nanovdb/HostBuffer.h‎
Lines changed: 7 additions & 0 deletions b/‎nanovdb/nanovdb/HostBuffer.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎nanovdb/nanovdb/cuda/DeviceBuffer.h‎
Lines changed: 58 additions & 20 deletions b/‎nanovdb/nanovdb/cuda/DeviceBuffer.h‎
Lines changed: 58 additions & 20 deletions
diff --git a/‎nanovdb/nanovdb/cuda/DeviceResource.h‎
Lines changed: 35 additions & 0 deletions b/‎nanovdb/nanovdb/cuda/DeviceResource.h‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎nanovdb/nanovdb/cuda/GridHandle.cuh‎
Lines changed: 4 additions & 1 deletion b/‎nanovdb/nanovdb/cuda/GridHandle.cuh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎nanovdb/nanovdb/cuda/TempDevicePool.h‎
Lines changed: 0 additions & 49 deletions b/‎nanovdb/nanovdb/cuda/TempDevicePool.h‎
Lines changed: 0 additions & 49 deletions
diff --git a/‎nanovdb/nanovdb/cuda/TempPool.h‎
Lines changed: 55 additions & 0 deletions b/‎nanovdb/nanovdb/cuda/TempPool.h‎
Lines changed: 55 additions & 0 deletions
@@ -184,9 +184,11 @@ set(NANOVDB_INCLUDE_FILES
 set(NANOVDB_INCLUDE_CUDA_FILES
   cuda/DeviceBuffer.h
   cuda/DeviceMesh.h
+  cuda/DeviceResource.h
   cuda/DeviceStreamMap.h
   cuda/GridHandle.cuh
   cuda/NodeManager.cuh
+  cuda/TempPool.h
   cuda/UnifiedBuffer.h
 )
 
 
@@ -192,6 +192,13 @@ class HostBuffer
     void* data() { return mData; }
     //@}
 
+    /// @brief Returns an offset pointer of a specific type from the allocated host memory
+    /// @tparam T Type of the pointer returned
+    /// @param count Numbers of elements of @c parameter type T to skip
+    /// @warning might return NULL
+    template <typename T>
+    T* data(ptrdiff_t count = 0) const {return mData ? reinterpret_cast<T*>(mData) + count : nullptr;}
+
     //@{
     /// @brief Returns the size in bytes associated with this buffer.
     uint64_t bufferSize() const { return mSize; }
 
@@ -41,7 +41,7 @@ class DeviceBuffer
 
     /// @brief Initialize buffer
     /// @param size byte size of buffer to be initialized
-    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param device id of the device on which to initialize the buffer
     /// @note All existing buffers are first cleared
     /// @warning size is expected to be non-zero. Use clear() clear buffer!
     void init(uint64_t size, int device, cudaStream_t stream);
@@ -127,6 +127,20 @@ class DeviceBuffer
         other.mSize = other.mDeviceCount = other.mManaged = 0;
     }
 
+    /// @brief Copy-constructor from a HostBuffer
+    /// @param buffer host buffer from which to copy data
+    /// @param device id of the device on which to initialize the buffer
+    /// @param stream cuda stream
+    DeviceBuffer(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0)
+        : DeviceBuffer(buffer.size(), device, stream)
+    {
+        if (mCpuData) {
+            cudaCheck(cudaMemcpy(mCpuData, buffer.data(), mSize, cudaMemcpyHostToHost));
+        } else if (mGpuData[device]) {
+            cudaCheck(cudaMemcpyAsync(mGpuData[device], buffer.data(), mSize, cudaMemcpyHostToDevice, stream));
+        }
+    }
+
      /// @brief Destructor frees memory on both the host and device
     ~DeviceBuffer() { this->clear(); };
 
@@ -138,6 +152,11 @@ class DeviceBuffer
     /// @return An instance of this class using move semantics
     static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy, bool host, void* stream){return DeviceBuffer(size, host, stream);}
 
+    /// @brief Static factory method that returns an instance of this buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
+    /// @param device id of the device on which to initialize the buffer
+    /// @param stream cuda stream
     static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy = nullptr, int device = cudaCpuDeviceId, cudaStream_t stream = 0){return DeviceBuffer(size, device, stream);}
 
     /// @brief Static factory method that returns an instance of this buffer that wraps externally managed memory
@@ -153,13 +172,20 @@ class DeviceBuffer
     /// @param list list of device IDs and device memory pointers
     static DeviceBuffer create(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return DeviceBuffer(size, cpuData, list);}
 
+    /// @brief Static factory method that returns an instance of this buffer constructed from a HostBuffer
+    /// @param buffer host buffer from which to copy data
+    /// @param device id of the device on which to initialize the buffer
+    /// @param stream cuda stream
+    static DeviceBuffer create(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return DeviceBuffer(buffer, device, stream);}
+
     ///////////////////////////////////////////////////////////////////////
 
     /// @{
     /// @brief Factory methods that create a shared pointer to an DeviceBuffer instance
     static PtrT createPtr(uint64_t size, const DeviceBuffer* = nullptr, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return std::make_shared<DeviceBuffer>(size, device, stream);}
     static PtrT createPtr(uint64_t size, void* cpuData, void* gpuData) {return std::make_shared<DeviceBuffer>(size, cpuData, gpuData);}
     static PtrT createPtr(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return std::make_shared<DeviceBuffer>(size, cpuData, list);}
+    static PtrT createPtr(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return std::make_shared<DeviceBuffer>(buffer, device, stream);}
     /// @}
 
     ///////////////////////////////////////////////////////////////////////
@@ -168,18 +194,7 @@ class DeviceBuffer
     DeviceBuffer& operator=(const DeviceBuffer&) = delete;
 
     /// @brief Move copy assignment operation
-    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept
-    {
-        mSize    = other.mSize;
-        mCpuData = other.mCpuData;
-        delete [] mGpuData;
-        mGpuData = other.mGpuData;
-        mDeviceCount = other.mDeviceCount;
-        mManaged = other.mManaged;
-        other.mCpuData = other.mGpuData = nullptr;
-        other.mSize = other.mDeviceCount = other.mManaged = 0;
-        return *this;
-    }
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept;
 
     ///////////////////////////////////////////////////////////////////////
 
@@ -190,7 +205,7 @@ class DeviceBuffer
     /// @brief Returns an offset pointer of a specific type from the allocated host memory
     /// @tparam T Type of the pointer returned
     /// @param count Numbers of elements of @c parameter type T to skip
-    /// @warning assumes that this instance is not empty!
+    /// @warning might return NULL
     template <typename T>
     T* data(ptrdiff_t count = 0, int device = cudaCpuDeviceId) const
     {
@@ -293,6 +308,26 @@ class DeviceBuffer
 
 // --------------------------> Implementations below <------------------------------------
 
+inline DeviceBuffer& DeviceBuffer::operator=(DeviceBuffer&& other) noexcept
+{
+    if (mManaged) {// first free all the managed data buffers
+        cudaCheck(cudaFreeHost(mCpuData));
+        for (int i=0; i<mDeviceCount; ++i) cudaCheck(util::cuda::freeAsync(mGpuData[i], 0));
+    }
+    delete [] mGpuData;
+    mSize    = other.mSize;
+    mCpuData = other.mCpuData;
+    mGpuData = other.mGpuData;
+    mDeviceCount = other.mDeviceCount;
+    mManaged = other.mManaged;
+    other.mCpuData = nullptr;
+    other.mGpuData = nullptr;
+    other.mSize = 0;
+    other.mDeviceCount = 0;
+    other.mManaged = 0;
+    return *this;
+}
+
 inline void DeviceBuffer::init(uint64_t size, int device, cudaStream_t stream)
 {
     if (size==0) return;
@@ -303,7 +338,7 @@ inline void DeviceBuffer::init(uint64_t size, int device, cudaStream_t stream)
         cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
         checkPtr(mCpuData, "cuda::DeviceBuffer::init: failed to allocate host buffer");
     } else {
-        cudaCheck(cudaMallocAsync(mGpuData+device, size, stream)); // un-managed memory on the device, always 32B aligned!
+        cudaCheck(util::cuda::mallocAsync(mGpuData+device, size, stream)); // un-managed memory on the device, always 32B aligned!
         checkPtr(mGpuData[device], "cuda::DeviceBuffer::init: failed to allocate device buffer");
     }
     mSize = size;
@@ -316,7 +351,7 @@ inline void DeviceBuffer::deviceUpload(int device, cudaStream_t stream, bool syn
     checkPtr(mCpuData, "uninitialized cpu source data");
     if (mGpuData[device] == nullptr) {
         if (mManaged==0) throw std::runtime_error("DeviceBuffer::deviceUpload called on externally managed memory that wasn\'t allocated.");
-        cudaCheck(cudaMallocAsync(mGpuData+device, mSize, stream)); // un-managed memory on the device, always 32B aligned!
+        cudaCheck(util::cuda::mallocAsync(mGpuData+device, mSize, stream)); // un-managed memory on the device, always 32B aligned!
     }
     checkPtr(mGpuData[device], "uninitialized gpu destination data");
     cudaCheck(cudaMemcpyAsync(mGpuData[device], mCpuData, mSize, cudaMemcpyHostToDevice, stream));
@@ -352,13 +387,16 @@ inline void DeviceBuffer::deviceDownload(void* stream, bool sync)
 
 inline void DeviceBuffer::clear(cudaStream_t stream)
 {
-    if (mManaged!=0) {// free all the managed data buffers
+    if (mManaged) {// free all the managed data buffers
         cudaCheck(cudaFreeHost(mCpuData));
-        for (int i=0; i<mDeviceCount; ++i) cudaCheck(cudaFreeAsync(mGpuData[i], stream));
+        for (int i=0; i<mDeviceCount; ++i) cudaCheck(util::cuda::freeAsync(mGpuData[i], stream));
     }
     delete [] mGpuData;
-    mCpuData = mGpuData = nullptr;
-    mSize = mDeviceCount = mManaged = 0;
+    mCpuData = nullptr;
+    mGpuData = nullptr;
+    mSize = 0;
+    mDeviceCount = 0;
+    mManaged = 0;
 } // DeviceBuffer::clear
 
 }// namespace cuda
 
@@ -0,0 +1,35 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef NANOVDB_CUDA_DEVICERESOURCE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_DEVICERESOURCE_H_HAS_BEEN_INCLUDED
+
+#include <cuda_runtime_api.h>
+#include <nanovdb/util/cuda/Util.h>
+
+namespace nanovdb {
+
+namespace cuda {
+
+class DeviceResource
+{
+public:
+    // cudaMalloc aligns memory to 256 bytes by default
+    static constexpr size_t DEFAULT_ALIGNMENT = 256;
+
+    static void* allocateAsync(size_t bytes, size_t, cudaStream_t stream) {
+        void* p = nullptr;
+        cudaCheck(util::cuda::mallocAsync(&p, bytes, stream));
+        return p;
+    }
+
+    static void deallocateAsync(void *p, size_t, size_t, cudaStream_t stream) {
+        cudaCheck(util::cuda::freeAsync(p, stream));
+    }
+};
+
+}
+
+} // namespace nanovdb::cuda
+
+#endif // end of NANOVDB_CUDA_DEVICERESOURCE_H_HAS_BEEN_INCLUDED
@@ -34,7 +34,8 @@ __global__ void cpyGridHandleMeta(const GridData *d_data, GridHandleMetaData *d_
 __global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty)
 {
     NANOVDB_ASSERT(gridIndex < gridCount);
-    if (*d_dirty = d_data->mGridIndex != gridIndex || d_data->mGridCount != gridCount) {
+    *d_dirty = (d_data->mGridIndex != gridIndex) || (d_data->mGridCount != gridCount);
+    if (*d_dirty) {
         d_data->mGridIndex = gridIndex;
         d_data->mGridCount = gridCount;
         if (d_data->mChecksum.isEmpty()) *d_dirty = false;// no need to update checksum if it didn't already exist
@@ -61,6 +62,7 @@ splitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullp
         updateGridCount<<<1, 1, 0, stream>>>(dst, 0u, 1u, d_dirty);
         cudaCheckError();
         cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
+        cudaCheck(cudaStreamSynchronize(stream));
         if (dirty) tools::cuda::updateChecksum(dst, CheckMode::Partial, stream);
         handles[n] = nanovdb::GridHandle<BufferT>(std::move(buffer));
         ptr = util::PtrAdd(ptr, handle.gridSize(n));
@@ -93,6 +95,7 @@ mergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* oth
             updateGridCount<<<1, 1, 0, stream>>>(data, counter++, gridCount, d_dirty);
             cudaCheckError();
             cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
+            cudaCheck(cudaStreamSynchronize(stream));
             if (dirty) tools::cuda::updateChecksum(data, CheckMode::Partial, stream);
             dst = util::PtrAdd(dst, h.gridSize(n));
             src = util::PtrAdd(src, h.gridSize(n));
 
@@ -0,0 +1,55 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef NANOVDB_CUDA_TEMPPOOL_H_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_TEMPPOOL_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/cuda/DeviceResource.h>
+
+namespace nanovdb {
+
+namespace cuda {
+
+template <class Resource>
+class TempPool {
+public:
+    TempPool() : mData(nullptr), mSize(0), mRequestedSize(0) {}
+    ~TempPool() {
+        mRequestedSize = 0;
+        Resource::deallocateAsync(mData, mSize, Resource::DEFAULT_ALIGNMENT, nullptr);
+        mData = nullptr;
+        mSize = 0;
+    }
+
+    void* data() {
+        return mData;
+    }
+
+    size_t& size() {
+        return mSize;
+    }
+
+    size_t& requestedSize() {
+        return mRequestedSize;
+    }
+
+    void reallocate(cudaStream_t stream) {
+        if (!mData || mRequestedSize > mSize) {
+            Resource::deallocateAsync(mData, mSize, Resource::DEFAULT_ALIGNMENT, stream);
+            mData = Resource::allocateAsync(mRequestedSize, Resource::DEFAULT_ALIGNMENT, stream);
+            mSize = mRequestedSize;
+        }
+    }
+private:
+    void* mData;
+    size_t mSize;
+    size_t mRequestedSize;
+};
+
+using TempDevicePool = TempPool<DeviceResource>;
+
+}
+
+} // namespace nanovdb::cuda
+
+#endif // end of NANOVDB_CUDA_TEMPPOOL_H_HAS_BEEN_INCLUDED