Device buffer from host buffer

kmuseth · matthewdcong · commit c4a3cb36434c · 2025-08-05T11:49:08.000-07:00
* minor cleanup

Signed-off-by: Ken &lt;ken.museth@gmail.com&gt;

* construct DeviceBuffer from HostBuffer

Signed-off-by: Ken &lt;ken.museth@gmail.com&gt;

* added HostBuffer::data&lt;T&gt;(int i)

Signed-off-by: Ken &lt;ken.museth@gmail.com&gt;

* improved unit-test

Signed-off-by: Ken &lt;ken.museth@gmail.com&gt;

* snapshot

Signed-off-by: Ken &lt;ken.museth@gmail.com&gt;

* removed whitespace

Signed-off-by: Ken &lt;ken.museth@gmail.com&gt;

---------

Signed-off-by: Ken &lt;ken.museth@gmail.com&gt;
Signed-off-by: Matthew Cong &lt;mcong@nvidia.com&gt;
diff --git a/nanovdb/nanovdb/HostBuffer.h b/nanovdb/nanovdb/HostBuffer.h
@@ -192,6 +192,13 @@ class HostBuffer
     void* data() { return mData; }
     //@}
 
+    /// @brief Returns an offset pointer of a specific type from the allocated host memory
+    /// @tparam T Type of the pointer returned
+    /// @param count Numbers of elements of @c parameter type T to skip
+    /// @warning might return NULL
+    template <typename T>
+    T* data(ptrdiff_t count = 0) const {return mData ? reinterpret_cast<T*>(mData) + count : nullptr;}
+
     //@{
     /// @brief Returns the size in bytes associated with this buffer.
     uint64_t bufferSize() const { return mSize; }
diff --git a/nanovdb/nanovdb/cuda/DeviceBuffer.h b/nanovdb/nanovdb/cuda/DeviceBuffer.h
@@ -127,6 +127,20 @@ class DeviceBuffer
         other.mSize = other.mDeviceCount = other.mManaged = 0;
     }
 
+    /// @brief Copy-constructor from a HostBuffer
+    /// @param buffer host buffer from which to copy data
+    /// @param device id of the device on which to initialize the buffer
+    /// @param stream cuda stream
+    DeviceBuffer(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0)
+        : DeviceBuffer(buffer.size(), device, stream)
+    {
+        if (mCpuData) {
+            cudaCheck(cudaMemcpy(mCpuData, buffer.data(), mSize, cudaMemcpyHostToHost));
+        } else if (mGpuData[device]) {
+            cudaCheck(cudaMemcpyAsync(mGpuData[device], buffer.data(), mSize, cudaMemcpyHostToDevice, stream));
+        }
+    }
+
      /// @brief Destructor frees memory on both the host and device
     ~DeviceBuffer() { this->clear(); };
 
@@ -153,13 +167,20 @@ class DeviceBuffer
     /// @param list list of device IDs and device memory pointers
     static DeviceBuffer create(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return DeviceBuffer(size, cpuData, list);}
 
+    /// @brief  Static factory method that returns an instance of this buffer constructed from a HostBuffer
+    /// @param buffer host buffer from which to copy data
+    /// @param device id of the device on which to initialize the buffer
+    /// @param stream cuda stream
+    static DeviceBuffer create(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return DeviceBuffer(buffer, device, stream);}
+
     ///////////////////////////////////////////////////////////////////////
 
     /// @{
     /// @brief Factory methods that create a shared pointer to an DeviceBuffer instance
     static PtrT createPtr(uint64_t size, const DeviceBuffer* = nullptr, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return std::make_shared<DeviceBuffer>(size, device, stream);}
     static PtrT createPtr(uint64_t size, void* cpuData, void* gpuData) {return std::make_shared<DeviceBuffer>(size, cpuData, gpuData);}
     static PtrT createPtr(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return std::make_shared<DeviceBuffer>(size, cpuData, list);}
+    static PtrT createPtr(const HostBuffer& buffer, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return std::make_shared<DeviceBuffer>(buffer, device, stream);}
     /// @}
 
     ///////////////////////////////////////////////////////////////////////
@@ -179,7 +200,7 @@ class DeviceBuffer
     /// @brief Returns an offset pointer of a specific type from the allocated host memory
     /// @tparam T Type of the pointer returned
     /// @param count Numbers of elements of @c parameter type T to skip
-    /// @warning assumes that this instance is not empty!
+    /// @warning might return NULL
     template <typename T>
     T* data(ptrdiff_t count = 0, int device = cudaCpuDeviceId) const
     {
diff --git a/nanovdb/nanovdb/unittest/TestNanoVDB.cu b/nanovdb/nanovdb/unittest/TestNanoVDB.cu
@@ -55,6 +55,44 @@ void device2host(size_t count)
     float *array = reinterpret_cast<float*>(buffer.data());
     for (size_t i=0; i<count; ++i) EXPECT_EQ(array[i], float(i));
 }// device2host
+void host2device(size_t count)
+{
+    const size_t size = count * sizeof(float);
+    auto hostBuffer = nanovdb::HostBuffer(size);
+    for (size_t i=0; i<count; ++i) *hostBuffer.data<float>(i) = float(i);
+
+    int dev;
+    cudaError_t err = cudaGetDevice(&dev);
+    if (err != cudaSuccess) printf("kernel cuda error: %d\n", (int)err);
+
+    auto devBuffer = nanovdb::cuda::DeviceBuffer::create(hostBuffer, dev);// on device only
+    EXPECT_EQ(size, devBuffer.size());
+    EXPECT_FALSE(devBuffer.data());
+    EXPECT_TRUE(devBuffer.deviceData());
+    float *d_array = reinterpret_cast<float*>(devBuffer.deviceData());
+    constexpr unsigned int num_threads = 256;
+    unsigned int num_blocks = num_blocks = (static_cast<unsigned int>(count) + num_threads - 1) / num_threads;
+
+    bool *test, *d_test;
+    cudaCheck(cudaMallocHost((void**)&test, sizeof(bool)));
+    cudaCheck(cudaMalloc((void**)&d_test, sizeof(bool)));
+    *test = true;
+    cudaCheck(cudaMemcpyAsync(d_test, test, sizeof(bool), cudaMemcpyHostToDevice));// on host only
+
+    nanovdb::util::cuda::lambdaKernel<<<num_blocks, num_threads>>>(count, [=] __device__ (size_t i) {
+        if (d_array[i] != float(i)) *d_test = false;
+        d_array[i] = float(i) + 1.0f;
+    });
+    cudaCheck(cudaMemcpy(test, d_test, sizeof(bool), cudaMemcpyDeviceToHost));
+    EXPECT_TRUE(*test);
+    cudaCheck(cudaFreeHost(test));
+    cudaCheck(cudaFree(d_test));
+    devBuffer.deviceDownload();// copy device -> host
+    EXPECT_EQ(size, devBuffer.size());
+    EXPECT_TRUE(devBuffer.data());
+    EXPECT_TRUE(devBuffer.deviceData());
+    for (size_t i=0; i<count; ++i) EXPECT_EQ(*hostBuffer.data<float>(i) + 1.0f, *devBuffer.data<float>(i));
+}// host2device
 // used for testing cuda::DeviceBuffer
 void host2device2host(size_t count)
 {
@@ -156,6 +194,7 @@ TEST(TestNanoVDBCUDA, CudaDeviceBuffer)
         EXPECT_FALSE(buffer.empty());
     }
     nanovdb::test::device2host(1000);
+    nanovdb::test::host2device(1000);
     nanovdb::test::host2device2host(1000);
 }
 
@@ -3594,4 +3633,40 @@ TEST(TestNanoVDBCUDA, VoxelBlockManager_ValueOnIndex)
     cudaCheck(cudaFree(deviceJumpMap));
 }// VoxelBlockManager_ValueOnIndex
 
+TEST(TestNanoVDBCUDA, GridHandle_from_HostBuffer)
+{
+    using namespace nanovdb;
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    auto hostHandle = tools::createLevelSetSphere<float>(100, Vec3d(0),1,3, Vec3d(0), "test");
+
+    int dev;
+    cudaError_t err = cudaGetDevice(&dev);
+    EXPECT_EQ(err, cudaSuccess);
+    cudaStream_t stream;
+    cudaCheck(cudaStreamCreate(&stream));
+
+    {// longer version
+        auto devBuffer = BufferT::create(hostHandle.buffer(), dev, stream);
+        EXPECT_EQ(hostHandle.bufferSize(), devBuffer.size());
+        auto devHandle = GridHandle<BufferT>(std::move(devBuffer));
+
+        // testing
+        EXPECT_EQ(hostHandle.bufferSize(), devHandle.bufferSize());
+        EXPECT_EQ(devBuffer.size(), 0);
+        devHandle.deviceDownload(stream);
+        for (uint64_t i=0; i<hostHandle.bufferSize(); ++i) {
+            EXPECT_EQ(*hostHandle.buffer().data<char>(i), *devHandle.buffer().data<char>(i));
+        }
+    }
+    {// compact version
+        auto devHandle = GridHandle<BufferT>(BufferT::create(hostHandle.buffer(), dev, stream));
+
+        // testing
+        EXPECT_EQ(hostHandle.bufferSize(), devHandle.bufferSize());
+        devHandle.deviceDownload(stream);
+        for (uint64_t i=0; i<hostHandle.bufferSize(); ++i) {
+            EXPECT_EQ(*hostHandle.buffer().data<char>(i), *devHandle.buffer().data<char>(i));
+        }
+    }
+}