Skip to content

Commit 93549a7

Browse files
committed
Add support for CUDA 13.0
* Add initial support for CUDA 13 * More fixes for CUDA 13 * Fix comment Signed-off-by: Matthew Cong <[email protected]>
1 parent 8fc5038 commit 93549a7

File tree

3 files changed

+57
-20
lines changed

3 files changed

+57
-20
lines changed

nanovdb/nanovdb/cuda/UnifiedBuffer.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ class UnifiedBuffer
6262
{
6363
assert(mSize <= mCapacity);
6464
cudaCheck(cudaMallocManaged(&mPtr, mCapacity, cudaMemAttachGlobal));
65-
cudaCheck(cudaMemAdvise(mPtr, size, cudaMemAdviseSetPreferredLocation, device));
66-
cudaCheck(cudaMemPrefetchAsync(mPtr, size, device, stream));
65+
cudaCheck(util::cuda::memAdvise(mPtr, size, cudaMemAdviseSetPreferredLocation, device));
66+
cudaCheck(util::cuda::memPrefetchAsync(mPtr, size, device, stream));
6767
}
6868

6969
/// @brief Constructor with a specified device
@@ -113,8 +113,8 @@ class UnifiedBuffer
113113
{
114114
const size_t capacity = (reference && reference->capacity()) ? reference->capacity() : size;
115115
UnifiedBuffer buffer(size, capacity);
116-
cudaCheck(cudaMemAdvise(buffer.mPtr, size, cudaMemAdviseSetPreferredLocation, device));
117-
cudaCheck(cudaMemPrefetchAsync(buffer.mPtr, size, device, stream));
116+
cudaCheck(util::cuda::memAdvise(buffer.mPtr, size, cudaMemAdviseSetPreferredLocation, device));
117+
cudaCheck(util::cuda::memPrefetchAsync(buffer.mPtr, size, device, stream));
118118
return buffer;
119119
}
120120

@@ -184,7 +184,7 @@ class UnifiedBuffer
184184
} else {
185185
void *ptr = 0;
186186
cudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal));
187-
if (dev > -2) for (auto a : list) cudaCheck(cudaMemAdvise(ptr, size, a, dev));
187+
if (dev > -2) for (auto a : list) cudaCheck(util::cuda::memAdvise(ptr, size, a, dev));
188188
if (mSize > 0) {// copy over data from the old memory block
189189
cudaCheck(cudaMemcpy(ptr, mPtr, std::min(mSize, size), cudaMemcpyDefault));
190190
cudaCheck(cudaFree(mPtr));
@@ -201,7 +201,7 @@ class UnifiedBuffer
201201
/// @param adv advice to be applied to the resized range
202202
void advise(ptrdiff_t byteOffset, size_t size, int dev, cudaMemoryAdvise adv) const
203203
{
204-
cudaCheck(cudaMemAdvise(util::PtrAdd(mPtr, byteOffset), size, adv, dev));
204+
cudaCheck(util::cuda::memAdvise(util::PtrAdd(mPtr, byteOffset), size, adv, dev));
205205
}
206206

207207
/// @brief Apply a list of advices to a memory block
@@ -212,7 +212,7 @@ class UnifiedBuffer
212212
void advise(ptrdiff_t byteOffset, size_t size, int dev, std::initializer_list<cudaMemoryAdvise> list) const
213213
{
214214
void *ptr = util::PtrAdd(mPtr, byteOffset);
215-
for (auto a : list) cudaCheck(cudaMemAdvise(ptr, size, a, dev));
215+
for (auto a : list) cudaCheck(util::cuda::memAdvise(ptr, size, a, dev));
216216
}
217217

218218
/// @brief Prefetches data to the specified device, i.e. ensure the device has an up-to-date copy of the memory specified
@@ -222,7 +222,7 @@ class UnifiedBuffer
222222
/// @param stream cuda stream
223223
void prefetch(ptrdiff_t byteOffset = 0, size_t size = 0, int dev = cudaCpuDeviceId, cudaStream_t stream = cudaStreamPerThread) const
224224
{
225-
cudaCheck(cudaMemPrefetchAsync(util::PtrAdd(mPtr, byteOffset), size ? size : mSize, dev, stream));
225+
cudaCheck(util::cuda::memPrefetchAsync(util::PtrAdd(mPtr, byteOffset), size ? size : mSize, dev, stream));
226226
}
227227

228228
///////////////////////////////////////////////////////////////////////
@@ -234,7 +234,7 @@ class UnifiedBuffer
234234
/// @note Legacy method included for compatibility with DeviceBuffer
235235
void deviceUpload(int device = 0, cudaStream_t stream = cudaStreamPerThread, bool sync = false) const
236236
{
237-
cudaCheck(cudaMemPrefetchAsync(mPtr, mSize, device, stream));
237+
cudaCheck(util::cuda::memPrefetchAsync(mPtr, mSize, device, stream));
238238
if (sync) cudaCheck(cudaStreamSynchronize(stream));
239239
}
240240
void deviceUpload(int device, void* stream, bool sync) const{this->deviceUpload(device, cudaStream_t(stream));}
@@ -256,7 +256,7 @@ class UnifiedBuffer
256256
/// @param sync if false the memory copy is asynchronous
257257
void deviceDownload(cudaStream_t stream = 0, bool sync = false) const
258258
{
259-
cudaCheck(cudaMemPrefetchAsync(mPtr, mSize, cudaCpuDeviceId, stream));
259+
cudaCheck(util::cuda::memPrefetchAsync(mPtr, mSize, cudaCpuDeviceId, stream));
260260
if (sync) cudaCheck(cudaStreamSynchronize(stream));
261261
}
262262

nanovdb/nanovdb/tools/cuda/DistributedPointsToGrid.cuh

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -447,17 +447,17 @@ void DistributedPointsToGrid<BuildT>::countNodes(const PtrT coords, size_t coord
447447
uint64_t* deviceOutputKeys = mData->d_keys + deviceStripeOffset;
448448
uint32_t* deviceOutputIndices = mData->d_indx + deviceStripeOffset;
449449

450-
cudaMemAdvise(deviceCoords, deviceStripeCount * sizeof(nanovdb::Coord), cudaMemAdviseSetPreferredLocation, deviceId);
451-
cudaMemAdvise(deviceCoords, deviceStripeCount * sizeof(nanovdb::Coord), cudaMemAdviseSetReadMostly, deviceId);
450+
util::cuda::memAdvise(deviceCoords, deviceStripeCount * sizeof(nanovdb::Coord), cudaMemAdviseSetPreferredLocation, deviceId);
451+
util::cuda::memAdvise(deviceCoords, deviceStripeCount * sizeof(nanovdb::Coord), cudaMemAdviseSetReadMostly, deviceId);
452452

453-
cudaMemAdvise(deviceInputKeys, deviceStripeCount * sizeof(uint64_t), cudaMemAdviseSetPreferredLocation, deviceId);
454-
cudaMemAdvise(deviceInputIndices, deviceStripeCount * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
455-
cudaMemAdvise(deviceOutputKeys, deviceStripeCount * sizeof(uint64_t), cudaMemAdviseSetPreferredLocation, deviceId);
456-
cudaMemAdvise(deviceOutputIndices, deviceStripeCount * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
453+
util::cuda::memAdvise(deviceInputKeys, deviceStripeCount * sizeof(uint64_t), cudaMemAdviseSetPreferredLocation, deviceId);
454+
util::cuda::memAdvise(deviceInputIndices, deviceStripeCount * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
455+
util::cuda::memAdvise(deviceOutputKeys, deviceStripeCount * sizeof(uint64_t), cudaMemAdviseSetPreferredLocation, deviceId);
456+
util::cuda::memAdvise(deviceOutputIndices, deviceStripeCount * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
457457

458458
uint32_t* devicePointsPerTile = mPointsPerTile + deviceStripeOffset;
459-
cudaMemAdvise(devicePointsPerTile, deviceStripeCount * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
460-
cudaMemAdvise(deviceNodeCount(deviceId), 3 * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
459+
util::cuda::memAdvise(devicePointsPerTile, deviceStripeCount * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
460+
util::cuda::memAdvise(deviceNodeCount(deviceId), 3 * sizeof(uint32_t), cudaMemAdviseSetPreferredLocation, deviceId);
461461
}
462462

463463
// Radix sort the subset of keys assigned to each device in parallel
@@ -472,7 +472,7 @@ void DistributedPointsToGrid<BuildT>::countNodes(const PtrT coords, size_t coord
472472
uint64_t* deviceOutputKeys = mData->d_keys + deviceStripeOffset;
473473
uint32_t* deviceOutputIndices = mData->d_indx + deviceStripeOffset;
474474

475-
cudaMemPrefetchAsync(coords, coordCount * sizeof(nanovdb::Coord), deviceId, stream);
475+
util::cuda::memPrefetchAsync(coords, coordCount * sizeof(nanovdb::Coord), deviceId, stream);
476476

477477
nanovdb::util::cuda::offsetLambdaKernel<<<numBlocks(deviceStripeCount), mNumThreads, 0, stream>>>(deviceStripeCount, deviceStripeOffset, TileKeyFunctor<BuildT, PtrT>(), mData, coords, mKeys, mIndices);
478478

@@ -661,7 +661,7 @@ void DistributedPointsToGrid<BuildT>::countNodes(const PtrT coords, size_t coord
661661
uint64_t* deviceOutputKeys = mData->d_keys + deviceStripeOffset;
662662
uint32_t* devicePointsPerTile = mPointsPerTile + deviceStripeOffset;
663663

664-
// cudaMemPrefetchAsync(deviceInputKeys, deviceStripeCount * sizeof(uint64_t), deviceId, stream);
664+
// util::cuda::memPrefetchAsync(deviceInputKeys, deviceStripeCount * sizeof(uint64_t), deviceId, stream);
665665

666666
CUB_LAUNCH(DeviceRunLengthEncode::Encode, mTempDevicePools[deviceId], stream, deviceInputKeys, deviceOutputKeys, devicePointsPerTile, deviceNodeCount(deviceId) + 2, deviceStripeCount);
667667
cudaCheck(cudaEventRecord(runLengthEncodeEvents[deviceId], stream));

nanovdb/nanovdb/util/cuda/Util.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,43 @@ inline size_t blocksPerGrid(size_t numItems, size_t threadsPerBlock)
198198
return (numItems + threadsPerBlock - 1) / threadsPerBlock;
199199
}
200200

201+
// CUDA 13.0 changes cudaMemPrefetchAsync and cudaMemPrefetch to use a cudaMemLocation as an argument as
202+
// opposed to an integer device id. This function provides compatibility by returning the corresponding
203+
// location in CUDA 13.0 and above while passing through the device in earlier versions.
204+
#if (CUDART_VERSION < 13000)
205+
/// @brief Compatbility wrapper for cudaMemAdvise/cudaMemAdvise
206+
inline cudaError_t memAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) {
207+
return cudaMemAdvise(devPtr, count, advice, device);
208+
}
209+
210+
/// @brief Compatbility wrapper for cudaMemPrefetchAsync/cudaMemPrefetchAsync
211+
inline cudaError_t memPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) {
212+
return cudaMemPrefetchAsync(devPtr, count, dstDevice, stream);
213+
}
214+
#else
215+
/// @brief Helper function that converts a device id to a cudaMemLocation
216+
/// @param device Integer device id
217+
/// @return cudaMemLocation corresponding to the device id
218+
inline cudaMemLocation deviceToLocation(int device) {
219+
if (device < cudaCpuDeviceId) {
220+
return {cudaMemLocationTypeInvalid, device};
221+
} else if (device == cudaCpuDeviceId) {
222+
return {cudaMemLocationTypeHost, device};
223+
} else {
224+
return {cudaMemLocationTypeDevice, device};
225+
}
226+
}
227+
228+
/// @brief Compatbility wrapper for cudaMemAdvise/cudaMemAdvise
229+
inline cudaError_t memAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) {
230+
return cudaMemAdvise(devPtr, count, advice, deviceToLocation(device));
231+
}
232+
233+
/// @brief Compatbility wrapper for cudaMemPrefetchAsync/cudaMemPrefetchAsync
234+
inline cudaError_t memPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) {
235+
return cudaMemPrefetchAsync(devPtr, count, deviceToLocation(dstDevice), 0u, stream);
236+
}
237+
#endif
201238

202239
#if defined(__CUDACC__)// the following functions only run on the GPU!
203240

0 commit comments

Comments
 (0)