Skip to content

Commit 38e2585

Browse files
e-agoeagonvaranadive
authored
nixlbench: fix cudaFree problem (#965)
* nixlbench: fix cudaFree problem Signed-off-by: eagostini <[email protected]> * fix codestyle Signed-off-by: eagostini <[email protected]> --------- Signed-off-by: eagostini <[email protected]> Co-authored-by: eagostini <[email protected]> Co-authored-by: Adit Ranadive <[email protected]>
1 parent 7e4cd8f commit 38e2585

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,17 @@ xferBenchNixlWorker::cleanupBasicDescVram(xferBenchIOV &iov) {
633633
CHECK_CUDA_DRIVER_ERROR(cuMemAddressFree(iov.addr, iov.padded_size),
634634
"Failed to free reserved address");
635635
} else {
636-
CHECK_CUDA_ERROR(cudaFree((void *)iov.addr), "Failed to deallocate CUDA buffer");
636+
/*
637+
* CUDA streams allow for concurrent execution of kernels and memory operations. However,
638+
* memory management functions like cudaFree are implicitly synchronized with all streams to
639+
* guarantee safety. This means cudaFree will wait for all kernels (in any stream) that
640+
* might use the memory to finish before actually freeing it.
641+
* If the application hangs on cudaFree due to kernels running in other streams, switching
642+
* to cudaFreeAsync can allow the host to proceed without waiting for the entire device
643+
* synchronization.
644+
*/
645+
CHECK_CUDA_ERROR(cudaFreeAsync((void *)iov.addr, 0), "Failed to deallocate CUDA buffer");
646+
CHECK_CUDA_ERROR(cudaStreamSynchronize(0), "Failed to synchronize stream 0");
637647
}
638648
}
639649
#endif /* HAVE_CUDA */

0 commit comments

Comments
 (0)