File tree Expand file tree Collapse file tree 1 file changed +11
-1
lines changed
benchmark/nixlbench/src/worker/nixl Expand file tree Collapse file tree 1 file changed +11
-1
lines changed Original file line number Diff line number Diff line change @@ -633,7 +633,17 @@ xferBenchNixlWorker::cleanupBasicDescVram(xferBenchIOV &iov) {
633633 CHECK_CUDA_DRIVER_ERROR (cuMemAddressFree (iov.addr , iov.padded_size ),
634634 " Failed to free reserved address" );
635635 } else {
636- CHECK_CUDA_ERROR (cudaFree ((void *)iov.addr ), " Failed to deallocate CUDA buffer" );
636+ /*
637+ * CUDA streams allow for concurrent execution of kernels and memory operations. However,
638+ * memory management functions like cudaFree are implicitly synchronized with all streams to
639+ * guarantee safety. This means cudaFree will wait for all kernels (in any stream) that
640+ * might use the memory to finish before actually freeing it.
641+ * If the application hangs on cudaFree due to kernels running in other streams, switching
642+ * to cudaFreeAsync can allow the host to proceed without waiting for the entire device
643+ * synchronization.
644+ */
645+ CHECK_CUDA_ERROR (cudaFreeAsync ((void *)iov.addr , 0 ), " Failed to deallocate CUDA buffer" );
646+ CHECK_CUDA_ERROR (cudaStreamSynchronize (0 ), " Failed to synchronize stream 0" );
637647 }
638648}
639649#endif /* HAVE_CUDA */
You can’t perform that action at this time.
0 commit comments