nixlbench: fix cudaFree problem (#965)

e-ago · eagonv · aranadive · web-flow · commit 38e2585358de · 2025-11-04T11:40:58.000-08:00
* nixlbench: fix cudaFree problem

Signed-off-by: eagostini &lt;eagostini@nvidia.com&gt;

* fix codestyle

Signed-off-by: eagostini &lt;eagostini@nvidia.com&gt;

---------

Signed-off-by: eagostini &lt;eagostini@nvidia.com&gt;
Co-authored-by: eagostini &lt;eagostini@nvidia.com&gt;
Co-authored-by: Adit Ranadive &lt;aranadive@nvidia.com&gt;
diff --git a/benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp b/benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp
@@ -633,7 +633,17 @@ xferBenchNixlWorker::cleanupBasicDescVram(xferBenchIOV &iov) {
         CHECK_CUDA_DRIVER_ERROR(cuMemAddressFree(iov.addr, iov.padded_size),
                                 "Failed to free reserved address");
     } else {
-        CHECK_CUDA_ERROR(cudaFree((void *)iov.addr), "Failed to deallocate CUDA buffer");
+        /*
+         * CUDA streams allow for concurrent execution of kernels and memory operations. However,
+         * memory management functions like cudaFree are implicitly synchronized with all streams to
+         * guarantee safety. This means cudaFree will wait for all kernels (in any stream) that
+         * might use the memory to finish before actually freeing it.
+         * If the application hangs on cudaFree due to kernels running in other streams, switching
+         * to cudaFreeAsync can allow the host to proceed without waiting for the entire device
+         * synchronization.
+         */
+        CHECK_CUDA_ERROR(cudaFreeAsync((void *)iov.addr, 0), "Failed to deallocate CUDA buffer");
+        CHECK_CUDA_ERROR(cudaStreamSynchronize(0), "Failed to synchronize stream 0");
     }
 }
 #endif /* HAVE_CUDA */