From 52d9e20cbfef937dba8e5c510a38d7356da9027d Mon Sep 17 00:00:00 2001 From: Feng Ren Date: Fri, 8 Aug 2025 02:22:36 +0000 Subject: [PATCH 1/5] Add cudaDeviceSynchronize after first address mapping --- .../src/transport/nvlink_transport/nvlink_transport.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp b/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp index 282fdf72f..325b4aa93 100644 --- a/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp +++ b/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp @@ -515,6 +515,7 @@ int NvlinkTransport::relocateSharedMemoryAddress(uint64_t &dest_addr, LOG(ERROR) << "Mismatched NVLink data transfer method"; return -1; } + cudaDeviceSynchronize(); } auto shm_addr = remap_entries_[std::make_pair(target_id, entry.addr)].shm_addr; From 8d8a02c262b3edc8ff5b5285054bfe937c3c18d4 Mon Sep 17 00:00:00 2001 From: Feng Ren Date: Fri, 8 Aug 2025 08:12:44 +0000 Subject: [PATCH 2/5] Move position --- .../src/transport/nvlink_transport/nvlink_transport.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp b/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp index 325b4aa93..fe4bc461b 100644 --- a/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp +++ b/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp @@ -515,11 +515,13 @@ int NvlinkTransport::relocateSharedMemoryAddress(uint64_t &dest_addr, LOG(ERROR) << "Mismatched NVLink data transfer method"; return -1; } - cudaDeviceSynchronize(); } auto shm_addr = remap_entries_[std::make_pair(target_id, entry.addr)].shm_addr; dest_addr = dest_addr - entry.addr + ((uint64_t)shm_addr); + // Device synchronize is required for NVLink IPC memory + // to ensure that the memory is ready for access. + cudaDeviceSynchronize(); return 0; } index++; From f106477959b7979c440ba5dd8d321c9c9d744ca5 Mon Sep 17 00:00:00 2001 From: Feng Ren Date: Tue, 12 Aug 2025 02:29:10 +0000 Subject: [PATCH 3/5] New attempt to fix incorrect transfer --- .../transfer_engine/transfer_engine_py.cpp | 16 ++++++++++++++++ .../nvlink_transport/nvlink_transport.cpp | 3 --- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp index b99a24cfd..edb052d92 100644 --- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp +++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp @@ -21,6 +21,9 @@ #include #ifdef USE_MNNVL +#include // pytorch dependencies +#include + #include "transport/nvlink_transport/nvlink_transport.h" static void *allocateMemory(size_t size) { return mooncake::NvlinkTransport::allocatePinnedLocalMemory(size); @@ -28,9 +31,18 @@ static void *allocateMemory(size_t size) { static void freeMemory(void *ptr) { mooncake::NvlinkTransport::freePinnedLocalMemory(ptr); } +static void synchronizePyTorchEvents() { + auto stream = at::cuda::getCurrentCUDAStream().stream(); + cudaEvent_t ev; + cudaEventCreate(&ev); + cudaEventRecord(ev, stream); + cudaStreamWaitEvent(0, ev, 0); + cudaEventDestroy(ev); +} #else static void *allocateMemory(size_t size) { return malloc(size); } static void freeMemory(void *ptr) { free(ptr); } +static void synchronizePyTorchEvents() {} #endif TransferEnginePy::TransferEnginePy() { @@ -258,6 +270,7 @@ int TransferEnginePy::transferSync(const char *target_hostname, uintptr_t peer_buffer_address, size_t length, TransferOpcode opcode) { pybind11::gil_scoped_release release; + synchronizePyTorchEvents(); Transport::SegmentHandle handle; { std::lock_guard guard(mutex_); @@ -331,6 +344,7 @@ int TransferEnginePy::batchTransferSync( std::vector peer_buffer_addresses, std::vector lengths, TransferOpcode opcode) { pybind11::gil_scoped_release release; + synchronizePyTorchEvents(); Transport::SegmentHandle handle; { std::lock_guard guard(mutex_); @@ -419,6 +433,7 @@ batch_id_t TransferEnginePy::batchTransferAsync( const std::vector &peer_buffer_addresses, const std::vector &lengths, TransferOpcode opcode) { pybind11::gil_scoped_release release; + synchronizePyTorchEvents(); Transport::SegmentHandle handle; { std::lock_guard guard(mutex_); @@ -541,6 +556,7 @@ batch_id_t TransferEnginePy::transferSubmitWrite(const char *target_hostname, uintptr_t peer_buffer_address, size_t length) { pybind11::gil_scoped_release release; + synchronizePyTorchEvents(); Transport::SegmentHandle handle; { std::lock_guard guard(mutex_); diff --git a/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp b/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp index fe4bc461b..282fdf72f 100644 --- a/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp +++ b/mooncake-transfer-engine/src/transport/nvlink_transport/nvlink_transport.cpp @@ -519,9 +519,6 @@ int NvlinkTransport::relocateSharedMemoryAddress(uint64_t &dest_addr, auto shm_addr = remap_entries_[std::make_pair(target_id, entry.addr)].shm_addr; dest_addr = dest_addr - entry.addr + ((uint64_t)shm_addr); - // Device synchronize is required for NVLink IPC memory - // to ensure that the memory is ready for access. - cudaDeviceSynchronize(); return 0; } index++; From b43ca23aedd7342cf9dd537041f4c3e37f788fb5 Mon Sep 17 00:00:00 2001 From: Feng Ren Date: Tue, 12 Aug 2025 02:59:55 +0000 Subject: [PATCH 4/5] More pessimistic --- mooncake-integration/transfer_engine/transfer_engine_py.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp index edb052d92..62f5d8595 100644 --- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp +++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp @@ -36,7 +36,7 @@ static void synchronizePyTorchEvents() { cudaEvent_t ev; cudaEventCreate(&ev); cudaEventRecord(ev, stream); - cudaStreamWaitEvent(0, ev, 0); + cudaEventSynchronize(ev); cudaEventDestroy(ev); } #else From cf6cfd97add8309084fdc544ce0e05f89d9046fe Mon Sep 17 00:00:00 2001 From: Feng Ren Date: Tue, 12 Aug 2025 05:55:06 +0000 Subject: [PATCH 5/5] Fix format issue --- mooncake-integration/transfer_engine/transfer_engine_py.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp index 62f5d8595..9f3f656a6 100644 --- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp +++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp @@ -21,7 +21,7 @@ #include #ifdef USE_MNNVL -#include // pytorch dependencies +#include // pytorch dependencies #include #include "transport/nvlink_transport/nvlink_transport.h"