Skip to content

Commit ad0e91a

Browse files
authored
[https://nvbugs/5546202][fix] Fix concurrent bug for NIXL cache transceiver (NVIDIA#8147)
Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
1 parent 6545d54 commit ad0e91a

File tree

3 files changed

+17
-23
lines changed

3 files changed

+17
-23
lines changed

cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,9 @@ class CacheSender::Impl
291291
mSelfState.setCommState(std::move(commState));
292292
}
293293

294-
[[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId) const
294+
[[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId)
295295
{
296+
std::unique_lock<std::mutex> lk(mMtxForMap);
296297
auto it = mRequestToSession.find(requestId);
297298
TLLM_CHECK(it != mRequestToSession.end());
298299
return it->second.getConnections().size();
@@ -472,8 +473,18 @@ class CacheSender::Impl
472473
// TODO(zhengd): pass the hashes directly instead of update llmRequest
473474
auto llmRequest = it->second.mRequest;
474475
llmRequest->setRequestedBlockHashes(std::move(blockHashes));
475-
476-
asyncSendAndRemoveResponse(it->first, std::move(it->second));
476+
if (dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager) != nullptr)
477+
{
478+
// our nixl impl seems only support recv and send in the same thread
479+
// if we use zmq as control path, we may avoid this issue
480+
sendAndRemoveResponse(it->first, std::move(it->second));
481+
}
482+
else
483+
{
484+
// if we send data in another thread, multiple rank may send data for different requests at the same
485+
// time with gen DP case.
486+
asyncSendAndRemoveResponse(it->first, std::move(it->second));
487+
}
477488
removeResponse(it);
478489
}
479490
mCurrentRequest = std::nullopt;

tests/integration/defs/cpp/test_multi_gpu.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -108,22 +108,6 @@ def run_cache_transceiver_tests(build_dir: _pl.Path,
108108
env=mgpu_env,
109109
timeout=timeout)
110110

111-
# Nixl transfer agent tests
112-
new_env = get_multi_gpu_env(kv_cache_type=KVCacheType.NIXL)
113-
114-
# Cache transceiver tests
115-
cache_trans_test_8_proc = [
116-
"mpirun",
117-
"-n",
118-
"8",
119-
"--allow-run-as-root",
120-
"cacheTransceiverTest",
121-
]
122-
_cpp.run_command(cache_trans_test_8_proc,
123-
cwd=tests_dir,
124-
env=new_env,
125-
timeout=600)
126-
127111

128112
def run_user_buffer_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
129113
tests_dir = build_dir / "tests" / "unit_tests" / "multi_gpu"
@@ -500,8 +484,8 @@ def test_fused_gemm_allreduce(build_google_tests, nprocs, build_dir):
500484

501485
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
502486
indirect=True)
503-
@pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
504-
ids=["mpi_kvcache", "ucx_kvcache"])
487+
@pytest.mark.parametrize("kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX],
488+
ids=["nixl_kvcache", "ucx_kvcache"])
505489
@pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"])
506490
def test_cache_transceiver(build_google_tests, nprocs, kvcache_type, build_dir):
507491

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,8 @@ l0_dgx_h100:
175175
# ------------- CPP tests ---------------
176176
- cpp/test_multi_gpu.py::test_mpi_utils[90]
177177
- cpp/test_multi_gpu.py::test_fused_gemm_allreduce[4proc-90]
178-
- cpp/test_multi_gpu.py::test_cache_transceiver[2proc-mpi_kvcache-90]
179178
- cpp/test_multi_gpu.py::test_cache_transceiver[2proc-ucx_kvcache-90]
180-
- cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mpi_kvcache-90]
179+
- cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90]
181180
- cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90]
182181
- cpp/test_multi_gpu.py::test_user_buffer[2proc-90]
183182
- cpp/test_multi_gpu.py::test_enc_dec[t5-90]

0 commit comments

Comments
 (0)