[https://nvbugs/5546202][fix] Fix concurrent bug for NIXL cache transceiver (NVIDIA#8147)

chuangz0 · web-flow · commit ad0e91a17407 · 2025-10-13T09:40:56.000+02:00
Signed-off-by: Chuang Zhu &lt;111838961+chuangz0@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
@@ -291,8 +291,9 @@ class CacheSender::Impl
         mSelfState.setCommState(std::move(commState));
     }
 
-    [[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId) const
+    [[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId)
     {
+        std::unique_lock<std::mutex> lk(mMtxForMap);
         auto it = mRequestToSession.find(requestId);
         TLLM_CHECK(it != mRequestToSession.end());
         return it->second.getConnections().size();
@@ -472,8 +473,18 @@ class CacheSender::Impl
             // TODO(zhengd): pass the hashes directly instead of update llmRequest
             auto llmRequest = it->second.mRequest;
             llmRequest->setRequestedBlockHashes(std::move(blockHashes));
-
-            asyncSendAndRemoveResponse(it->first, std::move(it->second));
+            if (dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager) != nullptr)
+            {
+                // our nixl impl seems only support recv and send in the same thread
+                //  if we use zmq as control path, we may avoid this issue
+                sendAndRemoveResponse(it->first, std::move(it->second));
+            }
+            else
+            {
+                // if we send data in another thread, multiple rank may send data for different requests at the same
+                // time with gen DP case.
+                asyncSendAndRemoveResponse(it->first, std::move(it->second));
+            }
             removeResponse(it);
         }
         mCurrentRequest = std::nullopt;
diff --git a/tests/integration/defs/cpp/test_multi_gpu.py b/tests/integration/defs/cpp/test_multi_gpu.py
@@ -108,22 +108,6 @@ def run_cache_transceiver_tests(build_dir: _pl.Path,
                      env=mgpu_env,
                      timeout=timeout)
 
-    # Nixl transfer agent tests
-    new_env = get_multi_gpu_env(kv_cache_type=KVCacheType.NIXL)
-
-    # Cache transceiver tests
-    cache_trans_test_8_proc = [
-        "mpirun",
-        "-n",
-        "8",
-        "--allow-run-as-root",
-        "cacheTransceiverTest",
-    ]
-    _cpp.run_command(cache_trans_test_8_proc,
-                     cwd=tests_dir,
-                     env=new_env,
-                     timeout=600)
-
 
 def run_user_buffer_tests(build_dir: _pl.Path, nprocs=2, timeout=300):
     tests_dir = build_dir / "tests" / "unit_tests" / "multi_gpu"
@@ -500,8 +484,8 @@ def test_fused_gemm_allreduce(build_google_tests, nprocs, build_dir):
 
 @pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                          indirect=True)
-@pytest.mark.parametrize("kvcache_type", [KVCacheType.MPI, KVCacheType.UCX],
-                         ids=["mpi_kvcache", "ucx_kvcache"])
+@pytest.mark.parametrize("kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX],
+                         ids=["nixl_kvcache", "ucx_kvcache"])
 @pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"])
 def test_cache_transceiver(build_google_tests, nprocs, kvcache_type, build_dir):
 
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -175,9 +175,8 @@ l0_dgx_h100:
   # ------------- CPP tests ---------------
   - cpp/test_multi_gpu.py::test_mpi_utils[90]
   - cpp/test_multi_gpu.py::test_fused_gemm_allreduce[4proc-90]
-  - cpp/test_multi_gpu.py::test_cache_transceiver[2proc-mpi_kvcache-90]
   - cpp/test_multi_gpu.py::test_cache_transceiver[2proc-ucx_kvcache-90]
-  - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mpi_kvcache-90]
+  - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90]
   - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90]
   - cpp/test_multi_gpu.py::test_user_buffer[2proc-90]
   - cpp/test_multi_gpu.py::test_enc_dec[t5-90]