[TRTLLM-6549][fix] add kv cache time output back (#7798)

zhengd-nv · web-flow · commit e3c1a9409f3f · 2025-09-23T14:12:42.000-04:00
Signed-off-by: zhengd-nv &lt;200704041+zhengd-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
@@ -156,6 +156,21 @@ static int32_t tagFromRequestId(LlmRequest::RequestIdType requestId)
     return ((requestId & 0xFFF) << 8) | (kDATA_TAG & 0xFF);
 }
 
+namespace fs = std::filesystem;
+
+static fs::path getTransferOutputPath(char const* tag)
+{
+    auto outputPath = common::getEnvKVCacheTransferOutputPath();
+    if (!outputPath.empty())
+    {
+        auto rank = mpi::MpiComm::world().getRank();
+        auto path = fs::path(outputPath);
+        fs::create_directories(path);
+        return path / ("rank_" + std::to_string(rank) + "_" + tag + ".csv");
+    }
+    return {};
+}
+
 struct ReceiveCacheResource
 {
     runtime::BufferManager mBufferManager;
@@ -282,6 +297,17 @@ class CacheSender::Impl
         auto it = mRequestToSession.find(requestId);
         TLLM_CHECK(it != mRequestToSession.end());
         std::unique_lock<std::mutex> lk(mMtxForMap);
+        if (!common::getEnvKVCacheTransferOutputPath().empty())
+        {
+            if (!mMeasuresFile.is_open())
+            {
+                auto outputPath = getTransferOutputPath("send");
+                mMeasuresFile.open(outputPath);
+                TLLM_CHECK_WITH_INFO(
+                    mMeasuresFile.is_open(), "Failed to open transfer output file: %s", outputPath.string().c_str());
+            }
+            it->second.exportMeasure(mMeasuresFile, true);
+        }
         mRequestToSession.erase(it);
     }
 
@@ -331,7 +357,8 @@ class CacheSender::Impl
             if (it == mRequestToSession.end())
             {
                 auto session = TransferSession(std::vector<Connection const*>(peerRelativeRanks.size(), nullptr),
-                    DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager);
+                    DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager, nullptr,
+                    !common::getEnvKVCacheTransferOutputPath().empty());
                 it = mRequestToSession.emplace(requestId, std::move(session)).first;
             }
             it->second.setConnection(peerIdx, connection);
@@ -527,6 +554,7 @@ class CacheSender::Impl
     std::unique_ptr<BaseCacheFormatter> mFormatter;
     std::mutex mMtxForMap;
     runtime::BufferManager mBufferManager;
+    std::ofstream mMeasuresFile;
 };
 
 class CacheReceiver::Impl
@@ -587,6 +615,18 @@ class CacheReceiver::Impl
     void receiveSync(TransferSession& session)
     {
         mFormatter->unformat(session);
+        if (!common::getEnvKVCacheTransferOutputPath().empty())
+        {
+            std::unique_lock<std::mutex> lock(mMeasuresFileMutex);
+            if (!mMeasuresFile.is_open())
+            {
+                auto outputPath = getTransferOutputPath("recv");
+                mMeasuresFile.open(outputPath);
+                TLLM_CHECK_WITH_INFO(
+                    mMeasuresFile.is_open(), "Failed to open transfer output file: %s", outputPath.string().c_str());
+            }
+            session.exportMeasure(mMeasuresFile, false);
+        }
     }
 
     TransferSession sendRequestInfo(LlmRequest const& llmRequest)
@@ -652,7 +692,7 @@ class CacheReceiver::Impl
         }
         auto const& resource = getReceiveCacheResource(llmRequest);
         return TransferSession(std::move(counterPartConnections), DataContext{tagFromRequestId(requestId)}, mSelfState,
-            contextState, resource->mBufferManager, &llmRequest);
+            contextState, resource->mBufferManager, &llmRequest, !common::getEnvKVCacheTransferOutputPath().empty());
     }
 
     std::unique_ptr<ReceiveCacheResource> const& getReceiveCacheResource(LlmRequest const& llmRequest)
@@ -831,6 +871,8 @@ class CacheReceiver::Impl
     std::unordered_map<std::string, std::unique_ptr<ReceiveCacheResource>> mProcessToResources;
     std::mutex mProcessIoResouceMutex;
     runtime::BufferManager mBufferManager;
+    std::ofstream mMeasuresFile;
+    std::mutex mMeasuresFileMutex;
 };
 
 void CacheSender::ImplDeleter::operator()(Impl* ptr)
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -584,6 +584,52 @@ def extra_endpoints_test(server_url: str):
                            extra_endpoints_test=extra_endpoints_test)
 
 
+@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
+                         indirect=True)
+def test_disaggregated_kv_cache_time_output(disaggregated_test_root, llm_venv,
+                                            disaggregated_example_root,
+                                            llama_model_root):
+    src_dst_dict = {
+        llama_model_root:
+        f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    }
+    for src, dst in src_dst_dict.items():
+        if not os.path.islink(dst):
+            os.makedirs(os.path.dirname(dst), exist_ok=True)
+            os.symlink(src, dst, target_is_directory=True)
+
+    output_path = os.path.join(llm_venv.get_working_directory(), "cache_time")
+    run_disaggregated_test(disaggregated_example_root,
+                           "perf_metrics",
+                           env=llm_venv._new_env
+                           | {"TRTLLM_KVCACHE_TIME_OUTPUT_PATH": output_path},
+                           cwd=llm_venv.get_working_directory())
+    assert os.path.isdir(output_path)
+    send_file = os.path.join(output_path, "rank_0_send.csv")
+    recv_file = os.path.join(output_path, "rank_1_recv.csv")
+    assert os.path.exists(send_file)
+    assert os.path.exists(recv_file)
+    with open(send_file, "r") as f:
+        lines = f.readlines()
+        assert len(lines) > 1
+        assert lines[0].startswith(
+            "RequestID,Delay(ms),Duration(ms),Bandwidth(Gbps)")
+        # get a send sample and match the recv
+        sample = lines[1].split(',')
+        assert len(sample) >= 4
+    with open(recv_file, "r") as f:
+        lines = f.readlines()
+        assert len(lines) > 1
+        matched = False
+        for line in lines:
+            sample_recv = line.split(',')
+            if sample_recv[0] == sample[0]:
+                matched = True
+                assert float(sample_recv[1]) <= float(sample[1])
+                break
+        assert matched
+
+
 @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
                          indirect=True)
 def test_disaggregated_trtllm_sampler(disaggregated_test_root, llm_venv,
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -75,6 +75,7 @@ l0_h100:
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
+  - disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_perf_metrics[TinyLlama-1.1B-Chat-v1.0]