Add testing and address review comments

Tabrizian · Tabrizian · commit c11ce69c8506 · 2026-01-04T15:20:57.000-05:00
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -1797,10 +1797,8 @@ void WindowBlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const
 
     for (auto const& blockId : blockIds)
     {
-        if (blockId < 0 || static_cast<size_t>(blockId) >= mAllBlocksById.size())
-        {
-            continue;
-        }
+        TLLM_CHECK_WITH_INFO(blockId >= 0 && static_cast<size_t>(blockId) < mAllBlocksById.size(),
+            "Block id %d is out of range", blockId);
         auto block = mAllBlocksById[blockId];
         if (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
         {
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -363,22 +363,7 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
             nb::call_guard<nb::gil_scoped_release>())
         .def("add_token", &BaseKVCacheManager::addToken, nb::call_guard<nb::gil_scoped_release>())
         .def("add_sequence", &BaseKVCacheManager::addSequence, nb::call_guard<nb::gil_scoped_release>())
-        .def(
-            "remove_sequence",
-            [](tbk::BaseKVCacheManager& self, tb::LlmRequest::RequestIdType requestId, tb::LlmRequest const* llmRequest,
-                bool pinOnRelease)
-            {
-                if (llmRequest != nullptr)
-                {
-                    return self.removeSequence(requestId, *llmRequest, pinOnRelease);
-                }
-                else
-                {
-                    return self.removeSequence(requestId, std::nullopt, pinOnRelease);
-                }
-            },
-            nb::arg("request_id"), nb::arg("llm_request") = nullptr, nb::arg("pin_on_release") = false,
-            nb::call_guard<nb::gil_scoped_release>())
+        .def("remove_sequence", &BaseKVCacheManager::removeSequence, nb::call_guard<nb::gil_scoped_release>())
         .def("pin_blocks", &BaseKVCacheManager::pinBlocks, nb::call_guard<nb::gil_scoped_release>())
         .def("scheduling_remove_sequence", &BaseKVCacheManager::schedulingRemoveSequence,
             nb::call_guard<nb::gil_scoped_release>())
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -367,22 +367,7 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
             py::call_guard<py::gil_scoped_release>())
         .def("add_token", &BaseKVCacheManager::addToken, py::call_guard<py::gil_scoped_release>())
         .def("add_sequence", &BaseKVCacheManager::addSequence, py::call_guard<py::gil_scoped_release>())
-        .def(
-            "remove_sequence",
-            [](tbk::BaseKVCacheManager& self, tb::LlmRequest::RequestIdType requestId, tb::LlmRequest const* llmRequest,
-                bool pinOnRelease)
-            {
-                if (llmRequest != nullptr)
-                {
-                    return self.removeSequence(requestId, *llmRequest, pinOnRelease);
-                }
-                else
-                {
-                    return self.removeSequence(requestId, std::nullopt, pinOnRelease);
-                }
-            },
-            py::arg("request_id"), py::arg("llm_request") = nullptr, py::arg("pin_on_release") = false,
-            py::call_guard<py::gil_scoped_release>())
+        .def("remove_sequence", &BaseKVCacheManager::removeSequence, py::call_guard<py::gil_scoped_release>())
         .def("pin_blocks", &BaseKVCacheManager::pinBlocks, py::call_guard<py::gil_scoped_release>())
         .def("scheduling_remove_sequence", &BaseKVCacheManager::schedulingRemoveSequence,
             py::call_guard<py::gil_scoped_release>())
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2445,12 +2445,7 @@ def _do_terminate_request(self, request: LlmRequest):
                     self.ctx_in_transmission_requests[request.py_request_id] = (
                         (request, block_id, self.ctx_in_transmission_counter))
 
-        store_blocks_for_reuse = not (self.block_reuse_enabled
-                                      and not self.kv_cache_manager.is_vswa
-                                      and self.kv_cache_transceiver
-                                      and request.is_context_only_request)
-        self.resource_manager.free_resources(
-            request, store_blocks_for_reuse=store_blocks_for_reuse)
+        self.resource_manager.free_resources(request)
 
         if self.gather_all_responses or self.dist.rank == 0:
             self.result_wait_queues.pop(request.py_request_id, None)
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -685,13 +685,8 @@ def update_kv_cache_draft_token_location(self,
                 None,
             )
 
-    def free_resources(self,
-                       request: LlmRequest,
-                       pin_on_release: bool = False,
-                       store_blocks_for_reuse: bool = True):
-        # When store_blocks_for_reuse is False, pass None to prevent block storage
-        llm_request = request if store_blocks_for_reuse else None
-        return self.impl.remove_sequence(request.py_request_id, llm_request,
+    def free_resources(self, request: LlmRequest, pin_on_release: bool = False):
+        return self.impl.remove_sequence(request.py_request_id, request,
                                          pin_on_release)
 
     def store_blocks_for_reuse(self,
@@ -1435,17 +1430,11 @@ def update_resources(self,
                 else:
                     resource_manager.update_resources(scheduled_batch)
 
-    def free_resources(self,
-                       request: LlmRequest,
-                       store_blocks_for_reuse: bool = True):
+    def free_resources(self, request: LlmRequest):
         for resource_type, resource_manager in reversed(
                 self.resource_managers.items()):
             if hasattr(resource_manager, "free_resources"):
-                if resource_type == ResourceManagerType.KV_CACHE_MANAGER:
-                    resource_manager.free_resources(
-                        request, store_blocks_for_reuse=store_blocks_for_reuse)
-                else:
-                    resource_manager.free_resources(request)
+                resource_manager.free_resources(request)
 
     def reorder_pipeline(self,
                          resource_manager_list: list[ResourceManagerType]):
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cancel_stress_test.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cancel_stress_test.yaml
@@ -0,0 +1,44 @@
+hostname: localhost
+port: 8000
+model: DeepSeek-V3-Lite/bf16
+backend: "pytorch"
+enable_autotuner: False
+context_servers:
+  disable_overlap_scheduler: True
+  num_instances: 1
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  max_num_tokens: 16384
+  max_seq_len: 32768
+  enable_chunked_prefill: True
+  kv_cache_config:
+    enable_block_reuse: True
+    enable_partial_reuse: True
+    free_gpu_memory_fraction: 0.3
+  cache_transceiver_config:
+    backend: "DEFAULT"
+    max_tokens_in_buffer: 32768
+  cuda_graph_config:
+    enable_padding: True
+    max_batch_size: 1
+  urls:
+      - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  max_num_tokens: 2048
+  max_seq_len: 32768
+  enable_chunked_prefill: True
+  kv_cache_config:
+    enable_block_reuse: True
+    enable_partial_reuse: True
+    free_gpu_memory_fraction: 0.85
+  cache_transceiver_config:
+    backend: "DEFAULT"
+    max_tokens_in_buffer: 32768
+  cuda_graph_config:
+    enable_padding: True
+    max_batch_size: 64
+  urls:
+      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -200,6 +200,8 @@ def get_test_config(test_desc, example_dir, test_root):
         "gpt_oss_120b_stress":
         (4,
          f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml"),
+        "cancel_stress_test":
+        (2, f"{test_configs_root}/disagg_config_cancel_stress_test.yaml"),
     }
 
     if test_desc not in config_map:
@@ -2098,3 +2100,183 @@ def test_disaggregated_stress_test(disaggregated_test_root,
                              threshold=test_config.accuracy_threshold,
                              env=llm_venv._new_env,
                              cwd=llm_venv.get_working_directory())
+
+
+def run_cancel_stress_test(server_url: str,
+                           num_bursts: int = 5,
+                           requests_per_burst: int = 32,
+                           prompt_len_range: tuple = (2000, 8000),
+                           cancel_after_range: tuple = (0.01, 0.1)):
+    """
+    Stress test that sends requests with large contexts and cancels them
+    during prefill to test resource cleanup under cancellation.
+
+    Args:
+        server_url: The server URL (e.g., "http://localhost:8000")
+        num_bursts: Number of request bursts to send
+        requests_per_burst: Number of concurrent requests per burst
+        prompt_len_range: (min, max) prompt length in tokens
+        cancel_after_range: (min, max) seconds to wait before cancelling
+    """
+    import asyncio
+    import random
+    import time
+
+    import aiohttp
+
+    async def spam_and_cancel(session, req_id, url, prompt_len_range,
+                              cancel_after_range):
+        """Send a request and cancel it during prefill."""
+        prompt_len = random.randint(prompt_len_range[0], prompt_len_range[1])
+        prompt = "test " * (prompt_len // 5)
+
+        payload = {
+            "model": "test-model",
+            "prompt": prompt,
+            "max_tokens": 10,
+            "stream": True
+        }
+
+        try:
+            cancel_after = random.uniform(cancel_after_range[0],
+                                          cancel_after_range[1])
+            start = time.time()
+            async with session.post(
+                    f"{url}/v1/completions",
+                    json=payload,
+                    timeout=aiohttp.ClientTimeout(total=60)) as resp:
+                async for line in resp.content:
+                    if time.time() - start > cancel_after:
+                        # Force disconnect during prefill
+                        break
+        except Exception:
+            pass  # Connection abort is expected
+
+    async def run_bursts():
+        async with aiohttp.ClientSession() as session:
+            for burst_idx in range(num_bursts):
+                tasks = [
+                    spam_and_cancel(session, i, server_url, prompt_len_range,
+                                    cancel_after_range)
+                    for i in range(requests_per_burst)
+                ]
+                await asyncio.gather(*tasks)
+                logger.info(
+                    f"Completed burst {burst_idx + 1}/{num_bursts} ({requests_per_burst} requests)"
+                )
+                await asyncio.sleep(0.05)
+
+    asyncio.run(run_bursts())
+
+
+def run_disaggregated_cancel_test(example_dir,
+                                  test_desc,
+                                  env=None,
+                                  cwd=None,
+                                  num_bursts=64,
+                                  requests_per_burst=64):
+    """Run disaggregated test with request cancellation stress test."""
+    cleanup_output_files()
+    run_env = env.copy()
+    run_env["UCX_TLS"] = "^ib"
+
+    num_ranks, config_file = get_test_config(test_desc, example_dir,
+                                             os.path.dirname(__file__))
+
+    workers_cmd = [
+        'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
+        str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
+        config_file
+    ]
+
+    server_start_timeout = 1200
+    server_cmd = [
+        'trtllm-serve', 'disaggregated', '--server_start_timeout',
+        str(server_start_timeout), '-c', config_file
+    ]
+    server_host, server_port = get_disagg_server_url_from_cfg(config_file)
+    server_url = f"http://{server_host}:{server_port}"
+
+    try:
+        with (open('output_workers.log', 'w') as output_workers,
+              popen(workers_cmd,
+                    stdout=output_workers,
+                    stderr=subprocess.STDOUT,
+                    env=run_env,
+                    cwd=cwd) as workers_proc, open('output_disagg.log', 'w') as
+              output_disagg,
+              popen(server_cmd,
+                    stdout=output_disagg,
+                    stderr=subprocess.STDOUT,
+                    env=run_env,
+                    cwd=cwd) as server_proc):
+
+            # Wait for server to be ready
+            if not wait_for_server(server_host,
+                                   server_port,
+                                   timeout_seconds=server_start_timeout):
+                raise RuntimeError(
+                    f"Disaggregated server did not become ready within {server_start_timeout} seconds"
+                )
+
+            # Run the cancel stress test
+            run_cancel_stress_test(server_url,
+                                   num_bursts=num_bursts,
+                                   requests_per_burst=requests_per_burst)
+
+            # Verify server is still healthy after stress test by sending a normal request
+            client_dir = f"{example_dir}/clients"
+            client_cmd = [
+                'python3', f'{client_dir}/disagg_client.py', '-c', config_file,
+                '-p', f'{client_dir}/prompts.json', '--ignore-eos',
+                '--server-start-timeout',
+                str(server_start_timeout)
+            ]
+            check_call(client_cmd,
+                       env=env,
+                       poll_procs=[workers_proc, server_proc])
+
+    except Exception:
+        logger.error("-------- Workers output --------")
+        with open('output_workers.log', 'r') as f:
+            logger.error(f.read())
+
+        logger.error("-------- Disagg server output --------")
+        with open('output_disagg.log', 'r') as f:
+            logger.error(f.read())
+        raise
+    finally:
+        if 'server_proc' in locals() and 'workers_proc' in locals():
+            server_proc.terminate()
+            workers_proc.terminate()
+            server_proc.wait()
+            workers_proc.wait()
+
+
+@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-bf16'],
+                         indirect=True)
+def test_disaggregated_cancel_large_context_requests(disaggregated_test_root,
+                                                     disaggregated_example_root,
+                                                     llm_venv,
+                                                     deepseek_v3_model_root):
+    """
+    Test that the disaggregated server handles request cancellations gracefully.
+
+    This test sends bursts of requests with large contexts and cancels them
+    during prefill to stress test resource cleanup.
+    """
+    src_dst_dict = {
+        deepseek_v3_model_root:
+        f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/bf16",
+    }
+    for src, dst in src_dst_dict.items():
+        if not os.path.islink(dst):
+            os.makedirs(os.path.dirname(dst), exist_ok=True)
+            os.symlink(src, dst, target_is_directory=True)
+
+    run_disaggregated_cancel_test(disaggregated_example_root,
+                                  "cancel_stress_test",
+                                  env=llm_venv._new_env,
+                                  cwd=llm_venv.get_working_directory(),
+                                  num_bursts=5,
+                                  requests_per_burst=32)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -43,6 +43,7 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-False]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
   - unittest/llmapi/apps/test_disagg_serving_perf_metrics.py
+  - disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16]
   # ------------- AutoDeploy tests ---------------
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
   # llmapi

Original file line number	Diff line number	Diff line change
`@@ -1797,10 +1797,8 @@ void WindowBlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const`
`1797`	`1797`
`1798`	`1798`	`for (auto const& blockId : blockIds)`
`1799`	`1799`	`{`
`1800`		`- if (blockId < 0 \|\| static_cast<size_t>(blockId) >= mAllBlocksById.size())`
`1801`		`- {`
`1802`		`- continue;`
`1803`		`- }`
	`1800`	`+ TLLM_CHECK_WITH_INFO(blockId >= 0 && static_cast<size_t>(blockId) < mAllBlocksById.size(),`
	`1801`	`+ "Block id %d is out of range", blockId);`
`1804`	`1802`	`auto block = mAllBlocksById[blockId];`
`1805`	`1803`	`if (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)`
`1806`	`1804`	`{`