Skip to content

Commit 934d03c

Browse files
authored
fix: Disable calls to abort() at TRT-LLM backend - temporary (#5827)
Signed-off-by: Jacky <[email protected]>
1 parent f6d6b34 commit 934d03c

File tree

3 files changed

+9
-12
lines changed

3 files changed

+9
-12
lines changed

components/src/dynamo/trtllm/request_handlers/handler_base.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,11 +198,12 @@ async def _handle_cancellation(
198198
)
199199

200200
# Abort the generation
201-
# Temporary: Disabled on DECODE workers to prevent engine hangs in
202-
# disaggregated setups where abort() may cause the engine to get stuck
203-
if self.disaggregation_mode != DisaggregationMode.DECODE:
204-
generation_result.abort()
205-
logging.debug(f"Aborted Request ID: {context.id()}")
201+
# Temporary:
202+
# Disable calling abort() on the engine, which may get stuck if a
203+
# sufficiently large number of concurrent requests is cancelled.
204+
# Note to restore:
205+
# call `generation_result.abort()`; and then
206+
# log `logging.debug(f"Aborted Request ID: {context.id()}")`
206207

207208
# Clean up any remaining background task
208209
for task in pending:

docs/reference/feature-matrix.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
9696
| **KV Block Manager** ||||| | | | | | |
9797
| **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> |||| | | | | |
9898
| **Request Migration** | 🚧<sup>3</sup> |||| 🚧 || | | | |
99-
| **Request Cancellation** |||||||| | | |
99+
| **Request Cancellation** |<sup>5</sup> |<sup>5</sup> |<sup>5</sup> |<sup>5</sup> |<sup>5</sup> |<sup>5</sup> || | | |
100100
| **LoRA** | | | | | | | || | |
101101
| **Tool Calling** |||||||| || |
102102
| **Speculative Decoding** |||||||| |||
@@ -106,6 +106,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
106106
> 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
107107
> 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme])
108108
> 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
109+
> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
109110
110111
---
111112

tests/fault_tolerance/cancellation/test_trtllm.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
3939
pytest.mark.post_merge, # post_merge to pinpoint failure commit
4040
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
41+
pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
4142
]
4243

4344

@@ -253,9 +254,6 @@ def test_request_cancellation_trtllm_aggregated(
253254
logger.info(f"{description} detected successfully")
254255

255256

256-
@pytest.mark.xfail(
257-
reason="Decode worker cancellation is temporarily disabled", strict=True
258-
)
259257
@pytest.mark.timeout(195) # 3x average
260258
def test_request_cancellation_trtllm_decode_cancel(
261259
request, runtime_services_dynamic_ports, predownload_models
@@ -432,9 +430,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
432430
)
433431

434432

435-
@pytest.mark.xfail(
436-
reason="Decode worker cancellation is temporarily disabled", strict=True
437-
)
438433
@pytest.mark.xfail(reason="Test fails only on CI", strict=False)
439434
@pytest.mark.timeout(195) # 3x average
440435
def test_request_cancellation_trtllm_kv_transfer_cancel(

0 commit comments

Comments
 (0)