fix: Disable calls to abort() at TRT-LLM backend - temporary (#5827)

kthui · web-flow · commit 934d03c54518 · 2026-01-30T15:03:30.000-08:00
Signed-off-by: Jacky &lt;18255193+kthui@users.noreply.github.com&gt;
diff --git a/components/src/dynamo/trtllm/request_handlers/handler_base.py b/components/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -198,11 +198,12 @@ async def _handle_cancellation(
             )
 
             # Abort the generation
-            # Temporary: Disabled on DECODE workers to prevent engine hangs in
-            # disaggregated setups where abort() may cause the engine to get stuck
-            if self.disaggregation_mode != DisaggregationMode.DECODE:
-                generation_result.abort()
-                logging.debug(f"Aborted Request ID: {context.id()}")
+            # Temporary:
+            #   Disable calling abort() on the engine, which may get stuck if a
+            #   sufficiently large number of concurrent requests is cancelled.
+            # Note to restore:
+            #   call `generation_result.abort()`; and then
+            #   log `logging.debug(f"Aborted Request ID: {context.id()}")`
 
             # Clean up any remaining background task
             for task in pending:
diff --git a/docs/reference/feature-matrix.md b/docs/reference/feature-matrix.md
@@ -96,7 +96,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
 | **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | |
 | **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | |
 | **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | |
-| **Request Cancellation** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | | | |
+| **Request Cancellation** | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | — | | | |
 | **LoRA** | | | | | | | | — | | |
 | **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | |
 | **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — |
@@ -106,6 +106,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
 > 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
 > 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme])
 > 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
+> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
 
 ---
 
diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -38,6 +38,7 @@
     pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
     pytest.mark.post_merge,  # post_merge to pinpoint failure commit
     pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
+    pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
 ]
 
 
@@ -253,9 +254,6 @@ def test_request_cancellation_trtllm_aggregated(
                 logger.info(f"{description} detected successfully")
 
 
-@pytest.mark.xfail(
-    reason="Decode worker cancellation is temporarily disabled", strict=True
-)
 @pytest.mark.timeout(195)  # 3x average
 def test_request_cancellation_trtllm_decode_cancel(
     request, runtime_services_dynamic_ports, predownload_models
@@ -432,9 +430,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
                 )
 
 
-@pytest.mark.xfail(
-    reason="Decode worker cancellation is temporarily disabled", strict=True
-)
 @pytest.mark.xfail(reason="Test fails only on CI", strict=False)
 @pytest.mark.timeout(195)  # 3x average
 def test_request_cancellation_trtllm_kv_transfer_cancel(