We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 02ed233 commit 65b6d2cCopy full SHA for 65b6d2c
tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2642,7 +2642,8 @@ def _handle_responses(self):
2642
if request.is_finished:
2643
# Finalize any remaining logits transfers for the finished request in chunked mode
2644
if request.py_use_chunked_generation_logits and request.py_return_generation_logits:
2645
- request.py_result.transfer_remaining_device_logits()
+ with torch.inference_mode():
2646
+ request.py_result.transfer_remaining_device_logits()
2647
2648
request_done = False
2649
if request.py_decoding_iter == 1 or request.is_finished or \
0 commit comments