We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 04b992b commit 37504c8Copy full SHA for 37504c8
tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2613,7 +2613,8 @@ def _handle_responses(self):
2613
if request.is_finished:
2614
# Finalize any remaining logits transfers for the finished request in chunked mode
2615
if request.py_use_chunked_generation_logits and request.py_return_generation_logits:
2616
- request.py_result.transfer_remaining_device_logits()
+ with torch.inference_mode():
2617
+ request.py_result.transfer_remaining_device_logits()
2618
2619
request_done = False
2620
if request.py_decoding_iter == 1 or request.is_finished or \
0 commit comments