Add missing sync before the forward call.

SimengLiu-nv · SimengLiu-nv · commit df67e1111dbc · 2025-12-23T10:43:22.000-08:00
Signed-off-by: SimengLiu-nv &lt;simengl@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -261,6 +261,7 @@ def __init__(self,
         # KVCacheTransferManager's onboard/offload operations.
         self.is_warmup = True
 
+        self.execution_stream.wait_stream(torch.cuda.current_stream())
         with torch.cuda.stream(self.execution_stream):
             self.model_engine.warmup(self.resource_manager)
             if self.draft_model_engine is not None:
@@ -2236,6 +2237,7 @@ def forward(scheduled_requests, resource_manager, new_tensors_device,
 
             # Run model forward on the execution stream for proper synchronization
             # with KVCacheTransferManager's onboard/offload operations.
+            self.execution_stream.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(self.execution_stream):
                 outputs = forward(scheduled_requests, self.resource_manager,
                                   new_tensors_device, gather_context_logits,