nv-auto-deploy · suyoggupta · Jul 27, 2025 · Jul 28, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -514,7 +514,12 @@ def unnest_sequences(self, t_nested: torch.Tensor) -> List[torch.Tensor]:
         return list(torch.split(t_squeezed, self.sequence_lengths))
 
     @nvtx_range("ad_update_pos")
-    def update_pos(self, seq_len: Union[torch.Tensor, List[int], int], reset: bool = False) -> None:
+    def update_pos(
+        self,
+        seq_len: Union[torch.Tensor, List[int], int],
+        reset: bool = False,
+        update_position_ids: bool = True,
+    ) -> None:
         """Update the starting position for each sequence in the cache.
 
         If ``reset=True`, ``input_pos`` will be reset to zero before updating.
@@ -528,8 +533,9 @@ def update_pos(self, seq_len: Union[torch.Tensor, List[int], int], reset: bool =
         else:
             self.input_pos_host[:bs] += seq_len.to(self.device)
-            self.input_pos_host[:bs] += seq_len.to(self.device)
+            self.input_pos_host[:bs] += seq_len.cpu()
-            self.input_pos_host[:bs] += seq_len.to(self.device)
+            self.input_pos_host[:bs] += seq_len.cpu()
 
-        # update position_ids
-        self._update_position_ids()
+        # In ad_executor context, this is done later in nest_sequences, so no need to do it here
+        if update_position_ids:
+            self._update_position_ids()
         self.input_pos[:bs].copy_(self.input_pos_host[:bs], non_blocking=True)
 
     @nvtx_range("ad_assign_cache_loc")

diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -209,7 +209,8 @@ def _prepare_inputs(
 
         # update the sequence info object now
         si = self.cache_seq_interface.info
-        si.update_pos(input_pos, reset=True)
+        # skip calling _update_position_ids() here, as it will be called in nest_sequences
+        si.update_pos(input_pos, reset=True, update_position_ids=False)
         si.assign_cache_loc(page_assignments)
         si.nest_sequences(input_ids)