NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 10 additions & 72 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 10 additions & 72 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py‎
Lines changed: 4 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py‎
Lines changed: 7 additions & 7 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py‎
Lines changed: 4 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py‎
Lines changed: 4 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py‎
Lines changed: 4 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py‎
Lines changed: 4 additions & 4 deletions
@@ -496,16 +496,17 @@ def __init__(
         # Create the InputBuffer that manages contiguous host and device memory
         # Starts on default device; use to() to move to target device
         self._input_buffer = InputBuffer(tensor_specs)
+        self._available_args = set(self._input_buffer.tensor_names) | {
+            f"{name}_host" for name in self._input_buffer.tensor_names
+        }
 
         # Initialize args_list from tensor specs
         self._args_list: Dict[str, List[int]] = {
             name: [0] * numel for name, numel, _ in tensor_specs
         }
 
         self._active_args = ("input_ids", "position_ids")
-        self._shapeable_args = ("input_ids", "position_ids")
-        # Args that should be returned from host (pinned memory) instead of device in _named_args
-        self._host_return_args = ("batch_info", "logits_gather_info")
+        self._shapeable_args = ("input_ids", "position_ids", "input_ids_host", "position_ids_host")
         ############################################################################################
 
         # EXTRA TENSOR FIELDS ######################################################################
@@ -543,14 +544,13 @@ def _shape_for_forward(self, tnsr: torch.Tensor) -> torch.Tensor:
 
     def _get_arg(self, name: str) -> torch.Tensor:
         """Get the argument from the input buffer either on device or host."""
-        if name in self._host_return_args:
-            arg = self._input_buffer.get_host_view(name)
+        if name.endswith("_host"):
+            arg = self._input_buffer.get_host_view(name.replace("_host", ""))
         else:
             arg = self._input_buffer.get_view(name)
         return self._shape_for_forward(arg) if name in self._shapeable_args else arg
 
     def _named_args(self, include_extra_args: bool = True) -> Dict[str, torch.Tensor]:
-        # Build args dict, using host views for _host_return_args, device views otherwise
         args = {k: self._get_arg(k) for k in self._active_args}
 
         # check other args to include
@@ -562,7 +562,7 @@ def _named_args(self, include_extra_args: bool = True) -> Dict[str, torch.Tensor
     @property
     def available_args(self) -> Set[str]:
         """Return a list of available arguments."""
-        return set(self._input_buffer.tensor_names)
+        return self._available_args
 
     @property
     def named_args(self) -> Dict[str, torch.Tensor]:
@@ -682,68 +682,6 @@ def _get_cache_locations_and_pages_per_sequence(
         pages_per_seq = [len(p) for p in page_assignments]
         return cache_loc_flat, pages_per_seq
 
-    # TODO: remove after updating all cached backends
-    @classmethod
-    def _get_sanitized_seq_len(
-        cls, input_or_position_ids: torch.Tensor, seq_len: torch.Tensor
-    ) -> torch.Tensor:
-        """Sanitize sequence lengths.
-
-        We want to cover the following scenarios with this function:
-
-        1. Pre-fill:
-            input_ids: [1, s_total, ...]
-            seq_len: [s_0, s_1, ..., s_{b-1}, 0, 0, ..., 0]
-            ---> returns [s_0, s_1, ..., s_{b-1}]
-        2. Decode:
-            input_ids: [b, 1, ...]
-            seq_len: [1, 1, ..., 1, 0, 0, ..., ..., ..., ..., 0]
-                     |---- b ----|--- (max_batch_size - b) ---|
-            --> returns [1,] * b
-        3. Decode in Cudagraph:
-            input_ids: [b_cudagraph, 1, ...]
-            seq_len: [1, 1, ..., 1, 0, 0, ..., ..., ..., ..., 0]
-                     |---- b ----|--- (max_batch_size - b) ---|
-
-            --> returns [1,] * b_cudagraph
-            Here b <= b_cudagraph. We want to make sure that the seq_len is one-padded to
-            b_cudagraph.
-
-            # TODO: I could see one possible issue with this approach in the future.
-            # If we have b < b_cudagraph we now one-pad. However, we don't pad the cache location
-            # information. What could happen is that the for the padded sequences the cache location
-            # tensors point to allocated pages. This could lead to a situation where we write into
-            # allocated cache pages polluting the cache of other sequences. Now this is not an issue
-            # if we write the dummy sequences into unallocated cache pages... One fix could be to
-            # pad not only the seq len but also pad the cache locations by just repeating the last
-            # valid cache location in the batch. This would ensure that the dummy sequences just
-            # repeats valid computation...
-        """
-        _, s = input_or_position_ids.shape[:2]
-        num_seq = cls._get_sanitized_num_sequences(input_or_position_ids, seq_len)
-        if s > 1:
-            return seq_len[:num_seq].clone()
-        else:
-            return torch.ones(num_seq, dtype=seq_len.dtype, device=seq_len.device)
-
-    @staticmethod
-    def _get_sanitized_num_sequences(
-        input_or_position_ids: torch.Tensor, seq_len: torch.Tensor
-    ) -> int:
-        """Get number of sequences.
-
-        We makes sure that this function is compatible with both torch graph capture and cudagraph.
-        Both can be a bit temparamental when trying to extract the number of sequences from a tensor
-        with max_batch_size or max_batch_size*max_seq_len.
-        """
-        b, s = input_or_position_ids.shape[:2]
-        if s > 1:
-            num_seq = torch.sum(seq_len > 0)
-            assert seq_len[num_seq:].sum() == 0, "seq_len should be zero-padded"
-        else:
-            num_seq = b
-        return num_seq
-
     def activate_arg(self, arg_name: str) -> bool:
         """Activate a desired argument.
 
@@ -854,7 +792,7 @@ def _store_arg(
             self._args_list[name] = tnsr_like.copy()
 
             # Only store to buffer when the argument is active or force_copy is True
-            if not (name in self._active_args or force_copy):
+            if not (name in self._active_args or f"{name}_host" in self._active_args or force_copy):
                 return
 
             # Store to the InputBuffer's pinned host memory
@@ -1075,12 +1013,12 @@ def rescatter_input_ids(self, ungathered_input_ids: torch.Tensor):
     def maybe_gather_and_squeeze_logits(self, logits: torch.Tensor) -> torch.Tensor:
         """Maybe gather the logits if logits have not been gathered yet."""
         num_tokens = logits.shape[0] * logits.shape[1]
-        num_tokens_to_gather, gather_required = self._get_arg("logits_gather_info").tolist()
+        num_tokens_to_gather, gather_required = self._get_arg("logits_gather_info_host").tolist()
         if gather_required and num_tokens_to_gather < num_tokens:
             logits = torch.ops.auto_deploy.gather_logits_before_lm_head(
                 logits,
                 self._get_arg("logits_gather_indices"),
-                self._get_arg("logits_gather_info"),
+                self._get_arg("logits_gather_info_host"),
             )
         return logits.squeeze(int(self.is_generate))
 
 
@@ -35,7 +35,7 @@ def fla_cached_delta_rule(
     v: torch.Tensor,
     beta: torch.Tensor,
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
     use_initial_states: torch.Tensor,
@@ -58,7 +58,7 @@ def fla_cached_delta_rule(
     y = torch.empty_like(v, memory_format=torch.contiguous_format)
     y_flat = y.view(b * s, num_heads, -1)
 
-    num_prefill, num_prefill_tokens, num_decode = batch_info.tolist()
+    num_prefill, num_prefill_tokens, num_decode = batch_info_host.tolist()
     num_seq = num_prefill + num_decode
 
     # clean up metadata
@@ -120,7 +120,7 @@ def fla_cached_delta_rule_fake(
     v: torch.Tensor,
     beta: torch.Tensor,
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
     use_initial_states: torch.Tensor,
@@ -160,7 +160,7 @@ def get_cached_attention_op(cls) -> MHACallable:
 
     @classmethod
     def get_standard_metadata_args(cls) -> List[str]:
-        return ["batch_info", "cu_seqlen", "slot_idx", "use_initial_states"]
+        return ["batch_info_host", "cu_seqlen", "slot_idx", "use_initial_states"]
 
     @classmethod
     def get_cache_initializers(
 
@@ -157,7 +157,7 @@ def _plan_decode(wrapper: flashinfer.BatchDecodeWithPagedKVCacheWrapper):
 @torch.library.custom_op("auto_deploy::flashinfer_attention_prepare_metadata", mutates_args=())
 def prepare_flashinfer_metadata(
     position_ids: torch.Tensor,
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     seq_len_with_cache: torch.Tensor,
 ) -> List[torch.Tensor]:
@@ -171,7 +171,7 @@ def prepare_flashinfer_metadata(
     _GlobalFlashInferPlanner.reset()
 
     # retrieve host-side metadata
-    num_prefill, num_prefill_tokens, num_decode = batch_info.tolist()
+    num_prefill, num_prefill_tokens, num_decode = batch_info_host.tolist()
     num_seq = num_prefill + num_decode
     num_tokens = num_prefill_tokens + num_decode
 
@@ -192,7 +192,7 @@ def prepare_flashinfer_metadata(
 @prepare_flashinfer_metadata.register_fake
 def prepare_flashinfer_metadata_fake(
     position_ids: torch.Tensor,
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     seq_len_with_cache: torch.Tensor,
 ):
@@ -210,7 +210,7 @@ def flashinfer_mha_with_cache(
     k: torch.Tensor,
     v: torch.Tensor,
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     cu_num_pages: torch.Tensor,
     cache_loc: torch.Tensor,
@@ -238,7 +238,7 @@ def flashinfer_mha_with_cache(
     v = v.reshape(b * s, -1, head_dim)
 
     # convert to flashinfer-style metadata
-    num_prefill, num_prefill_tokens, num_decode = batch_info.tolist()
+    num_prefill, num_prefill_tokens, num_decode = batch_info_host.tolist()
     num_seq = num_prefill + num_decode
 
     qo_indptr = cu_seqlen[: num_seq + 1]
@@ -305,7 +305,7 @@ def flashinfer_mha_with_cache_fake(
     k: torch.Tensor,
     v: torch.Tensor,
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     cu_num_pages: torch.Tensor,
     cache_loc: torch.Tensor,
@@ -358,7 +358,7 @@ def get_cached_attention_op(cls) -> MHACallable:
 
     @classmethod
     def get_standard_metadata_args(cls) -> List[str]:
-        return ["batch_info", "cu_seqlen", "cu_num_pages", "cache_loc", "last_page_len"]
+        return ["batch_info_host", "cu_seqlen", "cu_num_pages", "cache_loc", "last_page_len"]
 
     @classmethod
     def get_prepare_extra_metadata_info(
 
@@ -53,7 +53,7 @@ def _cuda_cached_causal_conv1d(
     weight: torch.Tensor,  # [c_out, c_in/groups, k] but we expect depthwise use: [c_in, k]
     bias: Optional[torch.Tensor],
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
     use_initial_states: torch.Tensor,
@@ -80,7 +80,7 @@ def _cuda_cached_causal_conv1d(
     """
     b, s = input.shape[:2]
 
-    num_prefill, num_prefill_tokens, num_decode = batch_info.tolist()
+    num_prefill, num_prefill_tokens, num_decode = batch_info_host.tolist()
     num_seq = num_prefill + num_decode
     num_total_tokens = num_prefill_tokens + num_decode
 
@@ -138,7 +138,7 @@ def _cuda_cached_causal_conv1d_fake(
     weight: torch.Tensor,  # [c_out, c_in/groups, k] but we expect depthwise use: [c_in, k]
     bias: Optional[torch.Tensor],
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
     use_initial_states: torch.Tensor,
@@ -189,7 +189,7 @@ def get_cached_attention_op(cls) -> MHACallable:
 
     @classmethod
     def get_standard_metadata_args(cls) -> List[str]:
-        return ["batch_info", "cu_seqlen", "slot_idx", "use_initial_states"]
+        return ["batch_info_host", "cu_seqlen", "slot_idx", "use_initial_states"]
 
     @classmethod
     def get_cache_initializers(
 
@@ -147,7 +147,7 @@ def _torch_cached_causal_conv1d(
     weight: torch.Tensor,  # [c_out, c_in/groups, k]
     bias: Optional[torch.Tensor],
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     seq_len: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
@@ -174,7 +174,7 @@ def _torch_cached_causal_conv1d(
     num_seq = seq_len.shape[0]
 
     # get cleaned up metadata
-    num_prefill, num_prefill_tokens, num_decode = batch_info.tolist()
+    num_prefill, num_prefill_tokens, num_decode = batch_info_host.tolist()
     num_seq = num_prefill + num_decode
     seq_len = seq_len[:num_seq]
     seq_start = cu_seqlen[:num_seq]
@@ -247,7 +247,7 @@ def _torch_cached_causal_conv1d_fake(
     weight: torch.Tensor,  # [c_out, c_in/groups, k]
     bias: Optional[torch.Tensor],
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     seq_len: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
@@ -296,7 +296,7 @@ def get_cached_attention_op(cls) -> MHACallable:
 
     @classmethod
     def get_standard_metadata_args(cls) -> List[str]:
-        return ["batch_info", "seq_len", "cu_seqlen", "slot_idx", "use_initial_states"]
+        return ["batch_info_host", "seq_len", "cu_seqlen", "slot_idx", "use_initial_states"]
 
     @classmethod
     def get_cache_initializers(
 
@@ -121,7 +121,7 @@ def _torch_cached_ssm(
     dt: torch.Tensor,  # [b, s, num_heads]
     dt_bias: torch.Tensor,  # [num_heads]
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     seq_len: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
@@ -145,7 +145,7 @@ def _torch_cached_ssm(
     num_seq = seq_len.shape[0]
 
     # get cleaned up metadata
-    num_prefill, num_prefill_tokens, num_decode = batch_info.tolist()
+    num_prefill, num_prefill_tokens, num_decode = batch_info_host.tolist()
     num_seq = num_prefill + num_decode
     seq_len = seq_len[:num_seq]
     seq_start = cu_seqlen[:num_seq]
@@ -246,7 +246,7 @@ def _torch_cached_ssm_fake(
     dt: torch.Tensor,  # [b, s, num_heads]
     dt_bias: torch.Tensor,  # [num_heads]
     # STANDARD METADATA
-    batch_info: torch.Tensor,
+    batch_info_host: torch.Tensor,
     seq_len: torch.Tensor,
     cu_seqlen: torch.Tensor,
     slot_idx: torch.Tensor,
@@ -293,7 +293,7 @@ def get_cached_attention_op(cls) -> MHACallable:
 
     @classmethod
     def get_standard_metadata_args(cls) -> List[str]:
-        return ["batch_info", "seq_len", "cu_seqlen", "slot_idx", "use_initial_states"]
+        return ["batch_info_host", "seq_len", "cu_seqlen", "slot_idx", "use_initial_states"]
 
     @classmethod
     def get_cache_initializers(