NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 6 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py‎
Lines changed: 8 additions & 18 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py‎
Lines changed: 8 additions & 18 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 23 additions & 3 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 23 additions & 3 deletions
@@ -127,6 +127,12 @@ transforms:
   insert_cached_mla_attention:
     stage: cache_init
     attn_backend: MultiHeadLatentAttention
+  insert_cached_ssm_attention:
+    stage: cache_init
+    attn_backend: triton_ssm
+  insert_cached_causal_conv:
+    stage: cache_init
+    attn_backend: cuda_causal_conv
   initialize_cache:
     stage: cache_init
   resize_kv_cache:
 
@@ -1,20 +1,10 @@
 """Custom ops and make sure they are all registered."""
 
-from ._triton_attention_internal import *
-from .dist import *
-from .flashinfer_attention import *
-from .flashinfer_rope import *
-from .linear import *
-from .mla import *
-from .mxfp4_moe import *
-from .quant import *
-from .rms_norm import *
-from .torch_attention import *
-from .torch_backend_attention import *
-from .torch_moe import *
-from .torch_quant import *
-from .torch_rope import *
-from .torch_router import *
-from .triton_attention import *
-from .triton_rope import *
-from .trtllm_moe import *
+import importlib
+import pkgutil
+
+__all__ = []
+
+for _, module_name, is_pkg in pkgutil.iter_modules(__path__):
+    __all__.append(module_name)
+    importlib.import_module(f"{__name__}.{module_name}")
@@ -72,6 +72,8 @@ class SequenceInfo:
     - pages_per_seq: [ps_0, ps_1, ..., ps_{b-1}] where ps_i is the number of pages allocated for
       sequence i. Note that, for example, cache_loc[p_0:p_1] will correspond to the pages associated
       with sequence 1 in the batch.
+    - slot_idx: [s_0, s_1, ..., s_{b-1}]
+      Corresponds to the slot index of each sequence in the batch.
 
     ################################################################################################
 
@@ -134,7 +136,8 @@ def __init__(
         self._num_pages = max(
             self.max_batch_size,
             (self.max_num_tokens) // self.page_size  # floored number of pages
-            + (self.max_num_tokens % self.page_size > 0) * self.max_batch_size,  # +1 per sequence
+            + (self.max_num_tokens / self.max_batch_size % self.page_size > 0)  # check for overflow
+            * self.max_batch_size,  # +1 page per sequence if overflow is required
         )
         # sanity check
         assert self.num_pages >= self.max_batch_size, "num_pages can't be less than max_batch_size"
@@ -164,6 +167,7 @@ def __init__(
             "input_pos": torch.empty(self.max_batch_size, dtype=torch.int),
             "cache_loc": torch.empty(self.num_pages, dtype=torch.int),
             "pages_per_seq": torch.empty(self.max_batch_size, dtype=torch.int),
+            "slot_idx": torch.empty(self.max_batch_size, dtype=torch.int),
             # OTHER FIELDS WHERE WE NEED EFFICIENT HOST<>DEVICE TRANSFER
             "_gather_idx": torch.empty(self.max_num_tokens, dtype=torch.int),
         }
@@ -172,7 +176,8 @@ def __init__(
         }
         # NOTE: order of keys is relevant here!
         self._uncached_arg_names = ("input_ids", "position_ids")
-        self._cached_arg_names = ("seq_len", "input_pos", "cache_loc", "pages_per_seq")
+        self._cached_arg_names = ("seq_len", "input_pos", "cache_loc", "pages_per_seq", "slot_idx")
+        self._cached_constants = ("page_size",)
         ############################################################################################
 
         # EXTRA TENSOR FIELDS ######################################################################
@@ -296,7 +301,7 @@ def const_args_for_prepare_metadata(self) -> Tuple:
         ``insert_cached_attention`` to extract the constant arguments and add them to the
         ``prepare_metadata`` node/op.
         """
-        return (self.page_size,)
+        return tuple(getattr(self, k) for k in self._cached_constants)
 
     @property
     def named_dynamic_shapes(self) -> Dict[str, Dict[str, Dim]]:
@@ -311,6 +316,7 @@ def named_dynamic_shapes(self) -> Dict[str, Dict[str, Dim]]:
             if self.max_batch_size > 1:
                 bs_seq_len_shape[0] = Dim("batch_size", max=self.max_batch_size)
             bs_seq_len_shape[1] = Dim("seq_len", max=self.max_seq_len)
+            # bs_seq_len_shape[1] = Dim.AUTO
             self._dynamic_shapes = {k: bs_seq_len_shape for k in self._uncached_arg_names}
             # cached args are static
             self._dynamic_shapes.update({k: {} for k in self._cached_arg_names})
@@ -522,11 +528,15 @@ def set_example_sequence(
         cache_loc = list(range(sum(pages_per_seq)))
         page_assignments = self._get_page_assignments(cache_loc, pages_per_seq)
 
+        # vanilla slot indices
+        slot_idx = list(range(len(input_ids)))
+
         self.nest_sequences(
             input_ids,
             position_ids,  # will be auto-inferred if None
             input_pos=0,  # no cache history
             page_assignments=page_assignments,  # vanilla page assignments
+            slot_idx=slot_idx,  # vanilla slot indices
             **extra_args,
         )
 
@@ -613,6 +623,7 @@ def nest_sequences(
         position_ids: Optional[Sequence[Sequence[int]]] = None,
         input_pos: Optional[Union[Sequence[int], int]] = None,
         page_assignments: Optional[Sequence[Sequence[int]]] = None,
+        slot_idx: Optional[Sequence[int]] = None,
         **extra_args: Dict[str, Union[torch.Tensor, Sequence[torch.Tensor]]],
     ) -> None:
         """Create and store sequence information for the next forward pass.
@@ -622,6 +633,7 @@ def nest_sequences(
             position_ids: List of sequences of position_ids for each token.
             input_pos: Absolute starting position in the cache for each sequence.
             page_assignments: List of sequences of page assignments for each sequence.
+            slot_idx: List of slot indices for each sequence.
             extra_args: Extra arguments to be stored in the interface.
 
         This i/f will ensure that all sequence info args are updated accordingly.
@@ -648,6 +660,10 @@ def nest_sequences(
             self._store_arg("cache_loc", cache_loc, reset=True)
             self._store_arg("pages_per_seq", pages_per_seq, reset=True)
 
+        # check for updated slot_idx
+        if slot_idx is not None:
+            self._store_arg("slot_idx", slot_idx)
+
         ### UPDATE MAIN INPUTS #####################################################################
         # set new input_ids and make sure to flatten it
         self._store_arg("input_ids", self._flatten(input_ids))
@@ -749,6 +765,7 @@ def __call__(
         input_pos: torch.Tensor,
         cache_loc: torch.Tensor,
         pages_per_seq: torch.Tensor,
+        slot_idx: torch.Tensor,
         page_size: int,
     ) -> List[torch.Tensor]: ...
 
@@ -834,6 +851,9 @@ def prepare_metadata(
             seq_len: torch.Tensor,
             input_pos: torch.Tensor,
             cache_loc: torch.Tensor,
+            pages_per_seq: torch.Tensor,
+            slot_idx: torch.Tensor,
+            page_size: int,
         ) -> List[torch.Tensor]: ...
         ```
         The metadata should contain all necessary global information required for the underlying