nv-auto-deploy · suyoggupta · Sep 22, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -123,10 +123,10 @@ transforms:
     attn_backend: MultiHeadLatentAttention
   insert_cached_ssm_attention:
     stage: cache_init
-    attn_backend: torch_ssm
+    attn_backend: triton_ssm
   insert_cached_causal_conv:
     stage: cache_init
-    attn_backend: torch_causal_conv
+    attn_backend: cuda_causal_conv
   initialize_cache:
     stage: cache_init
   resize_kv_cache:

diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -28,6 +28,7 @@
 class CacheConfig:
     """A dataclass to hold information how to configure the cache."""
 
+    # dtype of the cache
     dtype: Optional[torch.dtype] = None
 
 
@@ -522,6 +523,7 @@ def set_example_sequence(
 
         # vanilla slot indices
         slot_idx = list(range(len(input_ids)))
+        #        breakpoint()
 
         self.nest_sequences(
             input_ids,
@@ -537,6 +539,9 @@ def set_max_num_tokens_sample(self) -> None:
         # TODO (lucaslie): understand what this implies for extra arguments
         seq_len = self.max_num_tokens // self.max_batch_size
         input_ids = torch.ones(self.max_batch_size, seq_len, dtype=torch.int).tolist()
+        print(
+            f"setting max_num_tokens_sample: {self.max_num_tokens=}, {self.max_batch_size=}, {seq_len=}"
+        )
         self.set_example_sequence(input_ids)
 
     def set_generate_only_batch(self) -> None:
@@ -581,6 +586,10 @@ def _store_arg(
             # pin the memory on the host
             tnsr_host = torch.tensor(tnsr_like, dtype=tnsr_device.dtype, pin_memory=True)
 
+            if tnsr_device.numel() < tnsr_host.numel():
+                print("WARNING: tnsr_device.numel() < tnsr_like.numel()")
+                print(f"{name=}, {tnsr_device.numel()=}, {tnsr_host.numel()=}")
+                tnsr_device.resize_(tnsr_host.numel())
             # reset/copy to the device in a non-blocking fashion
             if reset:
                 tnsr_device.zero_()