update device passing and tests

h-guo18 · h-guo18 · commit 01f08360acad · 2025-09-29T05:38:22.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/config/transformers.yaml b/tensorrt_llm/_torch/auto_deploy/config/transformers.yaml
@@ -20,7 +20,6 @@ transforms:
   transformers_replace_cached_attn:
     stage: cache_init
     attn_backend: flashinfer
-    expected_layout: bsnd
   initialize_cache:
     stage: cache_init
   resize_kv_cache:
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -75,12 +75,6 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
         "If True, only the model architecture is loaded.",
     )
 
-    # checkpoint_device: Optional[str] = Field(
-    #     default=None,
-    #     description="Device on which to load the model checkpoint. "
-    #     "Defaults to the same device as the rest of the pipeline.",
-    # )
-
     tokenizer: Optional[PathLike] = Field(
         description="The tokenizer",
         default=None,
@@ -169,6 +163,12 @@ def update_attn_page_size(self):
             "torch",
         ]:
             self.attn_page_size = self.max_seq_len
+        # NOTE: (hg) For transformers mode. This is ugly.
+        if self.transforms.get("transformers_replace_cached_attn", {}).get("attn_backend") in [
+            "triton",
+            "torch",
+        ]:
+            self.attn_page_size = self.max_seq_len
         return self
 
     @field_validator("model_factory", mode="after")
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -114,9 +114,7 @@ def build_from_config(cls, ad_config: AutoDeployConfig):
         # ADEngine.__init__, and ADEngine.build_from_config. Seems a bit unnatural atm.
 
         # construct inference optimizer
-        build_and_optimize = InferenceOptimizer(
-            factory=factory, config=ad_config.transforms, local_device=device
-        )
+        build_and_optimize = InferenceOptimizer(factory=factory, config=ad_config.transforms)
 
         # construct engine
         return cls(build_and_optimize, seq_info, device, max_beam_width)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/interface.py b/tensorrt_llm/_torch/auto_deploy/transform/interface.py
@@ -55,7 +55,7 @@ class SharedConfig(BaseModel):
     sharding_config: ShardingConfig = Field(default_factory=ShardingConfig)
     local_rank: int = Field(default=0)
     world_size: int = Field(default=1)
-    local_device: str = Field(description="Current rank device.")
+    # local_device: str = Field(description="Current rank device.")
 
 
 class TransformConfig(BaseModel):
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/attention.py b/tensorrt_llm/_torch/auto_deploy/transform/library/attention.py
@@ -1,6 +1,6 @@
 """Pattern matching for detecting repeat_kv, eager, grouped attention patterns from Huggingface models."""
 
-from typing import Any, Callable, Dict, List, Literal, Tuple, Type
+from typing import Any, Callable, Dict, List, Tuple, Type
 
 import torch
 import torch.nn.functional as F
@@ -496,9 +496,7 @@ def register_grouped_attention(patterns: ADPatternMatcherPass):
 class MatchAttentionLayoutConfig(TransformConfig):
     """Configuration for the match attention layout transform."""
 
-    attn_backend: Literal["flashinfer", "triton", "torch"] = Field(
-        description="Attention backend to use."
-    )
+    attn_backend: str = Field(description="Attention backend to use.")
 
 
 @TransformRegistry.register("match_attention_layout")
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/build_model.py b/tensorrt_llm/_torch/auto_deploy/transform/library/build_model.py
@@ -85,7 +85,7 @@ def _apply(
         assert isinstance(factory, hf.AutoModelFactory), "Only HF models are supported."
 
         # build and load the model
-        model = factory.build_and_load_model(shared_config.local_device)
+        model = factory.build_and_load_model(cm.device)
 
         assert not self.config.use_strict_forward, "Only regular forward is supported."
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py
@@ -224,7 +224,7 @@ class ResizeKVCacheConfig(TransformConfig):
     """Configuration for the resize kv cache transform."""
 
     free_mem_ratio: float = Field(
-        description="The fraction of available memory to occupy.", default=0.8
+        default=0.8, ge=0.0, le=1.0, description="The fraction of available memory to occupy."
     )
     args_only: bool = Field(
         description="Use ``*cm.args`` (default) or use ``**cm.named_args`` for the forward pass.",
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/load_weights.py b/tensorrt_llm/_torch/auto_deploy/transform/library/load_weights.py
@@ -45,9 +45,9 @@ def _apply(
     ) -> Tuple[GraphModule, TransformInfo]:
         factory.load_or_random_init(
             gm,
-            device=self.config.checkpoint_device or shared_config.local_device,
+            device=self.config.checkpoint_device or cm.device,
         )
-        move_to_device(gm, shared_config.local_device)
+        move_to_device(gm, cm.device)
 
         info = TransformInfo(skipped=False, num_matches=0, is_clean=True, has_valid_shapes=True)
 
@@ -65,7 +65,9 @@ def _apply(
         factory: ModelFactory,
         shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
-        cm.to(shared_config.local_device)
+        # TODO (hg) This is weird but equivalent to previous code.
+        # We does not seems to need this transform.
+        cm.to(cm.device)
 
         info = TransformInfo(skipped=False, num_matches=0, is_clean=True, has_valid_shapes=True)
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/optimizer.py b/tensorrt_llm/_torch/auto_deploy/transform/optimizer.py
@@ -22,15 +22,17 @@
 
 
 class InferenceOptimizer:
-    def __init__(self, factory: ModelFactory, config: InferenceOptimizerConfig, local_device: str):
+    def __init__(self, factory: ModelFactory, config: InferenceOptimizerConfig):
         self.factory = factory
         self.config = self._clean_config(config)
         if not dist.is_initialized():
             local_rank, world_size = 0, 1
         else:
             local_rank, world_size = dist_ad.get_rank_world_size()
         self.shared_config = SharedConfig(
-            local_rank=local_rank, world_size=world_size, local_device=local_device
+            local_rank=local_rank,
+            world_size=world_size,
+            # local_device=local_device
         )
 
     def _clean_config(self, config: InferenceOptimizerConfig) -> StrictInferenceOptimizerConfig:
diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py
@@ -440,7 +440,7 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     "mistralai/Mistral-Small-3.1-24B-Instruct-2503": {
         "llm_models_subdir": "Mistral-Small-3.1-24B-Instruct-2503",
         "model_factory": "Mistral3VLM",
-        "compile_backend": "torch-simple",
+        # "compile_backend": "torch-simple",
         "model_kwargs": {
             "text_config": {"num_hidden_layers": 2},
             "vision_config": {"num_hidden_layers": 2},
@@ -473,10 +473,8 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An
 
     # add some defaults to llm_args
     llm_args["skip_loading_weights"] = True  # No weight loading to speed up things
-    llm_args["free_mem_ratio"] = 0.00  # we don't need the cache and it may cause OOM issues
     llm_args["attn_page_size"] = 4  # Make sure paging is activated despite small max_tokens
     llm_args["max_batch_size"] = 2  # Minimum batching to speed up things
-
     # update with custom llm_args kwargs
     llm_args.update(llm_args_kwargs)
 
@@ -494,10 +492,16 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An
 
 
 def get_small_model_config_pytest_param(
-    model_hub_id: str, pytest_param_kwargs=None, **llm_args_kwargs
+    model_hub_id: str,
+    attn_backend: str,
+    compile_backend: str,
+    pytest_param_kwargs=None,
+    **llm_args_kwargs,
 ):
     return pytest.param(
         get_small_model_config(model_hub_id, **llm_args_kwargs),
+        attn_backend,
+        compile_backend,
         id=model_hub_id,
         **(pytest_param_kwargs or {}),
     )
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py
@@ -5,7 +5,7 @@
 
 from tensorrt_llm._torch.auto_deploy import LlmArgs
 from tensorrt_llm._torch.auto_deploy.export import apply_export_patches, torch_export_to_gm
-from tensorrt_llm._torch.auto_deploy.transformations._graph import move_to_device
+from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device
 
 
 def test_build_run_llama4_vlm():
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_mistral3_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_mistral3_patches.py
@@ -4,7 +4,7 @@
 
 from tensorrt_llm._torch.auto_deploy import LlmArgs
 from tensorrt_llm._torch.auto_deploy.export import apply_export_patches, torch_export_to_gm
-from tensorrt_llm._torch.auto_deploy.transformations._graph import move_to_device
+from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device
 
 
 def test_build_run_mistral3_vlm():
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
@@ -4,6 +4,7 @@
 import pytest
 
 from tensorrt_llm._torch.auto_deploy import LLM, DemoLLM, LlmArgs
+from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 
 
 def test_custom_values():
@@ -12,13 +13,23 @@ def test_custom_values():
         "model": "test-model",
         "model_factory": "AutoModelForImageTextToText",
         "model_kwargs": {"custom_param": True},
-        "mla_backend": "MultiHeadLatentAttention",
         "skip_loading_weights": True,
-        "free_mem_ratio": 0.9,
-        "simple_shard_only": True,
         "attn_page_size": 128,
-        "attn_backend": "flashinfer",
         "max_seq_len": 2048,
+        "transforms": {
+            "detect_sharding": {
+                "stage": "sharding",
+                "simple_shard_only": True,
+            },
+            "insert_cached_attention": {
+                "stage": "cache_init",
+                "attn_backend": "flashinfer",
+            },
+            "resize_kv_cache": {
+                "stage": "cache_init",
+                "free_mem_ratio": 0.9,
+            },
+        },
     }
 
     args = LlmArgs(**custom_kwargs)
@@ -28,26 +39,30 @@ def test_custom_values():
         "custom_param": True,
     }
     assert args.skip_loading_weights
-    assert args.free_mem_ratio == 0.9
-    assert args.simple_shard_only
+    assert args.transforms["resize_kv_cache"]["free_mem_ratio"] == 0.9
+    assert args.transforms["detect_sharding"]["simple_shard_only"]
     assert args.attn_page_size == 128
     assert args.max_seq_len == 2048
     # attn_backend should be overridden if it was 'TRTLLM'
-    assert args.attn_backend == "flashinfer"
+    assert args.transforms["insert_cached_attention"]["attn_backend"] == "flashinfer"
 
 
 def test_free_mem_ratio_validation():
     """Test free_mem_ratio validation."""
+
+    def get_transform_config(free_mem_ratio):
+        return {"resize_kv_cache": {"stage": "cache_init", "free_mem_ratio": free_mem_ratio}}
+
     # Valid values
-    LlmArgs(model="test-model", free_mem_ratio=0.0)
-    LlmArgs(model="test-model", free_mem_ratio=1.0)
-    LlmArgs(model="test-model", free_mem_ratio=0.5)
+    InferenceOptimizer(None, get_transform_config(0.0))
+    InferenceOptimizer(None, get_transform_config(1.0))
+    InferenceOptimizer(None, get_transform_config(0.5))
 
     # Invalid values
     with pytest.raises(ValueError):
-        LlmArgs(model="test-model", free_mem_ratio=-0.1)
+        InferenceOptimizer(None, get_transform_config(-0.1))
     with pytest.raises(ValueError):
-        LlmArgs(model="test-model", free_mem_ratio=1.1)
+        InferenceOptimizer(None, get_transform_config(1.1))
 
 
 def test_get_pytorch_backend_config():
@@ -67,14 +82,25 @@ def test_config_params():
     return {
         "model": "test-model",
         "model_factory": "AutoModelForImageTextToText",
-        "free_mem_ratio": 0.7,
-        "simple_shard_only": True,
         "skip_loading_weights": True,
         "attn_page_size": 17,
-        "attn_backend": "flashinfer",
         "max_seq_len": 19,
         "max_batch_size": 5,
         "world_size": 3,
+        "transforms": {
+            "detect_sharding": {
+                "stage": "sharding",
+                "simple_shard_only": True,
+            },
+            "insert_cached_attention": {
+                "stage": "cache_init",
+                "attn_backend": "flashinfer",
+            },
+            "resize_kv_cache": {
+                "stage": "cache_init",
+                "free_mem_ratio": 0.7,
+            },
+        },
     }
 
 
@@ -131,8 +157,14 @@ def test_config_flow(
 
     # Common assertions for both APIs
     assert instance.args.model_factory == test_config_params["model_factory"]
-    assert instance.args.free_mem_ratio == test_config_params["free_mem_ratio"]
-    assert instance.args.simple_shard_only == test_config_params["simple_shard_only"]
+    assert (
+        instance.args.transforms["resize_kv_cache"]["free_mem_ratio"]
+        == test_config_params["transforms"]["resize_kv_cache"]["free_mem_ratio"]
+    )
+    assert (
+        instance.args.transforms["detect_sharding"]["simple_shard_only"]
+        == test_config_params["transforms"]["detect_sharding"]["simple_shard_only"]
+    )
     assert instance.args.skip_loading_weights == test_config_params["skip_loading_weights"]
     assert instance.args.attn_page_size == test_config_params["attn_page_size"]
     assert instance.args.max_seq_len == test_config_params["max_seq_len"]
@@ -198,5 +230,11 @@ def test_parallel_config_validation(parallel_field, invalid_value):
 )
 def test_attention_backend_page_size_logic(attn_backend, expected_attn_page_size):
     """Test attn_page_size logic for different attention backends."""
-    args = LlmArgs(model="test-model", attn_backend=attn_backend, max_seq_len=1024)
+    args = LlmArgs(
+        model="test-model",
+        max_seq_len=1024,
+        transforms={
+            "insert_cached_attention": {"stage": "cache_init", "attn_backend": attn_backend}
+        },
+    )
     assert args.attn_page_size == expected_attn_page_size
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py

Original file line number	Diff line number	Diff line change
`@@ -224,7 +224,7 @@ class ResizeKVCacheConfig(TransformConfig):`
`224`	`224`	`"""Configuration for the resize kv cache transform."""`
`225`	`225`
`226`	`226`	`free_mem_ratio: float = Field(`
`227`		`- description="The fraction of available memory to occupy.", default=0.8`
	`227`	`+ default=0.8, ge=0.0, le=1.0, description="The fraction of available memory to occupy."`
`228`	`228`	`)`
`229`	`229`	`args_only: bool = Field(`
`230`	`230`	description="Use ``cm.args`` (default) or use ``*cm.named_args`` for the forward pass.",