niukuo
diff --git a/‎examples/auto_deploy/.vscode/launch.json‎
Lines changed: 1 addition & 1 deletion b/‎examples/auto_deploy/.vscode/launch.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/auto_deploy/README.md‎
Lines changed: 14 additions & 26 deletions b/‎examples/auto_deploy/README.md‎
Lines changed: 14 additions & 26 deletions
diff --git a/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 16 additions & 26 deletions b/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 16 additions & 26 deletions
diff --git a/‎examples/auto_deploy/simple_config.py‎
Lines changed: 5 additions & 7 deletions b/‎examples/auto_deploy/simple_config.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 2 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 21 additions & 13 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 21 additions & 13 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/shim/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 22 additions & 21 deletions b/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 22 additions & 21 deletions
@@ -8,7 +8,7 @@
             "program": "build_and_run_ad.py",
             "args": [
                 "--config",
-                "{\"batch_size\": 2, \"page_size\": 16, \"world_size\": 2, \"compile_backend\": \"torch-simple\", \"attn_backend\": \"FlashInfer\",\"model_factory\": \"AutoModelForCausalLM\", \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"benchmark\": false}",
+                "{\"batch_size\": 2, \"attn_page_size\": 16, \"world_size\": 2, \"compile_backend\": \"torch-simple\", \"attn_backend\": \"FlashInfer\",\"model_factory\": \"AutoModelForCausalLM\", \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"benchmark\": false}",
                 "--model-kwargs",
                 "{}",
                 // "{\"num_hidden_layers\": 3}",
 
@@ -151,7 +151,7 @@ In the below example:
 | `"mla_backend"` | Specifies implementation for multi-head latent attention |
 | `"max_seq_len"` | Maximum sequence length for inference/cache |
 | `"max_batch_size"` | Maximum dimension for statically allocated KV cache |
-| `"page_size"` | Page size for attention |
+| `"attn_page_size"` | Page size for attention |
 | `"benchmark"` | Indicates whether to run the built-in benchmark for token generation |
 
 For default values and additional configuration options, refer to the [simple_config.py](./simple_config.py) file.
@@ -236,37 +236,25 @@ Here is an example of how you can build an LLM object with AutoDeploy integratio
 
 ```
 from tensorrt_llm import LLM
-from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm._torch.auto_deploy.shim import AutoDeployConfig
 
-# 1. Set up the build configuration
-build_config = BuildConfig(
-    max_seq_len=<MAX_SEQ_LEN>,
-    max_batch_size=<MAX_BS>,
-)
-build_config.plugin_config.tokens_per_block = <PAGE_SIZE>
-# if using "TritonWithFlattenedInputs" as backend, <PAGE_SIZE> should equal to <MAX_SEQ_LEN>
-# Refer to examples/auto_deploy/simple_config.py (line 109) for details.
-
-# 2. Set up AutoDeploy configuration
-# AutoDeploy will use its own cache implementation
-model_kwargs = {"use_cache":False}
 
-ad_config = AutoDeployConfig(
+# Construct the LLM high-level interface object with autodeploy as backend
+llm = LLM(
+    model=<HF_MODEL_CARD_OR_DIR>,
+    backend="_autodeploy",
+    tensor_parallel_size=<NUM_WORLD_RANK>,
     use_cuda_graph=True, # set True if using "torch-opt" as compile backend
     torch_compile_enabled=True, # set True if using "torch-opt" as compile backend
-    model_kwargs=model_kwargs,
+    model_kwargs={"use_cache": False}, # AutoDeploy uses its own cache implementation
     attn_backend="TritonWithFlattenedInputs", # choose between "TritonWithFlattenedInputs" and "FlashInfer"
+    attn_page_size=64, # page size for attention (tokens_per_block, should be == max_seq_len for triton)
     skip_loading_weights=False,
-)
-
-# 3. Construct the LLM high-level interface object with autodeploy as backend
-llm = LLM(
-    model=<HF_MODEL_CARD_OR_DIR>,
-    backend="autodeploy",
-    build_config=build_config,
-    auto_deploy_config=ad_config,
-    tensor_parallel_size=<NUM_WORLD_RANK>,
+    model_factory="AutoModelForCausalLM", # choose appropriate model factory
+    mla_backend="MultiHeadLatentAttention", # for models that support MLA
+    free_mem_ratio=0.8, # fraction of available memory for cache
+    simple_shard_only=False, # tensor parallelism sharding strategy
+    max_seq_len=<MAX_SEQ_LEN>,
+    max_batch_size=<MAX_BATCH_SIZE>,
 )
 
 ```
 
@@ -8,10 +8,9 @@
 from simple_config import SimpleConfig
 
 from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
-from tensorrt_llm._torch.auto_deploy.shim import AutoDeployConfig, DemoLLM
+from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
 from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
 from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
-from tensorrt_llm.builder import BuildConfig
 from tensorrt_llm.llmapi.llm import LLM, RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -33,27 +32,6 @@ def get_config_and_check_args() -> SimpleConfig:
 
 def build_llm_from_config(config: SimpleConfig) -> LLM:
     """Builds a LLM object from our config."""
-    # set up builder config
-    build_config = BuildConfig(max_seq_len=config.max_seq_len, max_batch_size=config.max_batch_size)
-    build_config.plugin_config.tokens_per_block = config.page_size
-
-    # setup AD config
-    ad_config = AutoDeployConfig(
-        # Both torch-opt and torch-cudagraph invoke cudagraphs
-        use_cuda_graph=config.compile_backend in ["torch-opt", "torch-cudagraph"],
-        # Both torch-opt and torch-compile invoke torch.compile
-        torch_compile_enabled=config.compile_backend in ["torch-opt", "torch-compile"],
-        model_factory=config.model_factory,
-        model_kwargs=config.model_kwargs,
-        attn_backend=config.attn_backend,
-        mla_backend=config.mla_backend,
-        skip_loading_weights=config.skip_loading_weights,
-        cuda_graph_max_batch_size=config.max_batch_size,
-        free_mem_ratio=config.free_mem_ratio,
-        simple_shard_only=config.simple_shard_only,
-    )
-    ad_logger.info(f"AutoDeploy Config: {ad_config}")
-
     # TODO: let's see if prefetching can't be done through the LLM api?
     # I believe the "classic workflow" invoked via the LLM api can do that.
     # put everything into the HF model Factory and try pre-fetching the checkpoint
@@ -73,9 +51,21 @@ def build_llm_from_config(config: SimpleConfig) -> LLM:
     }
     llm = llm_lookup[config.runtime](
         model=factory.model,
-        backend="autodeploy",
-        build_config=build_config,
-        auto_deploy_config=ad_config,
+        backend="_autodeploy",
+        max_seq_len=config.max_seq_len,
+        max_batch_size=config.max_batch_size,
+        # AutoDeploy-specific parameters
+        use_cuda_graph=config.compile_backend in ["torch-opt", "torch-cudagraph"],
+        torch_compile_enabled=config.compile_backend in ["torch-opt", "torch-compile"],
+        model_factory=config.model_factory,
+        model_kwargs=config.model_kwargs,
+        attn_backend=config.attn_backend,
+        mla_backend=config.mla_backend,
+        skip_loading_weights=config.skip_loading_weights,
+        cuda_graph_max_batch_size=config.max_batch_size,
+        free_mem_ratio=config.free_mem_ratio,
+        simple_shard_only=config.simple_shard_only,
+        attn_page_size=config.attn_page_size,  # Now passed directly as AutoDeploy parameter
         tensor_parallel_size=config.world_size,
         tokenizer=factory.init_tokenizer() if config.customize_tokenizer else None,
     )
 
@@ -7,6 +7,7 @@
 from typing import Dict, List, Literal, Optional, Union
 
 
+# TODO: remove and unify with _AutoDeployLlmArgs
 @dataclass
 class SimpleConfig:
     """Experiment Configuration."""
@@ -55,7 +56,7 @@ class SimpleConfig:
     mla_backend: Literal["MultiHeadLatentAttention"] = "MultiHeadLatentAttention"
     max_seq_len: int = 512  # max sequence length for inference/cache
     max_batch_size: int = 8  # max dimension for statically allocated kv cache
-    page_size: int = 64  # page size for attention
+    attn_page_size: int = 64  # page size for attention
     simple_shard_only: bool = False  # if True, force simple sharding(all_gather) in TP;
     # otherwise auto-detect and use column+row (all_reduce) sharding
 
@@ -94,11 +95,8 @@ def __post_init__(self):
         # check if model was supplied
         assert self.model, "model must be supplied!"
 
-        # we don't want to loose the default values for model_kwargs unless explicitly set by the
-        # user. They are not preserved by the standard initialization process since they whole dict
-        # gets replaced by the user provided one. We don't want that though.
-        f_default = self.__dataclass_fields__["model_kwargs"].default_factory()
-        setattr(self, "model_kwargs", {**f_default, **getattr(self, "model_kwargs")})
+        # NEVER use cache
+        self.model_kwargs["use_cache"] = False
 
         # special handling for torch_dtype in model_kwargs since HF does not correctly update
         # torch_dtype string to an actual torch.dtype object (only with default)
@@ -120,7 +118,7 @@ def __post_init__(self):
 
         # No paging allowed in TritonWithFlattenedInputs
         if self.attn_backend in ["TritonWithFlattenedInputs"]:
-            self.page_size = self.max_seq_len
+            self.attn_page_size = self.max_seq_len
 
         # use min instead of max to avoid OOM for large batch size
         self.model_kwargs["max_position_embeddings"] = min(
 
@@ -86,7 +86,7 @@ class SequenceInfo:
     # then the maximum number of sequences possible in the batch is min (max_batch_size, max_num_tokens // ISL).
     # Similarly, if a batch is composed of generate-only requests,
     # then the maximum number of sequences possible in the batch is min (max_batch_size, max_num_tokens).
-    max_num_tokens: int = 0
+    max_num_tokens: Optional[int] = None
 
     ## [UPDATE WITH CARE] TENSOR FIELDS THAT WILL BE PASSED TO PREPARE_METADATA OP #################
     # input_ids MUST ALWAYS BE THE FIRST FIELD
@@ -112,7 +112,7 @@ def __post_init__(self):
         # see https://github.com/NVIDIA/TensorRT-LLM/issues/4504
         max_seq_len_adjusted = self.max_seq_len + 1
 
-        if self.max_num_tokens < 1:
+        if self.max_num_tokens is None or self.max_num_tokens < 1:
             self.max_num_tokens = self.max_batch_size * max_seq_len_adjusted
         # if the provided max_num_tokens is less than the max_batch_size * max_seq_len,
         # we use the provided max_num_tokens to calculate the number of pages
 
@@ -75,9 +75,18 @@ def __init__(
         self.tokenizer_kwargs.setdefault("trust_remote_code", True)
         self._quant_config = None
 
-        # heuristic to disable use_cache
+        # NEVER use cache
         self.model_kwargs["use_cache"] = False
 
+        # special handling for torch_dtype in model_kwargs since HF does not correctly update
+        # torch_dtype string to an actual torch.dtype object (only with default)
+        if "torch_dtype" in self.model_kwargs:
+            dtype = self.model_kwargs["torch_dtype"]
+            if isinstance(dtype, str):
+                dtype = getattr(torch, self.model_kwargs["torch_dtype"])
+            assert isinstance(dtype, torch.dtype), f"Invalid dtype: {dtype}"
+            self.model_kwargs["torch_dtype"] = dtype
+
         # prefetch the model+checkpoint
         self.prefetch_checkpoint()
         # load the quantization config
@@ -322,19 +331,18 @@ class AutoModelForImageTextToTextFactory(AutoModelForCausalLMFactory):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        # additional heuristic to disable use_cache
-        self.model_kwargs["text_config"] = self.model_kwargs.get("text_config", {})
-        self.model_kwargs["text_config"]["use_cache"] = False
-
-        self.model_kwargs["text_config"]["max_position_embeddings"] = self.model_kwargs[
-            "max_position_embeddings"
-        ]
-
-        # additional heuristic to propagate use of num_hidden_layers
+        # additional heuristic to propagate "important keys"
         # TODO (lucaslie): WAR until we have better support on dashboard to control model_kwargs
-        nhl_key = "num_hidden_layers"
-        if nhl_key in self.model_kwargs:
-            self.model_kwargs["text_config"][nhl_key] = self.model_kwargs[nhl_key]
+        keys_to_propagate = [
+            "num_hidden_layers",
+            "max_position_embeddings",
+            "use_cache",
+            "torch_dtype",
+        ]
+        self.model_kwargs["text_config"] = self.model_kwargs.get("text_config", {})
+        for key in keys_to_propagate:
+            if key in self.model_kwargs:
+                self.model_kwargs["text_config"][key] = self.model_kwargs[key]
 
     @property
     def automodel_from_config(self):
 
@@ -2,4 +2,4 @@
 
 from .ad_executor import create_autodeploy_executor
 from .demollm import DemoLLM
-from .interface import AutoDeployConfig, CachedSequenceInterface, GetInferenceModel
+from .interface import CachedSequenceInterface, GetInferenceModel
@@ -9,6 +9,7 @@
 from ...._utils import mpi_rank, mpi_world_size
 from ....bindings.executor import ExecutorConfig
 from ....bindings.internal.batch_manager import CacheType
+from ....llmapi.llm_args import _AutoDeployLlmArgs
 from ....mapping import Mapping
 from ...distributed import MPIDist
 from ...pyexecutor.config import PyTorchConfig
@@ -27,7 +28,7 @@
 from ..models import ModelFactoryRegistry
 from ..transformations.transform import InferenceOptimizer
 from ..utils.logger import ad_logger
-from .interface import AutoDeployConfig, CachedSequenceInterface, GetInferenceModel
+from .interface import CachedSequenceInterface, GetInferenceModel
 
 
 class _CacheManagerWithFakePool(KVCacheManager):
@@ -84,11 +85,11 @@ def _device(self) -> DeviceLikeType:
     def build_from_config(
         cls,
         model: str,
-        ad_config: AutoDeployConfig,
+        ad_config: _AutoDeployLlmArgs,
         seq_info: SequenceInfo,
         device: DeviceLikeType,
     ):
-        """Build the ADEngine using the AutoDeployConfig that gets passed through from the LLM."""
+        """Build the ADEngine using the _AutoDeployLlmArgs that gets passed through from the LLM."""
 
         # update device to contain the current default device if it's in cuda
         device = torch.device(device)
@@ -245,7 +246,7 @@ def create_autodeploy_executor(
 ):
     """Create an AutoDeploy executor from the given configuration and checkpoint directory.
 
-    This is the entrypoint API to the autodeploy backend.
+    This is the entrypoint API to the _autodeploy backend.
     """
     # initialize process groups
     world_size = mpi_world_size()
@@ -258,33 +259,34 @@ def create_autodeploy_executor(
     dist.initialize_or_skip(rank, world_size, port)
 
     # some config
-    if executor_config.pytorch_backend_config is None:
-        executor_config.pytorch_backend_config = AutoDeployConfig(attn_backend="FlashInfer")
+    msg = "pytorch_backend_config must be an _AutoDeployLlmArgs object"
+    assert isinstance(executor_config.pytorch_backend_config, _AutoDeployLlmArgs), msg
+    ad_config: _AutoDeployLlmArgs = executor_config.pytorch_backend_config
 
-    max_batch_size = executor_config.max_batch_size
-    max_seq_len = executor_config.max_seq_len
-    tokens_per_block = executor_config.tokens_per_block
-    max_num_tokens = executor_config.max_num_tokens
-    ad_logger.info(f"{max_seq_len=}, {max_batch_size=}, {tokens_per_block=}, {max_num_tokens=}")
+    max_batch_size = ad_config.max_batch_size
+    max_seq_len = ad_config.max_seq_len
+    attn_page_size = ad_config.attn_page_size
+    max_num_tokens = ad_config.max_num_tokens
+    ad_logger.info(f"{max_seq_len=}, {max_batch_size=}, {attn_page_size=}, {max_num_tokens=}")
 
     # initialize model engine
     engine = ADEngine.build_from_config(
         model=checkpoint_dir,
-        ad_config=executor_config.pytorch_backend_config,
+        ad_config=ad_config,
         seq_info=SequenceInfo(
             max_seq_len=max_seq_len,
             max_batch_size=max_batch_size,
-            page_size=tokens_per_block,
+            page_size=attn_page_size,
             max_num_tokens=max_num_tokens,
         ),
         device="cuda",
     )
 
     # resource managers
     kv_cache_manager = _CacheManagerWithFakePool(
-        executor_config.kv_cache_config,
+        ad_config.kv_cache_config,
         num_blocks=engine.cache_seq_interface.info.num_pages,
-        tokens_per_block=tokens_per_block,
+        tokens_per_block=attn_page_size,
         max_seq_len=max_seq_len,
         max_batch_size=max_batch_size,
     )
@@ -302,18 +304,17 @@ def create_autodeploy_executor(
     sampler = TorchSampler(max_seq_len=max_seq_len)
 
     # creating the executor object
-    py_config: PyTorchConfig = executor_config.pytorch_backend_config
     py_executor = PyExecutor(
         resource_manager,
         scheduler,
         model_engine=engine,
         sampler=sampler,
         dist=mpi_dist,
-        disable_overlap_scheduler=py_config.disable_overlap_scheduler,
-        max_input_len=executor_config.max_input_len,
-        max_batch_size=executor_config.max_batch_size,
-        max_draft_tokens=executor_config.speculative_config.max_draft_tokens
-        if executor_config.speculative_config is not None
+        disable_overlap_scheduler=ad_config.disable_overlap_scheduler,
+        max_input_len=ad_config.max_input_len,
+        max_batch_size=ad_config.max_batch_size,
+        max_draft_tokens=ad_config.speculative_config.max_draft_tokens
+        if ad_config.speculative_config is not None
         else 0,
     )
     return py_executor