nv-auto-deploy
diff --git a/‎docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md‎
Lines changed: 17 additions & 15 deletions b/‎docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md‎
Lines changed: 17 additions & 15 deletions
diff --git a/‎docs/source/features/auto_deploy/advanced/expert_configurations.md‎
Lines changed: 18 additions & 26 deletions b/‎docs/source/features/auto_deploy/advanced/expert_configurations.md‎
Lines changed: 18 additions & 26 deletions
diff --git a/‎docs/source/features/auto_deploy/advanced/workflow.md‎
Lines changed: 0 additions & 2 deletions b/‎docs/source/features/auto_deploy/advanced/workflow.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/source/features/auto_deploy/support_matrix.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/features/auto_deploy/support_matrix.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md‎
Lines changed: 17 additions & 15 deletions b/‎docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md‎
Lines changed: 17 additions & 15 deletions
diff --git a/‎docs/source/torch/auto_deploy/advanced/expert_configurations.md‎
Lines changed: 18 additions & 26 deletions b/‎docs/source/torch/auto_deploy/advanced/expert_configurations.md‎
Lines changed: 18 additions & 26 deletions
diff --git a/‎docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md‎
Lines changed: 25 additions & 17 deletions b/‎docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md‎
Lines changed: 25 additions & 17 deletions
diff --git a/‎docs/source/torch/auto_deploy/advanced/workflow.md‎
Lines changed: 10 additions & 7 deletions b/‎docs/source/torch/auto_deploy/advanced/workflow.md‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎examples/auto_deploy/.vscode/launch.json‎
Lines changed: 2 additions & 2 deletions b/‎examples/auto_deploy/.vscode/launch.json‎
Lines changed: 2 additions & 2 deletions
@@ -40,29 +40,31 @@ trtllm-bench \
 #### Basic Performance Configuration (`autodeploy_config.yaml`)
 
 ```yaml
-# Compilation backend
-compile_backend: torch-opt
-
-# Runtime engine
+# runtime engine
 runtime: trtllm
 
-# Model loading
+# model loading
 skip_loading_weights: false
 
-# Fraction of free memory to use for kv-caches
-free_mem_ratio: 0.8
-
-# CUDA Graph optimization
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
-
-# Attention backend
-attn_backend: flashinfer
-
 # Sequence configuration
 max_batch_size: 256
+
+# transform options
+transforms:
+  insert_cached_attention:
+    # attention backend
+    backend: flashinfer
+  resize_kv_cache:
+    # fraction of free memory to use for kv-caches
+    free_mem_ratio: 0.8
+  compile_model:
+    # compilation backend
+    backend: torch-opt
+    # CUDA Graph optimization
+    cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
 ```
 
-Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs
+Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs.
 
 ## Configuration Options Reference
 
 
@@ -63,29 +63,22 @@ args:
     num_hidden_layers: 12
     hidden_size: 1024
   world_size: 4
-  compile_backend: torch-compile
-  attn_backend: triton
   max_seq_len: 2048
   max_batch_size: 16
   transforms:
-    sharding:
-      strategy: auto
-    quantization:
-      enabled: false
+    detect_sharding:
+      support_partial_config: true
+    insert_cached_attention:
+      backend: triton
+    compile_model:
+      backend: torch-compile
 
 prompt:
   batch_size: 8
   sp_kwargs:
     max_tokens: 150
     temperature: 0.8
     top_k: 50
-
-benchmark:
-  enabled: true
-  num: 20
-  bs: 4
-  isl: 1024
-  osl: 256
 ```
 
 Create an additional override file (e.g., `production.yaml`):
@@ -94,11 +87,10 @@ Create an additional override file (e.g., `production.yaml`):
 # production.yaml
 args:
   world_size: 8
-  compile_backend: torch-opt
   max_batch_size: 32
-
-benchmark:
-  enabled: false
+  transforms:
+    compile_model:
+      backend: torch-opt
 ```
 
 Then use these configurations:
@@ -107,26 +99,26 @@ Then use these configurations:
 # Using single YAML config
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs my_config.yaml
+  --yaml-extra my_config.yaml
 
 # Using multiple YAML configs (deep merged in order, later files have higher priority)
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs my_config.yaml production.yaml
+  --yaml-extra my_config.yaml production.yaml
 
 # Targeting nested AutoDeployConfig with separate YAML
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs my_config.yaml \
-  --args.yaml-configs autodeploy_overrides.yaml
+  --yaml-extra my_config.yaml \
+  --args.yaml-extra autodeploy_overrides.yaml
 ```
 
 ## Configuration Precedence and Deep Merging
 
 The configuration system follows a precedence order in which higher priority sources override lower priority ones:
 
 1. **CLI Arguments** (highest priority) - Direct command line arguments
-1. **YAML Configs** - Files specified via `--yaml-configs` and `--args.yaml-configs`
+1. **YAML Configs** - Files specified via `--yaml-extra` and `--args.yaml-extra`
 1. **Default Settings** (lowest priority) - Built-in defaults from the config classes
 
 **Deep Merging**: Unlike simple overwriting, deep merging recursively combines nested dictionaries. For example:
@@ -152,12 +144,12 @@ args:
 **Nested Config Behavior**: When using nested configurations, outer YAML configuration files become initialization settings for inner objects, giving them higher precedence:
 
 ```bash
-# The outer yaml-configs affects the entire ExperimentConfig
-# The inner args.yaml-configs affects only the AutoDeployConfig
+# The outer yaml-extra affects the entire ExperimentConfig
+# The inner args.yaml-extra affects only the AutoDeployConfig
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs experiment_config.yaml \
-  --args.yaml-configs autodeploy_config.yaml \
+  --yaml-extra experiment_config.yaml \
+  --args.yaml-extra autodeploy_config.yaml \
   --args.world-size=8  # CLI override beats both YAML configs
 ```
 
 
@@ -18,9 +18,7 @@ llm = LLM(
     attn_page_size=64, # page size for attention (tokens_per_block, should be == max_seq_len for triton)
     skip_loading_weights=False,
     model_factory="AutoModelForCausalLM", # choose appropriate model factory
-    mla_backend="MultiHeadLatentAttention", # for models that support MLA
     free_mem_ratio=0.8, # fraction of available memory for cache
-    simple_shard_only=False, # tensor parallelism sharding strategy
     max_seq_len=<MAX_SEQ_LEN>,
     max_batch_size=<MAX_BATCH_SIZE>,
 )
 
@@ -113,6 +113,7 @@ Optimize attention operations with different attention kernel implementations:
 
 | `"attn_backend"` | Description |
 |----------------------|-------------|
+| `torch`  | Custom fused multi-head attention (MHA) with KV Cache reference implementation in pure PyTorch (slow!) |
 | `triton` | Custom fused multi-head attention (MHA) with KV Cache kernels for efficient attention processing. |
 | `flashinfer`         | Uses optimized attention kernels with KV Cache from the [`flashinfer`](https://github.com/flashinfer-ai/flashinfer.git) library. |
 
 
@@ -40,29 +40,31 @@ trtllm-bench \
 #### Basic Performance Configuration (`autodeploy_config.yaml`)
 
 ```yaml
-# Compilation backend
-compile_backend: torch-opt
-
-# Runtime engine
+# runtime engine
 runtime: trtllm
 
-# Model loading
+# model loading
 skip_loading_weights: false
 
-# Fraction of free memory to use for kv-caches
-free_mem_ratio: 0.8
-
-# CUDA Graph optimization
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
-
-# Attention backend
-attn_backend: flashinfer
-
 # Sequence configuration
 max_batch_size: 256
+
+# transform options
+transforms:
+  insert_cached_attention:
+    # attention backend
+    backend: flashinfer
+  resize_kv_cache:
+    # fraction of free memory to use for kv-caches
+    free_mem_ratio: 0.8
+  compile_model:
+    # compilation backend
+    backend: torch-opt
+    # CUDA Graph optimization
+    cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
 ```
 
-Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs
+Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs.
 
 ## Configuration Options Reference
 
 
@@ -63,29 +63,22 @@ args:
     num_hidden_layers: 12
     hidden_size: 1024
   world_size: 4
-  compile_backend: torch-compile
-  attn_backend: triton
   max_seq_len: 2048
   max_batch_size: 16
   transforms:
-    sharding:
-      strategy: auto
-    quantization:
-      enabled: false
+    detect_sharding:
+      support_partial_config: true
+    insert_cached_attention:
+      backend: triton
+    compile_model:
+      backend: torch-compile
 
 prompt:
   batch_size: 8
   sp_kwargs:
     max_tokens: 150
     temperature: 0.8
     top_k: 50
-
-benchmark:
-  enabled: true
-  num: 20
-  bs: 4
-  isl: 1024
-  osl: 256
 ```
 
 Create an additional override file (e.g., `production.yaml`):
@@ -94,11 +87,10 @@ Create an additional override file (e.g., `production.yaml`):
 # production.yaml
 args:
   world_size: 8
-  compile_backend: torch-opt
   max_batch_size: 32
-
-benchmark:
-  enabled: false
+  transforms:
+    compile_model:
+      backend: torch-opt
 ```
 
 Then use these configurations:
@@ -107,26 +99,26 @@ Then use these configurations:
 # Using single YAML config
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs my_config.yaml
+  --yaml-extra my_config.yaml
 
 # Using multiple YAML configs (deep merged in order, later files have higher priority)
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs my_config.yaml production.yaml
+  --yaml-extra my_config.yaml production.yaml
 
 # Targeting nested AutoDeployConfig with separate YAML
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs my_config.yaml \
-  --args.yaml-configs autodeploy_overrides.yaml
+  --yaml-extra my_config.yaml \
+  --args.yaml-extra autodeploy_overrides.yaml
 ```
 
 ## Configuration Precedence and Deep Merging
 
 The configuration system follows a precedence order in which higher priority sources override lower priority ones:
 
 1. **CLI Arguments** (highest priority) - Direct command line arguments
-1. **YAML Configs** - Files specified via `--yaml-configs` and `--args.yaml-configs`
+1. **YAML Configs** - Files specified via `--yaml-extra` and `--args.yaml-extra`
 1. **Default Settings** (lowest priority) - Built-in defaults from the config classes
 
 **Deep Merging**: Unlike simple overwriting, deep merging recursively combines nested dictionaries. For example:
@@ -152,12 +144,12 @@ args:
 **Nested Config Behavior**: When using nested configurations, outer YAML configuration files become initialization settings for inner objects, giving them higher precedence:
 
 ```bash
-# The outer yaml-configs affects the entire ExperimentConfig
-# The inner args.yaml-configs affects only the AutoDeployConfig
+# The outer yaml-extra affects the entire ExperimentConfig
+# The inner args.yaml-extra affects only the AutoDeployConfig
 python build_and_run_ad.py \
   --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
-  --yaml-configs experiment_config.yaml \
-  --args.yaml-configs autodeploy_config.yaml \
+  --yaml-extra experiment_config.yaml \
+  --args.yaml-extra autodeploy_config.yaml \
   --args.world-size=8  # CLI override beats both YAML configs
 ```
 
 
@@ -42,23 +42,31 @@ trtllm-serve \
 Example `autodeploy_config.yaml`:
 
 ```yaml
-# Compilation backend for AutoDeploy
-compile_backend: torch-opt  # options: torch-simple, torch-compile, torch-cudagraph, torch-opt
-
-# Runtime engine
-runtime: trtllm                # options: trtllm, demollm
-
-# Model loading
-skip_loading_weights: false    # set true for architecture-only perf runs
-
-# KV cache memory
-free_mem_ratio: 0.8            # fraction of free GPU mem for KV cache
-
-# CUDA graph optimization
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64]
-
-# Attention backend
-attn_backend: flashinfer       # recommended for best performance
+# runtime engine
+runtime: trtllm
+
+# model loading
+skip_loading_weights: false
+
+# Sequence configuration
+max_batch_size: 256
+
+# multi-gpu execution
+world_size: 1
+
+# transform options
+transforms:
+  insert_cached_attention:
+    # attention backend
+    backend: flashinfer
+  resize_kv_cache:
+    # fraction of free memory to use for kv-caches
+    free_mem_ratio: 0.8
+  compile_model:
+    # compilation backend
+    backend: torch-opt
+    # CUDA Graph optimization
+    cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
 ```
 
 ## Limitations and tips
 
@@ -12,15 +12,18 @@ from tensorrt_llm._torch.auto_deploy import LLM
 llm = LLM(
     model=<HF_MODEL_CARD_OR_DIR>,
     world_size=<DESIRED_WORLD_SIZE>,
-    compile_backend="torch-compile",
+    model_factory="AutoModelForCausalLM", # choose appropriate model factory
     model_kwargs={"num_hidden_layers": 2}, # test with smaller model configuration
-    attn_backend="flashinfer", # choose between "triton" and "flashinfer"
-    attn_page_size=64, # page size for attention (tokens_per_block, should be == max_seq_len for triton)
+    transforms={
+        "insert_cached_attention": {"backend": "flashinfer"},  # or "triton"
+        "insert_cached_mla_attention": {"backend": "MultiHeadLatentAttention"},
+        "resize_kv_cache": {"free_mem_ratio": 0.8},
+        "compile_model": {"backend": "torch-compile"},
+        "detect_sharding": {"simple_shard_only": False},
+
+    },
+    attn_page_size=64, # page size for attention
     skip_loading_weights=False,
-    model_factory="AutoModelForCausalLM", # choose appropriate model factory
-    mla_backend="MultiHeadLatentAttention", # for models that support MLA
-    free_mem_ratio=0.8, # fraction of available memory for cache
-    simple_shard_only=False, # tensor parallelism sharding strategy
     max_seq_len=<MAX_SEQ_LEN>,
     max_batch_size=<MAX_BATCH_SIZE>,
 )
 
@@ -10,9 +10,9 @@
                 "--model=meta-llama/Meta-Llama-3.1-8B-Instruct",
                 "--args.world-size=2",
                 "--args.runtime=demollm",
-                "--args.compile-backend=torch-simple",
+                "--args.transforms.compile-model.backend=torch-simple",
                 "--args.attn-page-size=16",
-                "--args.attn-backend=flashinfer",
+                "--args.transforms.insert-cached-attention.backend=flashinfer",
                 "--args.model-factory=AutoModelForCausalLM",
                 "--benchmark.enabled=false",
                 "--prompt.batch-size=2",