[None][doc] promote AutoDeploy to beta feature in docs (NVIDIA#10372)

lucaslie · videodanchik · commit 302fa1dcda27 · 2026-01-14T02:12:41.000Z
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
Signed-off-by: Daniil Kulko &lt;kulkodaniil@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -63,6 +63,7 @@ docs/source/**/*.rst
 .coverage.*
 results_trt/
 llm-test-workspace/
+ad-test-workspace/
 
 # build/debug
 *.safetensors
diff --git a/docs/source/features/auto_deploy/auto-deploy.md b/docs/source/features/auto_deploy/auto-deploy.md
@@ -1,12 +1,13 @@
-# AutoDeploy (Prototype)
+# AutoDeploy (Beta)
 
 ```{note}
-This project is under active development and is currently in a prototype stage. The code is a prototype, subject to change, and may include backward-incompatible updates. While we strive for correctness, there are no guarantees regarding functionality, stability, or reliability.
+This project is under active development and is currently released as beta feature. The code is
+subject to change, and may include backward-incompatible updates.
 ```
 
 ## Seamless Model Deployment from PyTorch to TensorRT LLM
 
-AutoDeploy is a prototype designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT LLM.
+AutoDeploy is designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT LLM.
 
 ![AutoDeploy overview](../../media/ad_overview.png)
 <sub><em>AutoDeploy overview and relation with TensorRT LLM's LLM API</em></sub>
diff --git a/examples/auto_deploy/README.md b/examples/auto_deploy/README.md
@@ -334,4 +334,5 @@ the current progress in AutoDeploy and where you can help.
 
 ## Disclaimer
 
-This project is under active development and is currently in a prototype stage. The code is experimental, subject to change, and may include backward-incompatible updates. While we strive for correctness, there are no guarantees regarding functionality, stability, or reliability.
+This project is under active development and is currently released as beta feature. The code is
+subject to change, and may include backward-incompatible updates.
diff --git a/examples/auto_deploy/nemotron_flash.yaml b/examples/auto_deploy/nemotron_flash.yaml
@@ -5,7 +5,14 @@ max_num_tokens: 8192
 enable_chunked_prefill: true
 model_factory: NemotronFlashForCausalLM
 free_mem_ratio: 0.9
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64,96, 128, 256, 320, 384]
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 96, 128, 256, 320, 384]
 kv_cache_config:
   # disable kv_cache reuse since not supported for hybrid/ssm models
   enable_block_reuse: false
+transforms:
+  gather_logits_before_lm_head:
+    # TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default
+    enabled: true
+  fuse_mamba_a_log:
+    stage: post_load_fusion
+    enabled: true