From c2ae43710d35bfc9f3f0eae4026d691002ea928a Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Wed, 17 Dec 2025 17:40:35 -0800
Subject: [PATCH 01/10] [None][feat] Auto download speculative models from HF
 for pytorch backend, add speculative_model field alias

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 docs/source/features/speculative-decoding.md  |  36 +++++-
 .../_tensorrt_engine/llm_eagle2_decoding.py   |   4 +-
 .../_tensorrt_engine/llm_eagle_decoding.py    |   4 +-
 .../_tensorrt_engine/llm_medusa_decoding.py   |   4 +-
 examples/llm-api/llm_speculative_decoding.py  |   2 +-
 examples/llm-api/quickstart_advanced.py       |   6 +-
 examples/models/core/qwen/README.md           |   6 +-
 .../_torch/auto_deploy/shim/ad_executor.py    |   2 +-
 .../_torch/models/modeling_speculative.py     |   4 +-
 .../_torch/pyexecutor/model_loader.py         |   2 +-
 .../_torch/pyexecutor/py_executor_creator.py  |   2 +-
 tensorrt_llm/llmapi/llm_args.py               |  21 ++--
 tensorrt_llm/llmapi/llm_utils.py              |  29 +++--
 .../accuracy/test_disaggregated_serving.py    |   4 +-
 .../integration/defs/accuracy/test_llm_api.py |   4 +-
 .../defs/accuracy/test_llm_api_pytorch.py     |  29 +++--
 .../test_disaggregated_single_gpu.py          |   2 +-
 .../serve/test_spec_decoding_metrics.py       |   4 +-
 .../examples/test_ad_speculative_decoding.py  |   6 +-
 .../defs/perf/pytorch_model_config.py         |   2 +-
 .../integration/defs/perf/test_perf_sanity.py |  16 +--
 tests/integration/defs/test_e2e.py            |   2 +-
 .../gpt_oss_120b_fp4_grace_blackwell.yaml     |   2 +-
 .../perf-sanity/run_benchmark_serve.py        |  14 +--
 .../singlegpu/test_ad_speculative_decoding.py |  17 +--
 .../speculative/test_draft_len_schedule.py    |   8 +-
 .../_torch/speculative/test_draft_target.py   |   2 +-
 ...test_draft_token_prepare_for_generation.py |   2 +-
 .../test_draft_token_tree_sampling.py         |   2 +-
 .../test_draft_token_tree_verification.py     |   2 +-
 .../speculative/test_dynamic_spec_decode.py   |   2 +-
 .../_torch/speculative/test_eagle3.py         | 104 ++++++++++++------
 .../_torch/speculative/test_kv_cache_reuse.py |   2 +-
 .../_torch/speculative/test_spec_gate.py      |   2 +-
 tests/unittest/llmapi/test_llm.py             |   8 +-
 tests/unittest/llmapi/test_llm_args.py        |  13 +++
 36 files changed, 231 insertions(+), 140 deletions(-)

diff --git a/docs/source/features/speculative-decoding.md b/docs/source/features/speculative-decoding.md
index 089d7ecf3a7..64e52558b20 100644
--- a/docs/source/features/speculative-decoding.md
+++ b/docs/source/features/speculative-decoding.md
@@ -37,8 +37,13 @@ Draft/target is the simplest form of speculative decoding. In this approach, an
 ```python
 from tensorrt_llm.llmapi import DraftTargetDecodingConfig
 
+# Option 1: Use a HuggingFace Hub model ID (auto-downloaded)
 speculative_config = DraftTargetDecodingConfig(
-    max_draft_len=3, speculative_model_dir="/path/to/draft_model")
+    max_draft_len=3, speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
+
+# Option 2: Use a local path
+# speculative_config = DraftTargetDecodingConfig(
+#     max_draft_len=3, speculative_model="/path/to/draft_model")
 
 llm = LLM("/path/to/target_model", speculative_config=speculative_config, disable_overlap_scheduler=True)
 ```
@@ -51,18 +56,23 @@ TRT-LLM supports a modified version of the algorithm presented in the paper: tre
 The following draft model checkpoints can be used for EAGLE 3:
 * Llama 3 variants: [use the checkpoints from the authors of the original EAGLE 3 paper](https://huggingface.co/yuhuili).
 * Llama 4 Maverick: [use the checkpoint from the NVIDIA HuggingFace repository](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Eagle3).
+* Other models, including `gpt-oss-120b` and `Qwen3`: check out the [Speculative Decoding Modules](https://huggingface.co/collections/nvidia/speculative-decoding-modules) collection from NVIDIA.
 
 ```python
 from tensorrt_llm.llmapi import EagleDecodingConfig
 
 # Enable to use the faster one-model implementation for Llama 4.
 eagle3_one_model = False
+model = "meta-llama/Llama-3.1-8B-Instruct"
+speculative_model = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 
 speculative_config = EagleDecodingConfig(
-    max_draft_len=3, speculative_model_dir="/path/to/draft_model", eagle3_one_model=eagle3_one_model)
+    max_draft_len=3,
+    speculative_model=speculative_model,
+    eagle3_one_model=eagle3_one_model)
 
 # Only need to disable overlap scheduler if eagle3_one_model is False.
-llm = LLM("/path/to/target_model", speculative_config=speculative_config, disable_overlap_scheduler=True)
+llm = LLM(model, speculative_config=speculative_config, disable_overlap_scheduler=True)
 ```
 
 ### NGram
@@ -137,7 +147,17 @@ Speculative decoding options must be specified via `--config config.yaml` for bo
 
 The rest of the argument names/valid values are the same as in their corresponding configuration class described in the Quick Start section. For example, a YAML configuration could look like this:
 
+```yaml
+# Using a HuggingFace Hub model ID (auto-downloaded)
+disable_overlap_scheduler: true
+speculative_config:
+  decoding_type: Eagle
+  max_draft_len: 4
+  speculative_model: yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
 ```
+
+```yaml
+# Or using a local path
 disable_overlap_scheduler: true
 speculative_config:
   decoding_type: Eagle
@@ -145,6 +165,16 @@ speculative_config:
   speculative_model: /path/to/draft/model
 ```
 
+```{note}
+The field name `speculative_model_dir` can also be used as an alias for `speculative_config.speculative_model`. For example:
+
+    speculative_config:
+      decoding_type: Eagle
+      max_draft_len: 4
+      speculative_model_dir: /path/to/draft/model
+```
+
+
 ## Developer Guide
 
 This section describes the components of a speculative decoding algorithm. All of the interfaces are defined in [`_torch/speculative/interface.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/interface.py).
diff --git a/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py b/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py
index a1343cc5757..86b5ca28af4 100755
--- a/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py
+++ b/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py
@@ -23,12 +23,12 @@ def main():
     model = "lmsys/vicuna-7b-v1.3"
 
     # The end user can customize the eagle decoding configuration by specifying the
-    # speculative_model_dir, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices
+    # speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices
     # greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK
     # with the EagleDecodingConfig class
 
     speculative_config = EagleDecodingConfig(
-        speculative_model_dir="yuhuili/EAGLE-Vicuna-7B-v1.3",
+        speculative_model="yuhuili/EAGLE-Vicuna-7B-v1.3",
         max_draft_len=63,
         num_eagle_layers=4,
         max_non_leaves_per_layer=10,
diff --git a/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py b/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py
index c66e15f6646..e6e89a622ee 100644
--- a/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py
+++ b/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py
@@ -23,12 +23,12 @@ def main():
     model = "lmsys/vicuna-7b-v1.3"
 
     # The end user can customize the eagle decoding configuration by specifying the
-    # speculative_model_dir, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices
+    # speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices
     # greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK
     # with the EagleDecodingConfig class
 
     speculative_config = EagleDecodingConfig(
-        speculative_model_dir="yuhuili/EAGLE-Vicuna-7B-v1.3",
+        speculative_model="yuhuili/EAGLE-Vicuna-7B-v1.3",
         max_draft_len=63,
         num_eagle_layers=4,
         max_non_leaves_per_layer=10,
diff --git a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
index f45411b2336..d371600d00f 100644
--- a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
+++ b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
@@ -48,10 +48,10 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None):
         model = "lmsys/vicuna-7b-v1.3"
 
         # The end user can customize the medusa decoding configuration by specifying the
-        # speculative_model_dir, max_draft_len, medusa heads num and medusa choices
+        # speculative_model, max_draft_len, medusa heads num and medusa choices
         # with the MedusaDecodingConfig class
         speculative_config = MedusaDecodingConfig(
-                                        speculative_model_dir="FasterDecoding/medusa-vicuna-7b-v1.3",
+                                        speculative_model="FasterDecoding/medusa-vicuna-7b-v1.3",
                                         max_draft_len=63,
                                         num_medusa_heads=4,
                                         medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
diff --git a/examples/llm-api/llm_speculative_decoding.py b/examples/llm-api/llm_speculative_decoding.py
index 6d6e812db34..de33278a096 100644
--- a/examples/llm-api/llm_speculative_decoding.py
+++ b/examples/llm-api/llm_speculative_decoding.py
@@ -35,7 +35,7 @@ def run_MTP(model: Optional[str] = None):
 def run_Eagle3():
     spec_config = EagleDecodingConfig(
         max_draft_len=3,
-        speculative_model_dir="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
+        speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
         eagle3_one_model=True)
 
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
index abc8e48f61a..c0bb4e31bec 100644
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@@ -220,11 +220,11 @@ def setup_llm(args, **kwargs):
             relaxed_topk=args.relaxed_topk,
             relaxed_delta=args.relaxed_delta,
             mtp_eagle_one_model=args.use_one_model,
-            speculative_model_dir=args.model_dir)
+            speculative_model=args.model_dir)
     elif spec_decode_algo == "EAGLE3":
         spec_config = EagleDecodingConfig(
             max_draft_len=args.spec_decode_max_draft_len,
-            speculative_model_dir=args.draft_model_dir,
+            speculative_model=args.draft_model_dir,
             eagle3_one_model=args.use_one_model,
             eagle_choices=args.eagle_choices,
             use_dynamic_tree=args.use_dynamic_tree,
@@ -234,7 +234,7 @@ def setup_llm(args, **kwargs):
     elif spec_decode_algo == "DRAFT_TARGET":
         spec_config = DraftTargetDecodingConfig(
             max_draft_len=args.spec_decode_max_draft_len,
-            speculative_model_dir=args.draft_model_dir)
+            speculative_model=args.draft_model_dir)
     elif spec_decode_algo == "NGRAM":
         spec_config = NGramDecodingConfig(
             max_draft_len=args.spec_decode_max_draft_len,
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 5474e259690..566d4eab1ba 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -841,8 +841,8 @@ Qwen3 now supports Eagle3 (Speculative Decoding with Eagle3). To enable Eagle3 o
   Set the decoding type to "Eagle" to enable Eagle3 speculative decoding.
 - `speculative_config.max_draft_len: 3`
   Set the maximum number of draft tokens generated per step (this value can be adjusted as needed).
-- `speculative_config.speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>`
-  Specify the path to the Eagle3 draft model (ensure the corresponding draft model weights are prepared).
+- `speculative_config.speculative_model: <HUGGINGFACE ID / LOCAL PATH>`
+  Specify the Eagle3 draft model either as a Huggingface model ID or a local path. You can find ready-to-use Eagle3 draft models at https://huggingface.co/collections/nvidia/speculative-decoding-modules.
 
 Currently, there are some limitations when enabling Eagle3:
 
@@ -857,7 +857,7 @@ enable_attention_dp: false
 speculative_config:
     decoding_type: Eagle
     max_draft_len: 3
-    speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>
+    speculative_model: <HUGGINGFACE ID / LOCAL PATH>
 kv_cache_config:
     enable_block_reuse: false
 " >> ${path_config}
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
index 9f6b885d3b4..5055514014e 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -921,7 +921,7 @@ def create_draft_model_engine_maybe(
     drafting_loop_wrapper = None
 
     draft_model_engine = PyTorchModelEngine(
-        model_path=draft_spec_config.speculative_model_dir,
+        model_path=draft_spec_config.speculative_model,
         llm_args=draft_llm_args,
         mapping=dist_mapping,
         attn_runtime_features=attn_runtime_features,
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
index dc4b3b1d545..7c3fb20a5d1 100755
--- a/tensorrt_llm/_torch/models/modeling_speculative.py
+++ b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -887,7 +887,7 @@ def __init__(self, model: TModel, model_config: ModelConfig[TConfig]):
                     from tensorrt_llm._torch.models.checkpoints.mistral.config_loader import \
                         MistralConfigLoader
                     self.draft_config = MistralConfigLoader().load(
-                        spec_config.speculative_model_dir,
+                        spec_config.speculative_model,
                         mapping=model_config.mapping,
                         moe_backend=model_config.moe_backend,
                         moe_max_num_tokens=model_config.moe_max_num_tokens,
@@ -898,7 +898,7 @@ def __init__(self, model: TModel, model_config: ModelConfig[TConfig]):
                     self.draft_config.extra_attrs = model_config.extra_attrs
                 elif spec_config.eagle3_model_arch == "llama3":
                     self.draft_config = ModelConfig.from_pretrained(
-                        model_config.spec_config.speculative_model_dir,
+                        model_config.spec_config.speculative_model,
                         trust_remote_code=True,
                         attn_backend=model_config.attn_backend,
                         moe_backend=model_config.moe_backend,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 4756e24d082..cc44248ebcc 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -278,7 +278,7 @@ def init_meta_tensor(t: torch.Tensor):
                 if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
                 ):
                     weights = checkpoint_loader.load_weights(
-                        self.spec_config.speculative_model_dir,
+                        self.spec_config.speculative_model,
                         mapping=self.mapping)
 
                     draft_model_arch = model.draft_config.pretrained_config.architectures[
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index bd1857dda27..493f542f4fd 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -398,7 +398,7 @@ def drafting_loop_wrapper(model):
                 draft_llm_args.load_format = LoadFormat.DUMMY
 
             draft_model_engine = PyTorchModelEngine(
-                model_path=spec_config.speculative_model_dir,
+                model_path=spec_config.speculative_model,
                 llm_args=draft_llm_args,
                 mapping=mapping,
                 attn_runtime_features=attn_runtime_features,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 3f15252b84f..6f65d77d92c 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -651,7 +651,12 @@ class DecodingBaseConfig(StrictBaseModel):
     # If it's a static or dynamic tree, each draft layer may generate more than one draft token.
     # In this case, max_total_draft_tokens >= max_draft_len.
     max_total_draft_tokens: Optional[int] = None
-    speculative_model_dir: Optional[Union[str, Path]] = None
+    # The speculative (draft) model. Accepts either:
+    # - A HuggingFace Hub model ID (str), e.g., "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+    #   which will be automatically downloaded.
+    # - A local filesystem path to a downloaded model directory.
+    speculative_model: Optional[Union[str, Path]] = Field(
+        default=None, alias="speculative_model_dir")
 
     # PyTorch only.
     # When specified, speculation will be disabled at batch sizes above
@@ -918,7 +923,7 @@ def from_dict(cls, data: dict):
     decoding_type: ClassVar[str] = "Eagle"
 
     def validate(self) -> None:
-        if self.speculative_model_dir is None:
+        if self.speculative_model is None:
             raise ValueError("Draft model must be provided for EAGLE")
 
     def check_eagle_choices(self):
@@ -2132,7 +2137,7 @@ def model_format(self) -> _ModelFormatKind:
         return self._model_format
 
     @property
-    def speculative_model_dir(self) -> Optional[_ModelFormatKind]:
+    def speculative_model(self) -> Optional[str]:
         return self._speculative_model
 
     @property
@@ -2508,7 +2513,7 @@ def validate_speculative_config(self):
 
             elif isinstance(self.speculative_config, EagleDecodingConfig):
                 assert self.speculative_config.max_draft_len > 0
-                assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified."
+                assert self.speculative_config.speculative_model is not None, "EAGLE3 draft model must be specified."
                 self.build_config.max_draft_len = self.speculative_config.max_draft_len
                 self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE
                 eagle_config = _EagleConfig(
@@ -2529,7 +2534,7 @@ def validate_speculative_config(self):
             self.decoding_config = None
 
         self._speculative_model = getattr(self.speculative_config,
-                                          "speculative_model_dir", None)
+                                          "speculative_model", None)
         speculative_model_obj = _ModelWrapper(
             self._speculative_model
         ) if self._speculative_model is not None else None
@@ -3025,12 +3030,12 @@ def validate_speculative_config(self):
 
             if isinstance(self.speculative_config, EagleDecodingConfig):
                 assert self.speculative_config.max_draft_len > 0
-                assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified."
+                assert self.speculative_config.speculative_model is not None, "EAGLE3 draft model must be specified."
             elif isinstance(self.speculative_config, NGramDecodingConfig):
                 assert self.speculative_config.max_draft_len > 0 and self.speculative_config.max_matching_ngram_size > 0
             elif isinstance(self.speculative_config, DraftTargetDecodingConfig):
                 assert self.speculative_config.max_draft_len > 0
-                assert self.speculative_config.speculative_model_dir is not None, "Path to draft model must be specified."
+                assert self.speculative_config.speculative_model is not None, "Draft model must be specified."
             elif isinstance(self.speculative_config, MTPDecodingConfig):
                 assert self.speculative_config.num_nextn_predict_layers > 0
                 self.speculative_config.max_draft_len = self.speculative_config.num_nextn_predict_layers
@@ -3058,7 +3063,7 @@ def validate_speculative_config(self):
             self.decoding_config = None
 
         self._speculative_model = getattr(self.speculative_config,
-                                          "speculative_model_dir", None)
+                                          "speculative_model", None)
         speculative_model_obj = _ModelWrapper(
             self._speculative_model
         ) if self._speculative_model is not None else None
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index fc1647a8070..4efb06dd56f 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -109,8 +109,8 @@ def __init__(self,
 
         self.model_obj = _ModelWrapper(self.llm_args.model)
         self.speculative_model_obj = _ModelWrapper(
-            self.llm_args.speculative_model_dir
-        ) if self.llm_args.speculative_model_dir is not None else None
+            self.llm_args.speculative_model
+        ) if self.llm_args.speculative_model is not None else None
 
         if isinstance(self.llm_args, TrtLlmArgs):
             self.convert_checkpoint_options = self.llm_args._convert_checkpoint_options
@@ -440,8 +440,8 @@ def _load_model_from_hf(self):
         model_cls = AutoModelForCausalLM.get_trtllm_model_class(
             self._model_dir, self.llm_args.trust_remote_code,
             self.llm_args.decoding_config.decoding_mode
-            if hasattr(self.llm_args, "speculative_model_dir")
-            and self.llm_args.speculative_model_dir else None)
+            if hasattr(self.llm_args, "speculative_model")
+            and self.llm_args.speculative_model else None)
 
         prequantized = self._update_from_hf_quant_config()
 
@@ -643,15 +643,26 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]:
         if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE:
             return Path(self.llm_args.model), None
 
-        if self.llm_args.backend == "_autodeploy":
-            return None, ""
-
         self.engine_cache_stage: Optional[CachedStage] = None
-
         self._hf_model_dir = None
-
         self.model_loader = ModelLoader(self.llm_args)
 
+        # Download speculative model from HuggingFace if needed
+        if (self.model_loader.speculative_model_obj is not None
+                and self.model_loader.speculative_model_obj.is_hub_model):
+            spec_model_dirs = self._submit_to_all_workers(
+                CachedModelLoader._node_download_hf_model,
+                model=self.model_loader.speculative_model_obj.model_name,
+                revision=None)
+            spec_model_dir = spec_model_dirs[0]
+            self.model_loader.speculative_model_obj.model_dir = spec_model_dir
+            # Update llm_args so PyTorch/AutoDeploy executor gets the local path
+            if self.llm_args.speculative_config is not None:
+                self.llm_args.speculative_config.speculative_model = spec_model_dir
+
+        if self.llm_args.backend == "_autodeploy":
+            return None, ""
+
         if self.llm_args.backend is not None:
             if self.llm_args.backend not in ["pytorch", "_autodeploy"]:
                 raise ValueError(
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 2ba2ee1bfee..1350ecb774c 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -576,7 +576,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
         speculative_decoding_config = {
             "decoding_type": "Eagle",
             "max_draft_len": 4,
-            "speculative_model_dir":
+            "speculative_model":
             f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
             "eagle3_one_model": eagle3_one_model
         }
@@ -675,7 +675,7 @@ def test_guided_decoding_with_eagle3(self, backend: str,
         speculative_decoding_config = {
             "decoding_type": "Eagle",
             "max_draft_len": 3,
-            "speculative_model_dir":
+            "speculative_model":
             f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
             "eagle3_one_model": eagle3_one_model
         }
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index e019572ada0..a304a47edb2 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -471,7 +471,7 @@ class TestEagleVicuna_7B_v1_3(LlmapiAccuracyTestHarness):
 
     speculative_config = EagleDecodingConfig(
         max_draft_len=63,
-        speculative_model_dir=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3",
+        speculative_model=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3",
         num_eagle_layers=4,
         max_non_leaves_per_layer=10,
                             eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
@@ -497,7 +497,7 @@ class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness):
 
     speculative_config = EagleDecodingConfig(
         max_draft_len=63,
-        speculative_model_dir=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3",
+        speculative_model=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3",
         num_eagle_layers=4,
         max_non_leaves_per_layer=10,
         use_dynamic_tree=True,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 1a32e333b5a..6ef4915c2cf 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -276,7 +276,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model,
 
         draft_len = 4
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=eagle3_one_model)
 
         with LLM(model=target_model_dir,
@@ -369,8 +369,7 @@ def test_guided_decoding_with_eagle3(self, backend: str,
         cuda_graph_config = CudaGraphConfig(enable_padding=True)
         spec_config = EagleDecodingConfig(
             max_draft_len=3,
-            speculative_model_dir=
-            f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
+            speculative_model=f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
             eagle3_one_model=eagle3_one_model)
         llm = LLM(
             self.MODEL_PATH,
@@ -621,7 +620,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model, torch_compile):
         eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.3-Instruct-70B"
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
         spec_config = EagleDecodingConfig(max_draft_len=3,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=eagle3_one_model)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
         pytorch_config = dict(
@@ -1383,7 +1382,7 @@ def test_bfloat16_2_model_mtp(self):
         )
         mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3,
                                        mtp_eagle_one_model=False,
-                                       speculative_model_dir=self.MODEL_PATH)
+                                       speculative_model=self.MODEL_PATH)
         with LLM(self.MODEL_PATH,
                  kv_cache_config=kv_cache_config,
                  enable_chunked_prefill=False,
@@ -2935,7 +2934,7 @@ def test_nvfp4_2_model_mtp(self, tp_size, cuda_graph, overlap_scheduler,
 
         mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3,
                                        mtp_eagle_one_model=False,
-                                       speculative_model_dir=model_path)
+                                       speculative_model=model_path)
 
         with LLM(model_path,
                  max_batch_size=max_batch_size,
@@ -3441,7 +3440,7 @@ def test_eagle3(self, enable_chunked_prefill, eagle3_one_model):
 
         draft_len = 4
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=eagle3_one_model)
 
         llm = LLM(model=target_model_dir,
@@ -3812,7 +3811,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         if eagle3:
             spec_config = EagleDecodingConfig(
                 max_draft_len=2,
-                speculative_model_dir=
+                speculative_model=
                 f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
                 eagle3_one_model=True)
         with LLM(
@@ -3860,7 +3859,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
         if eagle3:
             spec_config = EagleDecodingConfig(
                 max_draft_len=2,
-                speculative_model_dir=
+                speculative_model=
                 f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
                 eagle3_one_model=True)
         with LLM(
@@ -4479,7 +4478,7 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
         draft_len = 3
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=one_model,
                                           allow_advanced_sampling=True)
 
@@ -4545,7 +4544,7 @@ def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
         draft_len = 3
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=one_model,
                                           allow_advanced_sampling=True)
 
@@ -4609,7 +4608,7 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
         draft_len = 3
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=one_model,
                                           allow_advanced_sampling=True)
 
@@ -4668,7 +4667,7 @@ def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
         draft_len = 3
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=one_model)
 
         max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
@@ -5147,7 +5146,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
         if eagle3:
             spec_config = EagleDecodingConfig(
                 max_draft_len=2,
-                speculative_model_dir=
+                speculative_model=
                 f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512-Eagle/",
                 eagle3_one_model=True,
                 eagle3_model_arch="mistral_large3")
@@ -5198,7 +5197,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         if eagle3:
             spec_config = EagleDecodingConfig(
                 max_draft_len=2,
-                speculative_model_dir=
+                speculative_model=
                 f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512-Eagle/",
                 eagle3_one_model=True,
                 eagle3_model_arch="mistral_large3")
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index d6b63d3ab3c..4e146f3df0e 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -400,7 +400,7 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
     # Test whether the batch slots are properly released when using speculative decoding
     # with disaggregated serving.
     spec_dec_config = EagleDecodingConfig(
-        speculative_model_dir=model_path(spec_dec_model_path),
+        speculative_model=model_path(spec_dec_model_path),
         eagle3_one_model=eagle3_one_model,
         max_draft_len=3)
 
diff --git a/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py b/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py
index a888f50e333..69bebfcfb0d 100644
--- a/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py
+++ b/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py
@@ -93,7 +93,7 @@ def test_spec_decoding_metrics_eagle3_one_model():
         "speculative_config": {
             "decoding_type": "Eagle",
             "max_draft_len": 4,
-            "speculative_model_dir": eagle3_path,
+            "speculative_model": eagle3_path,
             "eagle3_one_model": True,
         },
     }
@@ -174,7 +174,7 @@ def test_spec_decoding_metrics_eagle3_two_model():
         "speculative_config": {
             "decoding_type": "Eagle",
             "max_draft_len": 4,
-            "speculative_model_dir": eagle3_path,
+            "speculative_model": eagle3_path,
             "eagle3_one_model": False,  # Two-model mode
         },
     }
diff --git a/tests/integration/defs/examples/test_ad_speculative_decoding.py b/tests/integration/defs/examples/test_ad_speculative_decoding.py
index ddc785841e6..e1492a01535 100644
--- a/tests/integration/defs/examples/test_ad_speculative_decoding.py
+++ b/tests/integration/defs/examples/test_ad_speculative_decoding.py
@@ -52,14 +52,14 @@ def get_model_paths():
 
 def make_draft_target_config(spec_model_path: str):
     return DraftTargetDecodingConfig(
-        max_draft_len=DRAFT_TARGET_MAX_DRAFT_LEN, speculative_model_dir=spec_model_path
+        max_draft_len=DRAFT_TARGET_MAX_DRAFT_LEN, speculative_model=spec_model_path
     )
 
 
 def make_eagle3_config(spec_model_path: str):
     return EagleDecodingConfig(
         max_draft_len=EAGLE_MAX_DRAFT_LEN,
-        speculative_model_dir=spec_model_path,
+        speculative_model=spec_model_path,
         eagle3_one_model=False,
         eagle3_layers_to_capture=None,
     )
@@ -216,7 +216,7 @@ def test_autodeploy_eagle3_acceptance_rate():
     # Configure Eagle3 speculative decoding
     speculative_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=eagle_model,
+        speculative_model=eagle_model,
         eagle3_one_model=False,
         eagle3_layers_to_capture=None,
     )
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index 2e8ae2bb0db..ca2f90fe8e5 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -223,7 +223,7 @@ def get_model_yaml_config(model_label: str,
                 'speculative_config': {
                     'decoding_type': 'Eagle',
                     'eagle3_one_model': True,
-                    'speculative_model_dir': 'Qwen3-4B_eagle3',
+                    'speculative_model': 'Qwen3-4B_eagle3',
                     'max_draft_len': 3,
                 },
                 'kv_cache_config': {
diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py
index 7bad9cf7f40..5d3d38d68d7 100644
--- a/tests/integration/defs/perf/test_perf_sanity.py
+++ b/tests/integration/defs/perf/test_perf_sanity.py
@@ -209,7 +209,7 @@ def __init__(self, server_config_data: dict, env_vars: str = ""):
         else:
             self.eagle3_layers_to_capture = []
         self.max_draft_len = speculative_config.get("max_draft_len", 0)
-        self.speculative_model_dir = speculative_config.get("speculative_model_dir", "")
+        self.speculative_model = speculative_config.get("speculative_model", "")
 
         # match_mode: "config" (default) or "scenario"
         self.match_mode = server_config_data.get("match_mode", "config")
@@ -333,7 +333,7 @@ def to_db_data(self) -> dict:
             "l_num_nextn_predict_layers": self.num_nextn_predict_layers,
             "s_eagle3_layers_to_capture": ",".join(map(str, self.eagle3_layers_to_capture)),
             "l_max_draft_len": self.max_draft_len,
-            "s_speculative_model_dir": self.speculative_model_dir,
+            "s_speculative_model_dir": self.speculative_model,
             "s_server_log_link": "",
             "s_server_env_var": self.env_vars,
         }
@@ -343,15 +343,15 @@ def generate_extra_llm_api_config(self) -> str:
         """Generate extra-llm-api-config.yml content."""
         config_data = dict(self.extra_llm_api_config_data)
 
-        # Handle speculative_model_dir path conversion
+        # Handle speculative_model path conversion
         if (
             "speculative_config" in config_data
-            and "speculative_model_dir" in config_data["speculative_config"]
+            and "speculative_model" in config_data["speculative_config"]
         ):
-            spec_model_dir = config_data["speculative_config"]["speculative_model_dir"]
-            if spec_model_dir:
-                config_data["speculative_config"]["speculative_model_dir"] = os.path.join(
-                    llm_models_root(), spec_model_dir
+            spec_model = config_data["speculative_config"]["speculative_model"]
+            if spec_model:
+                config_data["speculative_config"]["speculative_model"] = os.path.join(
+                    llm_models_root(), spec_model
                 )
 
         return yaml.dump(config_data, default_flow_style=False, sort_keys=False)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index b55eeb8359d..dc3a385d3df 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -3378,7 +3378,7 @@ def test_eagle3_output_consistency_4gpus(model_dir: str, draft_model_dir: str):
     # Run with Eagle3
     spec_config = EagleDecodingConfig(
         max_draft_len=3,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         eagle3_one_model=True,
     )
     with LLM(**llm_common_config, speculative_config=spec_config) as llm_spec:
diff --git a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml
index cd346ac25f6..d5993c46deb 100644
--- a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml
@@ -146,7 +146,7 @@ server_configs:
       decoding_type: 'Eagle'
       eagle3_layers_to_capture: [-1]
       max_draft_len: 3
-      speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
+      speculative_model: "gpt_oss/gpt-oss-120b-Eagle3"
     stream_interval: 20
     num_postprocess_workers: 4
     client_configs:
diff --git a/tests/scripts/perf-sanity/run_benchmark_serve.py b/tests/scripts/perf-sanity/run_benchmark_serve.py
index 3f16f7273cd..627b5d980dd 100644
--- a/tests/scripts/perf-sanity/run_benchmark_serve.py
+++ b/tests/scripts/perf-sanity/run_benchmark_serve.py
@@ -218,7 +218,7 @@ def str_to_bool(value: str) -> bool:
 SPECULATIVE_CONFIG_METRICS = {
     "decoding_type": (True, str),
     "max_draft_len": (True, int),
-    "speculative_model_dir": (True, str),
+    "speculative_model": (True, str),
     "eagle3_one_model": (True, str_to_bool),
 }
 
@@ -259,7 +259,7 @@ def __init__(
         enable_padding: bool = True,
         decoding_type: str = "",
         max_draft_len: int = 0,
-        speculative_model_dir: str = "",
+        speculative_model: str = "",
         eagle3_one_model: bool = False,
     ):
         self.name = name
@@ -285,7 +285,7 @@ def __init__(
         self.enable_padding = enable_padding
         self.decoding_type = decoding_type
         self.max_draft_len = max_draft_len
-        self.speculative_model_dir = speculative_model_dir
+        self.speculative_model = speculative_model
         self.eagle3_one_model = eagle3_one_model
 
         model_dir = get_model_dir(self.model_name)
@@ -345,9 +345,9 @@ def generate_extra_llm_api_config(self) -> str:
             config_lines.append(f"  decoding_type: {self.decoding_type}")
             if self.max_draft_len > 0:
                 config_lines.append(f"  max_draft_len: {self.max_draft_len}")
-            if self.speculative_model_dir:
+            if self.speculative_model:
                 config_lines.append(
-                    f"  speculative_model_dir: {self.speculative_model_dir}")
+                    f"  speculative_model: {self.speculative_model}")
             if self.eagle3_one_model:
                 config_lines.append(
                     f"  eagle3_one_model: {str(self.eagle3_one_model).lower()}")
@@ -500,8 +500,8 @@ def parse_config_file(config_file_path: str, select_pattern: str = None):
                                                  {}).get('decoding_type', ''),
             max_draft_len=server_config_data.get('speculative_config',
                                                  {}).get('max_draft_len', 0),
-            speculative_model_dir=server_config_data.get(
-                'speculative_config', {}).get('speculative_model_dir', ''),
+            speculative_model=server_config_data.get(
+                'speculative_config', {}).get('speculative_model', ''),
             eagle3_one_model=server_config_data.get(
                 'speculative_config', {}).get('eagle3_one_model', False))
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
index 76825732029..e40b25984c4 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
@@ -13,13 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
 from _model_test_utils import get_small_model_config
 from build_and_run_ad import ExperimentConfig, main
 
 from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
 
 
-def test_ad_speculative_decoding_smoke():
+@pytest.mark.parametrize("use_hf_speculative_model", [False, True])
+def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool):
     """Test speculative decoding with AutoDeploy using the build_and_run_ad main()."""
 
     # Use a simple test prompt
@@ -27,15 +29,14 @@ def test_ad_speculative_decoding_smoke():
 
     # Get base model config
     experiment_config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct")
-    speculative_model_dir = get_small_model_config("TinyLlama/TinyLlama-1.1B-Chat-v1.0")["args"][
-        "model"
-    ]
+    speculative_model_hf_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    if use_hf_speculative_model:
+        speculative_model = speculative_model_hf_id
+    else:
+        speculative_model = get_small_model_config(speculative_model_hf_id)["args"]["model"]
 
-    print(f"Speculative model path: {speculative_model_dir}")
     # Configure speculative decoding with a draft model
-    spec_config = DraftTargetDecodingConfig(
-        max_draft_len=3, speculative_model_dir=speculative_model_dir
-    )
+    spec_config = DraftTargetDecodingConfig(max_draft_len=3, speculative_model=speculative_model)
 
     # Configure KV cache
     kv_cache_config = KvCacheConfig(
diff --git a/tests/unittest/_torch/speculative/test_draft_len_schedule.py b/tests/unittest/_torch/speculative/test_draft_len_schedule.py
index dc4aa577646..e64ca7fa538 100644
--- a/tests/unittest/_torch/speculative/test_draft_len_schedule.py
+++ b/tests/unittest/_torch/speculative/test_draft_len_schedule.py
@@ -77,7 +77,7 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict):
     else:
         spec_config = DraftTargetDecodingConfig(
             max_draft_len=max_draft_len,
-            speculative_model_dir=str(draft_model),
+            speculative_model=str(draft_model),
             draft_len_schedule=schedule,
         )
 
@@ -123,7 +123,7 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict):
     else:
         spec_config_fixed = DraftTargetDecodingConfig(
             max_draft_len=max_draft_len,
-            speculative_model_dir=str(draft_model),
+            speculative_model=str(draft_model),
             draft_len_schedule=None,  # No schedule - fixed draft length
         )
     llm_fixed = LLM(**llm_common_config, speculative_config=spec_config_fixed)
@@ -186,9 +186,7 @@ def test_draft_len_schedule_functionality(drafter_type: str, draft_schedule: dic
     else:
         spec_config = DraftTargetDecodingConfig(
             max_draft_len=5,
-            speculative_model_dir=str(
-                llm_models_root() / "llama-3.2-models" / "Llama-3.2-3B-Instruct"
-            ),
+            speculative_model=str(llm_models_root() / "llama-3.2-models" / "Llama-3.2-3B-Instruct"),
             draft_len_schedule=draft_schedule,
         )
     prompts = ["The capital of France is" for i in range(7)]
diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py
index 9aaa81e8375..6ba477051fd 100644
--- a/tests/unittest/_torch/speculative/test_draft_target.py
+++ b/tests/unittest/_torch/speculative/test_draft_target.py
@@ -45,7 +45,7 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
 
     spec_config = DraftTargetDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=draft_model_dir,
+        speculative_model=draft_model_dir,
     )
 
     prompts = [
diff --git a/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py b/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py
index 4a75e1b6f4a..352d6b743f7 100644
--- a/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py
+++ b/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py
@@ -87,7 +87,7 @@ def run_test(
         spec_config = EagleDecodingConfig(
             max_draft_len=max_draft_len,
             max_total_draft_tokens=max_total_draft_tokens,
-            speculative_model_dir=eagle_model_dir,
+            speculative_model=eagle_model_dir,
             eagle3_one_model=False,
             eagle_choices=eagle_choices,
             use_dynamic_tree=use_dynamic_tree,
diff --git a/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py b/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py
index 6002d9d6856..689da99cf2b 100644
--- a/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py
+++ b/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py
@@ -38,7 +38,7 @@ def run_test(max_batch_size, draft_layer_id, max_total_draft_tokens,
         spec_config = EagleDecodingConfig(
             max_draft_len=max_draft_len,
             max_total_draft_tokens=max_total_draft_tokens,
-            speculative_model_dir=eagle_model_dir,
+            speculative_model=eagle_model_dir,
             eagle3_one_model=False,
             eagle_choices=eagle_choices,
             use_dynamic_tree=use_dynamic_tree,
diff --git a/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py b/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py
index 8994d90ed61..29a19a04ccc 100644
--- a/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py
+++ b/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py
@@ -23,7 +23,7 @@ def run_test(eagle_model_dir, max_seq_len, beam_width, use_dynamic_tree,
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
         max_total_draft_tokens=max_total_draft_tokens,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         eagle3_one_model=False,
         eagle_choices=eagle_choices,
         use_dynamic_tree=use_dynamic_tree,
diff --git a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py
index eeb975bd800..eaa215c81e0 100644
--- a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py
+++ b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py
@@ -56,7 +56,7 @@ def test_dynamic_spec_decode(enforce_single_worker,
 
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         # Llama 3 does not support one model eagle.
         eagle3_one_model=False,
     )
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
index a459ae718ff..8502c4d5760 100644
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -92,48 +92,82 @@ def test_kv_lens_runtime_with_eagle3_one_model():
 
 
 @pytest.mark.parametrize(
-    "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,multi_batch,attention_dp",
+    "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,multi_batch,attention_dp,use_hf_speculative_model",
     [
-        [True, "TRTLLM", True, False, False, False, True, False, False],
-        [True, "TRTLLM", True, False, False, False, False, False, False],
-        [False, "TRTLLM", True, False, False, False, True, False, False],
-        [False, "TRTLLM", True, False, False, False, False, False, False],
-        [True, "FLASHINFER", True, False, False, False, True, False, False],
-        [False, "FLASHINFER", True, False, False, False, True, False, False],
-        [False, "TRTLLM", False, True, True, False, True, False, False],
-        [True, "TRTLLM", False, True, True, False, True, False, False],
-        [True, "TRTLLM", True, False, True, True, True, False, False],
-        [True, "TRTLLM", True, False, True, False, True, False, False],
-        [True, "TRTLLM", True, False, False, True, True, False, False],
-        [True, "TRTLLM", False, False, False, False, True, False, False],
-        [False, "TRTLLM", False, False, False, False, True, False, False],
-        [True, "TRTLLM", False, False, False, False, False, True, False],
-        [True, "TRTLLM", False, False, False, False, False, True, True],
-        [False, "TRTLLM", False, False, False, False, False, True, False],
-        [True, "TRTLLM", False, False, False, False, True, True, False],
-        [False, "TRTLLM", False, False, False, False, True, True, False],
-        [True, "TRTLLM", False, False, False, False, False, False, False],
-        [False, "TRTLLM", False, False, False, False, False, False, False],
-        [True, "TRTLLM", False, False, False, True, True, False, False],
-        [True, "TRTLLM", False, False, False, True, False, False, False],
-        [True, "FLASHINFER", False, False, False, False, True, False, False],
-        [False, "FLASHINFER", False, False, False, False, True, False, False],
+        [True, "TRTLLM", True, False, False, False, True, False, False, False],
+        [True, "TRTLLM", True, False, False, False, False, False, False, False],
+        [False, "TRTLLM", True, False, False, False, True, False, False, False],
+        [
+            False, "TRTLLM", True, False, False, False, False, False, False,
+            False
+        ],
+        [
+            True, "FLASHINFER", True, False, False, False, True, False, False,
+            False
+        ],
+        [
+            False, "FLASHINFER", True, False, False, False, True, False, False,
+            False
+        ],
+        [False, "TRTLLM", False, True, True, False, True, False, False, False],
+        [True, "TRTLLM", False, True, True, False, True, False, False, False],
+        [True, "TRTLLM", True, False, True, True, True, False, False, False],
+        [True, "TRTLLM", True, False, True, False, True, False, False, False],
+        [True, "TRTLLM", True, False, False, True, True, False, False, False],
+        [True, "TRTLLM", False, False, False, False, True, False, False, False],
+        [
+            False, "TRTLLM", False, False, False, False, True, False, False,
+            False
+        ],
+        [True, "TRTLLM", False, False, False, False, False, True, False, False],
+        [True, "TRTLLM", False, False, False, False, False, True, True, False],
+        [
+            False, "TRTLLM", False, False, False, False, False, True, False,
+            False
+        ],
+        [True, "TRTLLM", False, False, False, False, True, True, False, False],
+        [False, "TRTLLM", False, False, False, False, True, True, False, False],
+        [
+            True, "TRTLLM", False, False, False, False, False, False, False,
+            False
+        ],
+        [
+            False, "TRTLLM", False, False, False, False, False, False, False,
+            False
+        ],
+        [True, "TRTLLM", False, False, False, True, True, False, False, False],
+        [True, "TRTLLM", False, False, False, True, False, False, False, False],
+        [
+            True, "FLASHINFER", False, False, False, False, True, False, False,
+            False
+        ],
+        [
+            False, "FLASHINFER", False, False, False, False, True, False, False,
+            False
+        ],
+        # HF download variant - tests speculative model auto-download from HuggingFace Hub
+        [False, "TRTLLM", True, False, False, False, True, False, False, True],
     ])
 @pytest.mark.high_cuda_memory
 def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
                       disable_overlap_scheduler: bool, enable_block_reuse: bool,
                       use_one_model: bool, enable_chunked_prefill: bool,
                       use_chain_drafter: bool, multi_batch: bool,
-                      attention_dp: bool, request):
+                      attention_dp: bool, use_hf_speculative_model: bool,
+                      request):
     # Eagle3 one model works with overlap scheduler and block reuse.
     total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
     if total_mem_gb < 35:
         pytest.skip("Not enough memory to load target + draft model")
 
     models_path = llm_models_root()
-    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
     target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
 
+    if use_hf_speculative_model:
+        eagle_model = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+    else:
+        eagle_model = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
+
     # bs > 1 gives non-deterministic when doing IFB. There are slight chances
     # that ref and spec does not match 100%
     max_batch_size = 4 if multi_batch else 1
@@ -165,7 +199,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
 
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model,
         # Llama 3 does not support one model eagle.
         eagle3_one_model=use_one_model,
     )
@@ -241,7 +275,7 @@ def test_eagle3_spec_decoding_stats(eagle3_one_model):
                                     free_gpu_memory_fraction=0.6)
     spec_config = EagleDecodingConfig(
         max_draft_len=3,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         eagle3_one_model=eagle3_one_model,
     )
 
@@ -321,7 +355,7 @@ def test_llama_eagle3_long_prompt(use_cuda_graph):
 
     spec_config = EagleDecodingConfig(
         max_draft_len=3,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         eagle3_one_model=False,
     )
 
@@ -445,7 +479,7 @@ def test_deepseek_eagle3():
 
         spec_config = EagleDecodingConfig(
             max_draft_len=max_draft_len,
-            speculative_model_dir=eagle_model_dir,
+            speculative_model=eagle_model_dir,
             # Llama 3 does not support one model eagle.
             eagle3_one_model=use_one_model,
             eagle3_layers_to_capture={29},
@@ -555,7 +589,7 @@ def test_deepseek_mla_eagle3():
         )
 
         spec_config = EagleDecodingConfig(max_draft_len=max_draft_len,
-                                          speculative_model_dir=eagle_model_dir,
+                                          speculative_model=eagle_model_dir,
                                           eagle3_one_model=use_one_model,
                                           load_format="dummy")
 
@@ -654,7 +688,7 @@ def test_multi_eagle3(use_one_model: bool):
 
         spec_config = EagleDecodingConfig(
             max_draft_len=max_draft_len,
-            speculative_model_dir=eagle_model_dir,
+            speculative_model=eagle_model_dir,
             # Llama 3 does not support one model eagle.
             eagle3_one_model=use_one_model,
             num_eagle_layers=2,
@@ -713,7 +747,7 @@ def test_eagle3_cuda_graph_padding(disable_overlap_scheduler: bool):
 
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         eagle3_one_model=use_one_model,
     )
 
@@ -766,7 +800,7 @@ def test_eagle3_cdl_sampling(disable_overlap_scheduler: bool):
 
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         eagle3_one_model=use_one_model,
     )
 
diff --git a/tests/unittest/_torch/speculative/test_kv_cache_reuse.py b/tests/unittest/_torch/speculative/test_kv_cache_reuse.py
index 95ed232b969..eb5a720db1a 100644
--- a/tests/unittest/_torch/speculative/test_kv_cache_reuse.py
+++ b/tests/unittest/_torch/speculative/test_kv_cache_reuse.py
@@ -52,7 +52,7 @@ def test_kv_cache_reuse(use_cuda_graph: bool, attn_backend: str):
 
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         eagle3_one_model=False,
     )
 
diff --git a/tests/unittest/_torch/speculative/test_spec_gate.py b/tests/unittest/_torch/speculative/test_spec_gate.py
index b1720f59233..a99654a9c63 100644
--- a/tests/unittest/_torch/speculative/test_spec_gate.py
+++ b/tests/unittest/_torch/speculative/test_spec_gate.py
@@ -47,7 +47,7 @@ def test_spec_gate_e2e():
 
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
-        speculative_model_dir=eagle_model_dir,
+        speculative_model=eagle_model_dir,
         # Llama 3 does not support one model eagle.
         eagle3_one_model=False,
         max_concurrency=10000,
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index f8ffe8fc7bd..357d204eabb 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -1218,7 +1218,7 @@ def test_llm_api_medusa():
 
     speculative_config = MedusaDecodingConfig(num_medusa_heads=4,
             max_draft_len=63,
-            speculative_model_dir=get_model_path("medusa-vicuna-7b-v1.3"),
+            speculative_model=get_model_path("medusa-vicuna-7b-v1.3"),
             medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
                                             [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], \
                                             [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], \
@@ -1257,7 +1257,7 @@ def test_llm_api_medusa_tp2():
 
     speculative_config = MedusaDecodingConfig(num_medusa_heads=4,
             max_draft_len=63,
-              speculative_model_dir=get_model_path("medusa-vicuna-7b-v1.3"),
+              speculative_model=get_model_path("medusa-vicuna-7b-v1.3"),
                             medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
                                             [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], \
                                             [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], \
@@ -1295,7 +1295,7 @@ def test_llm_api_eagle(**llm_kwargs):
 
     speculative_config = EagleDecodingConfig(
         max_draft_len=63,
-        speculative_model_dir=get_model_path("EAGLE-Vicuna-7B-v1.3"),
+        speculative_model=get_model_path("EAGLE-Vicuna-7B-v1.3"),
         num_eagle_layers=4,
         max_non_leaves_per_layer=10,
                             eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
@@ -1342,7 +1342,7 @@ def test_llm_api_eagle2(**llm_kwargs):
 
     speculative_config = EagleDecodingConfig(
         max_draft_len=63,
-        speculative_model_dir=get_model_path("EAGLE-Vicuna-7B-v1.3"),
+        speculative_model=get_model_path("EAGLE-Vicuna-7B-v1.3"),
         num_eagle_layers=4,
         max_non_leaves_per_layer=10,
         use_dynamic_tree=True,
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index 55c6c7b055b..f3562ae6048 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -445,6 +445,19 @@ def test_dynamic_setattr(self):
             args = TorchLlmArgs(model=llama_model_path)
             args.invalid_arg = 1
 
+    def test_speculative_model_alias(self):
+        """Test that speculative_model_dir is accepted as an alias for speculative_model."""
+
+        spec_config = EagleDecodingConfig(
+            max_draft_len=3,
+            speculative_model_dir="/path/to/model",
+            eagle3_one_model=False,
+        )
+
+        args = TorchLlmArgs(model=llama_model_path,
+                            speculative_config=spec_config)
+        assert args.speculative_model == "/path/to/model"
+
 
 class TestTrtLlmArgs:
 

From 5c11e5cd5f7ec9372b1811357881a91ea3ce5d5c Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Wed, 17 Dec 2025 21:28:58 -0800
Subject: [PATCH 02/10] Add logs

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tensorrt_llm/llmapi/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py
index bfc81f7cfdd..88e22fc639c 100644
--- a/tensorrt_llm/llmapi/utils.py
+++ b/tensorrt_llm/llmapi/utils.py
@@ -224,6 +224,7 @@ def __init__(self, *args, **kwargs):
 
 def download_hf_model(model: str, revision: Optional[str] = None) -> Path:
     ignore_patterns = ["original/**/*"]
+    logger.info(f"Downloading model {model} from HuggingFace")
     with get_file_lock(model):
         hf_folder = snapshot_download(
             model,
@@ -231,6 +232,7 @@ def download_hf_model(model: str, revision: Optional[str] = None) -> Path:
             ignore_patterns=ignore_patterns,
             revision=revision,
             tqdm_class=DisabledTqdm)
+    logger.info(f"Finished downloading model {model} from HuggingFace")
     return Path(hf_folder)
 
 

From 64a42b87ed801c2d9e8e1e0f1c7fc0612e890d04 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 18 Dec 2025 17:48:45 -0800
Subject: [PATCH 03/10] Mock snapshot_download to avoid download from HF

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 .../defs/accuracy/test_llm_api_autodeploy.py  |  19 +--
 tests/test_common/llm_data.py                 | 115 ++++++++++++++++++
 .../_utils_test/_model_test_utils.py          |  28 +----
 .../singlegpu/models/test_deepseek_patches.py |   6 +-
 .../singlegpu/test_ad_speculative_decoding.py |   3 +
 .../_torch/speculative/test_eagle3.py         |   4 +-
 tests/unittest/utils/llm_data.py              |  27 +---
 7 files changed, 133 insertions(+), 69 deletions(-)
 create mode 100644 tests/test_common/llm_data.py

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 760164b3ca3..1ee0061cbf0 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -13,34 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
 import pytest
+from test_common.llm_data import hf_model_dir_or_hub_id
 
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
 from tensorrt_llm.quantization import QuantAlgo
 from tensorrt_llm.sampling_params import SamplingParams
 
-from ..conftest import llm_models_root
 from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness
 
 
-def _hf_model_dir_or_hub_id(
-    hf_model_subdir: str,
-    hf_hub_id: str,
-) -> str:
-    llm_models_path = llm_models_root()
-    if llm_models_path and os.path.isdir(
-        (model_fullpath := os.path.join(llm_models_path, hf_model_subdir))):
-        return str(model_fullpath)
-    else:
-        return hf_hub_id
-
-
 class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B"
-    MODEL_PATH = _hf_model_dir_or_hub_id("llama-3.1-model/Meta-Llama-3.1-8B",
-                                         MODEL_NAME)
+    MODEL_PATH = hf_model_dir_or_hub_id(MODEL_NAME)
 
     def get_default_kwargs(self, enable_chunked_prefill=False):
         config = {
diff --git a/tests/test_common/llm_data.py b/tests/test_common/llm_data.py
new file mode 100644
index 00000000000..6792af64144
--- /dev/null
+++ b/tests/test_common/llm_data.py
@@ -0,0 +1,115 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared utilities for local LLM model paths and HuggingFace download mocking."""
+
+import os
+from functools import wraps
+from pathlib import Path
+from typing import Optional
+from unittest.mock import patch
+
+# Mapping from HuggingFace Hub ID to local subdirectory under LLM_MODELS_ROOT.
+# NOTE: hf_id_to_llm_models_subdir below will fall back to checking if the model name exists
+# in LLM_MODELS_ROOT if not present here, so it's not required to exhaustively list all
+# models here.
+HF_ID_TO_LLM_MODELS_SUBDIR = {
+    "meta-llama/Meta-Llama-3.1-8B-Instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
+    "meta-llama/Llama-3.1-8B-Instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
+    "meta-llama/Llama-3.1-8B": "llama-3.1-model/Meta-Llama-3.1-8B",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama4-models/Llama-4-Scout-17B-16E-Instruct",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": "Mixtral-8x7B-Instruct-v0.1",
+    "mistralai/Mistral-Small-3.1-24B-Instruct-2503": "Mistral-Small-3.1-24B-Instruct-2503",
+    "Qwen/Qwen3-30B-A3B": "Qwen3/Qwen3-30B-A3B",
+    "Qwen/Qwen2.5-3B-Instruct": "Qwen2.5-3B-Instruct",
+    "microsoft/Phi-3-mini-4k-instruct": "Phi-3/Phi-3-mini-4k-instruct",
+    "deepseek-ai/DeepSeek-V3": "DeepSeek-V3",
+    "deepseek-ai/DeepSeek-R1": "DeepSeek-R1/DeepSeek-R1",
+    "ibm-ai-platform/Bamba-9B-v2": "Bamba-9B-v2",
+    "nvidia/NVIDIA-Nemotron-Nano-12B-v2": "NVIDIA-Nemotron-Nano-12B-v2",
+    "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3": "NVIDIA-Nemotron-Nano-31B-A3-v3",
+    "nvidia/Nemotron-Nano-3-30B-A3.5B-dev-1024": "Nemotron-Nano-3-30B-A3.5B-dev-1024",
+    "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B": "EAGLE3-LLaMA3.1-Instruct-8B",
+}
+
+
+def llm_models_root(check: bool = False) -> Optional[Path]:
+    root = Path("/home/scratch.trt_llm_data/llm-models/")
+
+    if "LLM_MODELS_ROOT" in os.environ:
+        root = Path(os.environ.get("LLM_MODELS_ROOT"))
+
+    if not root.exists():
+        root = Path("/scratch.trt_llm_data/llm-models/")
+
+    if check:
+        assert root.exists(), (
+            "You must set LLM_MODELS_ROOT env or be able to access /home/scratch.trt_llm_data to run this test"
+        )
+
+    return root if root.exists() else None
+
+
+def llm_datasets_root() -> str:
+    return os.path.join(llm_models_root(check=True), "datasets")
+
+
+def hf_id_to_local_model_dir(hf_hub_id: str) -> str | None:
+    """Return the local model directory under LLM_MODELS_ROOT for a given HuggingFace Hub ID, or None if not found."""
+    root = llm_models_root()
+    if root is None:
+        return None
+
+    if hf_hub_id in HF_ID_TO_LLM_MODELS_SUBDIR:
+        return str(root / HF_ID_TO_LLM_MODELS_SUBDIR[hf_hub_id])
+
+    # Fall back to checking if the model name exists as a top-level directory in LLM_MODELS_ROOT
+    model_name = hf_hub_id.split("/")[-1]
+    if os.path.isdir(root / model_name):
+        return str(root / model_name)
+
+    return None
+
+
+def hf_model_dir_or_hub_id(hf_hub_id: str) -> str:
+    """Resolve a HuggingFace Hub ID to local path if available, otherwise return the Hub ID."""
+    return hf_id_to_local_model_dir(hf_hub_id) or hf_hub_id
+
+
+def mock_snapshot_download(repo_id: str, **kwargs) -> str:
+    """Mock huggingface_hub.snapshot_download that returns an existing local model directory.
+
+    NOTE: This function does not currently handle the revision / allow_patterns / ignore_patterns parameters.
+    """
+    local_path = hf_id_to_local_model_dir(repo_id)
+    if local_path is None:
+        raise ValueError(f"Model '{repo_id}' not found in LLM_MODELS_ROOT")
+    return local_path
+
+
+def with_mocked_hf_download(func):
+    """Decorator to mock huggingface_hub.snapshot_download for tests.
+
+    When applied, any calls to snapshot_download will be redirected to use
+    local model paths from LLM_MODELS_ROOT instead of downloading from HuggingFace.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with patch("huggingface_hub.snapshot_download", side_effect=mock_snapshot_download):
+            return func(*args, **kwargs)
+
+    return wrapper
diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py
index a71a09b4652..04adb076dd4 100644
--- a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py
+++ b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py
@@ -1,12 +1,11 @@
 import copy
-import os
 from typing import Any, Dict, Optional
 
 import torch
 import torch.nn.functional as F
+from test_common.llm_data import hf_model_dir_or_hub_id
 from torch import nn
 from torch.export import Dim
-from utils.llm_data import llm_models_root
 
 
 def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
@@ -285,17 +284,6 @@ def generate_dynamic_shapes(max_batch_size, max_seq_len):
     return dynamic_shapes
 
 
-def _hf_model_dir_or_hub_id(
-    hf_model_subdir: str,
-    hf_hub_id: str,
-) -> str:
-    llm_models_path = llm_models_root()
-    if llm_models_path and os.path.isdir((model_fullpath := llm_models_path / hf_model_subdir)):
-        return str(model_fullpath)
-    else:
-        return hf_hub_id
-
-
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
@@ -351,7 +339,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
 
 _SMALL_MODEL_CONFIGS = {
     "meta-llama/Meta-Llama-3.1-8B-Instruct": {
-        "llm_models_subdir": "llama-3.1-model/Llama-3.1-8B-Instruct",
         "model_kwargs": {
             "num_hidden_layers": 1,
             "hidden_size": 64,
@@ -361,7 +348,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "mistralai/Mixtral-8x7B-Instruct-v0.1": {
-        "llm_models_subdir": "Mixtral-8x7B-Instruct-v0.1",
         "model_kwargs": {
             "num_hidden_layers": 2,
             "intermediate_size": 256,
@@ -372,7 +358,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "Qwen/Qwen3-30B-A3B": {
-        "llm_models_subdir": "Qwen3/Qwen3-30B-A3B",
         "model_kwargs": {
             "num_hidden_layers": 2,
             "intermediate_size": 256,
@@ -383,7 +368,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "microsoft/Phi-3-mini-4k-instruct": {
-        "llm_models_subdir": "Phi-3/Phi-3-mini-4k-instruct",
         "model_kwargs": {
             "num_hidden_layers": 2,
             "hidden_size": 128,
@@ -393,7 +377,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": {
-        "llm_models_subdir": "llama4-models/Llama-4-Scout-17B-16E-Instruct",
         "model_factory": "AutoModelForImageTextToText",
         "model_kwargs": {
             "text_config": {
@@ -412,7 +395,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "deepseek-ai/DeepSeek-V3": {
-        "llm_models_subdir": "DeepSeek-V3",
         "model_kwargs": {
             "first_k_dense_replace": 1,
             "num_hidden_layers": 2,
@@ -431,7 +413,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "Qwen/Qwen2.5-3B-Instruct": {
-        "llm_models_subdir": "Qwen2.5-3B-Instruct",
         "model_kwargs": {
             "num_hidden_layers": 2,
             "hidden_size": 64,
@@ -441,7 +422,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "mistralai/Mistral-Small-3.1-24B-Instruct-2503": {
-        "llm_models_subdir": "Mistral-Small-3.1-24B-Instruct-2503",
         "model_factory": "AutoModelForImageTextToText",
         "model_kwargs": {
             "text_config": {
@@ -463,7 +443,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "ibm-ai-platform/Bamba-9B-v2": {
-        "llm_models_subdir": "Bamba-9B-v2",
         "model_kwargs": {
             "dtype": "bfloat16",
             "hidden_size": 64,
@@ -482,7 +461,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "nvidia/NVIDIA-Nemotron-Nano-12B-v2": {
-        "llm_models_subdir": "NVIDIA-Nemotron-Nano-12B-v2",
         "model_kwargs": {
             "dtype": "bfloat16",
             "hidden_size": 32,
@@ -497,13 +475,11 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         },
     },
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {
-        "llm_models_subdir": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
         "model_kwargs": {
             "num_hidden_layers": 2,
         },
     },
     "nvidia/Nemotron-Nano-3-30B-A3.5B-dev-1024": {
-        "llm_models_subdir": "Nemotron-Nano-3-30B-A3.5B-dev-1024",
         "model_kwargs": {
             "num_hidden_layers": 8,
         },
@@ -531,7 +507,7 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An
     llm_args = copy.deepcopy(_SMALL_MODEL_CONFIGS[model_hub_id])
 
     # check if should use llm_models_root or hf_hub_id
-    llm_args["model"] = _hf_model_dir_or_hub_id(llm_args.pop("llm_models_subdir"), model_hub_id)
+    llm_args["model"] = hf_model_dir_or_hub_id(model_hub_id)
 
     # add some defaults to llm_args
     llm_args["skip_loading_weights"] = True  # No weight loading to speed up things
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py
index bbfd0c95f50..cf852a71111 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py
@@ -4,7 +4,7 @@
 
 import pytest
 import torch
-from _model_test_utils import _hf_model_dir_or_hub_id
+from test_common.llm_data import hf_model_dir_or_hub_id
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from tensorrt_llm._torch.auto_deploy.models.patches.deepseek import (
@@ -77,7 +77,7 @@ def _generate_ds_attention_mask(b, s):
     "model_name, module_name, patch, inputs",
     [
         pytest.param(
-            _hf_model_dir_or_hub_id("DeepSeek-R1/DeepSeek-R1", "deepseek-ai/DeepSeek-R1"),
+            hf_model_dir_or_hub_id("deepseek-ai/DeepSeek-R1"),
             "model.layers.0.self_attn",
             deepseek_v3_attention,
             [
@@ -87,7 +87,7 @@ def _generate_ds_attention_mask(b, s):
             ],
         ),  # attention requires  inputs [hidden_states, attention_mask, position_ids]
         pytest.param(
-            _hf_model_dir_or_hub_id("DeepSeek-R1/DeepSeek-R1", "deepseek-ai/DeepSeek-R1"),
+            hf_model_dir_or_hub_id("deepseek-ai/DeepSeek-R1"),
             "model.layers.0.mlp",
             deepseek_v3_moe_exact,
             [torch.randn(2, 6, 8, dtype=torch.bfloat16)],
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
index e40b25984c4..81481e8f51d 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
@@ -16,11 +16,13 @@
 import pytest
 from _model_test_utils import get_small_model_config
 from build_and_run_ad import ExperimentConfig, main
+from test_common.llm_data import with_mocked_hf_download
 
 from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
 
 
 @pytest.mark.parametrize("use_hf_speculative_model", [False, True])
+@with_mocked_hf_download
 def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool):
     """Test speculative decoding with AutoDeploy using the build_and_run_ad main()."""
 
@@ -31,6 +33,7 @@ def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool):
     experiment_config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct")
     speculative_model_hf_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     if use_hf_speculative_model:
+        # NOTE: this will still mock out the actual HuggingFace download
         speculative_model = speculative_model_hf_id
     else:
         speculative_model = get_small_model_config(speculative_model_hf_id)["args"]["model"]
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
index 8502c4d5760..c2d4cf50f4c 100644
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -8,6 +8,7 @@
 
 import pytest
 import torch
+from test_common.llm_data import with_mocked_hf_download
 from utils.llm_data import llm_models_root
 
 from tensorrt_llm import LLM, SamplingParams
@@ -145,10 +146,11 @@ def test_kv_lens_runtime_with_eagle3_one_model():
             False, "FLASHINFER", False, False, False, False, True, False, False,
             False
         ],
-        # HF download variant - tests speculative model auto-download from HuggingFace Hub
+        # Tests (mocked) speculative model auto-download from HuggingFace
         [False, "TRTLLM", True, False, False, False, True, False, False, True],
     ])
 @pytest.mark.high_cuda_memory
+@with_mocked_hf_download
 def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
                       disable_overlap_scheduler: bool, enable_block_reuse: bool,
                       use_one_model: bool, enable_chunked_prefill: bool,
diff --git a/tests/unittest/utils/llm_data.py b/tests/unittest/utils/llm_data.py
index c5953c2a768..118ba2fe05e 100644
--- a/tests/unittest/utils/llm_data.py
+++ b/tests/unittest/utils/llm_data.py
@@ -1,23 +1,6 @@
-import os
-from pathlib import Path
-from typing import Optional
+from test_common.llm_data import llm_datasets_root, llm_models_root
 
-
-def llm_models_root(check=False) -> Optional[Path]:
-    root = Path("/home/scratch.trt_llm_data/llm-models/")
-
-    if "LLM_MODELS_ROOT" in os.environ:
-        root = Path(os.environ.get("LLM_MODELS_ROOT"))
-
-    if not root.exists():
-        root = Path("/scratch.trt_llm_data/llm-models/")
-
-    if check:
-        assert root.exists(), \
-        "You shall set LLM_MODELS_ROOT env or be able to access /home/scratch.trt_llm_data to run this test"
-
-    return root if root.exists() else None
-
-
-def llm_datasets_root() -> str:
-    return os.path.join(llm_models_root(check=True), "datasets")
+__all__ = [
+    "llm_datasets_root",
+    "llm_models_root",
+]

From 213d74acc118735b89ac0ed71ad3149be889658c Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 18 Dec 2025 18:14:36 -0800
Subject: [PATCH 04/10] Move download to shared helper

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tensorrt_llm/llmapi/llm_utils.py | 38 ++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index 4efb06dd56f..3d1a33755bd 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -638,6 +638,23 @@ def _submit_to_all_workers(
         else:
             return [task(*args, **kwargs)]
 
+    def _download_hf_model_if_needed(self,
+                                     model_obj: _ModelWrapper,
+                                     revision: Optional[str] = None) -> Path:
+        """Download a model from HF hub if needed.
+
+        Also updates the model_obj.model_dir with the local model dir on rank 0.
+        """
+        if model_obj.is_hub_model:
+            model_dirs = self._submit_to_all_workers(
+                CachedModelLoader._node_download_hf_model,
+                model=model_obj.model_name,
+                revision=revision)
+            model_dir = model_dirs[0]
+            model_obj.model_dir = model_dir
+            return model_dir
+        return model_obj.model_dir
+
     def __call__(self) -> Tuple[Path, Union[Path, None]]:
 
         if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE:
@@ -648,14 +665,9 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]:
         self.model_loader = ModelLoader(self.llm_args)
 
         # Download speculative model from HuggingFace if needed
-        if (self.model_loader.speculative_model_obj is not None
-                and self.model_loader.speculative_model_obj.is_hub_model):
-            spec_model_dirs = self._submit_to_all_workers(
-                CachedModelLoader._node_download_hf_model,
-                model=self.model_loader.speculative_model_obj.model_name,
-                revision=None)
-            spec_model_dir = spec_model_dirs[0]
-            self.model_loader.speculative_model_obj.model_dir = spec_model_dir
+        if self.model_loader.speculative_model_obj is not None:
+            spec_model_dir = self._download_hf_model_if_needed(
+                self.model_loader.speculative_model_obj)
             # Update llm_args so PyTorch/AutoDeploy executor gets the local path
             if self.llm_args.speculative_config is not None:
                 self.llm_args.speculative_config.speculative_model = spec_model_dir
@@ -668,14 +680,8 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]:
                 raise ValueError(
                     f'backend {self.llm_args.backend} is not supported.')
 
-            if self.model_loader.model_obj.is_hub_model:
-                hf_model_dirs = self._submit_to_all_workers(
-                    CachedModelLoader._node_download_hf_model,
-                    model=self.model_loader.model_obj.model_name,
-                    revision=self.llm_args.revision)
-                self._hf_model_dir = hf_model_dirs[0]
-            else:
-                self._hf_model_dir = self.model_loader.model_obj.model_dir
+            self._hf_model_dir = self._download_hf_model_if_needed(
+                self.model_loader.model_obj, revision=self.llm_args.revision)
 
             if self.llm_args.quant_config.quant_algo is not None:
                 logger.warning(

From 50abc5dac3003d0d558013cdaea46d1a4cac85b2 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Mon, 22 Dec 2025 11:41:35 -0800
Subject: [PATCH 05/10] Add missing import

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tests/integration/defs/accuracy/test_llm_api_autodeploy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 1ee0061cbf0..508a26cb3dc 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import pytest
-from test_common.llm_data import hf_model_dir_or_hub_id
+from test_common.llm_data import hf_model_dir_or_hub_id, llm_models_root
 
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
 from tensorrt_llm.quantization import QuantAlgo

From 26c4dd9526767c54aaf2effd91323ee213531378 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Mon, 22 Dec 2025 18:08:22 -0800
Subject: [PATCH 06/10] Fixes

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tensorrt_llm/llmapi/llm_args.py  |  2 +-
 tensorrt_llm/llmapi/llm_utils.py | 25 +++++++++++++------------
 tests/unittest/utils/llm_data.py |  9 +++++++++
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 6f65d77d92c..dee8938a984 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -656,7 +656,7 @@ class DecodingBaseConfig(StrictBaseModel):
     #   which will be automatically downloaded.
     # - A local filesystem path to a downloaded model directory.
     speculative_model: Optional[Union[str, Path]] = Field(
-        default=None, alias="speculative_model_dir")
+        default=None, validation_alias="speculative_model_dir")
 
     # PyTorch only.
     # When specified, speculation will be disabled at batch sizes above
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index 3d1a33755bd..32cc8b92e94 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -125,7 +125,7 @@ def __init__(self,
             Path] = self.model_obj.model_dir if self.model_obj.is_local_model else None
 
         self._speculative_model_dir: Optional[
-            Path] = self.speculative_model_obj.model_dir if self.speculative_model_obj is not None and self.model_obj.is_local_model else None
+            Path] = self.speculative_model_obj.model_dir if self.speculative_model_obj is not None and self.speculative_model_obj.is_local_model else None
         self._model_info: Optional[_ModelInfo] = None
         self._model_format = self.llm_args.model_format
 
@@ -660,21 +660,22 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]:
         if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE:
             return Path(self.llm_args.model), None
 
-        self.engine_cache_stage: Optional[CachedStage] = None
-        self._hf_model_dir = None
-        self.model_loader = ModelLoader(self.llm_args)
-
-        # Download speculative model from HuggingFace if needed
-        if self.model_loader.speculative_model_obj is not None:
-            spec_model_dir = self._download_hf_model_if_needed(
-                self.model_loader.speculative_model_obj)
-            # Update llm_args so PyTorch/AutoDeploy executor gets the local path
-            if self.llm_args.speculative_config is not None:
-                self.llm_args.speculative_config.speculative_model = spec_model_dir
+        # Download speculative model from HuggingFace if needed (all backends)
+        if (self.llm_args.speculative_config is not None and
+                self.llm_args.speculative_config.speculative_model is not None):
+            spec_model_obj = _ModelWrapper(
+                self.llm_args.speculative_config.speculative_model)
+            spec_model_dir = self._download_hf_model_if_needed(spec_model_obj)
+            self.llm_args.speculative_config.speculative_model = spec_model_dir
 
+        # AutoDeploy doesn't use ModelLoader
         if self.llm_args.backend == "_autodeploy":
             return None, ""
 
+        self.engine_cache_stage: Optional[CachedStage] = None
+        self._hf_model_dir = None
+        self.model_loader = ModelLoader(self.llm_args)
+
         if self.llm_args.backend is not None:
             if self.llm_args.backend not in ["pytorch", "_autodeploy"]:
                 raise ValueError(
diff --git a/tests/unittest/utils/llm_data.py b/tests/unittest/utils/llm_data.py
index 118ba2fe05e..fd1bd15ca11 100644
--- a/tests/unittest/utils/llm_data.py
+++ b/tests/unittest/utils/llm_data.py
@@ -1,3 +1,12 @@
+import os
+import sys
+
+# Ensure tests/ directory is in path for test_common imports
+sys.path.insert(
+    0,
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.abspath(__file__)))))
+
 from test_common.llm_data import llm_datasets_root, llm_models_root
 
 __all__ = [

From e9cee86bcd62777afc4e4dbdbe5b11d04f41f36c Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Mon, 22 Dec 2025 22:37:01 -0800
Subject: [PATCH 07/10] Use AliasChoices

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tensorrt_llm/llmapi/llm_args.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index dee8938a984..c92c429e624 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -13,7 +13,7 @@
 
 import torch
 import yaml
-from pydantic import BaseModel
+from pydantic import AliasChoices, BaseModel
 from pydantic import Field as PydanticField
 from pydantic import PrivateAttr, field_validator, model_validator
 from strenum import StrEnum
@@ -656,7 +656,9 @@ class DecodingBaseConfig(StrictBaseModel):
     #   which will be automatically downloaded.
     # - A local filesystem path to a downloaded model directory.
     speculative_model: Optional[Union[str, Path]] = Field(
-        default=None, validation_alias="speculative_model_dir")
+        default=None,
+        validation_alias=AliasChoices("speculative_model",
+                                      "speculative_model_dir"))
 
     # PyTorch only.
     # When specified, speculation will be disabled at batch sizes above

From 6ebb1b903713eaf9ce23b993bed636cf99594d09 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Tue, 30 Dec 2025 15:26:40 -0800
Subject: [PATCH 08/10] Use model validators for eagle config

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tensorrt_llm/llmapi/llm_args.py | 69 ++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index c92c429e624..7f47fd6a077 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -865,28 +865,29 @@ class EagleDecodingConfig(DecodingBaseConfig):
     # choices: llama3, mistral_large3
     eagle3_model_arch: str = "llama3"
 
-    def __init__(self, **kwargs):
-        super().__init__()
-        for attr_name, attr_value in kwargs.items():
-            if attr_name == 'max_draft_len':
-                self.num_eagle_layers = attr_value
-                self.max_total_draft_tokens = attr_value  # If using linear-tree, the max_total_draft_tokens is the same as max_draft_len
-            # Convert the data type of Eagle choice from str to List[List[int]]
-            if attr_name == 'eagle_choices' and attr_value is not None:
-                logger.warning(
-                    "NOTE: The Draft token tree is still under development, PLEASE DO NOT USE IT !!!"
-                )
-                if not isinstance(attr_value, list):
-                    if isinstance(attr_value, str):
-                        attr_value = ast.literal_eval(
-                            attr_value.replace(" ", ""))
-                    else:
-                        raise ValueError(
-                            "Wrong eagle choices type. Eagle choices should be a List[List[int]] or a string like [[0], [1], [2], [0, 0], [0, 1]]."
-                        )
-            setattr(self, attr_name, attr_value)
+    @field_validator('eagle_choices', mode='before')
+    @classmethod
+    def validate_eagle_choices(cls, v):
+        if v is not None:
+            logger.warning(
+                "NOTE: The Draft token tree is still under development, PLEASE DO NOT USE IT !!!"
+            )
+            if not isinstance(v, list):
+                if isinstance(v, str):
+                    v = ast.literal_eval(v.replace(" ", ""))
+                else:
+                    raise ValueError(
+                        "Wrong eagle choices type. Eagle choices should be a List[List[int]] or a string like [[0], [1], [2], [0, 0], [0, 1]]."
+                    )
+        return v
+
+    @model_validator(mode='after')
+    def validate_eagle_config(self) -> 'EagleDecodingConfig':
+        if self.max_draft_len is None:
+            raise ValueError("max_draft_len is required for Eagle")
+        self.num_eagle_layers = self.max_draft_len
+        self.max_total_draft_tokens = self.max_draft_len  # If using linear-tree, the max_total_draft_tokens is the same as max_draft_len
 
-        assert self.max_draft_len is not None, "max_draft_len is required for Eagle"
         if self.eagle3_model_arch == "mistral_large3" and self.eagle3_layers_to_capture is None:
             # FIXME find a better way to setup it.
             self.eagle3_layers_to_capture = {-1}
@@ -896,7 +897,10 @@ def __init__(self, **kwargs):
         # and reset the max_draft_len and num_eagle_layers if necessary
         if self.eagle_choices is not None:
             # If eagle_choices is provided, use_dynamic_tree should not be used
-            assert not self.use_dynamic_tree, "If eagle_choices is provided, use_dynamic_tree need to be False"
+            if self.use_dynamic_tree:
+                raise ValueError(
+                    "If eagle_choices is provided, use_dynamic_tree need to be False"
+                )
 
             # Get num_eagle_layers from eagle_choices
             num_eagle_layers_from_choices = self.check_eagle_choices()
@@ -913,10 +917,23 @@ def __init__(self, **kwargs):
 
         # Dynamic tree logic
         if self.use_dynamic_tree:
-            assert self.eagle_choices is None, "If use_dynamic_tree is True, eagle_choices should be None"
-            assert self.max_draft_len is not None and self.max_draft_len > 0, "max_draft_len should be provided, which indicates the number of drafter layers"
-            assert self.dynamic_tree_max_topK is not None and self.dynamic_tree_max_topK > 0, "dynamic_tree_max_topK should be provided, which indicates the number of nodes to expand each time"
-            assert self.max_total_draft_tokens is not None and self.max_total_draft_tokens > 0, "max_total_draft_tokens should be provided, which indicates the total nodes of the final draft tree. (exclude the root node)"
+            if self.eagle_choices is not None:
+                raise ValueError(
+                    "If use_dynamic_tree is True, eagle_choices should be None")
+            if self.max_draft_len is None or self.max_draft_len <= 0:
+                raise ValueError(
+                    "max_draft_len should be provided, which indicates the number of drafter layers"
+                )
+            if self.dynamic_tree_max_topK is None or self.dynamic_tree_max_topK <= 0:
+                raise ValueError(
+                    "dynamic_tree_max_topK should be provided, which indicates the number of nodes to expand each time"
+                )
+            if self.max_total_draft_tokens is None or self.max_total_draft_tokens <= 0:
+                raise ValueError(
+                    "max_total_draft_tokens should be provided, which indicates the total nodes of the final draft tree. (exclude the root node)"
+                )
+
+        return self
 
     @classmethod
     def from_dict(cls, data: dict):

From e0b1bfa9b0c802c8964fe2cb08153807ec187eef Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Mon, 5 Jan 2026 11:01:01 -0800
Subject: [PATCH 09/10] Remove redundant speculative model download for TRTLLM
 backend

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tensorrt_llm/llmapi/llm_args.py  | 28 +++-------------------------
 tensorrt_llm/llmapi/llm_utils.py | 20 +++-----------------
 2 files changed, 6 insertions(+), 42 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 7f47fd6a077..7a2ba8b2ed2 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -17,6 +17,7 @@
 from pydantic import Field as PydanticField
 from pydantic import PrivateAttr, field_validator, model_validator
 from strenum import StrEnum
+
 from transformers import PreTrainedTokenizerBase
 
 try:
@@ -2143,9 +2144,6 @@ def coerce_env_overrides_to_str(cls, v):
 
     _parallel_config: Optional[_ParallelConfig] = PrivateAttr(default=None)
     _model_format: Optional[_ModelFormatKind] = PrivateAttr(default=None)
-    _speculative_model: Optional[str] = PrivateAttr(default=None)
-    _speculative_model_format: Optional[_ModelFormatKind] = PrivateAttr(
-        default=None)
 
     @property
     def parallel_config(self) -> _ParallelConfig:
@@ -2156,12 +2154,8 @@ def model_format(self) -> _ModelFormatKind:
         return self._model_format
 
     @property
-    def speculative_model(self) -> Optional[str]:
-        return self._speculative_model
-
-    @property
-    def speculative_model_format(self) -> _ModelFormatKind:
-        return self._speculative_model_format
+    def speculative_model(self) -> Optional[Union[str, Path]]:
+        return self.speculative_config.speculative_model if self.speculative_config is not None else None
 
     @classmethod
     def from_kwargs(cls, **kwargs: Any) -> "BaseLlmArgs":
@@ -2552,14 +2546,6 @@ def validate_speculative_config(self):
         else:
             self.decoding_config = None
 
-        self._speculative_model = getattr(self.speculative_config,
-                                          "speculative_model", None)
-        speculative_model_obj = _ModelWrapper(
-            self._speculative_model
-        ) if self._speculative_model is not None else None
-        if self._speculative_model and speculative_model_obj.is_local_model:
-            self._speculative_model_format = _ModelFormatKind.HF
-
         return self
 
     def _load_config_from_engine(self, engine_dir: Path):
@@ -3081,14 +3067,6 @@ def validate_speculative_config(self):
         else:
             self.decoding_config = None
 
-        self._speculative_model = getattr(self.speculative_config,
-                                          "speculative_model", None)
-        speculative_model_obj = _ModelWrapper(
-            self._speculative_model
-        ) if self._speculative_model is not None else None
-        if self._speculative_model and speculative_model_obj.is_local_model:
-            self._speculative_model_format = _ModelFormatKind.HF
-
         return self
 
     @model_validator(mode="after")
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index 32cc8b92e94..f6f3ebb05f2 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -9,10 +9,11 @@
 from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
-import transformers
 from pydantic import BaseModel
 from tqdm import tqdm
 
+import transformers
+
 from .._utils import (global_mpi_rank, local_mpi_rank, mpi_barrier,
                       mpi_broadcast, mpi_rank, release_gc)
 # yapf: disable
@@ -145,9 +146,7 @@ def _gather_build_steps(self):
             return
 
         if (self.model_obj.is_hub_model
-                and self._model_format is not _ModelFormatKind.TLLM_ENGINE) or (
-                    self.speculative_model_obj
-                    and self.speculative_model_obj.is_hub_model):
+                and self._model_format is not _ModelFormatKind.TLLM_ENGINE):
             # Download HF model if necessary
             if self.model_obj.model_name is None:
                 raise ValueError(
@@ -305,31 +304,18 @@ def save(
     def _download_hf_model(self):
         ''' Download HF model from third-party model hub like www.modelscope.cn or huggingface.  '''
         model_dir = None
-        speculative_model_dir = None
         # Only the rank0 are allowed to download model
         if mpi_rank() == 0:
             assert self._workspace is not None
             assert isinstance(self.model_obj.model_name, str)
             # this will download only once when multiple MPI processes are running
-
             model_dir = download_hf_model(self.model_obj.model_name,
                                           revision=self.llm_args.revision)
             print_colored(f"Downloaded model to {model_dir}\n", 'grey')
-            if self.speculative_model_obj:
-                speculative_model_dir = download_hf_model(
-                    self.speculative_model_obj.model_name)
-                print_colored(f"Downloaded model to {speculative_model_dir}\n",
-                              'grey')
         # Make all the processes got the same model_dir
         self._model_dir = mpi_broadcast(model_dir, root=0)
         self.model_obj.model_dir = self._model_dir  # mark as a local model
         assert self.model_obj.is_local_model
-        if self.speculative_model_obj:
-            self._speculative_model_dir = mpi_broadcast(speculative_model_dir,
-                                                        root=0)
-            self.speculative_model_obj.model_dir = self._speculative_model_dir
-
-            assert self.speculative_model_obj.is_local_model
 
     def _update_from_hf_quant_config(self) -> bool:
         """Update quant_config from the config file of pre-quantized HF checkpoint.

From 2594dc2b24ac983b0e365ca711bfa90f692ecf09 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Mon, 5 Jan 2026 11:31:33 -0800
Subject: [PATCH 10/10] Fix import sorting

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
---
 tensorrt_llm/llmapi/llm_args.py  | 1 -
 tensorrt_llm/llmapi/llm_utils.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 7a2ba8b2ed2..c6ca63a57fc 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -17,7 +17,6 @@
 from pydantic import Field as PydanticField
 from pydantic import PrivateAttr, field_validator, model_validator
 from strenum import StrEnum
-
 from transformers import PreTrainedTokenizerBase
 
 try:
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index f6f3ebb05f2..f687c03ddd6 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -9,11 +9,10 @@
 from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
+import transformers
 from pydantic import BaseModel
 from tqdm import tqdm
 
-import transformers
-
 from .._utils import (global_mpi_rank, local_mpi_rank, mpi_barrier,
                       mpi_broadcast, mpi_rank, release_gc)
 # yapf: disable