From c2ae43710d35bfc9f3f0eae4026d691002ea928a Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Wed, 17 Dec 2025 17:40:35 -0800 Subject: [PATCH 01/10] [None][feat] Auto download speculative models from HF for pytorch backend, add speculative_model field alias Signed-off-by: Anish Shanbhag --- docs/source/features/speculative-decoding.md | 36 +++++- .../_tensorrt_engine/llm_eagle2_decoding.py | 4 +- .../_tensorrt_engine/llm_eagle_decoding.py | 4 +- .../_tensorrt_engine/llm_medusa_decoding.py | 4 +- examples/llm-api/llm_speculative_decoding.py | 2 +- examples/llm-api/quickstart_advanced.py | 6 +- examples/models/core/qwen/README.md | 6 +- .../_torch/auto_deploy/shim/ad_executor.py | 2 +- .../_torch/models/modeling_speculative.py | 4 +- .../_torch/pyexecutor/model_loader.py | 2 +- .../_torch/pyexecutor/py_executor_creator.py | 2 +- tensorrt_llm/llmapi/llm_args.py | 21 ++-- tensorrt_llm/llmapi/llm_utils.py | 29 +++-- .../accuracy/test_disaggregated_serving.py | 4 +- .../integration/defs/accuracy/test_llm_api.py | 4 +- .../defs/accuracy/test_llm_api_pytorch.py | 29 +++-- .../test_disaggregated_single_gpu.py | 2 +- .../serve/test_spec_decoding_metrics.py | 4 +- .../examples/test_ad_speculative_decoding.py | 6 +- .../defs/perf/pytorch_model_config.py | 2 +- .../integration/defs/perf/test_perf_sanity.py | 16 +-- tests/integration/defs/test_e2e.py | 2 +- .../gpt_oss_120b_fp4_grace_blackwell.yaml | 2 +- .../perf-sanity/run_benchmark_serve.py | 14 +-- .../singlegpu/test_ad_speculative_decoding.py | 17 +-- .../speculative/test_draft_len_schedule.py | 8 +- .../_torch/speculative/test_draft_target.py | 2 +- ...test_draft_token_prepare_for_generation.py | 2 +- .../test_draft_token_tree_sampling.py | 2 +- .../test_draft_token_tree_verification.py | 2 +- .../speculative/test_dynamic_spec_decode.py | 2 +- .../_torch/speculative/test_eagle3.py | 104 ++++++++++++------ .../_torch/speculative/test_kv_cache_reuse.py | 2 +- .../_torch/speculative/test_spec_gate.py | 2 +- tests/unittest/llmapi/test_llm.py | 8 +- tests/unittest/llmapi/test_llm_args.py | 13 +++ 36 files changed, 231 insertions(+), 140 deletions(-) diff --git a/docs/source/features/speculative-decoding.md b/docs/source/features/speculative-decoding.md index 089d7ecf3a7..64e52558b20 100644 --- a/docs/source/features/speculative-decoding.md +++ b/docs/source/features/speculative-decoding.md @@ -37,8 +37,13 @@ Draft/target is the simplest form of speculative decoding. In this approach, an ```python from tensorrt_llm.llmapi import DraftTargetDecodingConfig +# Option 1: Use a HuggingFace Hub model ID (auto-downloaded) speculative_config = DraftTargetDecodingConfig( - max_draft_len=3, speculative_model_dir="/path/to/draft_model") + max_draft_len=3, speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B") + +# Option 2: Use a local path +# speculative_config = DraftTargetDecodingConfig( +# max_draft_len=3, speculative_model="/path/to/draft_model") llm = LLM("/path/to/target_model", speculative_config=speculative_config, disable_overlap_scheduler=True) ``` @@ -51,18 +56,23 @@ TRT-LLM supports a modified version of the algorithm presented in the paper: tre The following draft model checkpoints can be used for EAGLE 3: * Llama 3 variants: [use the checkpoints from the authors of the original EAGLE 3 paper](https://huggingface.co/yuhuili). * Llama 4 Maverick: [use the checkpoint from the NVIDIA HuggingFace repository](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Eagle3). +* Other models, including `gpt-oss-120b` and `Qwen3`: check out the [Speculative Decoding Modules](https://huggingface.co/collections/nvidia/speculative-decoding-modules) collection from NVIDIA. ```python from tensorrt_llm.llmapi import EagleDecodingConfig # Enable to use the faster one-model implementation for Llama 4. eagle3_one_model = False +model = "meta-llama/Llama-3.1-8B-Instruct" +speculative_model = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" speculative_config = EagleDecodingConfig( - max_draft_len=3, speculative_model_dir="/path/to/draft_model", eagle3_one_model=eagle3_one_model) + max_draft_len=3, + speculative_model=speculative_model, + eagle3_one_model=eagle3_one_model) # Only need to disable overlap scheduler if eagle3_one_model is False. -llm = LLM("/path/to/target_model", speculative_config=speculative_config, disable_overlap_scheduler=True) +llm = LLM(model, speculative_config=speculative_config, disable_overlap_scheduler=True) ``` ### NGram @@ -137,7 +147,17 @@ Speculative decoding options must be specified via `--config config.yaml` for bo The rest of the argument names/valid values are the same as in their corresponding configuration class described in the Quick Start section. For example, a YAML configuration could look like this: +```yaml +# Using a HuggingFace Hub model ID (auto-downloaded) +disable_overlap_scheduler: true +speculative_config: + decoding_type: Eagle + max_draft_len: 4 + speculative_model: yuhuili/EAGLE3-LLaMA3.1-Instruct-8B ``` + +```yaml +# Or using a local path disable_overlap_scheduler: true speculative_config: decoding_type: Eagle @@ -145,6 +165,16 @@ speculative_config: speculative_model: /path/to/draft/model ``` +```{note} +The field name `speculative_model_dir` can also be used as an alias for `speculative_config.speculative_model`. For example: + + speculative_config: + decoding_type: Eagle + max_draft_len: 4 + speculative_model_dir: /path/to/draft/model +``` + + ## Developer Guide This section describes the components of a speculative decoding algorithm. All of the interfaces are defined in [`_torch/speculative/interface.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/interface.py). diff --git a/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py b/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py index a1343cc5757..86b5ca28af4 100755 --- a/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py +++ b/examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py @@ -23,12 +23,12 @@ def main(): model = "lmsys/vicuna-7b-v1.3" # The end user can customize the eagle decoding configuration by specifying the - # speculative_model_dir, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices + # speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices # greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK # with the EagleDecodingConfig class speculative_config = EagleDecodingConfig( - speculative_model_dir="yuhuili/EAGLE-Vicuna-7B-v1.3", + speculative_model="yuhuili/EAGLE-Vicuna-7B-v1.3", max_draft_len=63, num_eagle_layers=4, max_non_leaves_per_layer=10, diff --git a/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py b/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py index c66e15f6646..e6e89a622ee 100644 --- a/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py +++ b/examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py @@ -23,12 +23,12 @@ def main(): model = "lmsys/vicuna-7b-v1.3" # The end user can customize the eagle decoding configuration by specifying the - # speculative_model_dir, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices + # speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices # greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK # with the EagleDecodingConfig class speculative_config = EagleDecodingConfig( - speculative_model_dir="yuhuili/EAGLE-Vicuna-7B-v1.3", + speculative_model="yuhuili/EAGLE-Vicuna-7B-v1.3", max_draft_len=63, num_eagle_layers=4, max_non_leaves_per_layer=10, diff --git a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py index f45411b2336..d371600d00f 100644 --- a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py +++ b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py @@ -48,10 +48,10 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None): model = "lmsys/vicuna-7b-v1.3" # The end user can customize the medusa decoding configuration by specifying the - # speculative_model_dir, max_draft_len, medusa heads num and medusa choices + # speculative_model, max_draft_len, medusa heads num and medusa choices # with the MedusaDecodingConfig class speculative_config = MedusaDecodingConfig( - speculative_model_dir="FasterDecoding/medusa-vicuna-7b-v1.3", + speculative_model="FasterDecoding/medusa-vicuna-7b-v1.3", max_draft_len=63, num_medusa_heads=4, medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \ diff --git a/examples/llm-api/llm_speculative_decoding.py b/examples/llm-api/llm_speculative_decoding.py index 6d6e812db34..de33278a096 100644 --- a/examples/llm-api/llm_speculative_decoding.py +++ b/examples/llm-api/llm_speculative_decoding.py @@ -35,7 +35,7 @@ def run_MTP(model: Optional[str] = None): def run_Eagle3(): spec_config = EagleDecodingConfig( max_draft_len=3, - speculative_model_dir="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", + speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", eagle3_one_model=True) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py index abc8e48f61a..c0bb4e31bec 100644 --- a/examples/llm-api/quickstart_advanced.py +++ b/examples/llm-api/quickstart_advanced.py @@ -220,11 +220,11 @@ def setup_llm(args, **kwargs): relaxed_topk=args.relaxed_topk, relaxed_delta=args.relaxed_delta, mtp_eagle_one_model=args.use_one_model, - speculative_model_dir=args.model_dir) + speculative_model=args.model_dir) elif spec_decode_algo == "EAGLE3": spec_config = EagleDecodingConfig( max_draft_len=args.spec_decode_max_draft_len, - speculative_model_dir=args.draft_model_dir, + speculative_model=args.draft_model_dir, eagle3_one_model=args.use_one_model, eagle_choices=args.eagle_choices, use_dynamic_tree=args.use_dynamic_tree, @@ -234,7 +234,7 @@ def setup_llm(args, **kwargs): elif spec_decode_algo == "DRAFT_TARGET": spec_config = DraftTargetDecodingConfig( max_draft_len=args.spec_decode_max_draft_len, - speculative_model_dir=args.draft_model_dir) + speculative_model=args.draft_model_dir) elif spec_decode_algo == "NGRAM": spec_config = NGramDecodingConfig( max_draft_len=args.spec_decode_max_draft_len, diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md index 5474e259690..566d4eab1ba 100644 --- a/examples/models/core/qwen/README.md +++ b/examples/models/core/qwen/README.md @@ -841,8 +841,8 @@ Qwen3 now supports Eagle3 (Speculative Decoding with Eagle3). To enable Eagle3 o Set the decoding type to "Eagle" to enable Eagle3 speculative decoding. - `speculative_config.max_draft_len: 3` Set the maximum number of draft tokens generated per step (this value can be adjusted as needed). -- `speculative_config.speculative_model_dir: ` - Specify the path to the Eagle3 draft model (ensure the corresponding draft model weights are prepared). +- `speculative_config.speculative_model: ` + Specify the Eagle3 draft model either as a Huggingface model ID or a local path. You can find ready-to-use Eagle3 draft models at https://huggingface.co/collections/nvidia/speculative-decoding-modules. Currently, there are some limitations when enabling Eagle3: @@ -857,7 +857,7 @@ enable_attention_dp: false speculative_config: decoding_type: Eagle max_draft_len: 3 - speculative_model_dir: + speculative_model: kv_cache_config: enable_block_reuse: false " >> ${path_config} diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index 9f6b885d3b4..5055514014e 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -921,7 +921,7 @@ def create_draft_model_engine_maybe( drafting_loop_wrapper = None draft_model_engine = PyTorchModelEngine( - model_path=draft_spec_config.speculative_model_dir, + model_path=draft_spec_config.speculative_model, llm_args=draft_llm_args, mapping=dist_mapping, attn_runtime_features=attn_runtime_features, diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py index dc4b3b1d545..7c3fb20a5d1 100755 --- a/tensorrt_llm/_torch/models/modeling_speculative.py +++ b/tensorrt_llm/_torch/models/modeling_speculative.py @@ -887,7 +887,7 @@ def __init__(self, model: TModel, model_config: ModelConfig[TConfig]): from tensorrt_llm._torch.models.checkpoints.mistral.config_loader import \ MistralConfigLoader self.draft_config = MistralConfigLoader().load( - spec_config.speculative_model_dir, + spec_config.speculative_model, mapping=model_config.mapping, moe_backend=model_config.moe_backend, moe_max_num_tokens=model_config.moe_max_num_tokens, @@ -898,7 +898,7 @@ def __init__(self, model: TModel, model_config: ModelConfig[TConfig]): self.draft_config.extra_attrs = model_config.extra_attrs elif spec_config.eagle3_model_arch == "llama3": self.draft_config = ModelConfig.from_pretrained( - model_config.spec_config.speculative_model_dir, + model_config.spec_config.speculative_model, trust_remote_code=True, attn_backend=model_config.attn_backend, moe_backend=model_config.moe_backend, diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py index 4756e24d082..cc44248ebcc 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_loader.py +++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py @@ -278,7 +278,7 @@ def init_meta_tensor(t: torch.Tensor): if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights( ): weights = checkpoint_loader.load_weights( - self.spec_config.speculative_model_dir, + self.spec_config.speculative_model, mapping=self.mapping) draft_model_arch = model.draft_config.pretrained_config.architectures[ diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index bd1857dda27..493f542f4fd 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -398,7 +398,7 @@ def drafting_loop_wrapper(model): draft_llm_args.load_format = LoadFormat.DUMMY draft_model_engine = PyTorchModelEngine( - model_path=spec_config.speculative_model_dir, + model_path=spec_config.speculative_model, llm_args=draft_llm_args, mapping=mapping, attn_runtime_features=attn_runtime_features, diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 3f15252b84f..6f65d77d92c 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -651,7 +651,12 @@ class DecodingBaseConfig(StrictBaseModel): # If it's a static or dynamic tree, each draft layer may generate more than one draft token. # In this case, max_total_draft_tokens >= max_draft_len. max_total_draft_tokens: Optional[int] = None - speculative_model_dir: Optional[Union[str, Path]] = None + # The speculative (draft) model. Accepts either: + # - A HuggingFace Hub model ID (str), e.g., "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + # which will be automatically downloaded. + # - A local filesystem path to a downloaded model directory. + speculative_model: Optional[Union[str, Path]] = Field( + default=None, alias="speculative_model_dir") # PyTorch only. # When specified, speculation will be disabled at batch sizes above @@ -918,7 +923,7 @@ def from_dict(cls, data: dict): decoding_type: ClassVar[str] = "Eagle" def validate(self) -> None: - if self.speculative_model_dir is None: + if self.speculative_model is None: raise ValueError("Draft model must be provided for EAGLE") def check_eagle_choices(self): @@ -2132,7 +2137,7 @@ def model_format(self) -> _ModelFormatKind: return self._model_format @property - def speculative_model_dir(self) -> Optional[_ModelFormatKind]: + def speculative_model(self) -> Optional[str]: return self._speculative_model @property @@ -2508,7 +2513,7 @@ def validate_speculative_config(self): elif isinstance(self.speculative_config, EagleDecodingConfig): assert self.speculative_config.max_draft_len > 0 - assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified." + assert self.speculative_config.speculative_model is not None, "EAGLE3 draft model must be specified." self.build_config.max_draft_len = self.speculative_config.max_draft_len self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE eagle_config = _EagleConfig( @@ -2529,7 +2534,7 @@ def validate_speculative_config(self): self.decoding_config = None self._speculative_model = getattr(self.speculative_config, - "speculative_model_dir", None) + "speculative_model", None) speculative_model_obj = _ModelWrapper( self._speculative_model ) if self._speculative_model is not None else None @@ -3025,12 +3030,12 @@ def validate_speculative_config(self): if isinstance(self.speculative_config, EagleDecodingConfig): assert self.speculative_config.max_draft_len > 0 - assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified." + assert self.speculative_config.speculative_model is not None, "EAGLE3 draft model must be specified." elif isinstance(self.speculative_config, NGramDecodingConfig): assert self.speculative_config.max_draft_len > 0 and self.speculative_config.max_matching_ngram_size > 0 elif isinstance(self.speculative_config, DraftTargetDecodingConfig): assert self.speculative_config.max_draft_len > 0 - assert self.speculative_config.speculative_model_dir is not None, "Path to draft model must be specified." + assert self.speculative_config.speculative_model is not None, "Draft model must be specified." elif isinstance(self.speculative_config, MTPDecodingConfig): assert self.speculative_config.num_nextn_predict_layers > 0 self.speculative_config.max_draft_len = self.speculative_config.num_nextn_predict_layers @@ -3058,7 +3063,7 @@ def validate_speculative_config(self): self.decoding_config = None self._speculative_model = getattr(self.speculative_config, - "speculative_model_dir", None) + "speculative_model", None) speculative_model_obj = _ModelWrapper( self._speculative_model ) if self._speculative_model is not None else None diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index fc1647a8070..4efb06dd56f 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -109,8 +109,8 @@ def __init__(self, self.model_obj = _ModelWrapper(self.llm_args.model) self.speculative_model_obj = _ModelWrapper( - self.llm_args.speculative_model_dir - ) if self.llm_args.speculative_model_dir is not None else None + self.llm_args.speculative_model + ) if self.llm_args.speculative_model is not None else None if isinstance(self.llm_args, TrtLlmArgs): self.convert_checkpoint_options = self.llm_args._convert_checkpoint_options @@ -440,8 +440,8 @@ def _load_model_from_hf(self): model_cls = AutoModelForCausalLM.get_trtllm_model_class( self._model_dir, self.llm_args.trust_remote_code, self.llm_args.decoding_config.decoding_mode - if hasattr(self.llm_args, "speculative_model_dir") - and self.llm_args.speculative_model_dir else None) + if hasattr(self.llm_args, "speculative_model") + and self.llm_args.speculative_model else None) prequantized = self._update_from_hf_quant_config() @@ -643,15 +643,26 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]: if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE: return Path(self.llm_args.model), None - if self.llm_args.backend == "_autodeploy": - return None, "" - self.engine_cache_stage: Optional[CachedStage] = None - self._hf_model_dir = None - self.model_loader = ModelLoader(self.llm_args) + # Download speculative model from HuggingFace if needed + if (self.model_loader.speculative_model_obj is not None + and self.model_loader.speculative_model_obj.is_hub_model): + spec_model_dirs = self._submit_to_all_workers( + CachedModelLoader._node_download_hf_model, + model=self.model_loader.speculative_model_obj.model_name, + revision=None) + spec_model_dir = spec_model_dirs[0] + self.model_loader.speculative_model_obj.model_dir = spec_model_dir + # Update llm_args so PyTorch/AutoDeploy executor gets the local path + if self.llm_args.speculative_config is not None: + self.llm_args.speculative_config.speculative_model = spec_model_dir + + if self.llm_args.backend == "_autodeploy": + return None, "" + if self.llm_args.backend is not None: if self.llm_args.backend not in ["pytorch", "_autodeploy"]: raise ValueError( diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 2ba2ee1bfee..1350ecb774c 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -576,7 +576,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): speculative_decoding_config = { "decoding_type": "Eagle", "max_draft_len": 4, - "speculative_model_dir": + "speculative_model": f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B", "eagle3_one_model": eagle3_one_model } @@ -675,7 +675,7 @@ def test_guided_decoding_with_eagle3(self, backend: str, speculative_decoding_config = { "decoding_type": "Eagle", "max_draft_len": 3, - "speculative_model_dir": + "speculative_model": f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B", "eagle3_one_model": eagle3_one_model } diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index e019572ada0..a304a47edb2 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -471,7 +471,7 @@ class TestEagleVicuna_7B_v1_3(LlmapiAccuracyTestHarness): speculative_config = EagleDecodingConfig( max_draft_len=63, - speculative_model_dir=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3", + speculative_model=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3", num_eagle_layers=4, max_non_leaves_per_layer=10, eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \ @@ -497,7 +497,7 @@ class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness): speculative_config = EagleDecodingConfig( max_draft_len=63, - speculative_model_dir=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3", + speculative_model=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3", num_eagle_layers=4, max_non_leaves_per_layer=10, use_dynamic_tree=True, diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 1a32e333b5a..6ef4915c2cf 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -276,7 +276,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model, draft_len = 4 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=eagle3_one_model) with LLM(model=target_model_dir, @@ -369,8 +369,7 @@ def test_guided_decoding_with_eagle3(self, backend: str, cuda_graph_config = CudaGraphConfig(enable_padding=True) spec_config = EagleDecodingConfig( max_draft_len=3, - speculative_model_dir= - f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B", + speculative_model=f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B", eagle3_one_model=eagle3_one_model) llm = LLM( self.MODEL_PATH, @@ -621,7 +620,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model, torch_compile): eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.3-Instruct-70B" kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) spec_config = EagleDecodingConfig(max_draft_len=3, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=eagle3_one_model) torch_compile_config = _get_default_torch_compile_config(torch_compile) pytorch_config = dict( @@ -1383,7 +1382,7 @@ def test_bfloat16_2_model_mtp(self): ) mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3, mtp_eagle_one_model=False, - speculative_model_dir=self.MODEL_PATH) + speculative_model=self.MODEL_PATH) with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config, enable_chunked_prefill=False, @@ -2935,7 +2934,7 @@ def test_nvfp4_2_model_mtp(self, tp_size, cuda_graph, overlap_scheduler, mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3, mtp_eagle_one_model=False, - speculative_model_dir=model_path) + speculative_model=model_path) with LLM(model_path, max_batch_size=max_batch_size, @@ -3441,7 +3440,7 @@ def test_eagle3(self, enable_chunked_prefill, eagle3_one_model): draft_len = 4 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=eagle3_one_model) llm = LLM(model=target_model_dir, @@ -3812,7 +3811,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, if eagle3: spec_config = EagleDecodingConfig( max_draft_len=2, - speculative_model_dir= + speculative_model= f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/", eagle3_one_model=True) with LLM( @@ -3860,7 +3859,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, if eagle3: spec_config = EagleDecodingConfig( max_draft_len=2, - speculative_model_dir= + speculative_model= f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/", eagle3_one_model=True) with LLM( @@ -4479,7 +4478,7 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler, eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" draft_len = 3 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=one_model, allow_advanced_sampling=True) @@ -4545,7 +4544,7 @@ def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker): eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" draft_len = 3 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=one_model, allow_advanced_sampling=True) @@ -4609,7 +4608,7 @@ def test_eagle3_guided_decoding_4gpus(self, one_model, mocker): eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" draft_len = 3 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=one_model, allow_advanced_sampling=True) @@ -4668,7 +4667,7 @@ def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler, eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" draft_len = 3 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=one_model) max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN @@ -5147,7 +5146,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, if eagle3: spec_config = EagleDecodingConfig( max_draft_len=2, - speculative_model_dir= + speculative_model= f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512-Eagle/", eagle3_one_model=True, eagle3_model_arch="mistral_large3") @@ -5198,7 +5197,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, if eagle3: spec_config = EagleDecodingConfig( max_draft_len=2, - speculative_model_dir= + speculative_model= f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512-Eagle/", eagle3_one_model=True, eagle3_model_arch="mistral_large3") diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index d6b63d3ab3c..4e146f3df0e 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -400,7 +400,7 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path, # Test whether the batch slots are properly released when using speculative decoding # with disaggregated serving. spec_dec_config = EagleDecodingConfig( - speculative_model_dir=model_path(spec_dec_model_path), + speculative_model=model_path(spec_dec_model_path), eagle3_one_model=eagle3_one_model, max_draft_len=3) diff --git a/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py b/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py index a888f50e333..69bebfcfb0d 100644 --- a/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py +++ b/tests/integration/defs/examples/serve/test_spec_decoding_metrics.py @@ -93,7 +93,7 @@ def test_spec_decoding_metrics_eagle3_one_model(): "speculative_config": { "decoding_type": "Eagle", "max_draft_len": 4, - "speculative_model_dir": eagle3_path, + "speculative_model": eagle3_path, "eagle3_one_model": True, }, } @@ -174,7 +174,7 @@ def test_spec_decoding_metrics_eagle3_two_model(): "speculative_config": { "decoding_type": "Eagle", "max_draft_len": 4, - "speculative_model_dir": eagle3_path, + "speculative_model": eagle3_path, "eagle3_one_model": False, # Two-model mode }, } diff --git a/tests/integration/defs/examples/test_ad_speculative_decoding.py b/tests/integration/defs/examples/test_ad_speculative_decoding.py index ddc785841e6..e1492a01535 100644 --- a/tests/integration/defs/examples/test_ad_speculative_decoding.py +++ b/tests/integration/defs/examples/test_ad_speculative_decoding.py @@ -52,14 +52,14 @@ def get_model_paths(): def make_draft_target_config(spec_model_path: str): return DraftTargetDecodingConfig( - max_draft_len=DRAFT_TARGET_MAX_DRAFT_LEN, speculative_model_dir=spec_model_path + max_draft_len=DRAFT_TARGET_MAX_DRAFT_LEN, speculative_model=spec_model_path ) def make_eagle3_config(spec_model_path: str): return EagleDecodingConfig( max_draft_len=EAGLE_MAX_DRAFT_LEN, - speculative_model_dir=spec_model_path, + speculative_model=spec_model_path, eagle3_one_model=False, eagle3_layers_to_capture=None, ) @@ -216,7 +216,7 @@ def test_autodeploy_eagle3_acceptance_rate(): # Configure Eagle3 speculative decoding speculative_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model, + speculative_model=eagle_model, eagle3_one_model=False, eagle3_layers_to_capture=None, ) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 2e8ae2bb0db..ca2f90fe8e5 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -223,7 +223,7 @@ def get_model_yaml_config(model_label: str, 'speculative_config': { 'decoding_type': 'Eagle', 'eagle3_one_model': True, - 'speculative_model_dir': 'Qwen3-4B_eagle3', + 'speculative_model': 'Qwen3-4B_eagle3', 'max_draft_len': 3, }, 'kv_cache_config': { diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 7bad9cf7f40..5d3d38d68d7 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -209,7 +209,7 @@ def __init__(self, server_config_data: dict, env_vars: str = ""): else: self.eagle3_layers_to_capture = [] self.max_draft_len = speculative_config.get("max_draft_len", 0) - self.speculative_model_dir = speculative_config.get("speculative_model_dir", "") + self.speculative_model = speculative_config.get("speculative_model", "") # match_mode: "config" (default) or "scenario" self.match_mode = server_config_data.get("match_mode", "config") @@ -333,7 +333,7 @@ def to_db_data(self) -> dict: "l_num_nextn_predict_layers": self.num_nextn_predict_layers, "s_eagle3_layers_to_capture": ",".join(map(str, self.eagle3_layers_to_capture)), "l_max_draft_len": self.max_draft_len, - "s_speculative_model_dir": self.speculative_model_dir, + "s_speculative_model_dir": self.speculative_model, "s_server_log_link": "", "s_server_env_var": self.env_vars, } @@ -343,15 +343,15 @@ def generate_extra_llm_api_config(self) -> str: """Generate extra-llm-api-config.yml content.""" config_data = dict(self.extra_llm_api_config_data) - # Handle speculative_model_dir path conversion + # Handle speculative_model path conversion if ( "speculative_config" in config_data - and "speculative_model_dir" in config_data["speculative_config"] + and "speculative_model" in config_data["speculative_config"] ): - spec_model_dir = config_data["speculative_config"]["speculative_model_dir"] - if spec_model_dir: - config_data["speculative_config"]["speculative_model_dir"] = os.path.join( - llm_models_root(), spec_model_dir + spec_model = config_data["speculative_config"]["speculative_model"] + if spec_model: + config_data["speculative_config"]["speculative_model"] = os.path.join( + llm_models_root(), spec_model ) return yaml.dump(config_data, default_flow_style=False, sort_keys=False) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index b55eeb8359d..dc3a385d3df 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -3378,7 +3378,7 @@ def test_eagle3_output_consistency_4gpus(model_dir: str, draft_model_dir: str): # Run with Eagle3 spec_config = EagleDecodingConfig( max_draft_len=3, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=True, ) with LLM(**llm_common_config, speculative_config=spec_config) as llm_spec: diff --git a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml index cd346ac25f6..d5993c46deb 100644 --- a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml @@ -146,7 +146,7 @@ server_configs: decoding_type: 'Eagle' eagle3_layers_to_capture: [-1] max_draft_len: 3 - speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3" + speculative_model: "gpt_oss/gpt-oss-120b-Eagle3" stream_interval: 20 num_postprocess_workers: 4 client_configs: diff --git a/tests/scripts/perf-sanity/run_benchmark_serve.py b/tests/scripts/perf-sanity/run_benchmark_serve.py index 3f16f7273cd..627b5d980dd 100644 --- a/tests/scripts/perf-sanity/run_benchmark_serve.py +++ b/tests/scripts/perf-sanity/run_benchmark_serve.py @@ -218,7 +218,7 @@ def str_to_bool(value: str) -> bool: SPECULATIVE_CONFIG_METRICS = { "decoding_type": (True, str), "max_draft_len": (True, int), - "speculative_model_dir": (True, str), + "speculative_model": (True, str), "eagle3_one_model": (True, str_to_bool), } @@ -259,7 +259,7 @@ def __init__( enable_padding: bool = True, decoding_type: str = "", max_draft_len: int = 0, - speculative_model_dir: str = "", + speculative_model: str = "", eagle3_one_model: bool = False, ): self.name = name @@ -285,7 +285,7 @@ def __init__( self.enable_padding = enable_padding self.decoding_type = decoding_type self.max_draft_len = max_draft_len - self.speculative_model_dir = speculative_model_dir + self.speculative_model = speculative_model self.eagle3_one_model = eagle3_one_model model_dir = get_model_dir(self.model_name) @@ -345,9 +345,9 @@ def generate_extra_llm_api_config(self) -> str: config_lines.append(f" decoding_type: {self.decoding_type}") if self.max_draft_len > 0: config_lines.append(f" max_draft_len: {self.max_draft_len}") - if self.speculative_model_dir: + if self.speculative_model: config_lines.append( - f" speculative_model_dir: {self.speculative_model_dir}") + f" speculative_model: {self.speculative_model}") if self.eagle3_one_model: config_lines.append( f" eagle3_one_model: {str(self.eagle3_one_model).lower()}") @@ -500,8 +500,8 @@ def parse_config_file(config_file_path: str, select_pattern: str = None): {}).get('decoding_type', ''), max_draft_len=server_config_data.get('speculative_config', {}).get('max_draft_len', 0), - speculative_model_dir=server_config_data.get( - 'speculative_config', {}).get('speculative_model_dir', ''), + speculative_model=server_config_data.get( + 'speculative_config', {}).get('speculative_model', ''), eagle3_one_model=server_config_data.get( 'speculative_config', {}).get('eagle3_one_model', False)) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py index 76825732029..e40b25984c4 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest from _model_test_utils import get_small_model_config from build_and_run_ad import ExperimentConfig, main from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig -def test_ad_speculative_decoding_smoke(): +@pytest.mark.parametrize("use_hf_speculative_model", [False, True]) +def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool): """Test speculative decoding with AutoDeploy using the build_and_run_ad main().""" # Use a simple test prompt @@ -27,15 +29,14 @@ def test_ad_speculative_decoding_smoke(): # Get base model config experiment_config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct") - speculative_model_dir = get_small_model_config("TinyLlama/TinyLlama-1.1B-Chat-v1.0")["args"][ - "model" - ] + speculative_model_hf_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + if use_hf_speculative_model: + speculative_model = speculative_model_hf_id + else: + speculative_model = get_small_model_config(speculative_model_hf_id)["args"]["model"] - print(f"Speculative model path: {speculative_model_dir}") # Configure speculative decoding with a draft model - spec_config = DraftTargetDecodingConfig( - max_draft_len=3, speculative_model_dir=speculative_model_dir - ) + spec_config = DraftTargetDecodingConfig(max_draft_len=3, speculative_model=speculative_model) # Configure KV cache kv_cache_config = KvCacheConfig( diff --git a/tests/unittest/_torch/speculative/test_draft_len_schedule.py b/tests/unittest/_torch/speculative/test_draft_len_schedule.py index dc4aa577646..e64ca7fa538 100644 --- a/tests/unittest/_torch/speculative/test_draft_len_schedule.py +++ b/tests/unittest/_torch/speculative/test_draft_len_schedule.py @@ -77,7 +77,7 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict): else: spec_config = DraftTargetDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=str(draft_model), + speculative_model=str(draft_model), draft_len_schedule=schedule, ) @@ -123,7 +123,7 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict): else: spec_config_fixed = DraftTargetDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=str(draft_model), + speculative_model=str(draft_model), draft_len_schedule=None, # No schedule - fixed draft length ) llm_fixed = LLM(**llm_common_config, speculative_config=spec_config_fixed) @@ -186,9 +186,7 @@ def test_draft_len_schedule_functionality(drafter_type: str, draft_schedule: dic else: spec_config = DraftTargetDecodingConfig( max_draft_len=5, - speculative_model_dir=str( - llm_models_root() / "llama-3.2-models" / "Llama-3.2-3B-Instruct" - ), + speculative_model=str(llm_models_root() / "llama-3.2-models" / "Llama-3.2-3B-Instruct"), draft_len_schedule=draft_schedule, ) prompts = ["The capital of France is" for i in range(7)] diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py index 9aaa81e8375..6ba477051fd 100644 --- a/tests/unittest/_torch/speculative/test_draft_target.py +++ b/tests/unittest/_torch/speculative/test_draft_target.py @@ -45,7 +45,7 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str): spec_config = DraftTargetDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=draft_model_dir, + speculative_model=draft_model_dir, ) prompts = [ diff --git a/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py b/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py index 4a75e1b6f4a..352d6b743f7 100644 --- a/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py +++ b/tests/unittest/_torch/speculative/test_draft_token_prepare_for_generation.py @@ -87,7 +87,7 @@ def run_test( spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, max_total_draft_tokens=max_total_draft_tokens, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=False, eagle_choices=eagle_choices, use_dynamic_tree=use_dynamic_tree, diff --git a/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py b/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py index 6002d9d6856..689da99cf2b 100644 --- a/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py +++ b/tests/unittest/_torch/speculative/test_draft_token_tree_sampling.py @@ -38,7 +38,7 @@ def run_test(max_batch_size, draft_layer_id, max_total_draft_tokens, spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, max_total_draft_tokens=max_total_draft_tokens, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=False, eagle_choices=eagle_choices, use_dynamic_tree=use_dynamic_tree, diff --git a/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py b/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py index 8994d90ed61..29a19a04ccc 100644 --- a/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py +++ b/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py @@ -23,7 +23,7 @@ def run_test(eagle_model_dir, max_seq_len, beam_width, use_dynamic_tree, spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, max_total_draft_tokens=max_total_draft_tokens, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=False, eagle_choices=eagle_choices, use_dynamic_tree=use_dynamic_tree, diff --git a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py index eeb975bd800..eaa215c81e0 100644 --- a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py +++ b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py @@ -56,7 +56,7 @@ def test_dynamic_spec_decode(enforce_single_worker, spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, # Llama 3 does not support one model eagle. eagle3_one_model=False, ) diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index a459ae718ff..8502c4d5760 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -92,48 +92,82 @@ def test_kv_lens_runtime_with_eagle3_one_model(): @pytest.mark.parametrize( - "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,multi_batch,attention_dp", + "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,multi_batch,attention_dp,use_hf_speculative_model", [ - [True, "TRTLLM", True, False, False, False, True, False, False], - [True, "TRTLLM", True, False, False, False, False, False, False], - [False, "TRTLLM", True, False, False, False, True, False, False], - [False, "TRTLLM", True, False, False, False, False, False, False], - [True, "FLASHINFER", True, False, False, False, True, False, False], - [False, "FLASHINFER", True, False, False, False, True, False, False], - [False, "TRTLLM", False, True, True, False, True, False, False], - [True, "TRTLLM", False, True, True, False, True, False, False], - [True, "TRTLLM", True, False, True, True, True, False, False], - [True, "TRTLLM", True, False, True, False, True, False, False], - [True, "TRTLLM", True, False, False, True, True, False, False], - [True, "TRTLLM", False, False, False, False, True, False, False], - [False, "TRTLLM", False, False, False, False, True, False, False], - [True, "TRTLLM", False, False, False, False, False, True, False], - [True, "TRTLLM", False, False, False, False, False, True, True], - [False, "TRTLLM", False, False, False, False, False, True, False], - [True, "TRTLLM", False, False, False, False, True, True, False], - [False, "TRTLLM", False, False, False, False, True, True, False], - [True, "TRTLLM", False, False, False, False, False, False, False], - [False, "TRTLLM", False, False, False, False, False, False, False], - [True, "TRTLLM", False, False, False, True, True, False, False], - [True, "TRTLLM", False, False, False, True, False, False, False], - [True, "FLASHINFER", False, False, False, False, True, False, False], - [False, "FLASHINFER", False, False, False, False, True, False, False], + [True, "TRTLLM", True, False, False, False, True, False, False, False], + [True, "TRTLLM", True, False, False, False, False, False, False, False], + [False, "TRTLLM", True, False, False, False, True, False, False, False], + [ + False, "TRTLLM", True, False, False, False, False, False, False, + False + ], + [ + True, "FLASHINFER", True, False, False, False, True, False, False, + False + ], + [ + False, "FLASHINFER", True, False, False, False, True, False, False, + False + ], + [False, "TRTLLM", False, True, True, False, True, False, False, False], + [True, "TRTLLM", False, True, True, False, True, False, False, False], + [True, "TRTLLM", True, False, True, True, True, False, False, False], + [True, "TRTLLM", True, False, True, False, True, False, False, False], + [True, "TRTLLM", True, False, False, True, True, False, False, False], + [True, "TRTLLM", False, False, False, False, True, False, False, False], + [ + False, "TRTLLM", False, False, False, False, True, False, False, + False + ], + [True, "TRTLLM", False, False, False, False, False, True, False, False], + [True, "TRTLLM", False, False, False, False, False, True, True, False], + [ + False, "TRTLLM", False, False, False, False, False, True, False, + False + ], + [True, "TRTLLM", False, False, False, False, True, True, False, False], + [False, "TRTLLM", False, False, False, False, True, True, False, False], + [ + True, "TRTLLM", False, False, False, False, False, False, False, + False + ], + [ + False, "TRTLLM", False, False, False, False, False, False, False, + False + ], + [True, "TRTLLM", False, False, False, True, True, False, False, False], + [True, "TRTLLM", False, False, False, True, False, False, False, False], + [ + True, "FLASHINFER", False, False, False, False, True, False, False, + False + ], + [ + False, "FLASHINFER", False, False, False, False, True, False, False, + False + ], + # HF download variant - tests speculative model auto-download from HuggingFace Hub + [False, "TRTLLM", True, False, False, False, True, False, False, True], ]) @pytest.mark.high_cuda_memory def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str, disable_overlap_scheduler: bool, enable_block_reuse: bool, use_one_model: bool, enable_chunked_prefill: bool, use_chain_drafter: bool, multi_batch: bool, - attention_dp: bool, request): + attention_dp: bool, use_hf_speculative_model: bool, + request): # Eagle3 one model works with overlap scheduler and block reuse. total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 if total_mem_gb < 35: pytest.skip("Not enough memory to load target + draft model") models_path = llm_models_root() - eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B" target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct" + if use_hf_speculative_model: + eagle_model = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + else: + eagle_model = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B" + # bs > 1 gives non-deterministic when doing IFB. There are slight chances # that ref and spec does not match 100% max_batch_size = 4 if multi_batch else 1 @@ -165,7 +199,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str, spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model, # Llama 3 does not support one model eagle. eagle3_one_model=use_one_model, ) @@ -241,7 +275,7 @@ def test_eagle3_spec_decoding_stats(eagle3_one_model): free_gpu_memory_fraction=0.6) spec_config = EagleDecodingConfig( max_draft_len=3, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=eagle3_one_model, ) @@ -321,7 +355,7 @@ def test_llama_eagle3_long_prompt(use_cuda_graph): spec_config = EagleDecodingConfig( max_draft_len=3, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=False, ) @@ -445,7 +479,7 @@ def test_deepseek_eagle3(): spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, # Llama 3 does not support one model eagle. eagle3_one_model=use_one_model, eagle3_layers_to_capture={29}, @@ -555,7 +589,7 @@ def test_deepseek_mla_eagle3(): ) spec_config = EagleDecodingConfig(max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=use_one_model, load_format="dummy") @@ -654,7 +688,7 @@ def test_multi_eagle3(use_one_model: bool): spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, # Llama 3 does not support one model eagle. eagle3_one_model=use_one_model, num_eagle_layers=2, @@ -713,7 +747,7 @@ def test_eagle3_cuda_graph_padding(disable_overlap_scheduler: bool): spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=use_one_model, ) @@ -766,7 +800,7 @@ def test_eagle3_cdl_sampling(disable_overlap_scheduler: bool): spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=use_one_model, ) diff --git a/tests/unittest/_torch/speculative/test_kv_cache_reuse.py b/tests/unittest/_torch/speculative/test_kv_cache_reuse.py index 95ed232b969..eb5a720db1a 100644 --- a/tests/unittest/_torch/speculative/test_kv_cache_reuse.py +++ b/tests/unittest/_torch/speculative/test_kv_cache_reuse.py @@ -52,7 +52,7 @@ def test_kv_cache_reuse(use_cuda_graph: bool, attn_backend: str): spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, eagle3_one_model=False, ) diff --git a/tests/unittest/_torch/speculative/test_spec_gate.py b/tests/unittest/_torch/speculative/test_spec_gate.py index b1720f59233..a99654a9c63 100644 --- a/tests/unittest/_torch/speculative/test_spec_gate.py +++ b/tests/unittest/_torch/speculative/test_spec_gate.py @@ -47,7 +47,7 @@ def test_spec_gate_e2e(): spec_config = EagleDecodingConfig( max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, + speculative_model=eagle_model_dir, # Llama 3 does not support one model eagle. eagle3_one_model=False, max_concurrency=10000, diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index f8ffe8fc7bd..357d204eabb 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1218,7 +1218,7 @@ def test_llm_api_medusa(): speculative_config = MedusaDecodingConfig(num_medusa_heads=4, max_draft_len=63, - speculative_model_dir=get_model_path("medusa-vicuna-7b-v1.3"), + speculative_model=get_model_path("medusa-vicuna-7b-v1.3"), medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \ [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], \ [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], \ @@ -1257,7 +1257,7 @@ def test_llm_api_medusa_tp2(): speculative_config = MedusaDecodingConfig(num_medusa_heads=4, max_draft_len=63, - speculative_model_dir=get_model_path("medusa-vicuna-7b-v1.3"), + speculative_model=get_model_path("medusa-vicuna-7b-v1.3"), medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \ [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], \ [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], \ @@ -1295,7 +1295,7 @@ def test_llm_api_eagle(**llm_kwargs): speculative_config = EagleDecodingConfig( max_draft_len=63, - speculative_model_dir=get_model_path("EAGLE-Vicuna-7B-v1.3"), + speculative_model=get_model_path("EAGLE-Vicuna-7B-v1.3"), num_eagle_layers=4, max_non_leaves_per_layer=10, eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \ @@ -1342,7 +1342,7 @@ def test_llm_api_eagle2(**llm_kwargs): speculative_config = EagleDecodingConfig( max_draft_len=63, - speculative_model_dir=get_model_path("EAGLE-Vicuna-7B-v1.3"), + speculative_model=get_model_path("EAGLE-Vicuna-7B-v1.3"), num_eagle_layers=4, max_non_leaves_per_layer=10, use_dynamic_tree=True, diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index 55c6c7b055b..f3562ae6048 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -445,6 +445,19 @@ def test_dynamic_setattr(self): args = TorchLlmArgs(model=llama_model_path) args.invalid_arg = 1 + def test_speculative_model_alias(self): + """Test that speculative_model_dir is accepted as an alias for speculative_model.""" + + spec_config = EagleDecodingConfig( + max_draft_len=3, + speculative_model_dir="/path/to/model", + eagle3_one_model=False, + ) + + args = TorchLlmArgs(model=llama_model_path, + speculative_config=spec_config) + assert args.speculative_model == "/path/to/model" + class TestTrtLlmArgs: From 5c11e5cd5f7ec9372b1811357881a91ea3ce5d5c Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Wed, 17 Dec 2025 21:28:58 -0800 Subject: [PATCH 02/10] Add logs Signed-off-by: Anish Shanbhag --- tensorrt_llm/llmapi/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py index bfc81f7cfdd..88e22fc639c 100644 --- a/tensorrt_llm/llmapi/utils.py +++ b/tensorrt_llm/llmapi/utils.py @@ -224,6 +224,7 @@ def __init__(self, *args, **kwargs): def download_hf_model(model: str, revision: Optional[str] = None) -> Path: ignore_patterns = ["original/**/*"] + logger.info(f"Downloading model {model} from HuggingFace") with get_file_lock(model): hf_folder = snapshot_download( model, @@ -231,6 +232,7 @@ def download_hf_model(model: str, revision: Optional[str] = None) -> Path: ignore_patterns=ignore_patterns, revision=revision, tqdm_class=DisabledTqdm) + logger.info(f"Finished downloading model {model} from HuggingFace") return Path(hf_folder) From 64a42b87ed801c2d9e8e1e0f1c7fc0612e890d04 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 18 Dec 2025 17:48:45 -0800 Subject: [PATCH 03/10] Mock snapshot_download to avoid download from HF Signed-off-by: Anish Shanbhag --- .../defs/accuracy/test_llm_api_autodeploy.py | 19 +-- tests/test_common/llm_data.py | 115 ++++++++++++++++++ .../_utils_test/_model_test_utils.py | 28 +---- .../singlegpu/models/test_deepseek_patches.py | 6 +- .../singlegpu/test_ad_speculative_decoding.py | 3 + .../_torch/speculative/test_eagle3.py | 4 +- tests/unittest/utils/llm_data.py | 27 +--- 7 files changed, 133 insertions(+), 69 deletions(-) create mode 100644 tests/test_common/llm_data.py diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 760164b3ca3..1ee0061cbf0 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -13,34 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest +from test_common.llm_data import hf_model_dir_or_hub_id from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM from tensorrt_llm.quantization import QuantAlgo from tensorrt_llm.sampling_params import SamplingParams -from ..conftest import llm_models_root from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness -def _hf_model_dir_or_hub_id( - hf_model_subdir: str, - hf_hub_id: str, -) -> str: - llm_models_path = llm_models_root() - if llm_models_path and os.path.isdir( - (model_fullpath := os.path.join(llm_models_path, hf_model_subdir))): - return str(model_fullpath) - else: - return hf_hub_id - - class TestLlama3_1_8B(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B" - MODEL_PATH = _hf_model_dir_or_hub_id("llama-3.1-model/Meta-Llama-3.1-8B", - MODEL_NAME) + MODEL_PATH = hf_model_dir_or_hub_id(MODEL_NAME) def get_default_kwargs(self, enable_chunked_prefill=False): config = { diff --git a/tests/test_common/llm_data.py b/tests/test_common/llm_data.py new file mode 100644 index 00000000000..6792af64144 --- /dev/null +++ b/tests/test_common/llm_data.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared utilities for local LLM model paths and HuggingFace download mocking.""" + +import os +from functools import wraps +from pathlib import Path +from typing import Optional +from unittest.mock import patch + +# Mapping from HuggingFace Hub ID to local subdirectory under LLM_MODELS_ROOT. +# NOTE: hf_id_to_llm_models_subdir below will fall back to checking if the model name exists +# in LLM_MODELS_ROOT if not present here, so it's not required to exhaustively list all +# models here. +HF_ID_TO_LLM_MODELS_SUBDIR = { + "meta-llama/Meta-Llama-3.1-8B-Instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", + "meta-llama/Llama-3.1-8B-Instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", + "meta-llama/Llama-3.1-8B": "llama-3.1-model/Meta-Llama-3.1-8B", + "TinyLlama/TinyLlama-1.1B-Chat-v1.0": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0", + "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama4-models/Llama-4-Scout-17B-16E-Instruct", + "mistralai/Mixtral-8x7B-Instruct-v0.1": "Mixtral-8x7B-Instruct-v0.1", + "mistralai/Mistral-Small-3.1-24B-Instruct-2503": "Mistral-Small-3.1-24B-Instruct-2503", + "Qwen/Qwen3-30B-A3B": "Qwen3/Qwen3-30B-A3B", + "Qwen/Qwen2.5-3B-Instruct": "Qwen2.5-3B-Instruct", + "microsoft/Phi-3-mini-4k-instruct": "Phi-3/Phi-3-mini-4k-instruct", + "deepseek-ai/DeepSeek-V3": "DeepSeek-V3", + "deepseek-ai/DeepSeek-R1": "DeepSeek-R1/DeepSeek-R1", + "ibm-ai-platform/Bamba-9B-v2": "Bamba-9B-v2", + "nvidia/NVIDIA-Nemotron-Nano-12B-v2": "NVIDIA-Nemotron-Nano-12B-v2", + "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3": "NVIDIA-Nemotron-Nano-31B-A3-v3", + "nvidia/Nemotron-Nano-3-30B-A3.5B-dev-1024": "Nemotron-Nano-3-30B-A3.5B-dev-1024", + "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B": "EAGLE3-LLaMA3.1-Instruct-8B", +} + + +def llm_models_root(check: bool = False) -> Optional[Path]: + root = Path("/home/scratch.trt_llm_data/llm-models/") + + if "LLM_MODELS_ROOT" in os.environ: + root = Path(os.environ.get("LLM_MODELS_ROOT")) + + if not root.exists(): + root = Path("/scratch.trt_llm_data/llm-models/") + + if check: + assert root.exists(), ( + "You must set LLM_MODELS_ROOT env or be able to access /home/scratch.trt_llm_data to run this test" + ) + + return root if root.exists() else None + + +def llm_datasets_root() -> str: + return os.path.join(llm_models_root(check=True), "datasets") + + +def hf_id_to_local_model_dir(hf_hub_id: str) -> str | None: + """Return the local model directory under LLM_MODELS_ROOT for a given HuggingFace Hub ID, or None if not found.""" + root = llm_models_root() + if root is None: + return None + + if hf_hub_id in HF_ID_TO_LLM_MODELS_SUBDIR: + return str(root / HF_ID_TO_LLM_MODELS_SUBDIR[hf_hub_id]) + + # Fall back to checking if the model name exists as a top-level directory in LLM_MODELS_ROOT + model_name = hf_hub_id.split("/")[-1] + if os.path.isdir(root / model_name): + return str(root / model_name) + + return None + + +def hf_model_dir_or_hub_id(hf_hub_id: str) -> str: + """Resolve a HuggingFace Hub ID to local path if available, otherwise return the Hub ID.""" + return hf_id_to_local_model_dir(hf_hub_id) or hf_hub_id + + +def mock_snapshot_download(repo_id: str, **kwargs) -> str: + """Mock huggingface_hub.snapshot_download that returns an existing local model directory. + + NOTE: This function does not currently handle the revision / allow_patterns / ignore_patterns parameters. + """ + local_path = hf_id_to_local_model_dir(repo_id) + if local_path is None: + raise ValueError(f"Model '{repo_id}' not found in LLM_MODELS_ROOT") + return local_path + + +def with_mocked_hf_download(func): + """Decorator to mock huggingface_hub.snapshot_download for tests. + + When applied, any calls to snapshot_download will be redirected to use + local model paths from LLM_MODELS_ROOT instead of downloading from HuggingFace. + """ + + @wraps(func) + def wrapper(*args, **kwargs): + with patch("huggingface_hub.snapshot_download", side_effect=mock_snapshot_download): + return func(*args, **kwargs) + + return wrapper diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py index a71a09b4652..04adb076dd4 100644 --- a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py +++ b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py @@ -1,12 +1,11 @@ import copy -import os from typing import Any, Dict, Optional import torch import torch.nn.functional as F +from test_common.llm_data import hf_model_dir_or_hub_id from torch import nn from torch.export import Dim -from utils.llm_data import llm_models_root def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: @@ -285,17 +284,6 @@ def generate_dynamic_shapes(max_batch_size, max_seq_len): return dynamic_shapes -def _hf_model_dir_or_hub_id( - hf_model_subdir: str, - hf_hub_id: str, -) -> str: - llm_models_path = llm_models_root() - if llm_models_path and os.path.isdir((model_fullpath := llm_models_path / hf_model_subdir)): - return str(model_fullpath) - else: - return hf_hub_id - - def rotate_half(x: torch.Tensor) -> torch.Tensor: x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] @@ -351,7 +339,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): _SMALL_MODEL_CONFIGS = { "meta-llama/Meta-Llama-3.1-8B-Instruct": { - "llm_models_subdir": "llama-3.1-model/Llama-3.1-8B-Instruct", "model_kwargs": { "num_hidden_layers": 1, "hidden_size": 64, @@ -361,7 +348,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "mistralai/Mixtral-8x7B-Instruct-v0.1": { - "llm_models_subdir": "Mixtral-8x7B-Instruct-v0.1", "model_kwargs": { "num_hidden_layers": 2, "intermediate_size": 256, @@ -372,7 +358,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "Qwen/Qwen3-30B-A3B": { - "llm_models_subdir": "Qwen3/Qwen3-30B-A3B", "model_kwargs": { "num_hidden_layers": 2, "intermediate_size": 256, @@ -383,7 +368,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "microsoft/Phi-3-mini-4k-instruct": { - "llm_models_subdir": "Phi-3/Phi-3-mini-4k-instruct", "model_kwargs": { "num_hidden_layers": 2, "hidden_size": 128, @@ -393,7 +377,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "meta-llama/Llama-4-Scout-17B-16E-Instruct": { - "llm_models_subdir": "llama4-models/Llama-4-Scout-17B-16E-Instruct", "model_factory": "AutoModelForImageTextToText", "model_kwargs": { "text_config": { @@ -412,7 +395,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "deepseek-ai/DeepSeek-V3": { - "llm_models_subdir": "DeepSeek-V3", "model_kwargs": { "first_k_dense_replace": 1, "num_hidden_layers": 2, @@ -431,7 +413,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "Qwen/Qwen2.5-3B-Instruct": { - "llm_models_subdir": "Qwen2.5-3B-Instruct", "model_kwargs": { "num_hidden_layers": 2, "hidden_size": 64, @@ -441,7 +422,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "mistralai/Mistral-Small-3.1-24B-Instruct-2503": { - "llm_models_subdir": "Mistral-Small-3.1-24B-Instruct-2503", "model_factory": "AutoModelForImageTextToText", "model_kwargs": { "text_config": { @@ -463,7 +443,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "ibm-ai-platform/Bamba-9B-v2": { - "llm_models_subdir": "Bamba-9B-v2", "model_kwargs": { "dtype": "bfloat16", "hidden_size": 64, @@ -482,7 +461,6 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "nvidia/NVIDIA-Nemotron-Nano-12B-v2": { - "llm_models_subdir": "NVIDIA-Nemotron-Nano-12B-v2", "model_kwargs": { "dtype": "bfloat16", "hidden_size": 32, @@ -497,13 +475,11 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1): }, }, "TinyLlama/TinyLlama-1.1B-Chat-v1.0": { - "llm_models_subdir": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0", "model_kwargs": { "num_hidden_layers": 2, }, }, "nvidia/Nemotron-Nano-3-30B-A3.5B-dev-1024": { - "llm_models_subdir": "Nemotron-Nano-3-30B-A3.5B-dev-1024", "model_kwargs": { "num_hidden_layers": 8, }, @@ -531,7 +507,7 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An llm_args = copy.deepcopy(_SMALL_MODEL_CONFIGS[model_hub_id]) # check if should use llm_models_root or hf_hub_id - llm_args["model"] = _hf_model_dir_or_hub_id(llm_args.pop("llm_models_subdir"), model_hub_id) + llm_args["model"] = hf_model_dir_or_hub_id(model_hub_id) # add some defaults to llm_args llm_args["skip_loading_weights"] = True # No weight loading to speed up things diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py index bbfd0c95f50..cf852a71111 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_deepseek_patches.py @@ -4,7 +4,7 @@ import pytest import torch -from _model_test_utils import _hf_model_dir_or_hub_id +from test_common.llm_data import hf_model_dir_or_hub_id from transformers import AutoConfig, AutoModelForCausalLM from tensorrt_llm._torch.auto_deploy.models.patches.deepseek import ( @@ -77,7 +77,7 @@ def _generate_ds_attention_mask(b, s): "model_name, module_name, patch, inputs", [ pytest.param( - _hf_model_dir_or_hub_id("DeepSeek-R1/DeepSeek-R1", "deepseek-ai/DeepSeek-R1"), + hf_model_dir_or_hub_id("deepseek-ai/DeepSeek-R1"), "model.layers.0.self_attn", deepseek_v3_attention, [ @@ -87,7 +87,7 @@ def _generate_ds_attention_mask(b, s): ], ), # attention requires inputs [hidden_states, attention_mask, position_ids] pytest.param( - _hf_model_dir_or_hub_id("DeepSeek-R1/DeepSeek-R1", "deepseek-ai/DeepSeek-R1"), + hf_model_dir_or_hub_id("deepseek-ai/DeepSeek-R1"), "model.layers.0.mlp", deepseek_v3_moe_exact, [torch.randn(2, 6, 8, dtype=torch.bfloat16)], diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py index e40b25984c4..81481e8f51d 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py @@ -16,11 +16,13 @@ import pytest from _model_test_utils import get_small_model_config from build_and_run_ad import ExperimentConfig, main +from test_common.llm_data import with_mocked_hf_download from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig @pytest.mark.parametrize("use_hf_speculative_model", [False, True]) +@with_mocked_hf_download def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool): """Test speculative decoding with AutoDeploy using the build_and_run_ad main().""" @@ -31,6 +33,7 @@ def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool): experiment_config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct") speculative_model_hf_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" if use_hf_speculative_model: + # NOTE: this will still mock out the actual HuggingFace download speculative_model = speculative_model_hf_id else: speculative_model = get_small_model_config(speculative_model_hf_id)["args"]["model"] diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index 8502c4d5760..c2d4cf50f4c 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -8,6 +8,7 @@ import pytest import torch +from test_common.llm_data import with_mocked_hf_download from utils.llm_data import llm_models_root from tensorrt_llm import LLM, SamplingParams @@ -145,10 +146,11 @@ def test_kv_lens_runtime_with_eagle3_one_model(): False, "FLASHINFER", False, False, False, False, True, False, False, False ], - # HF download variant - tests speculative model auto-download from HuggingFace Hub + # Tests (mocked) speculative model auto-download from HuggingFace [False, "TRTLLM", True, False, False, False, True, False, False, True], ]) @pytest.mark.high_cuda_memory +@with_mocked_hf_download def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str, disable_overlap_scheduler: bool, enable_block_reuse: bool, use_one_model: bool, enable_chunked_prefill: bool, diff --git a/tests/unittest/utils/llm_data.py b/tests/unittest/utils/llm_data.py index c5953c2a768..118ba2fe05e 100644 --- a/tests/unittest/utils/llm_data.py +++ b/tests/unittest/utils/llm_data.py @@ -1,23 +1,6 @@ -import os -from pathlib import Path -from typing import Optional +from test_common.llm_data import llm_datasets_root, llm_models_root - -def llm_models_root(check=False) -> Optional[Path]: - root = Path("/home/scratch.trt_llm_data/llm-models/") - - if "LLM_MODELS_ROOT" in os.environ: - root = Path(os.environ.get("LLM_MODELS_ROOT")) - - if not root.exists(): - root = Path("/scratch.trt_llm_data/llm-models/") - - if check: - assert root.exists(), \ - "You shall set LLM_MODELS_ROOT env or be able to access /home/scratch.trt_llm_data to run this test" - - return root if root.exists() else None - - -def llm_datasets_root() -> str: - return os.path.join(llm_models_root(check=True), "datasets") +__all__ = [ + "llm_datasets_root", + "llm_models_root", +] From 213d74acc118735b89ac0ed71ad3149be889658c Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 18 Dec 2025 18:14:36 -0800 Subject: [PATCH 04/10] Move download to shared helper Signed-off-by: Anish Shanbhag --- tensorrt_llm/llmapi/llm_utils.py | 38 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index 4efb06dd56f..3d1a33755bd 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -638,6 +638,23 @@ def _submit_to_all_workers( else: return [task(*args, **kwargs)] + def _download_hf_model_if_needed(self, + model_obj: _ModelWrapper, + revision: Optional[str] = None) -> Path: + """Download a model from HF hub if needed. + + Also updates the model_obj.model_dir with the local model dir on rank 0. + """ + if model_obj.is_hub_model: + model_dirs = self._submit_to_all_workers( + CachedModelLoader._node_download_hf_model, + model=model_obj.model_name, + revision=revision) + model_dir = model_dirs[0] + model_obj.model_dir = model_dir + return model_dir + return model_obj.model_dir + def __call__(self) -> Tuple[Path, Union[Path, None]]: if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE: @@ -648,14 +665,9 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]: self.model_loader = ModelLoader(self.llm_args) # Download speculative model from HuggingFace if needed - if (self.model_loader.speculative_model_obj is not None - and self.model_loader.speculative_model_obj.is_hub_model): - spec_model_dirs = self._submit_to_all_workers( - CachedModelLoader._node_download_hf_model, - model=self.model_loader.speculative_model_obj.model_name, - revision=None) - spec_model_dir = spec_model_dirs[0] - self.model_loader.speculative_model_obj.model_dir = spec_model_dir + if self.model_loader.speculative_model_obj is not None: + spec_model_dir = self._download_hf_model_if_needed( + self.model_loader.speculative_model_obj) # Update llm_args so PyTorch/AutoDeploy executor gets the local path if self.llm_args.speculative_config is not None: self.llm_args.speculative_config.speculative_model = spec_model_dir @@ -668,14 +680,8 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]: raise ValueError( f'backend {self.llm_args.backend} is not supported.') - if self.model_loader.model_obj.is_hub_model: - hf_model_dirs = self._submit_to_all_workers( - CachedModelLoader._node_download_hf_model, - model=self.model_loader.model_obj.model_name, - revision=self.llm_args.revision) - self._hf_model_dir = hf_model_dirs[0] - else: - self._hf_model_dir = self.model_loader.model_obj.model_dir + self._hf_model_dir = self._download_hf_model_if_needed( + self.model_loader.model_obj, revision=self.llm_args.revision) if self.llm_args.quant_config.quant_algo is not None: logger.warning( From 50abc5dac3003d0d558013cdaea46d1a4cac85b2 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Mon, 22 Dec 2025 11:41:35 -0800 Subject: [PATCH 05/10] Add missing import Signed-off-by: Anish Shanbhag --- tests/integration/defs/accuracy/test_llm_api_autodeploy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 1ee0061cbf0..508a26cb3dc 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -14,7 +14,7 @@ # limitations under the License. import pytest -from test_common.llm_data import hf_model_dir_or_hub_id +from test_common.llm_data import hf_model_dir_or_hub_id, llm_models_root from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM from tensorrt_llm.quantization import QuantAlgo From 26c4dd9526767c54aaf2effd91323ee213531378 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Mon, 22 Dec 2025 18:08:22 -0800 Subject: [PATCH 06/10] Fixes Signed-off-by: Anish Shanbhag --- tensorrt_llm/llmapi/llm_args.py | 2 +- tensorrt_llm/llmapi/llm_utils.py | 25 +++++++++++++------------ tests/unittest/utils/llm_data.py | 9 +++++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 6f65d77d92c..dee8938a984 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -656,7 +656,7 @@ class DecodingBaseConfig(StrictBaseModel): # which will be automatically downloaded. # - A local filesystem path to a downloaded model directory. speculative_model: Optional[Union[str, Path]] = Field( - default=None, alias="speculative_model_dir") + default=None, validation_alias="speculative_model_dir") # PyTorch only. # When specified, speculation will be disabled at batch sizes above diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index 3d1a33755bd..32cc8b92e94 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -125,7 +125,7 @@ def __init__(self, Path] = self.model_obj.model_dir if self.model_obj.is_local_model else None self._speculative_model_dir: Optional[ - Path] = self.speculative_model_obj.model_dir if self.speculative_model_obj is not None and self.model_obj.is_local_model else None + Path] = self.speculative_model_obj.model_dir if self.speculative_model_obj is not None and self.speculative_model_obj.is_local_model else None self._model_info: Optional[_ModelInfo] = None self._model_format = self.llm_args.model_format @@ -660,21 +660,22 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]: if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE: return Path(self.llm_args.model), None - self.engine_cache_stage: Optional[CachedStage] = None - self._hf_model_dir = None - self.model_loader = ModelLoader(self.llm_args) - - # Download speculative model from HuggingFace if needed - if self.model_loader.speculative_model_obj is not None: - spec_model_dir = self._download_hf_model_if_needed( - self.model_loader.speculative_model_obj) - # Update llm_args so PyTorch/AutoDeploy executor gets the local path - if self.llm_args.speculative_config is not None: - self.llm_args.speculative_config.speculative_model = spec_model_dir + # Download speculative model from HuggingFace if needed (all backends) + if (self.llm_args.speculative_config is not None and + self.llm_args.speculative_config.speculative_model is not None): + spec_model_obj = _ModelWrapper( + self.llm_args.speculative_config.speculative_model) + spec_model_dir = self._download_hf_model_if_needed(spec_model_obj) + self.llm_args.speculative_config.speculative_model = spec_model_dir + # AutoDeploy doesn't use ModelLoader if self.llm_args.backend == "_autodeploy": return None, "" + self.engine_cache_stage: Optional[CachedStage] = None + self._hf_model_dir = None + self.model_loader = ModelLoader(self.llm_args) + if self.llm_args.backend is not None: if self.llm_args.backend not in ["pytorch", "_autodeploy"]: raise ValueError( diff --git a/tests/unittest/utils/llm_data.py b/tests/unittest/utils/llm_data.py index 118ba2fe05e..fd1bd15ca11 100644 --- a/tests/unittest/utils/llm_data.py +++ b/tests/unittest/utils/llm_data.py @@ -1,3 +1,12 @@ +import os +import sys + +# Ensure tests/ directory is in path for test_common imports +sys.path.insert( + 0, + os.path.dirname(os.path.dirname(os.path.dirname( + os.path.abspath(__file__))))) + from test_common.llm_data import llm_datasets_root, llm_models_root __all__ = [ From e9cee86bcd62777afc4e4dbdbe5b11d04f41f36c Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Mon, 22 Dec 2025 22:37:01 -0800 Subject: [PATCH 07/10] Use AliasChoices Signed-off-by: Anish Shanbhag --- tensorrt_llm/llmapi/llm_args.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index dee8938a984..c92c429e624 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -13,7 +13,7 @@ import torch import yaml -from pydantic import BaseModel +from pydantic import AliasChoices, BaseModel from pydantic import Field as PydanticField from pydantic import PrivateAttr, field_validator, model_validator from strenum import StrEnum @@ -656,7 +656,9 @@ class DecodingBaseConfig(StrictBaseModel): # which will be automatically downloaded. # - A local filesystem path to a downloaded model directory. speculative_model: Optional[Union[str, Path]] = Field( - default=None, validation_alias="speculative_model_dir") + default=None, + validation_alias=AliasChoices("speculative_model", + "speculative_model_dir")) # PyTorch only. # When specified, speculation will be disabled at batch sizes above From 6ebb1b903713eaf9ce23b993bed636cf99594d09 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Tue, 30 Dec 2025 15:26:40 -0800 Subject: [PATCH 08/10] Use model validators for eagle config Signed-off-by: Anish Shanbhag --- tensorrt_llm/llmapi/llm_args.py | 69 ++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index c92c429e624..7f47fd6a077 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -865,28 +865,29 @@ class EagleDecodingConfig(DecodingBaseConfig): # choices: llama3, mistral_large3 eagle3_model_arch: str = "llama3" - def __init__(self, **kwargs): - super().__init__() - for attr_name, attr_value in kwargs.items(): - if attr_name == 'max_draft_len': - self.num_eagle_layers = attr_value - self.max_total_draft_tokens = attr_value # If using linear-tree, the max_total_draft_tokens is the same as max_draft_len - # Convert the data type of Eagle choice from str to List[List[int]] - if attr_name == 'eagle_choices' and attr_value is not None: - logger.warning( - "NOTE: The Draft token tree is still under development, PLEASE DO NOT USE IT !!!" - ) - if not isinstance(attr_value, list): - if isinstance(attr_value, str): - attr_value = ast.literal_eval( - attr_value.replace(" ", "")) - else: - raise ValueError( - "Wrong eagle choices type. Eagle choices should be a List[List[int]] or a string like [[0], [1], [2], [0, 0], [0, 1]]." - ) - setattr(self, attr_name, attr_value) + @field_validator('eagle_choices', mode='before') + @classmethod + def validate_eagle_choices(cls, v): + if v is not None: + logger.warning( + "NOTE: The Draft token tree is still under development, PLEASE DO NOT USE IT !!!" + ) + if not isinstance(v, list): + if isinstance(v, str): + v = ast.literal_eval(v.replace(" ", "")) + else: + raise ValueError( + "Wrong eagle choices type. Eagle choices should be a List[List[int]] or a string like [[0], [1], [2], [0, 0], [0, 1]]." + ) + return v + + @model_validator(mode='after') + def validate_eagle_config(self) -> 'EagleDecodingConfig': + if self.max_draft_len is None: + raise ValueError("max_draft_len is required for Eagle") + self.num_eagle_layers = self.max_draft_len + self.max_total_draft_tokens = self.max_draft_len # If using linear-tree, the max_total_draft_tokens is the same as max_draft_len - assert self.max_draft_len is not None, "max_draft_len is required for Eagle" if self.eagle3_model_arch == "mistral_large3" and self.eagle3_layers_to_capture is None: # FIXME find a better way to setup it. self.eagle3_layers_to_capture = {-1} @@ -896,7 +897,10 @@ def __init__(self, **kwargs): # and reset the max_draft_len and num_eagle_layers if necessary if self.eagle_choices is not None: # If eagle_choices is provided, use_dynamic_tree should not be used - assert not self.use_dynamic_tree, "If eagle_choices is provided, use_dynamic_tree need to be False" + if self.use_dynamic_tree: + raise ValueError( + "If eagle_choices is provided, use_dynamic_tree need to be False" + ) # Get num_eagle_layers from eagle_choices num_eagle_layers_from_choices = self.check_eagle_choices() @@ -913,10 +917,23 @@ def __init__(self, **kwargs): # Dynamic tree logic if self.use_dynamic_tree: - assert self.eagle_choices is None, "If use_dynamic_tree is True, eagle_choices should be None" - assert self.max_draft_len is not None and self.max_draft_len > 0, "max_draft_len should be provided, which indicates the number of drafter layers" - assert self.dynamic_tree_max_topK is not None and self.dynamic_tree_max_topK > 0, "dynamic_tree_max_topK should be provided, which indicates the number of nodes to expand each time" - assert self.max_total_draft_tokens is not None and self.max_total_draft_tokens > 0, "max_total_draft_tokens should be provided, which indicates the total nodes of the final draft tree. (exclude the root node)" + if self.eagle_choices is not None: + raise ValueError( + "If use_dynamic_tree is True, eagle_choices should be None") + if self.max_draft_len is None or self.max_draft_len <= 0: + raise ValueError( + "max_draft_len should be provided, which indicates the number of drafter layers" + ) + if self.dynamic_tree_max_topK is None or self.dynamic_tree_max_topK <= 0: + raise ValueError( + "dynamic_tree_max_topK should be provided, which indicates the number of nodes to expand each time" + ) + if self.max_total_draft_tokens is None or self.max_total_draft_tokens <= 0: + raise ValueError( + "max_total_draft_tokens should be provided, which indicates the total nodes of the final draft tree. (exclude the root node)" + ) + + return self @classmethod def from_dict(cls, data: dict): From e0b1bfa9b0c802c8964fe2cb08153807ec187eef Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Mon, 5 Jan 2026 11:01:01 -0800 Subject: [PATCH 09/10] Remove redundant speculative model download for TRTLLM backend Signed-off-by: Anish Shanbhag --- tensorrt_llm/llmapi/llm_args.py | 28 +++------------------------- tensorrt_llm/llmapi/llm_utils.py | 20 +++----------------- 2 files changed, 6 insertions(+), 42 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 7f47fd6a077..7a2ba8b2ed2 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -17,6 +17,7 @@ from pydantic import Field as PydanticField from pydantic import PrivateAttr, field_validator, model_validator from strenum import StrEnum + from transformers import PreTrainedTokenizerBase try: @@ -2143,9 +2144,6 @@ def coerce_env_overrides_to_str(cls, v): _parallel_config: Optional[_ParallelConfig] = PrivateAttr(default=None) _model_format: Optional[_ModelFormatKind] = PrivateAttr(default=None) - _speculative_model: Optional[str] = PrivateAttr(default=None) - _speculative_model_format: Optional[_ModelFormatKind] = PrivateAttr( - default=None) @property def parallel_config(self) -> _ParallelConfig: @@ -2156,12 +2154,8 @@ def model_format(self) -> _ModelFormatKind: return self._model_format @property - def speculative_model(self) -> Optional[str]: - return self._speculative_model - - @property - def speculative_model_format(self) -> _ModelFormatKind: - return self._speculative_model_format + def speculative_model(self) -> Optional[Union[str, Path]]: + return self.speculative_config.speculative_model if self.speculative_config is not None else None @classmethod def from_kwargs(cls, **kwargs: Any) -> "BaseLlmArgs": @@ -2552,14 +2546,6 @@ def validate_speculative_config(self): else: self.decoding_config = None - self._speculative_model = getattr(self.speculative_config, - "speculative_model", None) - speculative_model_obj = _ModelWrapper( - self._speculative_model - ) if self._speculative_model is not None else None - if self._speculative_model and speculative_model_obj.is_local_model: - self._speculative_model_format = _ModelFormatKind.HF - return self def _load_config_from_engine(self, engine_dir: Path): @@ -3081,14 +3067,6 @@ def validate_speculative_config(self): else: self.decoding_config = None - self._speculative_model = getattr(self.speculative_config, - "speculative_model", None) - speculative_model_obj = _ModelWrapper( - self._speculative_model - ) if self._speculative_model is not None else None - if self._speculative_model and speculative_model_obj.is_local_model: - self._speculative_model_format = _ModelFormatKind.HF - return self @model_validator(mode="after") diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index 32cc8b92e94..f6f3ebb05f2 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -9,10 +9,11 @@ from typing import Any, Callable, List, Optional, Tuple, Union import torch -import transformers from pydantic import BaseModel from tqdm import tqdm +import transformers + from .._utils import (global_mpi_rank, local_mpi_rank, mpi_barrier, mpi_broadcast, mpi_rank, release_gc) # yapf: disable @@ -145,9 +146,7 @@ def _gather_build_steps(self): return if (self.model_obj.is_hub_model - and self._model_format is not _ModelFormatKind.TLLM_ENGINE) or ( - self.speculative_model_obj - and self.speculative_model_obj.is_hub_model): + and self._model_format is not _ModelFormatKind.TLLM_ENGINE): # Download HF model if necessary if self.model_obj.model_name is None: raise ValueError( @@ -305,31 +304,18 @@ def save( def _download_hf_model(self): ''' Download HF model from third-party model hub like www.modelscope.cn or huggingface. ''' model_dir = None - speculative_model_dir = None # Only the rank0 are allowed to download model if mpi_rank() == 0: assert self._workspace is not None assert isinstance(self.model_obj.model_name, str) # this will download only once when multiple MPI processes are running - model_dir = download_hf_model(self.model_obj.model_name, revision=self.llm_args.revision) print_colored(f"Downloaded model to {model_dir}\n", 'grey') - if self.speculative_model_obj: - speculative_model_dir = download_hf_model( - self.speculative_model_obj.model_name) - print_colored(f"Downloaded model to {speculative_model_dir}\n", - 'grey') # Make all the processes got the same model_dir self._model_dir = mpi_broadcast(model_dir, root=0) self.model_obj.model_dir = self._model_dir # mark as a local model assert self.model_obj.is_local_model - if self.speculative_model_obj: - self._speculative_model_dir = mpi_broadcast(speculative_model_dir, - root=0) - self.speculative_model_obj.model_dir = self._speculative_model_dir - - assert self.speculative_model_obj.is_local_model def _update_from_hf_quant_config(self) -> bool: """Update quant_config from the config file of pre-quantized HF checkpoint. From 2594dc2b24ac983b0e365ca711bfa90f692ecf09 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Mon, 5 Jan 2026 11:31:33 -0800 Subject: [PATCH 10/10] Fix import sorting Signed-off-by: Anish Shanbhag --- tensorrt_llm/llmapi/llm_args.py | 1 - tensorrt_llm/llmapi/llm_utils.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 7a2ba8b2ed2..c6ca63a57fc 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -17,7 +17,6 @@ from pydantic import Field as PydanticField from pydantic import PrivateAttr, field_validator, model_validator from strenum import StrEnum - from transformers import PreTrainedTokenizerBase try: diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index f6f3ebb05f2..f687c03ddd6 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -9,11 +9,10 @@ from typing import Any, Callable, List, Optional, Tuple, Union import torch +import transformers from pydantic import BaseModel from tqdm import tqdm -import transformers - from .._utils import (global_mpi_rank, local_mpi_rank, mpi_barrier, mpi_broadcast, mpi_rank, release_gc) # yapf: disable