[NVIDIA#8245][feat] Autodeploy: Guided Decoding Support (NVIDIA#8551)

govind-ramnarayan · 2ez4bz · lucaslie · dominicshanshan · commit c3a3a0158983 · 2025-10-31T23:08:02.000-07:00
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
Signed-off-by: Govind Ramnarayan &lt;105831528+govind-ramnarayan@users.noreply.github.com&gt;
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
Co-authored-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
Co-authored-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py
@@ -15,6 +15,7 @@
 )
 
 from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig, DemoLLM
+from tensorrt_llm._torch.auto_deploy.llm_args import LlmArgs
 from tensorrt_llm._torch.auto_deploy.utils._config import (
     DynamicYamlMixInForSettings,
     deep_merge_dicts,
@@ -139,9 +140,10 @@ class ExperimentConfig(DynamicYamlMixInForSettings, BaseSettings):
 
     ### CORE ARGS ##################################################################################
     # The main AutoDeploy arguments - contains model, tokenizer, backend configs, etc.
-    args: AutoDeployConfig = Field(
+    args: LlmArgs = Field(
         description="The main AutoDeploy arguments containing model, tokenizer, backend configs, etc. "
-        "Please check `tensorrt_llm._torch.auto_deploy.llm_args.AutoDeployConfig` for more details."
+        "Contains all the fields from `AutoDeployConfig` and `BaseLlmArgs`. "
+        "Please check `tensorrt_llm._torch.auto_deploy.llm_args.LlmArgs` for more details."
     )
 
     # Optional model field for convenience - if provided, will be used to initialize args.model
@@ -304,6 +306,7 @@ def main(config: Optional[ExperimentConfig] = None):
         store_benchmark_results(results, config.benchmark.results_path)
 
     llm.shutdown()
+    return results
 
 
 if __name__ == "__main__":
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -87,6 +87,7 @@ def __init__(
         max_batch_size: int = 1,
         page_size: int = 0,
         max_num_tokens: Optional[int] = None,
+        vocab_size_padded: Optional[int] = None,
     ):
         """Initialize the SequenceInfo object.
 
@@ -104,14 +105,15 @@ def __init__(
                 batch is min (max_batch_size, max_num_tokens // ISL). Similarly, if a batch is
                 composed of generate-only requests, then the maximum number of sequences possible in
                 the batch is min (max_batch_size, max_num_tokens).
-
+            vocab_size_padded: corresponds to the padded vocabulary size of the model.
         Returns:
             None
         """
         # set up basic attributes
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.page_size = page_size if page_size > 0 else max_seq_len
+        self.vocab_size_padded = vocab_size_padded
 
         # NOTE (lucaslie): WAR to address issue when using flashinfer attention with
         # (max_batch_size, max_seq_len) input in trtllm runtime.
diff --git a/tensorrt_llm/_torch/auto_deploy/models/factory.py b/tensorrt_llm/_torch/auto_deploy/models/factory.py
@@ -131,6 +131,18 @@ def tokenizer(self) -> Optional[str]:
         """The tokenizer path."""
         return self._prefetched_tokenizer_path or self._tokenizer or self.model
 
+    @property
+    def vocab_size_padded(self) -> Optional[int]:
+        """Return the padded vocabulary size of the model.
+
+        This is needed for guided decoding in the pyexecutor. If the factory does not support this,
+        then this method should return None.
+
+        Returns:
+            The padded vocabulary size of the model.
+        """
+        return None
+
     def build_model(self, device: str) -> nn.Module:
         """Build the model on the desired device.
 
@@ -164,10 +176,7 @@ def forward(
         the factory.
         """
         # make sure model architecture is pre-fetched (no weights needed at this point)
-        skip_loading_weights = self.skip_loading_weights
-        self.skip_loading_weights = True
-        self.prefetch_checkpoint()
-        self.skip_loading_weights = skip_loading_weights
+        self.prefetch_checkpoint(skip_loading_weights=True)
 
         # build the model
         return self._build_model(device)
@@ -211,15 +220,18 @@ def init_processor(self) -> Optional[Any]:
         """
         return None
 
-    def prefetch_checkpoint(self, force: bool = False):
+    def prefetch_checkpoint(self, force: bool = False, skip_loading_weights: Optional[bool] = None):
         """Try or skip prefetching the checkpoint for the model and tokenizer.
 
         Args:
             force: Whether to force prefetching the checkpoint.
+            skip_loading_weights: Whether to skip loading weights. If not provided, it will use
+                the factory's skip_loading_weights value.
         """
         if not self._prefetched_model_path or force:
             self._prefetched_model_path = self._prefetch_checkpoint(
-                self._model, self.skip_loading_weights
+                self._model,
+                self.skip_loading_weights if skip_loading_weights is None else skip_loading_weights,
             )
         if self._tokenizer and (not self._prefetched_tokenizer_path or force):
             self._prefetched_tokenizer_path = self._prefetch_checkpoint(self._tokenizer, True)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -119,6 +119,11 @@ def __init__(self, *args, **kwargs):
     def automodel_cls(self) -> Type[_BaseAutoModelClass]:
         return AutoModelForCausalLM
 
+    @property
+    def vocab_size_padded(self) -> Optional[int]:
+        model_config, _ = self._get_model_config()
+        return getattr(model_config, "vocab_size", None)
+
     def _recursive_update_config(
         self, config: PretrainedConfig, update_dict: Dict[str, Any]
     ) -> Tuple[PretrainedConfig, Dict[str, Any]]:
@@ -167,6 +172,9 @@ def _recursive_update_config(
         return config, nested_unused_kwargs
 
     def _get_model_config(self) -> Tuple[PretrainedConfig, Dict[str, Any]]:
+        # prefetch the model once without weights
+        self.prefetch_checkpoint(skip_loading_weights=True)
+
         # NOTE (lucaslie): HF doesn't recursively update nested PreTrainedConfig objects. Instead,
         # the entire subconfig will be overwritten.
         # we want to recursively update model_config from model_kwargs here.
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -1,3 +1,14 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from collections import defaultdict
 from types import SimpleNamespace
 from typing import Dict, List, Optional, Tuple
@@ -6,9 +17,12 @@
 from strenum import StrEnum
 from torch._prims_common import DeviceLikeType
 
+from tensorrt_llm._torch.pyexecutor.guided_decoder import GuidedDecoder
+from tensorrt_llm._torch.pyexecutor.py_executor_creator import get_guided_decoding_config
 from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
 from tensorrt_llm._utils import nvtx_range
 from tensorrt_llm.llmapi.llm_args import ContextChunkingPolicy
+from tensorrt_llm.llmapi.tokenizer import TokenizerBase
 
 from ...._utils import mpi_rank, mpi_world_size
 from ....bindings.internal.batch_manager import CacheType
@@ -26,7 +40,7 @@
 )
 from ..custom_ops.attention_interface import SequenceInfo
 from ..distributed import common as dist
-from ..llm_args import AutoDeployConfig, LlmArgs
+from ..llm_args import LlmArgs
 from ..transform.optimizer import InferenceOptimizer
 from ..utils.logger import ad_logger
 from .interface import CachedSequenceInterface, GetInferenceModel
@@ -83,8 +97,8 @@ def _device(self) -> DeviceLikeType:
         return self.cache_seq_interface.device
 
     @classmethod
-    def build_from_config(cls, ad_config: AutoDeployConfig):
-        """Build the ADEngine using the AutoDeployConfig that gets passed through from the LLM."""
+    def build_from_config(cls, ad_config: LlmArgs):
+        """Build the ADEngine using the LlmArgs that gets passed through from the LLM."""
 
         max_batch_size = ad_config.max_batch_size
         max_seq_len = ad_config.max_seq_len
@@ -98,16 +112,17 @@ def build_from_config(cls, ad_config: AutoDeployConfig):
             device = torch.device(f"cuda:{torch.cuda.current_device()}")
         device = str(device)
 
+        factory = ad_config.create_factory()
+
         # initialize seq info object
         seq_info = SequenceInfo(
             max_seq_len=max_seq_len,
             max_batch_size=max_batch_size,
             page_size=attn_page_size,
             max_num_tokens=max_num_tokens,
+            vocab_size_padded=factory.vocab_size_padded,
         )
 
-        factory = ad_config.create_factory()
-
         # TODO (lucaslie): consider how we move args around InferenceOptimizer.__init__,
         # ADEngine.__init__, and ADEngine.build_from_config. Seems a bit unnatural atm.
 
@@ -296,8 +311,9 @@ def forward(
         return {"logits": logits_flat}
 
 
-def create_autodeploy_executor(ad_config: LlmArgs):
-    """Create an AutoDeploy executor from the given configuration and checkpoint directory.
+def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[TokenizerBase] = None):
+    """Create an AutoDeploy executor from the given configuration and tokenizer.
+    The tokenizer is required for guided decoding.
 
     This is the entrypoint API to the _autodeploy backend.
     """
@@ -404,6 +420,25 @@ def create_autodeploy_executor(ad_config: LlmArgs):
     )
     sampler = TorchSampler(sampler_args)
 
+    # Guided (istructured) decoding.
+    guided_decoder = None
+    if (
+        (guided_decoding_backend := ad_config.guided_decoding_backend) is not None
+    ) and dist_mapping.is_last_pp_rank():
+        vocab_size_padded = engine.cache_seq_interface.info.vocab_size_padded
+        if vocab_size_padded is None:
+            raise RuntimeError(
+                "Could not determine the vocabulary size. Required for guided decoding."
+            )
+        guided_decoding_config = get_guided_decoding_config(
+            guided_decoding_backend=guided_decoding_backend, tokenizer=tokenizer
+        )
+        guided_decoder = GuidedDecoder(
+            guided_decoding_config=guided_decoding_config,
+            max_num_sequences=ad_config.max_batch_size,
+            vocab_size_padded=vocab_size_padded,
+        )
+
     # creating the executor object
     py_executor = PyExecutor(
         resource_manager,
@@ -418,5 +453,6 @@ def create_autodeploy_executor(ad_config: LlmArgs):
         max_draft_len=max_draft_len,
         max_total_draft_tokens=max_total_draft_tokens,
         max_beam_width=ad_config.max_beam_width,
+        guided_decoder=guided_decoder,
     )
     return py_executor
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
@@ -130,6 +130,7 @@ def _create_py_executor():
                 create_executor = create_autodeploy_executor
                 assert isinstance(self.llm_args, ADLlmArgs)
                 args["ad_config"] = self.llm_args.get_pytorch_backend_config()
+                args["tokenizer"] = self._tokenizer
             else:
                 raise ValueError(
                     f"Unsupported backend config: {self.llm_args.backend}")
diff --git a/tests/integration/defs/examples/test_ad_guided_decoding.py b/tests/integration/defs/examples/test_ad_guided_decoding.py
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from build_and_run_ad import ExperimentConfig, main
+from defs.conftest import llm_models_root
+
+from tensorrt_llm.sampling_params import GuidedDecodingParams
+
+
+def test_autodeploy_guided_decoding_main_json():
+    schema = (
+        "{"
+        '"title": "WirelessAccessPoint", "type": "object", "properties": {'
+        '"ssid": {"title": "SSID", "type": "string"}, '
+        '"securityProtocol": {"title": "SecurityProtocol", "type": "string"}, '
+        '"bandwidth": {"title": "Bandwidth", "type": "string"}}, '
+        '"required": ["ssid", "securityProtocol", "bandwidth"]}')
+
+    model_path = os.path.join(llm_models_root(),
+                              "llama-models-v2/TinyLlama-1.1B-Chat-v1.0")
+
+    print(f"model_path: {model_path}")
+    llm_args = {
+        "model": model_path,
+        "guided_decoding_backend": "xgrammar",
+        "skip_loading_weights": False,
+    }
+
+    experiment_config = {
+        "args": llm_args,
+        "benchmark": {
+            "enabled": False
+        },
+        "prompt": {
+            "batch_size":
+            1,
+            "queries":
+            ("Please provide a JSON object representing a wireless access point. "
+             "Follow this exact schema: " + schema),
+        },
+    }
+
+    # DemoLLM runtime does not support guided decoding. Need to set runtime to trtllm.
+    experiment_config["args"]["runtime"] = "trtllm"
+    experiment_config["args"]["world_size"] = 1
+
+    cfg = ExperimentConfig(**experiment_config)
+
+    # Need to introduce the guided decoding params after ExperimentConfig construction
+    # because otherwise they get unpacked as a dict.
+    cfg.prompt.sp_kwargs = {
+        "max_tokens": 100,
+        "top_k": None,
+        "temperature": 0.1,
+        "guided_decoding": GuidedDecodingParams(json=schema),
+    }
+
+    result = main(cfg)
+    print(f"guided_text: {result}")
+
+    # Extract the generated text from the nested structure
+    # Format: {'prompts_and_outputs': [[prompt, output]]}
+    assert "prompts_and_outputs" in result, "Result should contain 'prompts_and_outputs'"
+    assert len(result["prompts_and_outputs"]
+               ) > 0, "Should have at least one prompt/output pair"
+
+    _prompt, generated_text = result["prompts_and_outputs"][0]
+    print(f"Generated text: {generated_text}")
+
+    # Parse and validate the JSON
+    try:
+        guided_json = json.loads(generated_text)
+    except Exception as e:
+        print(
+            f"Failed to parse generated text as JSON. Raw text: {generated_text!r}"
+        )
+        raise AssertionError(f"Generated text is not valid JSON: {e}") from e
+
+    # Assert the JSON conforms to the schema
+    assert "ssid" in guided_json, "JSON must contain 'ssid' field"
+    assert "securityProtocol" in guided_json, "JSON must contain 'securityProtocol' field"
+    assert "bandwidth" in guided_json, "JSON must contain 'bandwidth' field"
+
+    # Validate field types
+    assert isinstance(guided_json["ssid"], str), "'ssid' must be a string"
+    assert isinstance(guided_json["securityProtocol"],
+                      str), "'securityProtocol' must be a string"
+    assert isinstance(guided_json["bandwidth"],
+                      str), "'bandwidth' must be a string"
+
+    # Validate non-empty values
+    assert len(guided_json["ssid"]) > 0, "'ssid' must not be empty"
+    assert len(guided_json["securityProtocol"]
+               ) > 0, "'securityProtocol' must not be empty"
+    assert len(guided_json["bandwidth"]) > 0, "'bandwidth' must not be empty"
+
+    print(f"Validation passed! Generated JSON: {guided_json}")
diff --git a/tests/integration/defs/pytest.ini b/tests/integration/defs/pytest.ini
@@ -4,6 +4,8 @@ threadleak = True
 threadleak_exclude = asyncio_\d+
 junit_family=legacy
 addopts = --ignore-glob="*perf/test_perf.py"  --ignore-glob="*test_list_validation.py"  --ignore-glob="*llm-test-workspace*"  --durations=0 -W ignore::DeprecationWarning
+pythonpath =
+    ../../../examples/auto_deploy
 norecursedirs = ./triton/perf
 markers =
     skip_less_device: skip when less device detected than the declared
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_create_ad_executor.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_create_ad_executor.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_guided_decoding_regex.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_guided_decoding_regex.py