Skip to content

Commit a42d2df

Browse files
authored
[Frontend] Cache chat template kwargs resolution (#26227)
Signed-off-by: Isotr0py <[email protected]>
1 parent 5c057e0 commit a42d2df

File tree

7 files changed

+81
-18
lines changed

7 files changed

+81
-18
lines changed

vllm/entrypoints/chat_utils.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1572,6 +1572,22 @@ def parse(self, parser: jinja2.parser.Parser) -> jinja2.nodes.CallBlock:
15721572
return call_block.set_lineno(lineno)
15731573

15741574

1575+
def _resolve_chat_template_kwargs(
1576+
chat_template: str,
1577+
):
1578+
env = jinja2.sandbox.ImmutableSandboxedEnvironment(
1579+
trim_blocks=True,
1580+
lstrip_blocks=True,
1581+
extensions=[AssistantTracker, jinja2.ext.loopcontrols],
1582+
)
1583+
parsed_content = env.parse(chat_template)
1584+
template_vars = jinja2.meta.find_undeclared_variables(parsed_content)
1585+
return template_vars
1586+
1587+
1588+
_cached_resolve_chat_template_kwargs = lru_cache(_resolve_chat_template_kwargs)
1589+
1590+
15751591
def resolve_chat_template_kwargs(
15761592
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
15771593
chat_template: str,
@@ -1582,13 +1598,7 @@ def resolve_chat_template_kwargs(
15821598
if supports_kw(tokenizer.apply_chat_template, k, allow_var_kwargs=False)
15831599
}
15841600

1585-
env = jinja2.sandbox.ImmutableSandboxedEnvironment(
1586-
trim_blocks=True,
1587-
lstrip_blocks=True,
1588-
extensions=[AssistantTracker, jinja2.ext.loopcontrols],
1589-
)
1590-
parsed_content = env.parse(chat_template)
1591-
template_vars = jinja2.meta.find_undeclared_variables(parsed_content)
1601+
template_vars = _cached_resolve_chat_template_kwargs(chat_template)
15921602

15931603
# We exclude chat_template from kwargs here, because
15941604
# chat template has been already resolved at this stage

vllm/entrypoints/openai/api_server.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,6 +1745,7 @@ async def init_app_state(
17451745
request_logger=request_logger,
17461746
chat_template=resolved_chat_template,
17471747
chat_template_content_format=args.chat_template_content_format,
1748+
trust_request_chat_template=args.trust_request_chat_template,
17481749
log_error_stack=args.log_error_stack,
17491750
) if "encode" in supported_tasks else None
17501751
state.openai_serving_embedding = OpenAIServingEmbedding(
@@ -1754,6 +1755,7 @@ async def init_app_state(
17541755
request_logger=request_logger,
17551756
chat_template=resolved_chat_template,
17561757
chat_template_content_format=args.chat_template_content_format,
1758+
trust_request_chat_template=args.trust_request_chat_template,
17571759
log_error_stack=args.log_error_stack,
17581760
) if "embed" in supported_tasks else None
17591761
state.openai_serving_classification = ServingClassification(
@@ -1777,6 +1779,7 @@ async def init_app_state(
17771779
request_logger=request_logger,
17781780
chat_template=resolved_chat_template,
17791781
chat_template_content_format=args.chat_template_content_format,
1782+
trust_request_chat_template=args.trust_request_chat_template,
17801783
log_error_stack=args.log_error_stack,
17811784
)
17821785
state.openai_serving_transcription = OpenAIServingTranscription(

vllm/entrypoints/openai/serving_chat.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -222,16 +222,14 @@ async def create_chat_completion(
222222

223223
if not self.use_harmony:
224224
# Common case.
225-
request_chat_template = request.chat_template
226-
chat_template_kwargs = request.chat_template_kwargs
227-
if not self.trust_request_chat_template and (
228-
request_chat_template is not None or
229-
(chat_template_kwargs and
230-
chat_template_kwargs.get("chat_template") is not None)):
231-
return self.create_error_response(
232-
"Chat template is passed with request, but "
233-
"--trust-request-chat-template is not set. "
234-
"Refused request with untrusted chat template.")
225+
error_check_ret = self._validate_chat_template(
226+
request_chat_template=request.chat_template,
227+
chat_template_kwargs=request.chat_template_kwargs,
228+
trust_request_chat_template=self.
229+
trust_request_chat_template,
230+
)
231+
if error_check_ret is not None:
232+
return error_check_ret
235233
(
236234
conversation,
237235
request_prompts,
@@ -240,7 +238,7 @@ async def create_chat_completion(
240238
request,
241239
tokenizer,
242240
request.messages,
243-
chat_template=request_chat_template or self.chat_template,
241+
chat_template=request.chat_template or self.chat_template,
244242
chat_template_content_format=self.
245243
chat_template_content_format,
246244
add_generation_prompt=request.add_generation_prompt,

vllm/entrypoints/openai/serving_embedding.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,7 @@ def __init__(
576576
request_logger: Optional[RequestLogger],
577577
chat_template: Optional[str],
578578
chat_template_content_format: ChatTemplateContentFormatOption,
579+
trust_request_chat_template: bool = False,
579580
log_error_stack: bool = False,
580581
) -> None:
581582
super().__init__(engine_client=engine_client,
@@ -586,6 +587,7 @@ def __init__(
586587

587588
self.chat_template = chat_template
588589
self.chat_template_content_format: Final = chat_template_content_format
590+
self.trust_request_chat_template = trust_request_chat_template
589591

590592
async def create_embedding(
591593
self,
@@ -629,3 +631,17 @@ def _create_pooling_params(
629631
return self.create_error_response(str(e))
630632

631633
return pooling_params
634+
635+
async def _preprocess(
636+
self,
637+
ctx: ServeContext,
638+
) -> Optional[ErrorResponse]:
639+
if isinstance(ctx.request, EmbeddingChatRequest):
640+
error_check_ret = self._validate_chat_template(
641+
request_chat_template=ctx.request.chat_template,
642+
chat_template_kwargs=ctx.request.chat_template_kwargs,
643+
trust_request_chat_template=self.trust_request_chat_template,
644+
)
645+
if error_check_ret is not None:
646+
return error_check_ret
647+
return await super()._preprocess(ctx)

vllm/entrypoints/openai/serving_engine.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,22 @@ async def _tokenize_prompt_inputs_async(
751751
tokenizer=tokenizer,
752752
)
753753

754+
def _validate_chat_template(
755+
self,
756+
request_chat_template: Optional[str],
757+
chat_template_kwargs: Optional[dict[str, Any]],
758+
trust_request_chat_template: bool,
759+
) -> Optional[ErrorResponse]:
760+
if not trust_request_chat_template and (
761+
request_chat_template is not None or
762+
(chat_template_kwargs
763+
and chat_template_kwargs.get("chat_template") is not None)):
764+
return self.create_error_response(
765+
"Chat template is passed with request, but "
766+
"--trust-request-chat-template is not set. "
767+
"Refused request with untrusted chat template.")
768+
return None
769+
754770
async def _preprocess_chat(
755771
self,
756772
request: Union[ChatLikeRequest, ResponsesRequest],

vllm/entrypoints/openai/serving_pooling.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def __init__(
6565
request_logger: Optional[RequestLogger],
6666
chat_template: Optional[str],
6767
chat_template_content_format: ChatTemplateContentFormatOption,
68+
trust_request_chat_template: bool = False,
6869
log_error_stack: bool = False,
6970
) -> None:
7071
super().__init__(engine_client=engine_client,
@@ -75,6 +76,7 @@ def __init__(
7576

7677
self.chat_template = chat_template
7778
self.chat_template_content_format: Final = chat_template_content_format
79+
self.trust_request_chat_template = trust_request_chat_template
7880
io_processor_plugin = self.model_config.io_processor_plugin
7981
self.io_processor = get_io_processor(vllm_config, io_processor_plugin)
8082

@@ -129,6 +131,14 @@ async def create_pooling(
129131
prompt=validated_prompt, request_id=request_id)
130132

131133
elif isinstance(request, PoolingChatRequest):
134+
error_check_ret = self._validate_chat_template(
135+
request_chat_template=request.chat_template,
136+
chat_template_kwargs=request.chat_template_kwargs,
137+
trust_request_chat_template=self.
138+
trust_request_chat_template,
139+
)
140+
if error_check_ret is not None:
141+
return error_check_ret
132142
(
133143
_,
134144
_,

vllm/entrypoints/openai/serving_tokenization.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def __init__(
4040
request_logger: Optional[RequestLogger],
4141
chat_template: Optional[str],
4242
chat_template_content_format: ChatTemplateContentFormatOption,
43+
trust_request_chat_template: bool = False,
4344
log_error_stack: bool = False,
4445
) -> None:
4546
super().__init__(engine_client=engine_client,
@@ -50,6 +51,7 @@ def __init__(
5051

5152
self.chat_template = chat_template
5253
self.chat_template_content_format: Final = chat_template_content_format
54+
self.trust_request_chat_template = trust_request_chat_template
5355

5456
async def create_tokenize(
5557
self,
@@ -71,6 +73,14 @@ async def create_tokenize(
7173
if isinstance(request, TokenizeChatRequest):
7274
tool_dicts = (None if request.tools is None else
7375
[tool.model_dump() for tool in request.tools])
76+
error_check_ret = self._validate_chat_template(
77+
request_chat_template=request.chat_template,
78+
chat_template_kwargs=request.chat_template_kwargs,
79+
trust_request_chat_template=self.
80+
trust_request_chat_template,
81+
)
82+
if error_check_ret is not None:
83+
return error_check_ret
7484
(
7585
_,
7686
_,

0 commit comments

Comments
 (0)