Skip to content

Commit ea25a76

Browse files
[BugFix] Use async Mistral Tokenizer in Chat Completions (vllm-project#26134)
Signed-off-by: Ben Browning <[email protected]> Co-authored-by: Cyrus Leung <[email protected]>
1 parent 67bc0c0 commit ea25a76

File tree

2 files changed

+73
-2
lines changed

2 files changed

+73
-2
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import asyncio
5+
import time
6+
from unittest.mock import Mock
7+
8+
import pytest
9+
10+
from vllm.config import ModelConfig
11+
from vllm.entrypoints.openai.serving_engine import OpenAIServing
12+
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
13+
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
14+
15+
16+
@pytest.fixture()
17+
def serving() -> OpenAIServing:
18+
"""Create a minimal OpenAIServing instance for testing."""
19+
20+
# Create minimal mocks
21+
engine_client = Mock()
22+
model_config = Mock(spec=ModelConfig)
23+
model_config.max_model_len = 32768
24+
models = Mock(spec=OpenAIServingModels)
25+
26+
serving = OpenAIServing(
27+
engine_client=engine_client,
28+
model_config=model_config,
29+
models=models,
30+
request_logger=None,
31+
)
32+
return serving
33+
34+
35+
@pytest.mark.asyncio
36+
async def test_async_mistral_tokenizer_does_not_block_event_loop(
37+
serving: OpenAIServing):
38+
expected_tokens = [1, 2, 3]
39+
40+
# Mock the blocking version to sleep
41+
def mocked_apply_chat_template(*_args, **_kwargs):
42+
time.sleep(2)
43+
return expected_tokens
44+
45+
mock_tokenizer = Mock(spec=MistralTokenizer)
46+
mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template
47+
48+
task = serving._apply_mistral_chat_template_async(tokenizer=mock_tokenizer,
49+
messages=[],
50+
chat_template=None,
51+
tools=[])
52+
53+
# Ensure the event loop is not blocked
54+
blocked_count = 0
55+
for _i in range(20): # Check over ~2 seconds
56+
start = time.perf_counter()
57+
await asyncio.sleep(0)
58+
elapsed = time.perf_counter() - start
59+
60+
# an overly generous elapsed time for slow machines
61+
if elapsed >= 0.5:
62+
blocked_count += 1
63+
64+
await asyncio.sleep(0.1)
65+
66+
# Ensure task completes
67+
tokens = await task
68+
assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
69+
assert blocked_count == 0, ("Event loop blocked during tokenization")

vllm/entrypoints/openai/serving_engine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
8181
log_tracing_disabled_warning)
8282
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
83-
from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
83+
from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of, make_async,
8484
merge_async_iterators, random_uuid)
8585

8686
logger = init_logger(__name__)
@@ -240,6 +240,8 @@ def __init__(
240240
self.enable_force_include_usage = enable_force_include_usage
241241

242242
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
243+
self._apply_mistral_chat_template_async = make_async(
244+
apply_mistral_chat_template, executor=self._tokenizer_executor)
243245

244246
self._async_tokenizer_pool: dict[AnyTokenizer,
245247
AsyncMicrobatchTokenizer] = {}
@@ -798,7 +800,7 @@ async def _preprocess_chat(
798800
if tokenizer is None:
799801
request_prompt = "placeholder"
800802
elif isinstance(tokenizer, MistralTokenizer):
801-
request_prompt = apply_mistral_chat_template(
803+
request_prompt = await self._apply_mistral_chat_template_async(
802804
tokenizer,
803805
messages=messages,
804806
**_chat_template_kwargs,

0 commit comments

Comments
 (0)