Skip to content

Commit 8fc5750

Browse files
[Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (vllm-project#19135)
Signed-off-by: chaunceyjiang <[email protected]>
1 parent af7fc84 commit 8fc5750

File tree

2 files changed

+101
-31
lines changed

2 files changed

+101
-31
lines changed

tests/entrypoints/openai/test_completion_with_function_calling.py

Lines changed: 86 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4+
from typing import NamedTuple
5+
46
import openai # use the official client for correctness check
57
import pytest
68
import pytest_asyncio
@@ -22,7 +24,9 @@ def server(): # noqa: F811
2224
"--guided-decoding-backend",
2325
"xgrammar",
2426
"--tool-call-parser",
25-
"hermes"
27+
"hermes",
28+
"--reasoning-parser",
29+
"qwen3",
2630
]
2731

2832
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -35,9 +39,53 @@ async def client(server):
3539
yield async_client
3640

3741

42+
class TestCase(NamedTuple):
43+
model_name: str
44+
stream: bool
45+
tool_choice: str
46+
enable_thinking: bool
47+
48+
3849
@pytest.mark.asyncio
39-
@pytest.mark.parametrize("model_name", [MODEL_NAME])
40-
async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
50+
@pytest.mark.parametrize(
51+
"test_case",
52+
[
53+
TestCase(model_name=MODEL_NAME,
54+
stream=True,
55+
tool_choice="auto",
56+
enable_thinking=False),
57+
TestCase(model_name=MODEL_NAME,
58+
stream=False,
59+
tool_choice="auto",
60+
enable_thinking=False),
61+
TestCase(model_name=MODEL_NAME,
62+
stream=True,
63+
tool_choice="required",
64+
enable_thinking=False),
65+
TestCase(model_name=MODEL_NAME,
66+
stream=False,
67+
tool_choice="required",
68+
enable_thinking=False),
69+
TestCase(model_name=MODEL_NAME,
70+
stream=True,
71+
tool_choice="auto",
72+
enable_thinking=True),
73+
TestCase(model_name=MODEL_NAME,
74+
stream=False,
75+
tool_choice="auto",
76+
enable_thinking=True),
77+
TestCase(model_name=MODEL_NAME,
78+
stream=True,
79+
tool_choice="required",
80+
enable_thinking=True),
81+
TestCase(model_name=MODEL_NAME,
82+
stream=False,
83+
tool_choice="required",
84+
enable_thinking=True),
85+
],
86+
)
87+
async def test_function_tool_use(client: openai.AsyncOpenAI,
88+
test_case: TestCase):
4189
tools = [
4290
{
4391
"type": "function",
@@ -126,30 +174,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
126174
"forecast for the next 5 days, in fahrenheit?",
127175
},
128176
]
129-
130-
# Non-streaming test
131-
chat_completion = await client.chat.completions.create(
132-
messages=messages,
133-
model=model_name,
134-
tools=tools,
135-
tool_choice="required",
136-
)
137-
138-
assert chat_completion.choices[0].message.tool_calls is not None
139-
assert len(chat_completion.choices[0].message.tool_calls) > 0
140-
141-
# Streaming test
142-
stream = await client.chat.completions.create(
143-
messages=messages,
144-
model=model_name,
145-
tools=tools,
146-
tool_choice="required",
147-
stream=True,
148-
)
149-
150-
output = []
151-
async for chunk in stream:
152-
if chunk.choices and chunk.choices[0].delta.tool_calls:
153-
output.extend(chunk.choices[0].delta.tool_calls)
154-
155-
assert len(output) > 0
177+
if not test_case.stream:
178+
# Non-streaming test
179+
chat_completion = await client.chat.completions.create(
180+
messages=messages,
181+
model=test_case.model_name,
182+
tools=tools,
183+
tool_choice=test_case.tool_choice,
184+
extra_body={
185+
"chat_template_kwargs": {
186+
"enable_thinking": test_case.enable_thinking
187+
}
188+
})
189+
190+
assert chat_completion.choices[0].message.tool_calls is not None
191+
assert len(chat_completion.choices[0].message.tool_calls) > 0
192+
else:
193+
# Streaming test
194+
stream = await client.chat.completions.create(
195+
messages=messages,
196+
model=test_case.model_name,
197+
tools=tools,
198+
tool_choice=test_case.tool_choice,
199+
stream=True,
200+
extra_body={
201+
"chat_template_kwargs": {
202+
"enable_thinking": test_case.enable_thinking
203+
}
204+
})
205+
206+
output = []
207+
async for chunk in stream:
208+
if chunk.choices and chunk.choices[0].delta.tool_calls:
209+
output.extend(chunk.choices[0].delta.tool_calls)
210+
211+
assert len(output) > 0

vllm/entrypoints/openai/serving_chat.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,21 @@ async def chat_completion_stream_generator(
689689
current_token_ids,
690690
output.token_ids,
691691
))
692-
692+
# When encountering think end id in prompt_token_ids
693+
# i.e {"enable_thinking": False},
694+
# set reasoning status to end.
695+
# Remove the text and token ids related
696+
# to 'reasoning_content'.
697+
if res.prompt_token_ids and \
698+
reasoning_parser.is_reasoning_end(
699+
list(res.prompt_token_ids)):
700+
reasoning_end_arr[i] = True
701+
current_token_ids = list(output.token_ids)
702+
if delta_message and delta_message.content:
703+
current_text = delta_message.content
704+
delta_message.content = None
705+
else:
706+
current_text = ""
693707
# When encountering think end id in delta_token_ids,
694708
# set reasoning status to end.
695709
# Remove the text and token ids related

0 commit comments

Comments
 (0)