Skip to content

Add EXAONE 4.0 reasoning parser #22617

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
340 changes: 340 additions & 0 deletions tests/reasoning/test_exaone4_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
from transformers import AutoTokenizer

from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.entrypoints.openai.protocol import ChatCompletionRequest

parser_name = "exaone4"
start_token = "<think>"
end_token = "</think>"

REASONING_MODEL_NAME = "LGAI-EXAONE/EXAONE-4.0-1.2B"


@pytest.fixture(scope="module")
def exaone4_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)


SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
NO_REASONING = {
"output": "This is content",
"reasoning_content": None,
"content": "This is content",
"is_reasoning_end": False,
"skip_extract_content": True,
}
NO_REASONING_STREAMING = {
"output": "This is a normal section",
"reasoning_content": None,
"content": "This is a normal section",
"is_reasoning_end": False,
"skip_extract_content": True,
}
NO_REASONING_STREAMING_WITH_THINK = {
"output": "This is a normal section",
"reasoning_content": "This is a normal section",
"content": None,
"is_reasoning_end": False,
}
MULTIPLE_LINES = {
"output": "This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the rest",
"reasoning_content": "",
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING = {
"output": "</think>This is the rest",
"reasoning_content": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
MULTIPLE_LINES_WITH_THINK = {
"output": "<think>This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning_content": "",
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning_content": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
THINK_NO_END = {
"output": "<think>This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
EMPTY = {
"output": "",
"reasoning_content": None,
"content": "",
"is_reasoning_end": False,
}
EMPTY_STREAMING = {
"output": "",
"reasoning_content": None,
"content": None,
"is_reasoning_end": False,
}
NEW_LINE = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning_content": "This is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}
NEW_LINE_STREAMING = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning_content": "\nThis is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}

TEST_CASES = [
pytest.param(
False,
SIMPLE_REASONING,
False,
id="simple_reasoning",
),
pytest.param(
True,
SIMPLE_REASONING,
True,
id="simple_reasoning_streaming",
),
pytest.param(
False,
COMPLETE_REASONING,
False,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
True,
id="complete_reasoning_streaming",
),
pytest.param(
False,
NO_REASONING,
False,
id="no_reasoning_token",
),
pytest.param(
True,
NO_REASONING_STREAMING,
False,
id="no_reasoning_token_streaming",
),
pytest.param(
True,
NO_REASONING_STREAMING_WITH_THINK,
True,
id="no_reasoning_token_streaming_with_think",
),
pytest.param(
False,
MULTIPLE_LINES,
False,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
True,
id="multiple_lines_streaming",
),
pytest.param(
True,
SHORTEST_REASONING,
False,
id="shortest",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING,
True,
id="shortest_streaming",
),
pytest.param(
False,
REASONING_WITH_THINK,
False,
id="reasoning_with_think",
),
pytest.param(
True,
REASONING_WITH_THINK,
True,
id="reasoning_with_think_streaming",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_THINK,
False,
id="complete_reasoning_with_think",
),
pytest.param(
True,
COMPLETE_REASONING_WITH_THINK,
True,
id="complete_reasoning_with_think_streaming",
),
pytest.param(
False,
MULTIPLE_LINES_WITH_THINK,
False,
id="multiple_lines_with_think",
),
pytest.param(
True,
MULTIPLE_LINES_WITH_THINK,
True,
id="multiple_lines_with_think_streaming",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
False,
id="shortest_with_think",
),
pytest.param(
True,
SHORTEST_REASONING_WITH_THINK,
True,
id="shortest_with_think_streaming",
),
pytest.param(
False,
THINK_NO_END,
False,
id="think_no_end",
),
pytest.param(
True,
THINK_NO_END,
True,
id="think_no_end_streaming",
),
pytest.param(
False,
EMPTY,
False,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
True,
id="empty_streaming",
),
pytest.param(
False,
NEW_LINE,
False,
id="new_line",
),
pytest.param(
True,
NEW_LINE_STREAMING,
True,
id="new_line_streaming",
),
]


@pytest.mark.parametrize("streaming, param_dict, enable_thinking", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
enable_thinking: bool,
exaone4_tokenizer,
):
output = exaone4_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
exaone4_tokenizer.convert_tokens_to_string([token])
for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(exaone4_tokenizer)

dummy_request = ChatCompletionRequest(
messages=[],
chat_template_kwargs={"enable_thinking": enable_thinking},
)
reasoning, content = run_reasoning_extraction(parser,
output_tokens,
request=dummy_request if enable_thinking else None,
streaming=streaming)

assert reasoning == param_dict["reasoning_content"]
assert content == param_dict["content"]

# Test is_reasoning_end
output_ids = exaone4_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]

# Test extract_content

# NOTE: In case of `no_reasoning_token`s, We omit the extract_content test.
# By default, EXAONE 4.0 parser assumes the content is the whole output
# if there is no '<think>' or '</think>', and `enable_thinking=False`.
# `extract_content_ids()` cannot get `enable_thinking` from the request,
# and it is only used for removing the reasoning content from the output
# on vllm.entrypoints.openai.serving_chat.py.
# So we let `extract_content_ids()` as is (assume the output is reasoning content

Check failure on line 329 in tests/reasoning/test_exaone4_reasoning_parser.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/reasoning/test_exaone4_reasoning_parser.py:329:81: E501 Line too long (85 > 80)
# with the condition: no '<think>' or '</think>' and `enable_thinking=False`).

Check failure on line 330 in tests/reasoning/test_exaone4_reasoning_parser.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

tests/reasoning/test_exaone4_reasoning_parser.py:330:81: E501 Line too long (82 > 80)
if param_dict.get("skip_extract_content", False):
return

if param_dict["content"] is not None:
content = parser.extract_content_ids(output_ids)
assert content == exaone4_tokenizer.convert_tokens_to_ids(
exaone4_tokenizer.tokenize(param_dict["content"]))
else:
content = parser.extract_content_ids(output)
assert content == []
1 change: 1 addition & 0 deletions tests/reasoning/test_granite_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ def test_streaming_subcases(param_dict):
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids,
request=None,
)
# Streaming currently expects at least one of reasoning content / content,
# so the response should return None in that case.
Expand Down
2 changes: 2 additions & 0 deletions tests/reasoning/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def run_reasoning_extraction_streaming(
previous_tokens,
current_tokens,
token_delta,
request,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
Expand Down Expand Up @@ -147,6 +148,7 @@ def run_reasoning_extraction_streaming_mistral(
previous_tokens,
current_tokens,
token_delta,
request,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
Expand Down
3 changes: 3 additions & 0 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,7 @@ async def chat_completion_stream_generator(
previous_token_ids,
current_token_ids,
output.token_ids,
request,
))
# When encountering think end id in delta_token_ids
# or think end id in prompt_token_ids
Expand Down Expand Up @@ -781,6 +782,7 @@ async def chat_completion_stream_generator(
previous_token_ids,
current_token_ids,
output.token_ids,
request,
))
# When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False},
Expand Down Expand Up @@ -858,6 +860,7 @@ async def chat_completion_stream_generator(
previous_token_ids,
current_token_ids,
output.token_ids,
request,
))
# handle streaming just a content delta
else:
Expand Down
Loading
Loading