[None][feat] Support Qwen3 reasoning parser (#8000)

LinPoly · web-flow · commit a4227cf1b013 · 2025-10-21T14:08:39.000+08:00
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/reasoning_parser.py b/tensorrt_llm/llmapi/reasoning_parser.py
@@ -1,18 +1,12 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Type
 
 
 @dataclass
 class ReasoningParserResult:
-
-    def __init__(self,
-                 in_reasoning: bool,
-                 content: Optional[str] = None,
-                 reasoning_content: Optional[str] = None):
-        self.in_reasoning = in_reasoning
-        self.content = content
-        self.reasoning_content = reasoning_content
+    content: str = ""
+    reasoning_content: str = ""
 
 
 class BaseReasoningParser(ABC):
@@ -34,62 +28,99 @@ class DeepSeekR1Parser(BaseReasoningParser):
     treat all the text before the </think> tag as `reasoning_content` and the text after as `content`.
     """
 
-    def __init__(self):
+    def __init__(self, reasoning_at_start: bool = False) -> None:
+        self.reasoning_start = "<think>"
         self.reasoning_end = "</think>"
-        self.in_reasoning = True
+        self.reasoning_at_start = reasoning_at_start
+        self.in_reasoning = self.reasoning_at_start
+        self._buffer = ""
 
     def _create_reasoning_end_result(self, content: str,
                                      reasoning_content: str):
         if len(content) == 0:
             reasoning_parser_result = ReasoningParserResult(
-                True, reasoning_content=reasoning_content)
+                reasoning_content=reasoning_content)
         elif len(reasoning_content) == 0:
-            reasoning_parser_result = ReasoningParserResult(False,
-                                                            content=content)
+            reasoning_parser_result = ReasoningParserResult(content=content)
         else:
             reasoning_parser_result = ReasoningParserResult(
-                False, content=content, reasoning_content=reasoning_content)
+                content=content, reasoning_content=reasoning_content)
         return reasoning_parser_result
 
     def parse(self, text: str) -> ReasoningParserResult:
-        if self.reasoning_end not in text:
-            return ReasoningParserResult(True, reasoning_content=text)
-
-        splits = text.split(self.reasoning_end, maxsplit=1)
-        reasoning_content = splits[0]
-        content = splits[1]
-
-        reasoning_parser_result = self._create_reasoning_end_result(
-            content, reasoning_content)
-        return reasoning_parser_result
+        if not self.reasoning_at_start:
+            splits = text.partition(self.reasoning_start)
+            if splits[1] == "":
+                # no reasoning start tag found
+                return ReasoningParserResult(content=text)
+            # reasoning start tag found
+            # text before reasoning start tag is dropped
+            text = splits[2]
+        splits = text.partition(self.reasoning_end)
+        reasoning_content, content = splits[0], splits[2]
+        return ReasoningParserResult(content=content,
+                                     reasoning_content=reasoning_content)
 
     def parse_delta(self, delta_text: str) -> ReasoningParserResult:
-        if self.in_reasoning and self.reasoning_end in delta_text:
+        self._buffer += delta_text
+        delta_text = self._buffer
+        reasoning_content = None
+        content = None
+        if (self.reasoning_start.startswith(delta_text)
+                or self.reasoning_end.startswith(delta_text)):
+            # waiting for more text to determine if it's a reasoning start or end tag
+            return ReasoningParserResult()
+
+        if not self.in_reasoning:
+            begin_idx = delta_text.find(self.reasoning_start)
+            if begin_idx == -1:
+                self._buffer = ""
+                return ReasoningParserResult(content=delta_text)
+            self.in_reasoning = True
+            # set reasoning_content, will be processed by the next block
+            reasoning_content = delta_text[begin_idx +
+                                           len(self.reasoning_start):]
+
+        if self.in_reasoning:
+            delta_text = reasoning_content if reasoning_content is not None else delta_text
             end_idx = delta_text.find(self.reasoning_end)
+            if end_idx == -1:
+                last_idx = delta_text.rfind(self.reasoning_end[0])
+                if last_idx != -1 and self.reasoning_end.startswith(
+                        delta_text[last_idx:]):
+                    self._buffer = delta_text[last_idx:]
+                    reasoning_content = delta_text[:last_idx]
+                else:
+                    self._buffer = ""
+                    reasoning_content = delta_text
+                return ReasoningParserResult(
+                    reasoning_content=reasoning_content)
             reasoning_content = delta_text[:end_idx]
             content = delta_text[end_idx + len(self.reasoning_end):]
-            reasoning_parser_result = self._create_reasoning_end_result(
-                content, reasoning_content)
             self.in_reasoning = False
-            return reasoning_parser_result
-
-        if self.in_reasoning:
-            return ReasoningParserResult(self.in_reasoning,
-                                         reasoning_content=delta_text)
-
-        # not self.in_reasoning:
-        return ReasoningParserResult(self.in_reasoning, content=delta_text)
+            self._buffer = ""
+            return ReasoningParserResult(content=content,
+                                         reasoning_content=reasoning_content)
+        raise RuntimeError(
+            "Unreachable code reached in `DeepSeekR1Parser.parse_delta`")
 
 
 class ReasoningParserFactory:
-    parsers: Dict[str, BaseReasoningParser] = {
+    parsers: dict[str, Type[BaseReasoningParser]] = {
         "deepseek-r1": DeepSeekR1Parser,
+        "qwen3": DeepSeekR1Parser,
     }
 
     @staticmethod
     def create_reasoning_parser(reasoning_parser: str) -> BaseReasoningParser:
-        if reasoning_parser not in ReasoningParserFactory.parsers:
-            raise ValueError(f"Invalid reasoning_parser: {reasoning_parser}")
-        reasoning_parser_class = ReasoningParserFactory.parsers.get(
-            reasoning_parser.lower())
-        return reasoning_parser_class()
+        try:
+            reasoning_parser_class = ReasoningParserFactory.parsers[
+                reasoning_parser.lower()]
+            if reasoning_parser == "deepseek-r1":
+                return reasoning_parser_class(reasoning_at_start=True)
+            return reasoning_parser_class()
+        except KeyError as e:
+            raise ValueError(
+                f"Invalid reasoning parser: {reasoning_parser}\n"
+                f"Supported parsers: {list(ReasoningParserFactory.parsers.keys())}"
+            ) from e
diff --git a/tensorrt_llm/serve/postprocess_handlers.py b/tensorrt_llm/serve/postprocess_handlers.py
@@ -95,7 +95,7 @@ def create_logprobs(token_ids: List[int], tokenizer: TransformersTokenizer,
 
 
 def apply_reasoning_parser(args: ChatPostprocArgs, output_index: int, text: str,
-                           streaming: bool) -> Tuple[bool, str, str]:
+                           streaming: bool) -> Tuple[str, str]:
     reasoning_parser = None
     if args.reasoning_parser is not None:
         if output_index not in args.reasoning_parser_dict:
@@ -104,17 +104,16 @@ def apply_reasoning_parser(args: ChatPostprocArgs, output_index: int, text: str,
                     args.reasoning_parser)
         reasoning_parser = args.reasoning_parser_dict[output_index]
 
-    in_reasoning = False
     if reasoning_parser is not None:
         if not streaming:
             result = reasoning_parser.parse(text)
         else:
             result = reasoning_parser.parse_delta(text)
-        in_reasoning, content, reasoning_content = result.in_reasoning, result.content, result.reasoning_content
+        content, reasoning_content = result.content, result.reasoning_content
     else:
-        in_reasoning, content, reasoning_content = False, text, None
+        content, reasoning_content = text, ""
 
-    return in_reasoning, content, reasoning_content
+    return content, reasoning_content
 
 
 @nvtx_range_debug("chat_stream_post_processor")
@@ -123,8 +122,8 @@ def chat_stream_post_processor(rsp: GenerationResultBase,
 
     def yield_first_chat(num_tokens: int,
                          idx: int,
-                         role: str = None,
-                         content: str = None):
+                         role: str | None = None,
+                         content: str | None = None):
         choice_data = ChatCompletionResponseStreamChoice(index=idx,
                                                          delta=DeltaMessage(
                                                              role=role,
@@ -171,7 +170,7 @@ def yield_first_chat(num_tokens: int,
 
         delta_text = output.text_diff
 
-        in_reasoning, delta_text, reasoning_delta_text = apply_reasoning_parser(
+        delta_text, reasoning_delta_text = apply_reasoning_parser(
             args, i, delta_text, True)
 
         if args.tool_choice and type(
@@ -181,12 +180,8 @@ def yield_first_chat(num_tokens: int,
                     name=args.tool_choice.function.name, arguments=delta_text))
             ])
         else:
-            if in_reasoning:
-                delta_message = DeltaMessage(
-                    reasoning_content=reasoning_delta_text)
-            else:
-                delta_message = DeltaMessage(
-                    content=delta_text, reasoning_content=reasoning_delta_text)
+            delta_message = DeltaMessage(content=delta_text,
+                                         reasoning_content=reasoning_delta_text)
 
         choice = ChatCompletionResponseStreamChoice(
             index=i,
@@ -239,8 +234,8 @@ def chat_response_post_processor(
     choices: List[ChatCompletionResponseChoice] = []
     role = args.role
     for output in rsp.outputs:
-        _, text, reasoning_text = apply_reasoning_parser(
-            args, output.index, output.text, False)
+        text, reasoning_text = apply_reasoning_parser(args, output.index,
+                                                      output.text, False)
 
         if args.tool_choice and isinstance(args.tool_choice,
                                            ChatCompletionNamedToolChoiceParam):
@@ -252,8 +247,6 @@ def chat_response_post_processor(
                         name=args.tool_choice.function.name, arguments=text))
                 ])
         else:
-            if text is None:
-                text = ""
             message = ChatMessage(role=role,
                                   content=text,
                                   reasoning_content=reasoning_text)
diff --git a/tests/unittest/llmapi/apps/_test_openai_reasoning.py b/tests/unittest/llmapi/apps/_test_openai_reasoning.py
@@ -9,9 +9,13 @@
 pytestmark = pytest.mark.threadleak(enabled=False)
 
 
-@pytest.fixture(scope="module", ids=["DeepSeek-R1-Distill-Qwen-1.5B"])
-def model_name() -> str:
-    return "DeepSeek-R1-Distill-Qwen-1.5B"
+# yapf: disable
+@pytest.fixture(scope="module",
+                params=["DeepSeek-R1-Distill-Qwen-1.5B",
+                        "Qwen3/Qwen3-0.6B"])
+def model_name(request) -> str:
+    return request.param
+# yapf: enable
 
 
 @pytest.fixture(scope="module", params=["trt", "pytorch"])
@@ -21,12 +25,19 @@ def backend(request):
 
 @pytest.fixture(scope="module")
 def server(model_name: str, backend: str):
+    # Skip specific model/backend combinations
+    if model_name == "Qwen3/Qwen3-0.6B" and backend == "trt":
+        pytest.skip("Qwen3 model not supported with trt backend")
+
     model_path = get_model_path(model_name)
     args = ["--backend", f"{backend}"]
     max_beam_width = 1 if backend == "pytorch" else 2
     args.extend(["--max_beam_width", str(max_beam_width)])
     args.extend(["--max_batch_size", "2", "--max_seq_len", "1024"])
-    args.extend(["--reasoning_parser", "deepseek-r1"])
+    if model_name.startswith("Qwen3"):
+        args.extend(["--reasoning_parser", "qwen3"])
+    else:
+        args.extend(["--reasoning_parser", "deepseek-r1"])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
@@ -51,16 +62,10 @@ def test_reasoning_parser(client: openai.OpenAI, model_name: str, backend: str):
         extra_body=extra_body,
     )
 
-    if backend == "pytorch":
-        assert len(resp.choices) == n
-        for resp_choice in resp.choices:
-            assert len(resp_choice.message.content) > 0
-            assert len(resp_choice.message.reasoning_content) > 0
-    else:
-        assert len(resp.choices) == n
-        for resp_choice in resp.choices:
-            assert len(resp_choice.message.content) > 0
-            assert len(resp_choice.message.reasoning_content) > 0
+    assert len(resp.choices) == n
+    for resp_choice in resp.choices:
+        assert len(resp_choice.message.content) > 0
+        assert len(resp_choice.message.reasoning_content) > 0
 
 
 @pytest.fixture(scope="module")
@@ -78,9 +83,9 @@ async def process_stream(
         delta = choice.delta.dict()
         content = delta.get("content", None)
         reasoning_content = delta.get("reasoning_content", None)
-        if content is not None:
+        if content:
             content_chunks.append(content)
-        if reasoning_content is not None:
+        if reasoning_content:
             reasoning_content_chunks.append(reasoning_content)
     return (content_chunks, reasoning_content_chunks)
 
@@ -105,12 +110,17 @@ async def test_reasoning_parser_streaming(async_client: openai.AsyncOpenAI,
     stream = await async_client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=1,
+        max_completion_tokens=2,
         temperature=0.0,
         stream=True,
     )
 
     content_chunks, reasoning_content_chunks = await process_stream(
         stream=stream)
     assert len(content_chunks) == 0
-    assert len(reasoning_content_chunks) == 1
+    if model_name.startswith("Qwen3"):
+        # First token would be <think>
+        assert len(reasoning_content_chunks) == 1
+    else:
+        # <think> is in chat template
+        assert len(reasoning_content_chunks) == 2
diff --git a/tests/unittest/llmapi/test_reasoning_parser.py b/tests/unittest/llmapi/test_reasoning_parser.py