Skip to content

Commit 16b943b

Browse files
committed
feat: support ds v3.2 encoding
1 parent 882b4d7 commit 16b943b

File tree

4 files changed

+694
-0
lines changed

4 files changed

+694
-0
lines changed

rtp_llm/models/deepseek_v2.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,3 +714,4 @@ def get_weight_cls():
714714
register_model("deepseek-v3-mtp", DeepSeekV3Mtp, ["DeepseekV3ForCausalLMNextN"])
715715
register_model("kimi_k2", DeepSeekV2, [])
716716
register_model("deepseek_v31", DeepSeekV2, [])
717+
register_model("deepseek_v32", DeepSeekV2, []) # DeepSeek-V3.2 uses same architecture as V3.1, with encoding script update

rtp_llm/openai/renderers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .chatglm4_renderer import ChatGlm4Renderer
44
from .chatglm45_renderer import ChatGlm45Renderer
55
from .deepseekv31_renderer import DeepseekV31Renderer
6+
from .deepseekv32_renderer import DeepseekV32Renderer
67
from .internvl_renderer import InternVLRenderer
78
from .kimik2_renderer import KimiK2Renderer
89
from .llava_renderer import LlavaRenderer
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
import importlib.util
2+
import logging
3+
import os
4+
import sys
5+
from typing import Optional
6+
7+
from typing_extensions import override
8+
9+
from rtp_llm.frontend.tokenizer_factory.tokenizers import BaseTokenizer
10+
from rtp_llm.openai.api_datatype import ChatCompletionRequest
11+
from rtp_llm.openai.renderer_factory_register import register_renderer
12+
from rtp_llm.openai.renderers.custom_renderer import RenderedInputs, RendererParams
13+
from rtp_llm.openai.renderers.reasoning_tool_base_renderer import (
14+
ReasoningToolBaseRenderer,
15+
)
16+
from rtp_llm.openai.renderers.sglang_helpers.function_call.base_format_detector import (
17+
BaseFormatDetector,
18+
)
19+
from rtp_llm.openai.renderers.sglang_helpers.function_call.deepseekv32_detector import (
20+
DeepSeekV32Detector,
21+
)
22+
from rtp_llm.openai.renderers.sglang_helpers.reasoning_parser import ReasoningParser
23+
24+
25+
class DeepseekV32Renderer(ReasoningToolBaseRenderer):
26+
"""DeepSeek V3.2 Renderer
27+
28+
This renderer uses a dedicated Python encoding script instead of Jinja templates.
29+
The encoding script is loaded from the checkpoint's "encoding" folder.
30+
31+
Key features:
32+
1. Loads encoding_dsv32.py from checkpoint/encoding folder
33+
2. Uses encode_messages function for rendering
34+
3. Supports thinking mode and tool calls
35+
"""
36+
37+
def __init__(
38+
self,
39+
tokenizer: BaseTokenizer,
40+
renderer_params: RendererParams,
41+
):
42+
# Load the encoding module before calling super().__init__()
43+
self.encoding_module = self._load_encoding_module(renderer_params.ckpt_path)
44+
super().__init__(tokenizer, renderer_params)
45+
46+
def _load_encoding_module(self, ckpt_path: str):
47+
"""
48+
Load the encoding_dsv32.py module from the checkpoint's encoding folder.
49+
50+
Args:
51+
ckpt_path: Path to the checkpoint directory
52+
53+
Returns:
54+
The loaded encoding module
55+
56+
Raises:
57+
FileNotFoundError: If the encoding script is not found
58+
ImportError: If the encoding script cannot be loaded
59+
"""
60+
encoding_folder = os.path.join(ckpt_path, "encoding")
61+
encoding_script_path = os.path.join(encoding_folder, "encoding_dsv32.py")
62+
63+
if not os.path.exists(encoding_script_path):
64+
raise FileNotFoundError(
65+
f"DeepSeek V3.2 encoding script not found at {encoding_script_path}. "
66+
f"Please ensure the checkpoint includes the 'encoding' folder with encoding_dsv32.py"
67+
)
68+
69+
try:
70+
spec = importlib.util.spec_from_file_location(
71+
"encoding_dsv32", encoding_script_path
72+
)
73+
if spec is None or spec.loader is None:
74+
raise ImportError(f"Failed to load spec from {encoding_script_path}")
75+
76+
module = importlib.util.module_from_spec(spec)
77+
sys.modules["encoding_dsv32"] = module
78+
spec.loader.exec_module(module)
79+
80+
logging.info(f"Successfully loaded DeepSeek V3.2 encoding module from {encoding_script_path}")
81+
return module
82+
except Exception as e:
83+
raise ImportError(
84+
f"Failed to load DeepSeek V3.2 encoding module from {encoding_script_path}: {str(e)}"
85+
)
86+
87+
@override
88+
def _setup_chat_template(self):
89+
"""
90+
DeepSeek V3.2 doesn't use Jinja templates.
91+
The chat_template attribute is set to None to indicate custom rendering.
92+
"""
93+
self.chat_template = None
94+
95+
@override
96+
def in_think_mode(self, request: ChatCompletionRequest) -> bool:
97+
"""
98+
Check if thinking mode is enabled.
99+
100+
Supports both parent class logic and per-request enable_thinking parameter.
101+
102+
Args:
103+
request: Chat completion request
104+
105+
Returns:
106+
True if thinking mode is enabled, False otherwise
107+
"""
108+
# Check parent class logic first
109+
thinking_enabled = super().in_think_mode(request)
110+
111+
# Check if enable_thinking is explicitly set in request kwargs
112+
if request.chat_template_kwargs and request.chat_template_kwargs.get("enable_thinking"):
113+
thinking_enabled = True
114+
if (
115+
request.extra_configs
116+
and request.extra_configs.chat_template_kwargs
117+
and isinstance(request.extra_configs.chat_template_kwargs, dict)
118+
and request.extra_configs.chat_template_kwargs.get("enable_thinking")
119+
):
120+
thinking_enabled = True
121+
122+
return thinking_enabled
123+
124+
def _build_prompt(self, request: ChatCompletionRequest) -> str:
125+
"""
126+
Build prompt string using the DeepSeek V3.2 encoding script.
127+
128+
Args:
129+
request: Chat completion request
130+
131+
Returns:
132+
str: Rendered prompt string
133+
"""
134+
# Convert request messages to the format expected by encoding_dsv32
135+
messages = []
136+
for msg in request.messages:
137+
message_dict = {"role": msg.role.value, "content": msg.content}
138+
139+
# Add tool_calls if present (on assistant messages)
140+
if hasattr(msg, "tool_calls") and msg.tool_calls:
141+
message_dict["tool_calls"] = [
142+
{
143+
"type": "function",
144+
"id": tc.id,
145+
"function": {
146+
"name": tc.function.name,
147+
"arguments": tc.function.arguments,
148+
}
149+
}
150+
for tc in msg.tool_calls
151+
]
152+
153+
# Add reasoning_content if present
154+
if hasattr(msg, "reasoning_content") and msg.reasoning_content:
155+
message_dict["reasoning_content"] = msg.reasoning_content
156+
157+
messages.append(message_dict)
158+
159+
# Add tools from request level to the first system message
160+
# According to encoding_dsv32 format, tools must be attached to a system message
161+
if request.tools:
162+
tools_data = [
163+
{
164+
"type": "function",
165+
"function": {
166+
"name": tool.function.name,
167+
"description": tool.function.description,
168+
"parameters": tool.function.parameters,
169+
}
170+
}
171+
for tool in request.tools
172+
]
173+
174+
# Find the first system message and add tools to it
175+
has_system = False
176+
for msg in messages:
177+
if msg["role"] == "system":
178+
msg["tools"] = tools_data
179+
has_system = True
180+
break
181+
182+
# If no system message exists, create one with tools
183+
if not has_system:
184+
messages.insert(0, {
185+
"role": "system",
186+
"content": "",
187+
"tools": tools_data
188+
})
189+
190+
# Determine thinking mode
191+
thinking_mode = "thinking" if self.in_think_mode(request) else "chat"
192+
193+
# Configure encoding
194+
# drop_thinking=True: Remove reasoning_content from historical assistant messages
195+
# add_default_bos_token=True: Always add BOS token since we encode full messages
196+
encode_config = {
197+
"thinking_mode": thinking_mode,
198+
"drop_thinking": True,
199+
"add_default_bos_token": True,
200+
}
201+
202+
# Override with custom configs if provided
203+
# Note: context parameter is not used since RTP-LLM always provides full message history
204+
if request.chat_template_kwargs:
205+
encode_config.update(request.chat_template_kwargs)
206+
207+
if (
208+
request.extra_configs
209+
and request.extra_configs.chat_template_kwargs
210+
and isinstance(request.extra_configs.chat_template_kwargs, dict)
211+
):
212+
encode_config.update(request.extra_configs.chat_template_kwargs)
213+
214+
# Filter encode_config to only include parameters accepted by encode_messages()
215+
# Valid parameters: thinking_mode, context, drop_thinking, add_default_bos_token
216+
valid_params = {"thinking_mode", "context", "drop_thinking", "add_default_bos_token"}
217+
filtered_config = {k: v for k, v in encode_config.items() if k in valid_params}
218+
219+
try:
220+
# Use the encoding module to encode messages
221+
rendered_prompt = self.encoding_module.encode_messages(
222+
messages, **filtered_config
223+
)
224+
225+
logging.debug(
226+
f"DeepSeek V3.2 rendered prompt (thinking_mode={thinking_mode}): {rendered_prompt[:200]}..."
227+
)
228+
229+
return rendered_prompt
230+
except Exception as e:
231+
logging.error(f"Failed to render DeepSeek V3.2 prompt: {str(e)}")
232+
raise ValueError(f"Error rendering DeepSeek V3.2 prompt: {str(e)}")
233+
234+
@override
235+
def render_chat(self, request: ChatCompletionRequest) -> RenderedInputs:
236+
"""
237+
Render chat messages using the DeepSeek V3.2 encoding script.
238+
239+
Args:
240+
request: Chat completion request
241+
242+
Returns:
243+
RenderedInputs with encoded token IDs and rendered prompt
244+
"""
245+
prompt = self._build_prompt(request)
246+
input_ids = self.tokenizer.encode(prompt)
247+
return RenderedInputs(input_ids=input_ids, rendered_prompt=prompt)
248+
249+
@override
250+
def _create_detector(
251+
self, request: ChatCompletionRequest
252+
) -> Optional[BaseFormatDetector]:
253+
"""
254+
Create DSML format detector for tool calls.
255+
256+
Args:
257+
request: Chat completion request
258+
259+
Returns:
260+
DeepSeekV32Detector if tools are present, None otherwise
261+
"""
262+
if request.tools:
263+
# Determine thinking_mode based on whether request is in thinking mode
264+
thinking_mode = "thinking" if self.in_think_mode(request) else "chat"
265+
266+
# Pass the encoding module and thinking_mode to detector
267+
# Detector is created fresh for each request (not singleton)
268+
return DeepSeekV32Detector(
269+
encoding_module=self.encoding_module,
270+
thinking_mode=thinking_mode
271+
)
272+
return None
273+
274+
@override
275+
def _create_reasoning_parser(
276+
self, request: ChatCompletionRequest
277+
) -> Optional[ReasoningParser]:
278+
"""
279+
Create reasoning parser if in thinking mode.
280+
281+
Args:
282+
request: Chat completion request
283+
284+
Returns:
285+
ReasoningParser if thinking mode is enabled, None otherwise
286+
"""
287+
if not self.in_think_mode(request):
288+
return None
289+
290+
try:
291+
# Check if the rendered prompt should use thinking mode
292+
rendered_result = self.render_chat(request)
293+
if "<think>" in rendered_result.rendered_prompt:
294+
return ReasoningParser(model_type="deepseek-v3", force_reasoning=True)
295+
except Exception:
296+
return None
297+
298+
return None
299+
300+
301+
register_renderer("deepseek_v32", DeepseekV32Renderer)

0 commit comments

Comments
 (0)