Skip to content

Commit 5334b84

Browse files
authored
support deepseek-V3.1 & add no_think_prefix for hybrid thinking models (#5463)
* support model * update template * fix * fix template * fix template * no_think_prefix * fix glm prefix
1 parent dd838e7 commit 5334b84

File tree

10 files changed

+83
-5
lines changed

10 files changed

+83
-5
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@
447447
|[deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)|deepseek_r1_distill|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)|
448448
|[deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|deepseek_r1_distill|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|
449449
|[deepseek-ai/DeepSeek-R1-0528-Qwen3-8B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B)|deepseek_r1_distill|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-0528-Qwen3-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B)|
450+
|[deepseek-ai/DeepSeek-V3.1-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.1-Base)|deepseek_v3_1|deepseek_v3_1|transformers>=4.39.3|✘|-|[deepseek-ai/DeepSeek-V3.1-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Base)|
451+
|[deepseek-ai/DeepSeek-V3.1](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.1)|deepseek_v3_1|deepseek_v3_1|transformers>=4.39.3|✘|-|[deepseek-ai/DeepSeek-V3.1](https://huggingface.co/deepseek-ai/DeepSeek-V3.1)|
450452
|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama-65b-v8-bf16)|
451453
|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://huggingface.co/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|
452454
|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@ The table below introduces the models integrated with ms-swift:
447447
|[deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)|deepseek_r1_distill|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)|
448448
|[deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|deepseek_r1_distill|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|
449449
|[deepseek-ai/DeepSeek-R1-0528-Qwen3-8B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B)|deepseek_r1_distill|deepseek_r1|-|✔|-|[deepseek-ai/DeepSeek-R1-0528-Qwen3-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B)|
450+
|[deepseek-ai/DeepSeek-V3.1-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.1-Base)|deepseek_v3_1|deepseek_v3_1|transformers>=4.39.3|✘|-|[deepseek-ai/DeepSeek-V3.1-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Base)|
451+
|[deepseek-ai/DeepSeek-V3.1](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3.1)|deepseek_v3_1|deepseek_v3_1|transformers>=4.39.3|✘|-|[deepseek-ai/DeepSeek-V3.1](https://huggingface.co/deepseek-ai/DeepSeek-V3.1)|
450452
|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama-65b-v8-bf16)|
451453
|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://huggingface.co/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|
452454
|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|openbuddy_llama|openbuddy|-|✔|-|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|

swift/llm/model/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class LLMModelType:
6464
deepseek_v2_5 = 'deepseek_v2_5'
6565
deepseek_r1 = 'deepseek_r1'
6666
deepseek_r1_distill = 'deepseek_r1_distill'
67+
deepseek_v3_1 = 'deepseek_v3_1'
6768

6869
openbuddy_llama = 'openbuddy_llama'
6970
openbuddy_llama3 = 'openbuddy_llama3'

swift/llm/model/model/deepseek.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,22 @@ def get_model_tokenizer_deepseek_moe(model_dir: str,
128128
requires=['transformers>=4.39.3'],
129129
))
130130

131+
register_model(
132+
ModelMeta(
133+
LLMModelType.deepseek_v3_1,
134+
[
135+
ModelGroup([
136+
Model('deepseek-ai/DeepSeek-V3.1-Base', 'deepseek-ai/DeepSeek-V3.1-Base'),
137+
Model('deepseek-ai/DeepSeek-V3.1', 'deepseek-ai/DeepSeek-V3.1'),
138+
]),
139+
],
140+
TemplateType.deepseek_v3_1,
141+
get_model_tokenizer_deepseek_moe,
142+
architectures=['DeepseekV3ForCausalLM'],
143+
model_arch=ModelArch.deepseek_v2,
144+
requires=['transformers>=4.39.3'],
145+
))
146+
131147

132148
def _get_deepseek_vl(processor, llm_prefix, model_dir, *args, **kwargs):
133149
kwargs['tokenizer'] = processor.tokenizer

swift/llm/template/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class LLMTemplateType:
5555
deepseek_coder = 'deepseek_coder'
5656
deepseek_v2_5 = 'deepseek_v2_5'
5757
deepseek_r1 = 'deepseek_r1'
58+
deepseek_v3_1 = 'deepseek_v3_1'
5859

5960
openbuddy = 'openbuddy'
6061
openbuddy2 = 'openbuddy2'

swift/llm/template/template/deepseek.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,20 @@ class DeepseekV2_5TemplateMeta(TemplateMeta):
244244

245245
register_template(DeepseekV2_5TemplateMeta(LLMTemplateType.deepseek_v2_5))
246246

247+
248+
class DeepseekV3_1Template(ThinkingTemplate):
249+
no_think_prefix = '</think>'
250+
history_think_prefix = '</think>'
251+
252+
247253
register_template(
248254
DeepseekV2_5TemplateMeta(LLMTemplateType.deepseek_r1, template_cls=ThinkingTemplate, response_prefix='<think>\n'))
249255

256+
# enable thinking: response_prefix='<think>'
257+
register_template(
258+
DeepseekV2_5TemplateMeta(
259+
LLMTemplateType.deepseek_v3_1, template_cls=DeepseekV3_1Template, response_prefix='</think>'))
260+
250261

251262
class DeepseekVL2Template(DeepseekVLTemplate):
252263
image_placeholder = ['<image>\n']

swift/llm/template/template/glm.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,13 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
229229

230230
register_template(GLM4_0414TemplateMeta(LLMTemplateType.glm4_0414, template_cls=GLM4_0414Template))
231231

232-
register_template(GLM4_0414TemplateMeta(LLMTemplateType.glm4_5, template_cls=ThinkingTemplate))
232+
233+
class GLM4_5Template(ThinkingTemplate):
234+
no_think_prefix = '<think></think>\n'
235+
history_think_prefix = '<think></think>\n'
236+
237+
238+
register_template(GLM4_0414TemplateMeta(LLMTemplateType.glm4_5, template_cls=GLM4_5Template))
233239

234240
register_template(GLM4_1VTemplateMeta(MLLMTemplateType.glm4_1v, template_cls=GLM4_1VTemplate))
235241

swift/llm/template/template/qwen.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,12 @@ class Qwen2_5MathTemplateMeta(QwenTemplateMeta):
5252
QwenTemplateMeta(
5353
LLMTemplateType.qwq, default_system=None, response_prefix='<think>\n', template_cls=ThinkingTemplate))
5454

55-
# '<think>\n\n</think>\n\n'
56-
register_template(QwenTemplateMeta(LLMTemplateType.qwen3, default_system=None, template_cls=ThinkingTemplate))
55+
56+
class Qwen3Template(ThinkingTemplate):
57+
no_think_prefix = '<think>\n\n</think>\n\n'
58+
59+
60+
register_template(QwenTemplateMeta(LLMTemplateType.qwen3, default_system=None, template_cls=Qwen3Template))
5761

5862
register_template(
5963
QwenTemplateMeta(

swift/llm/template/template/utils.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,19 @@ class EmptyTemplateMeta(TemplateMeta):
3434

3535
class ThinkingTemplate(Template):
3636
with_answer = False
37+
no_think_prefix = '' # for hybrid thinking model
38+
history_think_prefix = ''
3739

3840
def _swift_prepare_inputs(self, inputs):
3941
super()._swift_prepare_inputs(inputs)
4042
messages = inputs.messages
43+
44+
if self.no_think_prefix:
45+
for i, message in enumerate(messages):
46+
if message['role'] == 'assistant' and isinstance(message['content'], str):
47+
if not message['content'].startswith('<think>'):
48+
message['content'] = self.no_think_prefix + message['content']
49+
4150
# Only during inference or training, and only if the loss_scale is set to 'last_round',
4251
# will the previous 'think' entries be deleted.
4352
if not self.is_training or self.loss_scale.name in {'last_round', 'last_round_with_ignore_empty_think'}:
@@ -48,7 +57,8 @@ def _swift_prepare_inputs(self, inputs):
4857
message['content'] = message['content'].split('<answer>')[-1].rstrip().rstrip(
4958
'</answer>').strip()
5059
else:
51-
message['content'] = message['content'].split('</think>')[-1].strip()
60+
message['content'] = self.history_think_prefix + message['content'].split(
61+
'</think>')[-1].strip()
5262

5363

5464
class ThinkingWithAnswerTemplate(ThinkingTemplate):

tests/test_align/test_template/test_template.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,33 @@ def test_minimax_vl():
9595
assert len(res['input_ids']) == 5877
9696

9797

98+
def test_deepseek_v3_1():
99+
tokenizer = get_model_tokenizer('deepseek-ai/DeepSeek-V3.1', load_model=False)[1]
100+
template = get_template(tokenizer.model_meta.template, tokenizer)
101+
inputs = TemplateInputs(messages=[{
102+
'role': 'system',
103+
'content': '000'
104+
}, {
105+
'role': 'user',
106+
'content': 'aaa'
107+
}, {
108+
'role': 'assistant',
109+
'content': 'bbb'
110+
}, {
111+
'role': 'user',
112+
'content': 'ccc'
113+
}])
114+
res = template.encode(inputs)
115+
template.print_inputs(res)
116+
template.template_backend = 'jinja'
117+
res2 = template.encode(inputs)
118+
template.print_inputs(res2)
119+
assert res['input_ids'] == res2['input_ids']
120+
121+
98122
if __name__ == '__main__':
99123
# test_deepseek_v2_5()
100124
# test_qwen2_5_math_reward()
101125
# test_minimax()
102-
test_minimax_vl()
126+
# test_minimax_vl()
127+
test_deepseek_v3_1()

0 commit comments

Comments
 (0)