[train] Support new special tokens (#4945)

Jintao-Huang · web-flow · commit 2ea885ebac81 · 2025-07-14T16:03:54.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -32,6 +32,7 @@
 - 🔥torch_dtype: 模型权重的数据类型，支持`float16`,`bfloat16`,`float32`。默认为None，从config.json文件中读取。
 - attn_impl: attention类型，可选项为`flash_attn`, `sdpa`, `eager`。默认使用None，读取config.json。
   - 注意：这三种实现并不一定都支持，这取决于对应模型的支持情况。
+- new_special_tokens: 需要新增的特殊tokens。默认为`[]`。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens)。
 - num_labels: 分类模型（即`--task_type seq_cls`）需要指定该参数。代表标签数量，默认为None。
 - problem_type: 分类模型（即`--task_type seq_cls`）需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为None，根据num_labels和数据集类型进行自动设置。
 - rope_scaling: rope类型，支持`linear`和`dynamic`和`yarn`，请配合`max_length`共同使用。默认为None。
@@ -639,7 +640,7 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数)
 ## 特定模型参数
 特定模型参数可以通过`--model_kwargs`或者环境变量进行设置，例如: `--model_kwargs '{"fps_max_frames": 12}'`或者`FPS_MAX_FRAMES=12`。
 
-### qwen2_vl, qvq, qwen2_5_vl
+### qwen2_vl, qvq, qwen2_5_vl, mimo_vl
 参数含义同`qwen_vl_utils`或者`qwen_omni_utils`库，可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)。
 
 - IMAGE_FACTOR: 默认为28。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -33,6 +33,7 @@ Hints:
 - 🔥torch_dtype: Data type of model weights, supports `float16`, `bfloat16`, `float32`. The default is None, and it is read from the 'config.json' file.
 - attn_impl: The type of attention, with options including `flash_attn`, `sdpa`, and `eager`. The default is None, which reads from `config.json`.
   - Note: These three implementations may not all be supported, depending on the support of the corresponding model.
+- new_special_tokens: The special tokens to be added. Default is `[]`. See the example [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens).
 - num_labels: This parameter is required for classification models (i.e., `--task_type seq_cls`). It represents the number of labels, with a default value of None.
 - problem_type: This parameter is required for classification models (i.e., `--task_type seq_cls`). The options are 'regression', 'single_label_classification', and 'multi_label_classification'. The default value is None, and it will be automatically set based on the number of labels and the dataset type.
 - rope_scaling: Type of rope, supports `linear` and `dynamic` and `yarn`, should be used in conjunction with `max_length`. Default is None.
@@ -658,7 +659,7 @@ Export Arguments include the [basic arguments](#base-arguments) and [merge argum
 
 Specific model arguments can be set using `--model_kwargs` or environment variables, for example: `--model_kwargs '{"fps_max_frames": 12}'` or `FPS_MAX_FRAMES=12`.
 
-### qwen2_vl, qvq, qwen2_5_vl
+### qwen2_vl, qvq, qwen2_5_vl, mimo_vl
 The parameter meanings are the same as in the `qwen_vl_utils` or `qwen_omni_utils` library. You can refer to [here](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)
 
 - IMAGE_FACTOR: Default is 28
diff --git a/examples/train/new_special_tokens/infer.sh b/examples/train/new_special_tokens/infer.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --max_batch_size 16 \
+    --load_data_args true \
+    --temperature 0
diff --git a/examples/train/new_special_tokens/merge_lora.sh b/examples/train/new_special_tokens/merge_lora.sh
@@ -0,0 +1,13 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --merge_lora true
+
+
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx-merged \
+    --max_batch_size 16 \
+    --load_data_args true \
+    --temperature 0
diff --git a/examples/train/new_special_tokens/tokens.txt b/examples/train/new_special_tokens/tokens.txt
@@ -0,0 +1,100 @@
+<|0|>
+<|1|>
+<|2|>
+<|3|>
+<|4|>
+<|5|>
+<|6|>
+<|7|>
+<|8|>
+<|9|>
+<|10|>
+<|11|>
+<|12|>
+<|13|>
+<|14|>
+<|15|>
+<|16|>
+<|17|>
+<|18|>
+<|19|>
+<|20|>
+<|21|>
+<|22|>
+<|23|>
+<|24|>
+<|25|>
+<|26|>
+<|27|>
+<|28|>
+<|29|>
+<|30|>
+<|31|>
+<|32|>
+<|33|>
+<|34|>
+<|35|>
+<|36|>
+<|37|>
+<|38|>
+<|39|>
+<|40|>
+<|41|>
+<|42|>
+<|43|>
+<|44|>
+<|45|>
+<|46|>
+<|47|>
+<|48|>
+<|49|>
+<|50|>
+<|51|>
+<|52|>
+<|53|>
+<|54|>
+<|55|>
+<|56|>
+<|57|>
+<|58|>
+<|59|>
+<|60|>
+<|61|>
+<|62|>
+<|63|>
+<|64|>
+<|65|>
+<|66|>
+<|67|>
+<|68|>
+<|69|>
+<|70|>
+<|71|>
+<|72|>
+<|73|>
+<|74|>
+<|75|>
+<|76|>
+<|77|>
+<|78|>
+<|79|>
+<|80|>
+<|81|>
+<|82|>
+<|83|>
+<|84|>
+<|85|>
+<|86|>
+<|87|>
+<|88|>
+<|89|>
+<|90|>
+<|91|>
+<|92|>
+<|93|>
+<|94|>
+<|95|>
+<|96|>
+<|97|>
+<|98|>
+<|99|>
diff --git a/examples/train/new_special_tokens/train.sh b/examples/train/new_special_tokens/train.sh
@@ -0,0 +1,31 @@
+# 4 * 26GB
+# This example is just a demo showing how to add new_special_tokens.
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_type lora \
+    --dataset 'swift/new_special_tokens' \
+    --split_dataset_ratio 0.01 \
+    --new_special_tokens examples/train/new_special_tokens/tokens.txt \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 16 \
+    --padding_free true \
+    --attn_impl flash_attn \
+    --learning_rate 1e-4 \
+    --lora_rank 16 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --modules_to_save embed_tokens lm_head \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 500 \
+    --save_steps 500 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
@@ -237,6 +237,7 @@ def load_args_from_ckpt(self) -> None:
             'model_revision',
             'torch_dtype',
             'attn_impl',
+            'new_special_tokens',
             'num_labels',
             'problem_type',
             # quant_args
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
@@ -3,7 +3,7 @@
 import math
 import os
 from dataclasses import dataclass, field
-from typing import Any, Dict, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import json
 import torch
@@ -42,6 +42,7 @@ class ModelArguments:
     # flash_attn: It will automatically convert names based on the model.
     # None: It will be automatically selected between sdpa and eager.
     attn_impl: Literal['flash_attn', 'sdpa', 'eager', 'flex_attention', None] = None
+    new_special_tokens: List[str] = field(default_factory=list)
 
     num_labels: Optional[int] = None
     problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None
@@ -149,9 +150,24 @@ def _init_model_info(self) -> torch.dtype:
             self._init_rope_scaling()
         return self.model_info.torch_dtype
 
+    def _init_new_special_tokens(self):
+        if isinstance(self.new_special_tokens, str):
+            self.new_special_tokens = [self.new_special_tokens]
+        new_special_tokens = []
+        for token in self.new_special_tokens:
+            if token.endswith('.txt'):
+                assert os.path.isfile(token), f'special_tokens_path: {token}'
+                with open(token, 'r') as f:
+                    text = f.read()
+                new_special_tokens += text.split()
+            else:
+                new_special_tokens.append(token)
+        self.new_special_tokens = new_special_tokens
+
     def __post_init__(self):
         if self.model is None:
             raise ValueError(f'Please set --model <model_id_or_path>`, model: {self.model}')
+        self._init_new_special_tokens()
         self.model_suffix = get_model_name(self.model)
         self._init_device_map()
         self._init_max_memory()
@@ -170,6 +186,7 @@ def get_model_kwargs(self):
             'max_memory': self.max_memory,
             'quantization_config': self.get_quantization_config(),
             'attn_impl': self.attn_impl,
+            'new_special_tokens': self.new_special_tokens,
             'rope_scaling': self.rope_scaling,
             'task_type': self.task_type,
             'num_labels': self.num_labels,
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
@@ -559,6 +559,7 @@ def get_model_tokenizer(
         quantization_config=None,
         max_memory: Union[str, Dict[str, Any]] = None,
         attn_impl: Literal['flash_attn', 'sdpa', 'eager', None] = None,
+        new_special_tokens: Optional[List[str]] = None,
         rope_scaling: Optional[Dict[str, Any]] = None,
         automodel_class=None,
         task_type: Literal['causal_lm', 'seq_cls', 'reranker', 'generative_reranker'] = None,
@@ -617,6 +618,13 @@ def get_model_tokenizer(
         patch_getattr(processor.__class__, 'tokenizer')
     else:
         tokenizer = processor
+    if new_special_tokens:
+        num_new_tokens = tokenizer.add_special_tokens({'additional_special_tokens': new_special_tokens})
+        if num_new_tokens > 0:
+            logger.info(f'Added {num_new_tokens} new special tokens.')
+            if model.config.vocab_size < len(tokenizer):
+                model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
+
     problem_type = kwargs.get('problem_type')
     if problem_type is None and model_info.num_labels == 1:
         problem_type = 'regression'

-Original file line number
+Diff line change
@@ @@ -0,0 +1,100 @@ @@
 +<|0|>
 +<|1|>
 +<|2|>
 +<|3|>
 +<|4|>
 +<|5|>
 +<|6|>
 +<|7|>
 +<|8|>
 +<|9|>
 +<|10|>
 +<|11|>
 +<|12|>
 +<|13|>
 +<|14|>
 +<|15|>
 +<|16|>
 +<|17|>
 +<|18|>
 +<|19|>
 +<|20|>
 +<|21|>
 +<|22|>
 +<|23|>
 +<|24|>
 +<|25|>
 +<|26|>
 +<|27|>
 +<|28|>
 +<|29|>
 +<|30|>
 +<|31|>
 +<|32|>
 +<|33|>
 +<|34|>
 +<|35|>
 +<|36|>
 +<|37|>
 +<|38|>
 +<|39|>
 +<|40|>
 +<|41|>
 +<|42|>
 +<|43|>
 +<|44|>
 +<|45|>
 +<|46|>
 +<|47|>
 +<|48|>
 +<|49|>
 +<|50|>
 +<|51|>
 +<|52|>
 +<|53|>
 +<|54|>
 +<|55|>
 +<|56|>
 +<|57|>
 +<|58|>
 +<|59|>
 +<|60|>
 +<|61|>
 +<|62|>
 +<|63|>
 +<|64|>
 +<|65|>
 +<|66|>
 +<|67|>
 +<|68|>
 +<|69|>
 +<|70|>
 +<|71|>
 +<|72|>
 +<|73|>
 +<|74|>
 +<|75|>
 +<|76|>
 +<|77|>
 +<|78|>
 +<|79|>
 +<|80|>
 +<|81|>
 +<|82|>
 +<|83|>
 +<|84|>
 +<|85|>
 +<|86|>
 +<|87|>
 +<|88|>
 +<|89|>
 +<|90|>
 +<|91|>
 +<|92|>
 +<|93|>
 +<|94|>
 +<|95|>
 +<|96|>
 +<|97|>
 +<|98|>
 +<|99|>