Merge branch 'main' into release/3.0

Jintao-Huang · Jintao-Huang · commit e1c69dc30157 · 2024-12-25T14:53:23.000+08:00
diff --git a/README.md b/README.md
@@ -147,6 +147,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora and use vLLM for inference acceleration
@@ -157,6 +158,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git a/README_CN.md b/README_CN.md
@@ -140,6 +140,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora并使用vLLM进行推理加速
@@ -150,6 +151,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git a/docs/source/GetStarted/快速开始.md b/docs/source/GetStarted/快速开始.md
@@ -64,6 +64,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora并使用vLLM进行推理加速
@@ -74,6 +75,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -16,10 +16,10 @@
 - custom_register_path: 自定义模型、对话模板和数据集注册的`.py`文件路径
 
 ### 模型参数
-- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. 例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥model: 模型id或模型本地路径。如果是自定义模型请配合`model_type`和`template`使用，具体可以参考[自定义模型](../Customization/自定义模型.md)
 - model_type: 模型类型。相同的模型架构、template、模型加载过程被定义为一个model_type
 - model_revision: 模型版本
+- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. 例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥torch_dtype: 模型权重的数据类型，支持`float16`,`bfloat16`,`float32`，默认从config文件中读取
 - attn_impl: attention类型，支持`flash_attn`, `sdpa`, `eager`，默认使用sdpa
 - num_labels: 分类模型需要指定。代表标签数量，默认为None
diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
@@ -64,6 +64,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora and use vLLM for inference acceleration
@@ -74,6 +75,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -16,11 +16,11 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - custom_register_path: The file path for the custom model, chat template, and dataset registration `.py` files.
 
 ### Model Arguments
-- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥model: Model ID or local path to the model. If it's a custom model, please use it with `model_type` and `template`. The specific details can be referred to in the [Custom Model](../Customization/Custom-model.md).
 - model_type: Model type. The same model architecture, template, and loading process define a model_type.
 - model_revision: Model version.
 - 🔥torch_dtype: Data type for model weights, supports `float16`, `bfloat16`, `float32`, default is read from the config file.
+- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - attn_impl: Attention type, supports `flash_attn`, `sdpa`, `eager`, default is sdpa.
 - num_labels: To be specified for classification models, representing the number of labels, default is None.
 - rope_scaling: Rope type, supports `linear` and `dynamic`, to be used with `max_length`.
diff --git a/examples/notebook/qwen2.5-self-cognition/infer.sh b/examples/notebook/qwen2.5-self-cognition/infer.sh
@@ -0,0 +1,7 @@
+# Here is the command-line style inference code.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
diff --git a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
@@ -22,7 +22,7 @@
    "outputs": [],
    "source": [
     "# # install ms-swift\n",
-    "# pip install git+https://github.com/modelscope/ms-swift.git"
+    "# pip install ms-swift -U"
    ]
   },
   {
diff --git a/examples/notebook/qwen2.5-self-cognition/sft.sh b/examples/notebook/qwen2.5-self-cognition/sft.sh
@@ -0,0 +1,30 @@
+# Here is the command-line style training code.
+# 22GB
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model Qwen/Qwen2.5-3B-Instruct \
+    --train_type lora \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
+              AI-ModelScope/alpaca-gpt4-data-en#500 \
+              swift/self-cognition#500 \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --model_author 小黄 'Xiao Huang' \
+    --model_name '魔搭' 'ModelScope'
diff --git a/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb b/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb
@@ -22,7 +22,7 @@
    "outputs": [],
    "source": [
     "# # install ms-swift\n",
-    "# pip install git+https://github.com/modelscope/ms-swift.git"
+    "# pip install ms-swift -U"
    ]
   },
   {
diff --git a/examples/train/demo.sh b/examples/train/demo.sh
@@ -34,4 +34,5 @@ swift sft \
 # swift infer \
 #     --adapters output/vx-xxx/checkpoint-xxx \
 #     --stream true \
+#     --temperature 0 \
 #     --max_new_tokens 2048
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
@@ -170,8 +170,8 @@ def from_pretrained(cls, checkpoint_dir: str):
         self.load_args_from_ckpt()
         return self
 
-    def _init_ckpt_dir(self, adapters=None):
-        model_dirs = (adapters or self.adapters).copy()
+    def _init_ckpt_dir(self):
+        model_dirs = self.adapters.copy()
         if self.model:
             model_dirs.append(self.model)
         self.ckpt_dir = None
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
@@ -29,11 +29,11 @@ class ModelArguments:
         device_map (Optional[str]): Configuration for device mapping. Default is None.
         local_repo_path (Optional[str]): Path to the local github repository for model. Default is None.
     """
-    task_type: Literal['causal_lm', 'seq_cls'] = None
     model: Optional[str] = None  # model id or model path
     model_type: Optional[str] = field(
         default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
     model_revision: Optional[str] = None
+    task_type: Literal['causal_lm', 'seq_cls'] = None
 
     torch_dtype: Literal['bfloat16', 'float16', 'float32', None] = None
     # flash_attn: It will automatically convert names based on the model.
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -462,6 +462,8 @@ def _pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float
     @staticmethod
     def _add_default_tags(inputs: StdTemplateInputs):
         total_content = '\n'.join([message['content'] or '' for message in inputs.messages])
+        if inputs.system:
+            total_content = f'{inputs.system}\n{total_content}'
         for media_type in ['image', 'audio', 'video']:
             media_key, media_tag = f'{media_type}s', f'<{media_type}>'
             medias = getattr(inputs, media_key)
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -4,9 +4,7 @@
 import os
 import shutil
 import time
-from contextlib import contextmanager
 from copy import copy
-from functools import wraps
 from types import MethodType
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
@@ -249,29 +247,6 @@ def _save_checkpoint(self, *args, **kwargs):
         logger.info(f'Saving model checkpoint to {self.state.last_model_checkpoint}')
         return result
 
-    @contextmanager
-    def _patch_loss_function(self):
-        model = self.model
-        if isinstance(model, PeftModel):
-            model = model.model
-        model_cls = model.__class__
-        if not hasattr(model_cls, 'loss_function'):
-            yield
-            return
-
-        loss_function = model.loss_function
-        _old_loss_function = model_cls.loss_function
-
-        @staticmethod
-        @wraps(loss_function)
-        def new_loss_function(logits, labels, **kwargs):
-            labels = labels.to(logits.device)  # fix device_map
-            return loss_function(logits=logits, labels=labels, **kwargs)
-
-        model_cls.loss_function = new_loss_function
-        yield
-        model_cls.loss_function = _old_loss_function
-
     def train(self, *args, **kwargs):
         if self.model.model_meta.is_multimodal:
             models = list(
@@ -282,7 +257,7 @@ def train(self, *args, **kwargs):
             self.template.register_post_encode_hook(models)
             logger.info(f'Successfully registered post_encode hook: {[model.__class__.__name__ for model in models]}')
         self._save_initial_model(self.args.output_dir)
-        with self.hub.patch_hub(), self._patch_loss_function():
+        with self.hub.patch_hub():
             res = super().train(*args, **kwargs)
         self.template.remove_post_encode_hook()
         return res
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -2,6 +2,7 @@
 # Part of the implementation is borrowed from huggingface/transformers.
 import os
 from contextlib import contextmanager, nullcontext
+from functools import wraps
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -22,6 +23,33 @@
 class Trainer(SwiftMixin, HfTrainer):
     args: TrainingArguments
 
+    @contextmanager
+    def _patch_loss_function(self):
+        model = self.model
+        if isinstance(model, PeftModel):
+            model = model.model
+        model_cls = model.__class__
+        if not hasattr(model_cls, 'loss_function'):
+            yield
+            return
+
+        loss_function = model.loss_function
+        _old_loss_function = model_cls.loss_function
+
+        @staticmethod
+        @wraps(loss_function)
+        def new_loss_function(logits, labels, **kwargs):
+            labels = labels.to(logits.device)  # fix device_map
+            return loss_function(logits=logits, labels=labels, **kwargs)
+
+        model_cls.loss_function = new_loss_function
+        yield
+        model_cls.loss_function = _old_loss_function
+
+    def train(self, *args, **kwargs):
+        with self._patch_loss_function():
+            return super().train(*args, **kwargs)
+
     def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         loss, outputs = super().compute_loss(model, inputs, return_outputs=True)
         if inputs.get('labels') is not None:

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`"outputs": [],`
`23`	`23`	`"source": [`
`24`	`24`	`"# # install ms-swift\n",`
`25`		`- "# pip install git+https://github.com/modelscope/ms-swift.git"`
	`25`	`+ "# pip install ms-swift -U"`
`26`	`26`	`]`
`27`	`27`	`},`
`28`	`28`	`{`