add feat: only save model (#49)

Jintao-Huang · web-flow · commit 20aef7068560 · 2023-09-06T15:22:51.000+08:00
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ Key features:
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
 
 1. supported SFT methods: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine-tuning)
-2. supported models: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b
+2. supported models: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b
 3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en
diff --git a/README_CN.md b/README_CN.md
@@ -30,7 +30,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
 
 1. 支持的SFT方法: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调
-2. 支持的模型: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b
+2. 支持的模型: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b
 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
 4. 支持的数据集:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh
@@ -18,8 +18,9 @@ python src/llm_sft.py \
     --gradient_accumulation_steps 16 \
     --max_grad_norm 1 \
     --warmup_ratio 0.03 \
-    --eval_steps 50 \
+    --eval_steps 100 \
     --save_steps 100 \
+    --only_save_model true \
     --save_total_limit 2 \
     --logging_steps 10 \
     --use_flash_attn false \
diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
@@ -109,8 +109,7 @@ def llm_infer(args: InferArguments) -> None:
         args.system,
         args.max_length,
         batched=False)
-    streamer = TextStreamer(
-        tokenizer, skip_prompt=True, skip_special_tokens=True)
+    streamer = TextStreamer(tokenizer, skip_prompt=True)
     generation_config = GenerationConfig(
         max_new_tokens=args.max_new_tokens,
         temperature=args.temperature,
@@ -126,6 +125,7 @@ def llm_infer(args: InferArguments) -> None:
             query = input('<<< ')
             data = {'query': query}
             input_ids = preprocess_func(data)['input_ids']
+            streamer.decode_kwargs['skip_special_tokens'] = True
             inference(input_ids, model, tokenizer, streamer, generation_config,
                       args.skip_prompt)
     else:
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
@@ -85,6 +85,7 @@ class SftArguments:
 
     eval_steps: int = 50
     save_steps: Optional[int] = None
+    only_save_model: Optional[bool] = None
     save_total_limit: int = 2
     logging_steps: int = 5
     dataloader_num_workers: int = 1
@@ -126,23 +127,24 @@ def __post_init__(self):
         if self.sft_type == 'lora':
             if self.learning_rate is None:
                 self.learning_rate = 1e-4
-            if self.save_steps is None:
-                self.save_steps = self.eval_steps
+            if self.only_save_model is None:
+                self.only_save_model = False
         elif self.sft_type == 'full':
             assert self.quantization_bit is None, 'not supported'
             assert self.dtype != 'fp16', 'please use bf16 or fp32'
             if self.learning_rate is None:
                 self.learning_rate = 1e-5
-            if self.save_steps is None:
-                # Saving the model takes a long time
-                self.save_steps = self.eval_steps * 4
+            if self.only_save_model is None:
+                self.only_save_model = True
         else:
             raise ValueError(f'sft_type: {self.sft_type}')
+
         if self.template_type is None:
             self.template_type = MODEL_MAPPING[self.model_type].get(
                 'template', 'default')
             logger.info(f'Setting template_type: {self.template_type}')
-
+        if self.save_steps is None:
+            self.save_steps = self.eval_steps
         self.output_dir = os.path.join(self.output_dir, self.model_type)
 
         if self.lora_target_modules is None:
@@ -288,7 +290,8 @@ def llm_sft(args: SftArguments) -> None:
         resume_from_checkpoint=args.resume_from_ckpt,
         ddp_backend=args.ddp_backend,
         gradient_checkpointing=args.gradient_checkpointing,
-        local_rank=local_rank)
+        local_rank=local_rank,
+        only_save_model=args.only_save_model)
 
     if args.gradient_checkpointing:
         # fix: gradients will be None
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
@@ -137,13 +137,14 @@ def _process_mutimodal_dataset(dataset: HfDataset, prompt: str, image_key: str,
                                response_key: str) -> HfDataset:
     dataset._info.features._column_requires_decoding['image'] = False
     query_format = f'<img>{{image_path}}</img>{prompt}'
-    query = [
-        query_format.format(image_path=d[image_key]['path']) for d in dataset
-    ]
-    dataset = HfDataset.from_dict({
-        'query': query,
-        'response': dataset[response_key]
-    })
+    query = []
+    response = []
+    for d in tqdm(dataset):
+        query.append(query_format.format(image_path=d[image_key]['path']))
+        if '&&' in d[response_key]:
+            d[response_key] = d[response_key].split('&&')[0]
+        response.append(d[response_key])
+    dataset = HfDataset.from_dict({'query': query, 'response': response})
     return dataset
 
 
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
@@ -222,14 +222,14 @@ class LoRATM(NamedTuple):
     },
     'chatglm2-6b': {
         'model_id': 'ZhipuAI/chatglm2-6b',
-        'revision': 'v1.0.8',
+        'revision': 'v1.0.9',
         'get_function': get_model_tokenizer_chatglm2,
         'template': 'chatglm2',
         'lora_TM': LoRATM.chatglm2,
     },
     'chatglm2-6b-32k': {
         'model_id': 'ZhipuAI/chatglm2-6b-32k',
-        'revision': 'v1.0.0',
+        'revision': 'v1.0.1',
         'template': 'chatglm2',
         'lora_TM': LoRATM.chatglm2,
     },
diff --git a/swift/trainers/__init__.py b/swift/trainers/__init__.py
@@ -4,7 +4,6 @@
                                         HPSearchBackend, HubStrategy,
                                         IntervalStrategy, SchedulerType,
                                         ShardedDDPOption)
-from transformers.training_args import TrainingArguments
-from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 
+from .arguments import Seq2SeqTrainingArguments, TrainingArguments
 from .trainers import Seq2SeqTrainer, Trainer
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from dataclasses import dataclass
+
+from transformers.training_args import TrainingArguments as HfTrainingArguments
+from transformers.training_args_seq2seq import \
+    Seq2SeqTrainingArguments as HfSeq2SeqTrainingArguments
+
+
+@dataclass
+class SwiftArgumentsMixin:
+    # ckpt only save model
+    only_save_model: bool = False
+
+
+@dataclass
+class TrainingArguments(SwiftArgumentsMixin, HfTrainingArguments):
+    pass
+
+
+@dataclass
+class Seq2SeqTrainingArguments(SwiftArgumentsMixin,
+                               HfSeq2SeqTrainingArguments):
+    pass
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -1,11 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+# Part of the implementation is borrowed from huggingface/transformers.
 import os
 import shutil
 from types import MethodType
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import json
+import numpy as np
 import safetensors
 import torch
 from datasets import Dataset as HfDataset
@@ -15,6 +16,7 @@
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.data.data_collator import DataCollator
 from transformers.modeling_utils import unwrap_model
+from transformers.trainer import PREFIX_CHECKPOINT_DIR, TRAINER_STATE_NAME
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import EvalPrediction, HubStrategy
 from transformers.training_args import TrainingArguments
@@ -278,3 +280,52 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(output_dir)
         torch.save(self.args, os.path.join(output_dir, 'training_args.bin'))
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        only_save_model = getattr(self.args, 'only_save_model', False)
+        if only_save_model:
+            return self._only_save_model(model, trial, metrics)
+        else:
+            return super()._save_checkpoint(model, trial, metrics)
+
+    def _only_save_model(self, model, trial, metrics=None):
+        # Save model checkpoint
+        checkpoint_folder = f'{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}'
+
+        if self.hp_search_backend is None and trial is None:
+            self.store_flos()
+
+        run_dir = self._get_output_dir(trial=trial)
+        output_dir = os.path.join(run_dir, checkpoint_folder)
+        self.save_model(output_dir, _internal_call=True)
+        if self.is_deepspeed_enabled:
+            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
+            # config `stage3_gather_16bit_weights_on_model_save` is True
+            self.model_wrapped.save_checkpoint(output_dir)
+
+        # Determine the new best metric / best model checkpoint
+        if metrics is not None and self.args.metric_for_best_model is not None:
+            metric_to_check = self.args.metric_for_best_model
+            if not metric_to_check.startswith('eval_'):
+                metric_to_check = f'eval_{metric_to_check}'
+            metric_value = metrics[metric_to_check]
+
+            operator = np.greater if self.args.greater_is_better else np.less
+            if (self.state.best_metric is None
+                    or self.state.best_model_checkpoint is None
+                    or operator(metric_value, self.state.best_metric)):
+                self.state.best_metric = metric_value
+                self.state.best_model_checkpoint = output_dir
+
+        # Save the Trainer state
+        if self.args.should_save:
+            self.state.save_to_json(
+                os.path.join(output_dir, TRAINER_STATE_NAME))
+
+        # push to hub
+        if self.args.push_to_hub:
+            self._push_from_checkpoint(output_dir)
+
+        # Maybe delete some older checkpoints.
+        if self.args.should_save:
+            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
diff --git a/swift/trainers/trainer_patch.py b/swift/trainers/trainer_patch.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 
 import json
diff --git a/swift/trainers/utils.py b/swift/trainers/utils.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright 2023-present the HuggingFace Inc. team.
+# Part of the implementation is borrowed from huggingface/transformers.
 
 import inspect
 from types import FunctionType, MethodType
@@ -17,7 +17,7 @@ def can_return_loss(model: Module) -> List[str]:
     return False
 
 
-def find_labels(model: Module):
+def find_labels(model: Module) -> List[str]:
     """Find the labels used by a given model."""
     model_name = model.__class__.__name__
     signature = inspect.signature(model.forward)