support additional_trainable_parameters (#295)

Jintao-Huang · web-flow · commit b1422661c753 · 2024-01-07T15:52:15.000+08:00
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -11,6 +11,7 @@
 - `--model_cache_dir`: 默认为`None`. 如果模型在本地已经有缓存, 且缓存路径并非ModelScope默认cache路径, 可以通过指定该参数从cache_dir中导入model和tokenizer.
 - `--sft_type`: 表示微调的方式, 默认是`'lora'`. 你可以选择的值包括: 'lora', 'full', 'longlora', 'qalora'. 如果你要使用qlora, 你需设置`--sft_type lora --quantization_bit 4`.
 - `--freeze_parameters`: 当sft_type指定为'full'时, 将模型最底部的参数进行freeze. 指定范围为0. ~ 1., 默认为`0.`. 该参数提供了lora与全参数微调的折中方案.
+- `--additional_trainable_parameters`: 作为freeze_parameters的补充, 只有在sft_type指定为'full'才允许被使用, 默认为`[]`. 例如你如果想训练50%的参数的情况下想额外训练embedding层, 你可以设置`--freeze_parameters 0.5 --additional_trainable_parameters transformer.wte`, 所有以`transformer.wte`开头的parameters都会被激活.
 - `--tuner_backend`: 表示lora, qlora的后端支持, 默认是`'swift'`. 你可以选择的值包括: 'swift', 'peft'.
 - `--template_type`: 表示使用的对话模板的类型, 默认是`'AUTO'`, 即根据`model_type`查找`MODEL_MAPPING`中的`template`. 可以选择的`template_type`可以查看`TEMPLATE_MAPPING.keys()`.
 - `--output_dir`: 表示ckpt存储的目录, 默认是`'output'`. 我们会在该目录后拼接`model_type`和微调版本号. 方便用户对不同模型进行多次对比实验, 而不需要改变`output_dir`命令行参数. 如果不需要拼接这些内容, 你需要额外指定参数`--add_output_dir_suffix false`.
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 2 * A100
-# 2 * 78GB GPU memory
+# 2 * 80GB GPU memory
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1 \
 swift sft \
@@ -14,5 +14,6 @@ swift sft \
     --use_flash_attn true \
     --only_save_model true \
     --dataset codefuse-evol-instruction-zh \
-    --freeze_parameters 0.2 \
+    --freeze_parameters 0.25 \
+    --additional_trainable_parameters transformer.wte \
     --preprocess_num_proc 4 \
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
@@ -15,7 +15,7 @@
                          read_multi_line, seed_everything, show_layers)
 from .utils import (InferArguments, Template, get_additional_saved_files,
                     get_dataset, get_model_tokenizer, get_template, inference,
-                    inference_stream, set_generation_config)
+                    inference_stream, is_lora, set_generation_config)
 
 logger = get_logger()
 
@@ -138,8 +138,7 @@ def prepare_model_template(
     logger.info(f'generation_config: {generation_config}')
     set_generation_config(model, generation_config)
     # Preparing LoRA
-    if args.sft_type in ('lora', 'qalora',
-                         'longlora') and args.ckpt_dir is not None:
+    if is_lora(args.sft_type) and args.ckpt_dir is not None:
         model = Swift.from_pretrained(
             model, args.ckpt_dir, inference_mode=True)
 
diff --git a/swift/llm/tuner.py b/swift/llm/tuner.py
@@ -5,7 +5,8 @@
 from swift.trainers import TrainerCallback
 from swift.tuners import (LongLoRAConfig, LongLoRAModelType, LoraConfig,
                           LoRAConfig, NEFTuneConfig, Swift)
-from swift.utils import freeze_model_parameters, get_logger
+from swift.utils import (activate_model_parameters, freeze_model_parameters,
+                         get_logger)
 from .utils import SftArguments, find_all_linear_for_lora, is_lora
 
 logger = get_logger()
@@ -76,6 +77,9 @@ def prepare_model(model, args: SftArguments):
     elif args.sft_type == 'full':
         if args.freeze_parameters > 0:
             freeze_model_parameters(model, args.freeze_parameters)
+        if len(args.additional_trainable_parameters) > 0:
+            activate_model_parameters(model,
+                                      args.additional_trainable_parameters)
     else:
         raise ValueError(f'args.sft_type: {args.sft_type}')
 
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -39,6 +39,7 @@ class SftArguments:
 
     sft_type: Literal['lora', 'full', 'longlora', 'qalora'] = 'lora'
     freeze_parameters: float = 0.  # 0 ~ 1
+    additional_trainable_parameters: List[str] = field(default_factory=list)
     tuner_backend: Literal['swift', 'peft'] = 'swift'
     template_type: str = field(
         default='AUTO',
@@ -211,6 +212,9 @@ def __post_init__(self) -> None:
             assert self.freeze_parameters == 0., (
                 'lora does not support `freeze_parameters`, please set `--sft_type full`'
             )
+            assert len(self.additional_trainable_parameters) == 0, (
+                'lora does not support `additional_trainable_parameters`, please set `--sft_type full`'
+            )
             if 'int4' in self.model_type or 'int8' in self.model_type:
                 assert self.quantization_bit == 0, 'int4 and int8 models do not need to be quantized again.'
             if self.learning_rate is None:
@@ -221,12 +225,16 @@ def __post_init__(self) -> None:
                 else:
                     self.only_save_model = True
         elif self.sft_type == 'full':
-            assert 0 <= self.freeze_parameters < 1
+            assert 0 <= self.freeze_parameters <= 1
             assert self.quantization_bit == 0, 'Full parameter fine-tuning does not support quantization.'
             assert self.dtype != 'fp16', (
                 "Fine-tuning with dtype=='fp16' can lead to NaN issues. "
                 'Please use fp32+AMP or bf16 to perform full parameter fine-tuning.'
             )
+            if isinstance(self.additional_trainable_parameters, str):
+                self.additional_trainable_parameters = [
+                    self.additional_trainable_parameters
+                ]
             if self.learning_rate is None:
                 self.learning_rate = 2e-5
             if self.only_save_model is None:
diff --git a/swift/utils/__init__.py b/swift/utils/__init__.py
@@ -8,10 +8,10 @@
 from .run_utils import get_main
 from .tb_utils import (TB_COLOR, TB_COLOR_SMOOTH, plot_images,
                        read_tensorboard_file, tensorboard_smoothing)
-from .torch_utils import (broadcast_string, freeze_model_parameters,
-                          get_dist_setting, get_model_info, is_ddp_plus_mp,
-                          is_dist, is_local_master, is_master,
-                          is_on_same_device, seed_everything, show_layers,
-                          time_synchronize)
+from .torch_utils import (activate_model_parameters, broadcast_string,
+                          freeze_model_parameters, get_dist_setting,
+                          get_model_info, is_ddp_plus_mp, is_dist,
+                          is_local_master, is_master, is_on_same_device,
+                          seed_everything, show_layers, time_synchronize)
 from .utils import (add_version_to_work_dir, check_json_format, lower_bound,
                     parse_args, read_multi_line, test_time, upper_bound)
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
@@ -131,6 +131,23 @@ def freeze_model_parameters(model: Module, freeze_parameters: float) -> None:
         p.requires_grad = False
 
 
+def activate_model_parameters(
+        model: Module, additional_trainable_parameters: List[int]) -> None:
+    if len(additional_trainable_parameters) == 0:
+        return
+    has_activate = False
+    for n, p in model.named_parameters():
+        for additional_tp in additional_trainable_parameters:
+            if n.startswith(additional_tp):
+                p.requires_grad = True
+                has_activate = True
+    if not has_activate:
+        logger.warning(
+            'len(additional_trainable_parameters) > 0 but no parameters are activated.'
+            f'additional_trainable_parameters: {additional_trainable_parameters}'
+        )
+
+
 def broadcast_string(string: Optional[str], buffer_size: int = 1024) -> str:
     """String broadcasting in case of DDP
     string: main rank: str