LoriKiT (#9776)

Liebele · bruicecode · web-flow · commit e298ba42f6ac · 2025-09-04T20:44:52.000+08:00
* add linchain

* resolve confilct

* add linchain test

* resolve pre-commit

* resolve ci problem and add description.

---------

Co-authored-by: moge &lt;lydo_0826@outlook.com&gt;
diff --git a/llm/docs/finetune.md b/llm/docs/finetune.md
@@ -94,7 +94,7 @@ python  -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7"  run_finetune.
 3. 可以通过设置`weight_quantize_algo`将主干模型量化低比特，例如'weight_only_int4','weight_only_int8'，'nf4'或'fp4'。具体参考精调参数介绍
 4. 设置`use_flash_attention`为 True 使用 FlashAttention。在 FlashAttention 打开的基础上设置`flash_mask`为 True 使用 FlashMask。
 5. LoRA API 支持4D 并行策略，可以通过控制`tensor_parallel_degree`、`pipeline_parallel_degree`、 `sharding`、`sharding_parallel_degree`调整并行训练策略，可拓展至**单机 LoRA 微调千亿模型**。
-6. 可配置`rslora`、`lora_plus_scale`、`pissa`、`lora_use_mixer`、`use_mora`等参数，使用 rsLoRA、LoRa+、PiSSA、MosLoRA（暂不支持张量模型并行）、MoRA（暂不支持张量模型并行） 等算法。
+6. 可配置`rslora`、`lora_plus_scale`、`pissa`、`lora_use_mixer`、`mixer_num`、`use_mora`等参数，使用 rsLoRA、LoRa+、PiSSA、MosLoRA（暂不支持张量模型并行）、LinChain（暂不支持张量模型并行）、MoRA（暂不支持张量模型并行） 等算法。
 
 为了后续的**压缩**和**静态图推理**方便，我们提供 LoRA 参数合并脚本，可以将 LoRA 参数合并到主干模型并保存相应的权重。
 ```
diff --git a/llm/run_finetune.py b/llm/run_finetune.py
@@ -580,6 +580,7 @@ def create_peft_model(model_args, reft_args, training_args, dtype, model_config,
                 use_quick_lora=model_args.use_quick_lora,
                 lora_use_mixer=model_args.lora_use_mixer,
                 use_mora=model_args.use_mora,
+                mixer_num=model_args.mixer_num,
                 lorapro=model_args.lorapro,
             )
             if model_args.lorapro:
diff --git a/llm/tools/merge_lora_params.py b/llm/tools/merge_lora_params.py
@@ -78,51 +78,68 @@ def weight_process(name, quant_config, lora_config, state_dict, device):
         raise ValueError(f"quant_config.weight_quantize_algo {quant_config.weight_quantize_algo} is not supported.")
 
 
+def get_mixer(mixer, mixer_num, index=0):
+    if index == mixer_num - 1:
+        return mixer[index]
+    else:
+        return mixer[index] @ get_mixer(mixer, mixer_num, index + 1)
+
+
 def lora_process(name, layer, lora_config, state_dict, device, lora_state_dict=None):
+
     target_device = device if device == "cpu" else device + ":0"
 
     if (name + ".weight") not in state_dict.keys():
         return
 
     weight = state_dict.pop(name + ".weight")
     lora_use_mixer = lora_config.lora_use_mixer
+
+    mixer_num = lora_config.mixer_num
+    mixer = {}
     use_mora = lora_config.use_mora
+
     if lora_state_dict is None:
         lora_A = state_dict.pop(name + ".lora_A")
         if not use_mora:
             lora_B = state_dict.pop(name + ".lora_B")
         if lora_use_mixer:
-            lora_AB = state_dict.pop(name + ".lora_AB")
+            for i in range(mixer_num):
+                mixer[i] = state_dict.pop(name + ".lora_mixer_" + str(i))
     else:
         lora_A = lora_state_dict.pop(name + ".lora_A")
         if not use_mora:
             lora_B = lora_state_dict.pop(name + ".lora_B")
         if lora_use_mixer:
-            lora_AB = lora_state_dict.pop(name + ".lora_AB")
+            for i in range(mixer_num):
+                mixer[i] = state_dict.pop(name + ".lora_mixer_" + str(i))
     if device != "cpu":
         weight = weight.to(target_device)
         lora_A = lora_A.to(target_device)
         if not use_mora:
             lora_B = lora_B.to(target_device)
         if lora_use_mixer:
-            lora_AB = lora_AB.to(target_device)
+            for key in mixer.keys():
+                mixer[key] = mixer[key].to(target_device)
 
     if device == "cpu" and weight.dtype.name == "BF16":
         weight = weight.astype("float32")
         lora_A = lora_A.astype("float32")
         if not use_mora:
             lora_B = lora_B.astype("float32")
+
         if lora_use_mixer:
-            lora_AB = lora_AB.astype(lora_config.dtype)
-            delta_weight = layer.get_delta_weight(lora_A, lora_B, lora_AB)
+            for key in mixer.keys():
+                mixer[key] = mixer[key].astype(lora_config.dtype)
+            delta_weight = layer.get_delta_weight(lora_A, lora_B, get_mixer(mixer, mixer_num))
         elif use_mora:
             delta_weight = layer.get_delta_weight(lora_A)
         else:
             delta_weight = layer.get_delta_weight(lora_A, lora_B)
         out = (weight + delta_weight).astype(lora_config.dtype)
     else:
         if lora_use_mixer:
-            delta_weight = layer.get_delta_weight(lora_A, lora_B, lora_AB)
+            delta_weight = layer.get_delta_weight(lora_A, lora_B, get_mixer(mixer, mixer_num))
         elif use_mora:
             delta_weight = layer.get_delta_weight(lora_A)
         else:
diff --git a/paddlenlp/peft/lora/lora_config.py b/paddlenlp/peft/lora/lora_config.py
@@ -94,6 +94,12 @@ class LoRAConfig:
         default=False,
         metadata={"help": "Whether to use mos lora."},
     )
+    mixer_num: int = field(
+        default=1,
+        metadata={
+            "help": "Num of mixer matrices. Mixer matrices will be added between the LoRA_A and LoRA_B matrices, as referenced in the paper https://arxiv.org/abs/2411.00039."
+        },
+    )
     lorapro: bool = field(default=False, metadata={"help": "Whether to use LoRA-PRO"})
 
     def __post_init__(self):
diff --git a/paddlenlp/peft/lora/lora_layers.py b/paddlenlp/peft/lora/lora_layers.py
@@ -64,6 +64,7 @@ def __init__(
         lora_plus_scale: float = 1.0,
         pissa: bool = False,
         lora_use_mixer: bool = False,
+        mixer_num: int = 1,
         use_mora: bool = False,
         lorapro: bool = False,
         mp_moe: bool = False,
@@ -85,6 +86,7 @@ def __init__(
         self.merged = False
         self.pissa = pissa
         self.lora_use_mixer = lora_use_mixer
+        self.mixer_num = mixer_num
         self.lorapro = lorapro
 
         # Actual trainable parameters
@@ -118,14 +120,20 @@ def __init__(
                 ),
             )
             if self.lora_use_mixer:
-                self.lora_AB = self.create_parameter(
-                    shape=[r, r],
-                    dtype=self._dtype,
-                    is_bias=False,
-                    default_initializer=nn.initializer.KaimingUniform(
-                        negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
-                    ),
-                )
+                for i in range(self.mixer_num):
+                    key = "lora_mixer_" + str(i)
+                    setattr(
+                        self,
+                        key,
+                        self.create_parameter(
+                            shape=[r, r],
+                            dtype=self._dtype,
+                            is_bias=False,
+                            default_initializer=nn.initializer.KaimingUniform(
+                                negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
+                            ),
+                        ),
+                    )
             self.lora_B = self.create_parameter(
                 shape=[r, out_features],
                 dtype=self._dtype,
@@ -221,7 +229,7 @@ def get_delta_weight(self, lora_A=None, lora_B=None, lora_AB=None):
         if self.lora_use_mixer:
             lora_A = lora_A if lora_A is not None else self.lora_A
             lora_B = lora_B if lora_B is not None else self.lora_B
-            lora_AB = lora_AB if lora_AB is not None else self.lora_AB
+            lora_AB = lora_AB if lora_AB is not None else self.get_mixer_params(0)
             delta_weight = lora_A @ lora_AB @ lora_B * self.scaling
         elif self.use_mora:
             lora_A = lora_A if lora_A is not None else self.lora_A
@@ -256,18 +264,25 @@ def get_delta_weight(self, lora_A=None, lora_B=None, lora_AB=None):
 
         return delta_weight
 
+    def get_mixer_params(self, index):
+        key = "lora_mixer_" + str(index)
+        if index == self.mixer_num - 1:
+            return getattr(self, key)
+        else:
+            return getattr(self, key) @ self.get_mixer_params(index + 1)
+
     def merge(self):
         if not self.merged:
             delta_weight = self.get_delta_weight()
             new_weight = self.weight + delta_weight
-            self.weight.set_value(new_weight)
+            self.weight.set_value(new_weight.astype(self.weight.dtype))
             self.merged = True
 
     def unmerge(self):
         if self.merged:
             delta_weight = self.get_delta_weight()
             new_weight = self.weight - delta_weight
-            self.weight.set_value(new_weight)
+            self.weight.set_value(new_weight.astype(self.weight.dtype))
             self.merged = False
 
     def forward(self, input: paddle.Tensor, *args, **kwargs):
@@ -287,7 +302,9 @@ def forward(self, input: paddle.Tensor, *args, **kwargs):
         else:
             result = F.linear(x=input, weight=self.weight, bias=self.bias, name=self.name)
             if self.lora_use_mixer:
-                result += (self.lora_dropout(input) @ self.lora_A @ self.lora_AB @ self.lora_B) * self.scaling
+                result += (
+                    self.lora_dropout(input) @ self.lora_A @ self.get_mixer_params(0) @ self.lora_B
+                ) * self.scaling
             else:
                 result += (self.lora_dropout(input) @ self.lora_A @ self.lora_B) * self.scaling
         return result
diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py
@@ -485,6 +485,7 @@ def _find_and_replace_module(self, model, module_name, lora_config):
                 bias_attr=False if module.bias is None else None,
                 use_quick_lora=lora_config.use_quick_lora,
                 lora_use_mixer=lora_config.lora_use_mixer,
+                mixer_num=lora_config.mixer_num,
                 use_mora=lora_config.use_mora,
                 mp_moe=getattr(module.weight, "mp_moe", False),
                 is_distributed=getattr(module.weight, "is_distributed", False),
diff --git a/paddlenlp/trl/model_config.py b/paddlenlp/trl/model_config.py
@@ -64,6 +64,7 @@ class ModelConfig:
     lora_use_mixer: bool = field(
         default=False, metadata={"help": "Whether to use MosLoRA: https://arxiv.org/pdf/2406.11909"}
     )
+    mixer_num: int = field(default=1, metadata={"help": "Num of mixer matrices."})
     use_mora: bool = field(
         default=False, metadata={"help": "Whether to use MoRA: https://arxiv.org/pdf/2405.12130.pdf"}
     )
diff --git a/tests/fixtures/llm/linchain.yaml b/tests/fixtures/llm/linchain.yaml
@@ -0,0 +1,114 @@
+lora:
+  base:
+    dataset_name_or_path: "./data"
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 8
+    eval_accumulation_steps: 16
+    num_train_epochs: 3
+    learning_rate: 3e-04
+    warmup_steps: 30
+    logging_steps: 1
+    evaluation_strategy: "epoch"
+    save_strategy: "epoch"
+    src_length: 1024
+    max_length: 2048
+    fp16: true
+    fp16_opt_level: "O2"
+    do_train: true
+    do_eval: true
+    disable_tqdm: true
+    load_best_model_at_end: true
+    eval_with_do_generation: false
+    metric_for_best_model: "accuracy"
+    recompute: true
+    save_total_limit: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    lora: true
+    lora_use_mixer: true
+    mixer_num: 3
+
+  default:
+    llama:
+      model_name_or_path: __internal_testing__/tiny-random-llama
+    chatglm:
+      model_name_or_path: __internal_testing__/tiny-fused-chatglm
+    chatglm2:
+      model_name_or_path: __internal_testing__/tiny-fused-chatglm2
+    bloom:
+      model_name_or_path: __internal_testing__/tiny-fused-bloom
+    qwen:
+      model_name_or_path: __internal_testing__/tiny-fused-qwen
+    qwen2:
+      model_name_or_path: __internal_testing__/tiny-random-qwen2
+    qwen2moe:
+      model_name_or_path: __internal_testing__/tiny-random-qwen2moe
+    baichuan:
+      model_name_or_path: __internal_testing__/tiny-fused-baichuan
+
+rslora_plus:
+  base:
+    dataset_name_or_path: "./data"
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 8
+    eval_accumulation_steps: 16
+    num_train_epochs: 3
+    learning_rate: 3e-04
+    warmup_steps: 30
+    logging_steps: 1
+    evaluation_strategy: "epoch"
+    save_strategy: "epoch"
+    src_length: 1024
+    max_length: 2048
+    fp16: true
+    fp16_opt_level: "O2"
+    do_train: true
+    do_eval: true
+    disable_tqdm: true
+    load_best_model_at_end: true
+    eval_with_do_generation: false
+    metric_for_best_model: "accuracy"
+    recompute: true
+    save_total_limit: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    lora: true
+    lora_plus_scale: 4
+    rslora: true
+
+  default:
+    llama:
+      model_name_or_path: __internal_testing__/tiny-random-llama
+    chatglm:
+      model_name_or_path: __internal_testing__/tiny-fused-chatglm
+    chatglm2:
+      model_name_or_path: __internal_testing__/tiny-fused-chatglm2
+    bloom:
+      model_name_or_path: __internal_testing__/tiny-fused-bloom
+    qwen:
+      model_name_or_path: __internal_testing__/tiny-fused-qwen
+    baichuan:
+      model_name_or_path: __internal_testing__/tiny-fused-baichuan
+
+inference-predict:
+  default:
+    mode: dynamic 
+    max_length: 20
+    batch_size: 2
+    decode_strategy: greedy_search
+    dtype: float16
+
+inference-to-static:
+  default:
+    dtype: float16
+    max_length: 20
+
+inference-infer:
+  default:
+    mode: static
+    dtype: float16
+    batch_size: 2
+    decode_strategy: greedy_search
+    max_length: 20
diff --git a/tests/llm/test_linchain.py b/tests/llm/test_linchain.py
diff --git a/tests/peft/test_linchain.py b/tests/peft/test_linchain.py

Original file line number	Diff line number	Diff line change
`@@ -580,6 +580,7 @@ def create_peft_model(model_args, reft_args, training_args, dtype, model_config,`
`580`	`580`	`use_quick_lora=model_args.use_quick_lora,`
`581`	`581`	`lora_use_mixer=model_args.lora_use_mixer,`
`582`	`582`	`use_mora=model_args.use_mora,`
	`583`	`+ mixer_num=model_args.mixer_num,`
`583`	`584`	`lorapro=model_args.lorapro,`
`584`	`585`	`)`
`585`	`586`	`if model_args.lorapro:`
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ class ModelConfig:`
`64`	`64`	`lora_use_mixer: bool = field(`
`65`	`65`	`default=False, metadata={"help": "Whether to use MosLoRA: https://arxiv.org/pdf/2406.11909"}`
`66`	`66`	`)`
	`67`	`+ mixer_num: int = field(default=1, metadata={"help": "Num of mixer matrices."})`
`67`	`68`	`use_mora: bool = field(`
`68`	`69`	`default=False, metadata={"help": "Whether to use MoRA: https://arxiv.org/pdf/2405.12130.pdf"}`
`69`	`70`	`)`