[train] support target_parameters (#5340)

Jintao-Huang · Jintao-Huang · commit 44b03868d207 · 2025-08-19T01:02:06.000+08:00
diff --git a/README.md b/README.md
@@ -125,7 +125,7 @@ Running Environment:
 | torch        | >=2.0        | 2.7.1               |                                           |
 | transformers | >=4.33       | 4.54.1              |                                           |
 | modelscope   | >=1.23       |                     |                                           |
-| peft         | >=0.11,<0.17 |                     |                                           |
+| peft         | >=0.11,<0.18 |                     |                                           |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                                           |
 | trl          | >=0.15,<0.21 | 0.20.0              | RLHF                                      |
 | deepspeed    | >=0.14       | 0.16.9              | Training                                  |
diff --git a/README_CN.md b/README_CN.md
@@ -121,7 +121,7 @@ pip install -e .
 | torch        | >=2.0        | 2.7.1               |                    |
 | transformers | >=4.33       | 4.54.1              |                    |
 | modelscope   | >=1.23       |                     |                    |
-| peft         | >=0.11,<0.17 |                     |                    |
+| peft         | >=0.11,<0.18 |                     |                    |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                    |
 | trl          | >=0.15,<0.21 | 0.20.0              | RLHF               |
 | deepspeed    | >=0.14       | 0.16.9              | 训练                 |
diff --git a/docs/source/GetStarted/SWIFT安装.md b/docs/source/GetStarted/SWIFT安装.md
@@ -95,7 +95,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2
 | torch        | >=2.0        | 2.7.1               |                    |
 | transformers | >=4.33       | 4.54.1              |                    |
 | modelscope   | >=1.23       |                     |                    |
-| peft         | >=0.11,<0.17 |                     |                    |
+| peft         | >=0.11,<0.18 |                     |                    |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                    |
 | trl          | >=0.15,<0.21 | 0.20.0              | RLHF               |
 | deepspeed    | >=0.14       | 0.16.9              | 训练                 |
diff --git a/docs/source/Instruction/Megatron-SWIFT训练.md b/docs/source/Instruction/Megatron-SWIFT训练.md
@@ -54,7 +54,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2
 | flash_attn    |        | 2.7.4.post1/3.0.0b1   |                  |
 | transformers | >=4.33       | 4.51.3      |                    |
 | modelscope   | >=1.23       |             |                    |
-| peft         | >=0.11,<0.17 |             |      LoRA          |
+| peft         | >=0.11,<0.18 |             |      LoRA          |
 | trl          | >=0.15,<0.21 |       |      RLHF        |
 | deepspeed    | >=0.14       | 0.16.9      |                  |
 
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -212,6 +212,7 @@
 - 🔥target_modules: 指定lora模块, 默认为`['all-linear']`。你也可以设置为module的后缀，例如：`--target_modules q_proj k_proj v_proj`。该参数不限于LoRA，可用于其他tuners。
   - 注意：在LLM和多模态LLM中，'all-linear'的行为有所不同。若是LLM则自动寻找除lm_head外的linear并附加tuner；若是多模态LLM，则默认只在LLM上附加tuner，该行为可以被`freeze_llm`、`freeze_vit`、`freeze_aligner`控制。
 - 🔥target_regex: 指定lora模块的regex表达式，默认为`None`。如果该值传入，则target_modules参数失效。该参数不限于LoRA，可用于其他tuners。
+- target_parameters: 要替换为LoRA的参数名称列表。该参数的行为与 `target_modules` 类似，但传入的应是参数名称。该特性需要安装"peft>=0.17.0"。例如，在 Hugging Face Transformers 中许多混合专家（MoE）层中，并未使用 `nn.Linear`，而是使用了 `nn.Parameter`。这时可以使用target_parameters参数实现。
 - init_weights: 初始化weights的方法，LoRA可以指定为`true`、`false`、`gaussian`、`pissa`、`pissa_niter_[number of iters]`，Bone可以指定为`true`、`false`、`bat`。默认值`true`。
 - 🔥modules_to_save: 在已附加tuner后，额外指定一部分原模型模块参与训练和存储。默认为`[]`。该参数不限于LoRA，可用于其他tuners。
 
diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md
@@ -96,7 +96,7 @@ More images can be found [here](https://modelscope.cn/docs/intro/environment-set
 | torch        | >=2.0        | 2.7.1               |                                           |
 | transformers | >=4.33       | 4.54.1              |                                           |
 | modelscope   | >=1.23       |                     |                                           |
-| peft         | >=0.11,<0.17 |                     |                                           |
+| peft         | >=0.11,<0.18 |                     |                                           |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                                           |
 | trl          | >=0.15,<0.21 | 0.20.0              | RLHF                                      |
 | deepspeed    | >=0.14       | 0.16.9              | Training                                  |
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -216,6 +216,7 @@ Other important parameters:
 - 🔥 target_modules: Specifies the LoRA modules. The default is `['all-linear']`, but you can also pass layer-name suffixes, e.g. `--target_modules q_proj k_proj v_proj`. This argument is not restricted to LoRA and can be used with other tuners as well.
   - Note: The behavior of the special value `'all-linear'` differs between plain LLMs and multimodal LLMs. For a standard LLM, it automatically locates every linear layer except `lm_head` and attaches a tuner. For a multimodal LLM, it attaches the tuner only to the LLM component by default. This default can be changed with the `freeze_llm`, `freeze_vit`, and `freeze_aligner` options.
 - 🔥target_regex: Specifies a regex expression for LoRA modules, with a default of `None`. If this value is provided, the target_modules parameter becomes ineffective. This parameter is not limited to LoRA and can be used for other tuners.
+- target_parameters: List of parameter names to be replaced with LoRA. This argument behaves similarly to target_modules, but you should pass parameter names instead. This feature requires "peft>=0.17.0". For example, in many Mixture-of-Experts (MoE) layers in Hugging Face Transformers, `nn.Linear` is not used; instead, `nn.Parameter` is used. In such cases, the `target_parameters` argument can be used to apply LoRA.
 - init_weights: Specifies the method for initializing weights. LoRA can specify `true`, `false`, `gaussian`, `pissa`, `pissa_niter_[number of iters]`. Bone can specify `true`, `false`, `bat`. The default is `true`.
 - 🔥modules_to_save: After attaching a tuner, explicitly specifies additional original model modules to participate in training and storage. The default is `[]`. This parameter is not limited to LoRA and can be used for other tuners.
 
diff --git a/docs/source_en/Instruction/Megatron-SWIFT-Training.md b/docs/source_en/Instruction/Megatron-SWIFT-Training.md
@@ -56,7 +56,7 @@ Recommended Operating Environment:
 | flash_attn    |        | 2.7.4.post1/3.0.0b1   |                  |
 | transformers | >=4.33       | 4.51.3      |                    |
 | modelscope   | >=1.23       |             |                    |
-| peft         | >=0.11,<0.17 |             |      LoRA          |
+| peft         | >=0.11,<0.18 |             |      LoRA          |
 | trl          | >=0.15,<0.21 |       |      RLHF        |
 | deepspeed    | >=0.14       | 0.16.9      |                  |
 
diff --git a/swift/llm/argument/tuner_args.py b/swift/llm/argument/tuner_args.py
@@ -108,6 +108,7 @@ class TunerArguments:
     # tuners
     target_modules: List[str] = field(default_factory=lambda: ['all-linear'])
     target_regex: Optional[str] = None
+    target_parameters: Optional[List[str]] = None
     # e.g. ['wte', 'ln_1', 'ln_2', 'ln_f', 'lm_head']
     modules_to_save: List[str] = field(default_factory=list)
 
diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py
@@ -173,6 +173,8 @@ def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset
                 task_type = 'SEQ_CLS'
             elif task_type == 'GENERATIVE_RERANKER':
                 task_type = 'CAUSAL_LM'
+            if args.target_parameters is not None:
+                lora_kwargs['target_parameters'] = args.target_parameters
             lora_config = LoraConfig(task_type=task_type, lora_dtype=args.lora_dtype, **lora_kwargs)
             if args.init_weights == 'lora-ga':
                 try:
diff --git a/swift/megatron/tuners/lora.py b/swift/megatron/tuners/lora.py
@@ -12,6 +12,7 @@
                                                          TEGroupedLinear, TELayerNormColumnParallelLinear, TELinear,
                                                          TERowParallelGroupedLinear, TERowParallelLinear)
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.parallel_state import get_expert_tensor_parallel_world_size, get_tensor_model_parallel_world_size
 from megatron.core.transformer.mlp import apply_swiglu_sharded_factory
 from megatron.core.transformer.module import MegatronModule
 from packaging import version
@@ -51,8 +52,11 @@ def __init__(
         self.is_grouped = isinstance(base_layer, TEGroupedLinear)
         self.fan_in_fan_out = fan_in_fan_out
         self._active_adapter = adapter_name
-        self.tp_size = base_layer.tp_size
         self.is_expert = getattr(base_layer, 'is_expert', False)
+        if self.is_expert:
+            self.tp_size = get_expert_tensor_parallel_world_size()
+        else:
+            self.tp_size = get_tensor_model_parallel_world_size()
         self.update_layer(
             adapter_name,
             r,