Skip to content

Commit 69281aa

Browse files
authored
Merge branch 'PaddlePaddle:develop' into dev_20250124_fix_taskflow_infer
2 parents fc00ddd + 9ded9bf commit 69281aa

31 files changed

+2192
-684
lines changed

llm/run_finetune.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,18 @@
5252
AutoModelForCausalLM,
5353
AutoModelForCausalLMPipe,
5454
AutoTokenizer,
55+
DeepseekV2ForCausalLM,
56+
DeepseekV2ForCausalLMPipe,
57+
DeepseekV3ForCausalLM,
58+
DeepseekV3ForCausalLMPipe,
5559
Llama3Tokenizer,
5660
LlamaForCausalLM,
5761
LlamaForCausalLMPipe,
5862
LlamaTokenizer,
5963
Qwen2ForCausalLM,
6064
Qwen2ForCausalLMPipe,
65+
Qwen2MoeForCausalLM,
66+
Qwen2MoeForCausalLMPipe,
6167
)
6268
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
6369
from paddlenlp.trl import DataConfig, ModelConfig, SFTConfig, SFTTrainer
@@ -74,7 +80,18 @@
7480
# Fine-tune Environment Variables to support sharding stage1 overlap optimization.
7581
os.environ["USE_CASUAL_MASK"] = "False"
7682

77-
flash_mask_support_list = [LlamaForCausalLM, LlamaForCausalLMPipe, Qwen2ForCausalLM, Qwen2ForCausalLMPipe]
83+
flash_mask_support_list = [
84+
DeepseekV2ForCausalLM,
85+
DeepseekV2ForCausalLMPipe,
86+
DeepseekV3ForCausalLM,
87+
DeepseekV3ForCausalLMPipe,
88+
LlamaForCausalLM,
89+
LlamaForCausalLMPipe,
90+
Qwen2ForCausalLM,
91+
Qwen2ForCausalLMPipe,
92+
Qwen2MoeForCausalLM,
93+
Qwen2MoeForCausalLMPipe,
94+
]
7895

7996

8097
def paddlenlp_verison_check():
@@ -151,7 +168,11 @@ def main():
151168
quantization_config=quantization_config,
152169
)
153170

154-
if "Qwen2Moe" in str(model_config.architectures) and training_args.data_parallel_degree > 1:
171+
architectures_to_check = {"Qwen2Moe", "DeepseekV2", "DeepseekV3"}
172+
if (
173+
any(architecture in str(model_config.architectures) for architecture in architectures_to_check)
174+
and training_args.data_parallel_degree > 1
175+
):
155176
training_args.use_expert_parallel = True
156177

157178
LlmMetaConfig.set_llm_config(model_config, training_args)
@@ -585,7 +606,12 @@ def create_peft_model(model_args, reft_args, training_args, dtype, model_config,
585606
def trans_dataset_to_ids(train_ds, dev_ds, test_ds, model_args, data_args, trans_func, eval_zero_padding):
586607
if train_ds is not None:
587608
train_ds = train_ds.map(
588-
partial(trans_func, is_test=False, zero_padding=data_args.zero_padding, flash_mask=model_args.flash_mask)
609+
partial(
610+
trans_func,
611+
is_test=False,
612+
zero_padding=data_args.zero_padding,
613+
flash_mask=model_args.flash_mask,
614+
)
589615
)
590616
if dev_ds is not None:
591617
dev_ds = dev_ds.map(

llm/run_pretrain.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,11 @@ def main():
478478
except:
479479
print("Not register llama pp reshard information.")
480480

481-
if "Qwen2Moe" in str(config.architectures) and training_args.data_parallel_degree > 1:
481+
architectures_to_check = {"Qwen2Moe", "DeepseekV2", "DeepseekV3"}
482+
if (
483+
any(architecture in str(config.architectures) for architecture in architectures_to_check)
484+
and training_args.data_parallel_degree > 1
485+
):
482486
training_args.use_expert_parallel = True
483487

484488
if model_args.continue_training:

llm/utils/data.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,13 @@ def get_convert_example(model):
5959
"gpt",
6060
"yuan",
6161
"jamba",
62+
"deepseek_v2",
63+
"deepseek_v3",
6264
]:
6365
return convert_example_common
6466
else:
6567
raise ValueError(
66-
f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma, qwen2, qwen2_moe, yuan, jamba",
68+
f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma, qwen2, qwen2_moe, yuan, jamba,deepseek_v2, deepseek_v3",
6769
)
6870

6971

paddlenlp/mergekit/merge_config.py

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,7 @@
1717
from dataclasses import asdict, dataclass, field
1818
from typing import List, Optional
1919

20-
import paddle
21-
2220
from paddlenlp.utils.env import MERGE_CONFIG_NAME
23-
from paddlenlp.utils.log import logger
2421

2522

2623
@dataclass
@@ -30,7 +27,6 @@ class MergeConfig:
3027
"""
3128

3229
# Common parameters
33-
device: str = field(default="cpu", metadata={"help": "Device to use for the merge.ex cpu、 gpu、low_gpu_mem"})
3430
tensor_type: str = field(
3531
default="np", metadata={"help": "Tensor type to use for the merge. Choose np(CPU Only) or pd (CPU/GPU)"}
3632
)
@@ -39,14 +35,20 @@ class MergeConfig:
3935
merge_method: str = field(default="linear", metadata={"help": "The merge strategy."})
4036
merge_type: str = field(default="linear", metadata={"help": "The type of merge process."})
4137
sparsify_type: str = field(default=None, metadata={"help": "The type of sparsify process."})
38+
split_pieces: int = field(default=8, metadata={"help": "Split large tensor to multi-piece"})
39+
max_tensor_mem: float = field(default=0.5, metadata={"help": "Split tensor if exceed setting max_tensor_mem."})
4240

4341
# Model parameters
4442
model_path_list: Optional[List[str]] = field(default=None, metadata={"help": "Merge model name or path list"})
4543
model_path_str: Optional[str] = field(
4644
default=None, metadata={"help": "Merge model name or path string.(split by ',')"}
4745
)
4846
base_model_path: str = field(default=None, metadata={"help": "Base model name or path."})
49-
output_path: str = field(default=None, metadata={"help": "Base model name or path."})
47+
output_path: str = field(default=None, metadata={"help": "Output model name or path."})
48+
lora_model_path: str = field(default=None, metadata={"help": "LoRA model name or path."})
49+
copy_file_list: Optional[List[str]] = field(
50+
default=None, metadata={"help": "Copy file list from base model path or first model path."}
51+
)
5052
# merge parameters
5153
weight_list: Optional[List[float]] = field(
5254
default=None, metadata={"help": "Relative (or absolute if normalize=False) weighting of a given tensor"}
@@ -75,32 +77,43 @@ def config_check(self):
7577
os.makedirs(self.output_path, exist_ok=True)
7678
if self.tensor_type not in ["np", "pd"]:
7779
raise ValueError(f"Unsupported tensor type: {self.tensor_type}. Support 'np' and 'pd' only.")
78-
if self.device == "gpu" and self.tensor_type == "np":
79-
logger.warning("np only support cpu device, but got gpu. Setting `device` to `cpu`.")
80-
self.device = "cpu"
81-
82-
elif self.merge_method not in ["linear", "ties", "slerp", "della_linear", "della", "dare_linear", "dare_ties"]:
83-
raise ValueError(
84-
f"Unsupported merge strategy: {self.merge_method}. Please choose one from ['linear', 'slerp']."
85-
)
86-
if self.model_path_str is not None:
87-
self.model_path_list = self.model_path_str.split(",")
88-
if self.model_path_list is not None:
89-
if not isinstance(self.model_path_list, list) or len(self.model_path_list) < 2:
90-
raise ValueError(f"Please specify the model_path_list at least two. But got {self.model_path_list}")
91-
if self.weight_list is None:
92-
self.weight_list = [1.0] * len(self.model_path_list)
93-
self.normalize = True
94-
if len(self.model_path_list) != len(self.weight_list):
95-
raise ValueError("The length of model_path_list and weight_list must be the same.")
96-
if self.reserve_p < 0 or self.reserve_p > 1:
97-
raise ValueError("reserve_p must be between 0 and 1.")
98-
if "della" in self.merge_method or self.sparsify_type == "magprune":
99-
if self.reserve_p <= self.epsilon / 2 or self.reserve_p >= (1 - self.epsilon):
80+
if self.lora_model_path is not None:
81+
if self.base_model_path is None:
82+
raise ValueError("Please specify the base_model_path when using LoRA merge.")
83+
self.tensor_type = "pd"
84+
85+
if self.lora_model_path is None:
86+
if self.merge_method not in [
87+
"linear",
88+
"ties",
89+
"slerp",
90+
"della_linear",
91+
"della",
92+
"dare_linear",
93+
"dare_ties",
94+
]:
10095
raise ValueError(
101-
f"Error: reserve_p +- epsilon/2 must be in the range (0, 1). reserve_p + epsilon/2 = {self.reserve_p + self.epsilon / 2 }, reserve_p - epsilon/2 = {self.reserve_p - self.epsilon / 2 }"
96+
f"Unsupported merge strategy: {self.merge_method}. Please choose one from ['linear', 'slerp', 'ties', 'della_linear', 'della', ']."
10297
)
103-
paddle.set_device(self.device)
98+
if self.model_path_str is not None:
99+
self.model_path_list = self.model_path_str.split(",")
100+
if self.model_path_list is not None:
101+
if not isinstance(self.model_path_list, list) or len(self.model_path_list) < 2:
102+
raise ValueError(
103+
f"Please specify the model_path_list at least two. But got {self.model_path_list}"
104+
)
105+
if self.weight_list is None:
106+
self.weight_list = [1.0] * len(self.model_path_list)
107+
self.normalize = True
108+
if len(self.model_path_list) != len(self.weight_list):
109+
raise ValueError("The length of model_path_list and weight_list must be the same.")
110+
if self.reserve_p < 0 or self.reserve_p > 1:
111+
raise ValueError("reserve_p must be between 0 and 1.")
112+
if "della" in self.merge_method or self.sparsify_type == "magprune":
113+
if self.reserve_p <= self.epsilon / 2 or self.reserve_p >= (1 - self.epsilon):
114+
raise ValueError(
115+
f"Error: reserve_p +- epsilon/2 must be in the range (0, 1). reserve_p + epsilon/2 = {self.reserve_p + self.epsilon / 2 }, reserve_p - epsilon/2 = {self.reserve_p - self.epsilon / 2 }"
116+
)
104117

105118
@property
106119
def __dict__(self):

paddlenlp/mergekit/merge_method.py

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,10 @@ def linear(self, tensor_list):
4848
tensor_output = sum(weight * tensor for weight, tensor in zip(weight_list, tensor_list))
4949
return tensor_output
5050
elif self.merge_config.tensor_type == "pd":
51-
stacked_tensors = paddle.stack(tensor_list, axis=0)
52-
weights = paddle.to_tensor(weight_list, dtype=stacked_tensors.dtype)
53-
weights = weights.reshape([-1] + [1] * (len(stacked_tensors.shape) - 1))
54-
weighted_sum = paddle.sum(stacked_tensors * weights, axis=0)
55-
return weighted_sum
51+
tensor_output = paddle.zeros_like(tensor_list[0])
52+
for i, tensor in enumerate(tensor_list):
53+
tensor_output += tensor * weight_list[i]
54+
return tensor_output
5655
else:
5756
raise ValueError(f"Unkonwn tensor type {self.merge_config.tensor_type}")
5857

@@ -155,28 +154,34 @@ def ties(self, tensor_list):
155154

156155
elif self.merge_config.tensor_type == "pd":
157156
mask_dtype = tensor_list[0].dtype
158-
weight_list = self.merge_config.weight_list
159-
stacked_tensors = paddle.stack(tensor_list, axis=0)
160-
weights = paddle.to_tensor(weight_list, dtype=stacked_tensors.dtype)
161-
weights = weights.reshape([-1] + [1] * (len(stacked_tensors.shape) - 1))
162-
weighted_tensors = stacked_tensors * weights
157+
163158
# Elect majority sign
164-
if self.merge_config.ties_elect_type == "sum":
165-
majority_sign = (paddle.sum(weighted_tensors, axis=0) >= 0).astype(mask_dtype) * 2 - 1
166-
elif self.merge_config.ties_elect_type == "count":
167-
stacked_signs = paddle.sign(stacked_tensors).astype(mask_dtype)
168-
majority_sign = (paddle.sum(stacked_signs, axis=0) >= 0).astype(mask_dtype) * 2 - 1
169-
else:
170-
raise NotImplementedError(f"ties_elect_type: {self.merge_config.ties_elect_type} is unknown.")
159+
majority_sign = paddle.zeros_like(tensor_list[0])
160+
for i, tensor in enumerate(tensor_list):
161+
if self.merge_config.ties_elect_type == "sum":
162+
majority_sign += tensor * self.merge_config.weight_list[i]
163+
elif self.merge_config.ties_elect_type == "count":
164+
majority_sign += tensor.sign()
165+
else:
166+
raise NotImplementedError(f"ties_elect_type: {self.merge_config.ties_elect_type} is unknown.")
167+
majority_sign = (majority_sign >= 0).astype(mask_dtype) * 2 - 1
171168

172169
# Merge
173-
stacked_masks = (paddle.sign(weighted_tensors) == majority_sign).astype(mask_dtype)
174-
masked_tensors = stacked_masks * weighted_tensors
175-
merge_tensor = paddle.sum(masked_tensors, axis=0)
170+
merge_tensor = paddle.zeros_like(tensor_list[0])
171+
if self.merge_config.normalize:
172+
divisor = paddle.zeros_like(tensor_list[0])
173+
for i, tensor in enumerate(tensor_list):
174+
if self.merge_config.normalize:
175+
mask = (tensor.sign() == majority_sign).astype(mask_dtype) * self.merge_config.weight_list[i]
176+
divisor += mask
177+
merge_tensor += mask * tensor
178+
else:
179+
merge_tensor += (
180+
(tensor.sign() == majority_sign).astype(mask_dtype) * tensor * self.merge_config.weight_list[i]
181+
)
182+
176183
# Normalize
177184
if self.merge_config.normalize:
178-
weight_masks = stacked_masks * weights
179-
divisor = paddle.sum(weight_masks, axis=0)
180185
divisor = paddle.where(paddle.abs(divisor) < 1e-8, paddle.ones_like(divisor), divisor)
181186
merge_tensor /= divisor
182187

0 commit comments

Comments
 (0)