Skip to content

Commit c2a93f6

Browse files
Ace-To-HYBAce-To-HYB
andauthored
Refactoring Qwen2/3 with general design (#2480)
Co-authored-by: Ace-To-HYB <[email protected]>
1 parent 2868bae commit c2a93f6

37 files changed

+2562
-5177
lines changed

examples/config/qwen/lora_argument_qwen2_0p5b.json

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,15 @@
1515
"per_device_eval_batch_size": 8,
1616
"eval_accumulation_steps":16,
1717
"num_train_epochs": 1,
18-
"learning_rate": 3e-05,
19-
"warmup_steps": 10,
18+
"learning_rate": 3e-04,
19+
"warmup_steps": 30,
2020
"logging_steps": 1,
21+
"max_steps": 100,
2122
"evaluation_strategy": "epoch",
2223
"save_strategy": "epoch",
2324
"src_length": 1024,
2425
"max_length": 2048,
25-
"fp16": true,
26+
"bf16": true,
2627
"fp16_opt_level": "O2",
2728
"do_train": true,
2829
"do_eval": true,
@@ -36,9 +37,12 @@
3637
"pipeline_parallel_degree": 1,
3738
"sharding": "stage2",
3839
"lora": true,
39-
"zero_padding": false,
40+
"zero_padding": true,
41+
"flash_mask": true,
4042
"unified_checkpoint": true,
41-
"use_flash_attention": false,
43+
"use_flash_attention": true,
44+
"convert_from_hf": false,
45+
"save_to_hf": false,
4246
"pissa": false,
4347
"use_mora": false,
4448
"encode_one_turn": true

examples/config/qwen/sft_argument_qwen2_0p5b.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"save_strategy": "epoch",
2424
"src_length": 1024,
2525
"max_length": 2048,
26-
"fp16": true,
26+
"bf16": true,
2727
"fp16_opt_level": "O2",
2828
"do_train": true,
2929
"do_eval": true,
@@ -40,5 +40,7 @@
4040
"flash_mask": true,
4141
"unified_checkpoint": true,
4242
"use_flash_attention": true,
43+
"convert_from_hf": false,
44+
"save_to_hf": false,
4345
"encode_one_turn": true
4446
}

examples/run_finetune.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@
4343
Qwen2ForCausalLMPipe,
4444
Qwen2MoeForCausalLM,
4545
Qwen2MoeForCausalLMPipe,
46+
Qwen3ForCausalLM,
47+
Qwen3ForCausalLMPipe,
48+
Qwen3MoeForCausalLM,
49+
Qwen3MoeForCausalLMPipe,
4650
)
4751
from paddleformers.transformers.configuration_utils import LlmMetaConfig
4852
from paddleformers.trl import DataConfig, ModelConfig, SFTConfig, SFTTrainer
@@ -67,6 +71,10 @@
6771
Qwen2ForCausalLMPipe,
6872
Qwen2MoeForCausalLM,
6973
Qwen2MoeForCausalLMPipe,
74+
Qwen3ForCausalLM,
75+
Qwen3ForCausalLMPipe,
76+
Qwen3MoeForCausalLM,
77+
Qwen3MoeForCausalLMPipe,
7078
]
7179

7280

paddleformers/nn/activation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def __getitem__(self, key):
2525

2626

2727
ACT2CLS = {
28+
"gelu": nn.GELU,
2829
"relu": nn.ReLU,
2930
"relu6": nn.ReLU6,
3031
"sigmoid": nn.Sigmoid,

paddleformers/nn/pp_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ class RotaryEmbedding(nn.Layer):
181181
def __init__(self, config):
182182
super().__init__()
183183
self.config = config
184-
self.head_dim = config.head_dim
184+
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
185185
self.base = config.rope_theta
186186

187187
def forward(self, x, position_ids):

paddleformers/transformers/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,41 +291,41 @@
291291
"Qwen2Model",
292292
"Qwen2PretrainedModel",
293293
"Qwen2ForCausalLM",
294+
"Qwen2ForCausalLMPipe",
294295
"Qwen2PretrainingCriterion",
295296
"Qwen2ForSequenceClassification",
296297
"Qwen2ForTokenClassification",
297298
"Qwen2SentenceEmbedding",
298299
],
299-
"qwen2.modeling_pp": ["Qwen2ForCausalLMPipe"],
300300
"qwen2.tokenizer": ["Qwen2Tokenizer"],
301301
"qwen2.tokenizer_fast": ["Qwen2TokenizerFast"],
302302
"qwen2_moe.configuration": ["Qwen2MoeConfig"],
303303
"qwen2_moe.modeling": [
304304
"Qwen2MoeModel",
305305
"Qwen2MoePretrainedModel",
306306
"Qwen2MoeForCausalLM",
307+
"Qwen2MoeForCausalLMPipe",
307308
"Qwen2MoePretrainingCriterion",
308309
],
309-
"qwen2_moe.modeling_pp": ["Qwen2MoeForCausalLMPipe"],
310310
"qwen3.configuration": ["Qwen3Config"],
311311
"qwen3.modeling": [
312312
"Qwen3Model",
313313
"Qwen3PretrainedModel",
314314
"Qwen3ForCausalLM",
315+
"Qwen3ForCausalLMPipe",
315316
"Qwen3PretrainingCriterion",
316317
"Qwen3ForSequenceClassification",
317318
"Qwen3ForTokenClassification",
318319
"Qwen3SentenceEmbedding",
319320
],
320-
"qwen3.modeling_pp": ["Qwen3ForCausalLMPipe"],
321321
"qwen3_moe.configuration": ["Qwen3MoeConfig"],
322322
"qwen3_moe.modeling": [
323323
"Qwen3MoeModel",
324324
"Qwen3MoePretrainedModel",
325325
"Qwen3MoeForCausalLM",
326+
"Qwen3MoeForCausalLMPipe",
326327
"Qwen3MoePretrainingCriterion",
327328
],
328-
"qwen3_moe.modeling_pp": ["Qwen3MoeForCausalLMPipe"],
329329
"ernie4_5vl.tokenizer": ["Ernie4_5_VLTokenizer"],
330330
"ernie4_5vl": [],
331331
"bert": [],

paddleformers/transformers/configuration_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,10 @@ class PretrainedConfig:
507507
If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
508508
sep_token_id (`int`, *optional*): The id of the _separation_ token.
509509
510+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
511+
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
512+
model has a output word embedding layer.
513+
510514
dtype (`str`, *optional*):
511515
The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
512516
(which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
@@ -569,8 +573,10 @@ def __init__(self, **kwargs):
569573
self.output_hidden_states = kwargs.pop("output_hidden_states", False)
570574
self.output_attentions = kwargs.pop("output_attentions", False)
571575
self.use_cache = kwargs.pop("use_cache", False)
576+
self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
572577

573578
# for transformers fuse
579+
self.fuse_linear = kwargs.pop("fuse_linear", False)
574580
self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False)
575581
self.fuse_attention_ffn = kwargs.pop("fuse_attention_ffn", False)
576582

@@ -623,6 +629,9 @@ def __init__(self, **kwargs):
623629

624630
self.classifier_dropout = kwargs.pop("classifier_dropout", None)
625631

632+
self.dpo_config = kwargs.pop("dpo_config", None)
633+
self.kto_config = kwargs.pop("kto_config", None)
634+
626635
# Tokenizer arguments TODO: eventually tokenizer and models should share the same config
627636
self.tokenizer_class = kwargs.pop("tokenizer_class", None)
628637
self.prefix = kwargs.pop("prefix", None)

0 commit comments

Comments
 (0)