PaddlePaddle
diff --git a/‎examples/config/qwen/lora_argument_qwen2_0p5b.json‎
Lines changed: 9 additions & 5 deletions b/‎examples/config/qwen/lora_argument_qwen2_0p5b.json‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎examples/config/qwen/sft_argument_qwen2_0p5b.json‎
Lines changed: 3 additions & 1 deletion b/‎examples/config/qwen/sft_argument_qwen2_0p5b.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/run_finetune.py‎
Lines changed: 8 additions & 0 deletions b/‎examples/run_finetune.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddleformers/nn/activation.py‎
Lines changed: 1 addition & 0 deletions b/‎paddleformers/nn/activation.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddleformers/nn/pp_model.py‎
Lines changed: 1 addition & 1 deletion b/‎paddleformers/nn/pp_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddleformers/transformers/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎paddleformers/transformers/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddleformers/transformers/configuration_utils.py‎
Lines changed: 9 additions & 0 deletions b/‎paddleformers/transformers/configuration_utils.py‎
Lines changed: 9 additions & 0 deletions
@@ -15,14 +15,15 @@
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
     "num_train_epochs": 1,
-    "learning_rate": 3e-05,
-    "warmup_steps": 10,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
     "logging_steps": 1,
+    "max_steps": 100,
     "evaluation_strategy": "epoch",
     "save_strategy": "epoch",
     "src_length": 1024,
     "max_length": 2048,
-    "fp16": true,
+    "bf16": true,
     "fp16_opt_level": "O2",
     "do_train": true,
     "do_eval": true,
@@ -36,9 +37,12 @@
     "pipeline_parallel_degree": 1,
     "sharding": "stage2",
     "lora": true,
-    "zero_padding": false,
+    "zero_padding": true,
+    "flash_mask": true,
     "unified_checkpoint": true,
-    "use_flash_attention": false,
+    "use_flash_attention": true,
+    "convert_from_hf": false,
+    "save_to_hf": false,
     "pissa": false,
     "use_mora": false,
     "encode_one_turn": true
 
@@ -23,7 +23,7 @@
     "save_strategy": "epoch",
     "src_length": 1024,
     "max_length": 2048,
-    "fp16": true,
+    "bf16": true,
     "fp16_opt_level": "O2",
     "do_train": true,
     "do_eval": true,
@@ -40,5 +40,7 @@
     "flash_mask": true,
     "unified_checkpoint": true,
     "use_flash_attention": true,
+    "convert_from_hf": false,
+    "save_to_hf": false,
     "encode_one_turn": true
   }
@@ -43,6 +43,10 @@
     Qwen2ForCausalLMPipe,
     Qwen2MoeForCausalLM,
     Qwen2MoeForCausalLMPipe,
+    Qwen3ForCausalLM,
+    Qwen3ForCausalLMPipe,
+    Qwen3MoeForCausalLM,
+    Qwen3MoeForCausalLMPipe,
 )
 from paddleformers.transformers.configuration_utils import LlmMetaConfig
 from paddleformers.trl import DataConfig, ModelConfig, SFTConfig, SFTTrainer
@@ -67,6 +71,10 @@
     Qwen2ForCausalLMPipe,
     Qwen2MoeForCausalLM,
     Qwen2MoeForCausalLMPipe,
+    Qwen3ForCausalLM,
+    Qwen3ForCausalLMPipe,
+    Qwen3MoeForCausalLM,
+    Qwen3MoeForCausalLMPipe,
 ]
 
 
 
@@ -25,6 +25,7 @@ def __getitem__(self, key):
 
 
 ACT2CLS = {
+    "gelu": nn.GELU,
     "relu": nn.ReLU,
     "relu6": nn.ReLU6,
     "sigmoid": nn.Sigmoid,
 
@@ -181,7 +181,7 @@ class RotaryEmbedding(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.head_dim = config.head_dim
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         self.base = config.rope_theta
 
     def forward(self, x, position_ids):
 
@@ -291,41 +291,41 @@
         "Qwen2Model",
         "Qwen2PretrainedModel",
         "Qwen2ForCausalLM",
+        "Qwen2ForCausalLMPipe",
         "Qwen2PretrainingCriterion",
         "Qwen2ForSequenceClassification",
         "Qwen2ForTokenClassification",
         "Qwen2SentenceEmbedding",
     ],
-    "qwen2.modeling_pp": ["Qwen2ForCausalLMPipe"],
     "qwen2.tokenizer": ["Qwen2Tokenizer"],
     "qwen2.tokenizer_fast": ["Qwen2TokenizerFast"],
     "qwen2_moe.configuration": ["Qwen2MoeConfig"],
     "qwen2_moe.modeling": [
         "Qwen2MoeModel",
         "Qwen2MoePretrainedModel",
         "Qwen2MoeForCausalLM",
+        "Qwen2MoeForCausalLMPipe",
         "Qwen2MoePretrainingCriterion",
     ],
-    "qwen2_moe.modeling_pp": ["Qwen2MoeForCausalLMPipe"],
     "qwen3.configuration": ["Qwen3Config"],
     "qwen3.modeling": [
         "Qwen3Model",
         "Qwen3PretrainedModel",
         "Qwen3ForCausalLM",
+        "Qwen3ForCausalLMPipe",
         "Qwen3PretrainingCriterion",
         "Qwen3ForSequenceClassification",
         "Qwen3ForTokenClassification",
         "Qwen3SentenceEmbedding",
     ],
-    "qwen3.modeling_pp": ["Qwen3ForCausalLMPipe"],
     "qwen3_moe.configuration": ["Qwen3MoeConfig"],
     "qwen3_moe.modeling": [
         "Qwen3MoeModel",
         "Qwen3MoePretrainedModel",
         "Qwen3MoeForCausalLM",
+        "Qwen3MoeForCausalLMPipe",
         "Qwen3MoePretrainingCriterion",
     ],
-    "qwen3_moe.modeling_pp": ["Qwen3MoeForCausalLMPipe"],
     "ernie4_5vl.tokenizer": ["Ernie4_5_VLTokenizer"],
     "ernie4_5vl": [],
     "bert": [],
 
@@ -507,6 +507,10 @@ class PretrainedConfig:
             If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
         sep_token_id (`int`, *optional*): The id of the _separation_ token.
 
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+
         dtype (`str`, *optional*):
             The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
             (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
@@ -569,8 +573,10 @@ def __init__(self, **kwargs):
         self.output_hidden_states = kwargs.pop("output_hidden_states", False)
         self.output_attentions = kwargs.pop("output_attentions", False)
         self.use_cache = kwargs.pop("use_cache", False)
+        self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
 
         # for transformers fuse
+        self.fuse_linear = kwargs.pop("fuse_linear", False)
         self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False)
         self.fuse_attention_ffn = kwargs.pop("fuse_attention_ffn", False)
 
@@ -623,6 +629,9 @@ def __init__(self, **kwargs):
 
         self.classifier_dropout = kwargs.pop("classifier_dropout", None)
 
+        self.dpo_config = kwargs.pop("dpo_config", None)
+        self.kto_config = kwargs.pop("kto_config", None)
+
         # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
         self.tokenizer_class = kwargs.pop("tokenizer_class", None)
         self.prefix = kwargs.pop("prefix", None)