Fix attn impl and ernie4.5 for erniekit (#2580)

cheng221 · web-flow · commit e900f1d3a51f · 2025-09-10T13:55:01.000+08:00
diff --git a/examples/run_finetune.py b/examples/run_finetune.py
@@ -171,6 +171,7 @@ def main():
     model_config.seq_length = training_args.max_seq_len
     model_config.max_sequence_length = training_args.max_seq_len
     model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
+    model_config._attn_implementation = model_args.attn_impl
     logger.info(f"Final model config: {model_config}")
     logger.info("Creating model")
 
diff --git a/paddleformers/nn/attention/flashmask_attention.py b/paddleformers/nn/attention/flashmask_attention.py
@@ -34,6 +34,9 @@ def flashmask_attention_forward(
     **kwargs
 ):
     # b,l,h,d
+    if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.ndim == 3:
+        attn_mask_startend_row_indices = attn_mask_startend_row_indices.unsqueeze(-1)
+
     if sink is None:
         out = flashmask_attention(
             query,
diff --git a/paddleformers/nn/attention/sdpa_attention.py b/paddleformers/nn/attention/sdpa_attention.py
@@ -39,6 +39,8 @@ def sdpa_attention_forward(
         is_causal = query.shape[1] > 1 and attention_mask is None and getattr(module, "is_causal", True)
     elif attn_mask_startend_row_indices is not None:
         is_causal = False
+        if attn_mask_startend_row_indices.ndim == 3:
+            attn_mask_startend_row_indices = attn_mask_startend_row_indices.unsqueeze(-1)
         attention_mask = _gen_from_sparse_attn_mask_indices(attn_mask_startend_row_indices, query.dtype)
 
     if sink is None:
diff --git a/paddleformers/nn/criterion/sft_loss.py b/paddleformers/nn/criterion/sft_loss.py
@@ -36,7 +36,7 @@ def sft_preprocess_inputs(self, logits, labels):
 
 
 def sft_postprocess_loss(self, masked_lm_loss, labels, loss_mask, **kwargs):
-    if loss_mask is None:
+    if self.use_filtered_label_loss or loss_mask is None:
         loss_mask = labels != self.ignored_index
     loss_mask = loss_mask.reshape([-1]).cast(paddle.float32)
     # 逐位对齐, 全精度聚合
diff --git a/paddleformers/nn/pp_model.py b/paddleformers/nn/pp_model.py
@@ -653,6 +653,14 @@ def _prepare_pipeline_inputs_func(cls, inputs):
                     "position_ids",
                     "nbatch_pack_offset",
                 ]
+            # (NOTE) attn_mask_start_row_indices is special for erniekit
+            elif "attn_mask_start_row_indices" in inputs:
+                first_stage_keys = [
+                    "input_ids",
+                    "attn_mask_start_row_indices",
+                    "position_ids",
+                    "nbatch_pack_offset",
+                ]
         else:  # inputs is list
             if "attention_mask" in inputs[0]:
                 first_stage_keys = [
@@ -661,6 +669,13 @@ def _prepare_pipeline_inputs_func(cls, inputs):
                     "position_ids",
                     "nbatch_pack_offset",
                 ]
+            elif "attn_mask_start_row_indices" in inputs[0]:
+                first_stage_keys = [
+                    "input_ids",
+                    "attn_mask_start_row_indices",
+                    "position_ids",
+                    "nbatch_pack_offset",
+                ]
         last_stage_keys = ["labels", "loss_mask"]
 
         def get_expected_keys(inputs, keys):
diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
@@ -55,6 +55,8 @@
         ("bert", "Bert"),
         ("deepseek_v2", "DeepseekV2"),
         ("deepseek_v3", "DeepseekV3"),
+        ("ernie4_5", "Ernie4_5"),
+        ("ernie4_5_moe", "Ernie4_5_Moe"),
         ("llama", "Llama"),
         ("qwen", "QWen"),
         ("qwen2", "Qwen2"),
diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
@@ -56,6 +56,7 @@
         ("DeepseekV2", "deepseek_v2"),
         ("DeepseekV3", "deepseek_v3"),
         ("Ernie4_5", "ernie4_5"),
+        ("Ernie4_5_Moe", "ernie4_5_moe"),
         ("Llama", "llama"),
         ("QWen", "qwen"),
         ("Qwen2", "qwen2"),
diff --git a/paddleformers/transformers/ernie4_5/modeling.py b/paddleformers/transformers/ernie4_5/modeling.py
@@ -770,6 +770,9 @@ def forward(
         Returns:
             Union[tuple, CausalLMOutputWithCrossAttentions]: Model outputs.
         """
+        if kwargs.get("attn_mask_start_row_indices", None) is not None and attn_mask_startend_row_indices is None:
+            attn_mask_startend_row_indices = kwargs.pop("attn_mask_start_row_indices")
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/paddleformers/transformers/ernie4_5_moe/configuration.py b/paddleformers/transformers/ernie4_5_moe/configuration.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """ Ernie4_5_Moe model configuration """
+import json
 from typing import Optional, Union
 
 from ...utils.log import logger
@@ -233,7 +234,7 @@ def __init__(
         self.multi_token_pred_lambda = multi_token_pred_lambda
         self.enable_mtp_magic_send = enable_mtp_magic_send
         self.use_recompute_mtp = use_recompute_mtp
-
+        self.dpo_config = dpo_config
         self.register_unsavable_keys(
             [
                 "disable_ffn_model_parallel",
@@ -275,5 +276,49 @@ def __init__(
             ]
         )
 
+    def to_json_string(self, use_diff: bool = True, saving_file=False) -> str:
+        """
+        Serialize the configuration to a JSON string with special handling for non-serializable objects.
+
+        This method overrides the default JSON serialization to handle special objects like
+        paddle.distributed.communication.group.Group that cannot be serialized normally.
+
+        Args:
+            use_diff (bool, optional): If True, only outputs the differences from the default configuration.
+                                    If False, outputs the full configuration. Defaults to True.
+
+        Returns:
+            str: A JSON formatted string representation of the configuration, with proper indentation
+                and handling for non-serializable objects.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict(saving_file=saving_file)
+        else:
+            config_dict = self.to_dict(saving_file=saving_file)
+
+        def _serializer(obj):
+            """
+            Handle non-serializable objects during JSON conversion.
+
+            Args:
+                obj: The object to be serialized
+
+            Returns:
+                The serializable representation of the object
+
+            """
+            return repr(obj)
+
+        return (
+            json.dumps(
+                config_dict,
+                indent=2,
+                sort_keys=True,
+                ensure_ascii=False,
+                default=_serializer,
+            )
+            + "\n"
+        )
+
 
 __all__ = ["Ernie4_5_MoeConfig"]
diff --git a/paddleformers/transformers/ernie4_5_moe/modeling.py b/paddleformers/transformers/ernie4_5_moe/modeling.py
@@ -443,7 +443,17 @@ class Ernie4_5_MoePretrainedModel(PretrainedModel):
     config_class = Ernie4_5_MoeConfig
     base_model_prefix = "model"
     _keep_in_fp32_modules = ["mlp.gate.weight", "e_score_correction_bias"]
-    transpose_weight_keys = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "gate"]
+    transpose_weight_keys = [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "gate",
+        "mtp_linear_proj.0",
+    ]
 
     @classmethod
     def _get_tensor_parallel_mappings(cls, config, is_split=True):
@@ -659,16 +669,18 @@ def __init__(self, config: Ernie4_5_MoeConfig):
             self.mtp_linear_proj = paddle.nn.LayerList(
                 [
                     GeneralLinear.create(
-                        self.config.hidden_size * 2,
-                        self.config.hidden_size,
+                        config.hidden_size * 2,
+                        config.hidden_size,
                         has_bias=config.use_bias,
                         config=config,
                         fuse_matmul_bias=config.fuse_linear,
+                        linear_type="default",
                     )
-                    for _ in range(self.config.num_nextn_predict_layers)
+                    for _ in range(config.num_nextn_predict_layers)
                 ]
             )
             if config.sequence_parallel:
+                logger.info("enable sequence parallel for mtp_linear")
                 for mtp_linear in self.mtp_linear_proj:
                     mark_as_sequence_parallel_parameter(mtp_linear.weight)
                     if config.use_bias:
@@ -795,7 +807,7 @@ def forward(
                 attention_mask, inputs_embeds.shape[:2], kv_seq_len, inputs_embeds.dtype
             )
 
-        if self.config.num_nextn_predict_layers > 0:
+        if self.training and self.config.num_nextn_predict_layers > 0:
             inputs_embeds_extra = inputs_embeds[:, -self.config.num_nextn_predict_layers :, :]
             inputs_embeds = inputs_embeds[:, : -self.config.num_nextn_predict_layers, :]
             inputs_embeds_ori = inputs_embeds
@@ -896,7 +908,7 @@ def forward(
                 all_gate_logits = all_gate_logits + (gate_logits,)
 
         # Multi Token Prediction
-        if self.config.num_nextn_predict_layers > 0:
+        if self.training and self.config.num_nextn_predict_layers > 0:
             mtp_outputs.append(hidden_states)
 
             for depth in range(self.config.num_nextn_predict_layers):
@@ -1088,6 +1100,9 @@ def forward(
         Returns:
             Union[tuple, MoECausalLMOutputWithPast]: Model outputs.
         """
+        if kwargs.get("attn_mask_start_row_indices", None) is not None and attn_mask_startend_row_indices is None:
+            attn_mask_startend_row_indices = kwargs["attn_mask_start_row_indices"]
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states