PaddlePaddle
diff --git a/‎paddleformers/generation/utils.py‎
Lines changed: 77 additions & 50 deletions b/‎paddleformers/generation/utils.py‎
Lines changed: 77 additions & 50 deletions
diff --git a/‎paddleformers/nn/__init__.py‎
Lines changed: 29 additions & 0 deletions b/‎paddleformers/nn/__init__.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎paddleformers/nn/criterion/interface.py‎
Lines changed: 12 additions & 8 deletions b/‎paddleformers/nn/criterion/interface.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎paddleformers/nn/criterion/sft_loss.py‎
Lines changed: 57 additions & 4 deletions b/‎paddleformers/nn/criterion/sft_loss.py‎
Lines changed: 57 additions & 4 deletions
@@ -20,13 +20,11 @@
 
 import paddle
 import paddle.distributed as dist
-import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle import Tensor
-from paddle.common_ops_import import convert_dtype
 from paddle.utils import map_structure
 
-from ..transformers.model_outputs import ModelOutput
+from ..transformers.model_outputs import CausalLMOutputWithPast, ModelOutput
 from ..transformers.utils import get_scale_by_dtype
 from ..utils.log import logger
 from ..utils.masking_utils import _expand_2d_mask, _make_causal_mask
@@ -493,61 +491,38 @@ def expand_inputs_for_generation(input_ids, expand_size, attention_mask=None, **
 
     @staticmethod
     def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        # Update the model inputs during generation.
-        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
-        # and they contain pad value, the result vectors updated by this method
-        # may be different from expected. In this case, you need to rewrite the
-        # method.
+        """
+        Updates model kwargs for generation.
+
+        Args:
+            outputs (Any): Model outputs.
+            model_kwargs (dict): Current model kwargs.
+            is_encoder_decoder (bool): Whether using encoder-decoder architecture.
 
+        Returns:
+            dict: Updated model kwargs.
+        """
         # update cache
         if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
-            model_kwargs["cache"] = outputs[1]
             model_kwargs["past_key_values"] = outputs[1]
 
-        if isinstance(outputs, ModelOutput) and "past_key_values" in outputs:
-            model_kwargs["cache"] = outputs.past_key_values
+        if isinstance(outputs, CausalLMOutputWithPast) and "past_key_values" in outputs:
             model_kwargs["past_key_values"] = outputs.past_key_values
 
         # update token_type_ids with last value
         if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
             token_type_ids = model_kwargs["token_type_ids"]
             model_kwargs["token_type_ids"] = paddle.concat([token_type_ids, token_type_ids[:, -1:]], axis=-1)
-
-        # update position_ids
-        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
-            position_ids = model_kwargs["position_ids"]
-            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
-
-        # update attention_mask
-        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+        if not is_encoder_decoder and model_kwargs.get("attention_mask", None) is not None:
+            # update attention mask
             attention_mask = model_kwargs["attention_mask"]
-            # nn.Pad2D don't support the data type `bool`
-            if convert_dtype(attention_mask.dtype) == "bool":
-                attention_mask = paddle.cast(attention_mask, "int64")
-            if len(attention_mask.shape) == 4:
-                cur_device = paddle.get_device()
-                if cur_device.split(":")[0] == "npu":
-                    attention_mask = nn.Pad2D([0, 0, 0, 1], mode="constant")(attention_mask)
-                    attention_mask = nn.Pad2D([0, 1, 0, 0], value=0)(attention_mask)
-                else:
-                    attention_mask = nn.Pad2D([0, 0, 0, 1], mode="replicate")(attention_mask)
-                    attention_mask = nn.Pad2D([0, 1, 0, 0], value=get_scale_by_dtype(return_positive=False))(
-                        attention_mask
-                    )
-
-                dtype = convert_dtype(attention_mask.dtype)
-                if "int" in dtype:
-                    attention_mask[:, :, -1, -1] = 1
-                elif "float" in dtype:
-                    attention_mask[:, :, -1, -1] = 0.0
-                else:
-                    raise ValueError("The data type of input `attention_mask` must " "be bool, int or float")
-            else:
-                attention_mask = paddle.concat(
-                    [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype="int64")], axis=-1
-                )
-            model_kwargs["attention_mask"] = attention_mask
-
+            model_kwargs["attention_mask"] = paddle.concat(
+                [
+                    attention_mask,
+                    paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype),
+                ],
+                axis=-1,
+            )
         # update role_ids
         if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
             role_ids = model_kwargs["role_ids"]
@@ -611,11 +586,63 @@ def get_decoder_start_token_id(self, decoder_start_token_id=None, bos_token_id=N
             "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
         )
 
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        # Implement in subclasses for custom behavior to prepare inputs in the
-        # generate method.
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        use_cache=True,
+        past_key_values=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        """Prepares model inputs for generation in PaddlePaddle models.
+
+        Args:
+            input_ids (paddle.Tensor):
+                The input token IDs with shape [batch_size, sequence_length].
+            use_cache (bool, optional):
+                Whether to use cached key-value states for faster generation.
+                Defaults to False.
+            past_key_values (Optional[Tuple[paddle.Tensor]]):
+                Cached past key-value states from previous generation steps.
+                If provided, the input_ids will be truncated to only keep the last token.
+            inputs_embeds (Optional[paddle.Tensor]):
+                Precomputed embeddings instead of token IDs.
+                Only used in the first generation step when past_key_values is None.
+            **kwargs:
+                Additional keyword arguments including:
+                - attention_mask (paddle.Tensor): Attention mask tensor
+
+        Returns:
+            Dict[str, Union[paddle.Tensor, bool, Dict]]:
+            A dictionary containing:
+                - "input_ids" or "inputs_embeds": The main input tensors
+                - "past_key_values": The cached key-value states
+                - "use_cache": Flag indicating whether to use caching
+                - "attention_mask": The attention mask tensor (if provided)
+                - "return_dict": Always set to True for consistent output format
+
+        """
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        attention_mask = kwargs.get("attention_mask", None)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "return_dict": True,
+            }
+        )
 
-        return {"input_ids": input_ids}
+        return model_inputs
 
     def adjust_logits_during_generation(self, logits):
         # Implement in subclasses for custom behavior to adjust the logits in
 
@@ -35,13 +35,40 @@
         "sft_postprocess_loss",
         "sft_loss_forward",
     ],
+    "moe.abstract": ["MOELayerBase"],
+    "moe.all_gather": ["allgather_async", "reduce_scatter_async", "AlltoAllSmart", "AllGatherAsync"],
+    "moe.all_to_all": ["AlltoAll", "AlltoAllAsync"],
+    "moe.moe_allgather_layer": ["ReshardCombineWeight", "MOEAllGatherLayerV2"],
+    "moe.moe_alltoall_layer": ["GateCombine", "combining"],
+    "moe.moe_block": ["create_moe_block", "MoEStatics"],
+    "moe.top_gate": [
+        "masked_fill",
+        "compute_optimal_transport",
+        "cast_if_needed",
+        "FusedGateDetachMatmul",
+        "gate_detach_matmul",
+        "TopKGate",
+    ],
+    "moe.utils": [
+        "ReduceScatterGroupOp",
+        "AllGatherGroupOp",
+        "get_async_loader",
+        "hack_offload_wait",
+        "all_gather_group",
+        "reduce_scatter_group",
+        "detach_and_requires_grad_",
+        "FakeClone",
+        "manual_backward",
+        "_parse_moe_group",
+    ],
     "activation": ["ACT2FN", "ClassInstantier", "ACT2CLS"],
     "embedding": ["Embedding"],
     "general": ["GeneralInterface"],
     "linear": ["Linear"],
     "lm_head": ["LMHead"],
     "mlp": ["MLP"],
     "norm": ["Norm", "LayerNorm", "RMSNorm"],
+    "pp_model": ["GeneralModelForCausalLMPipe"],
 }
 
 if TYPE_CHECKING:
@@ -53,7 +80,9 @@
     from .linear import *
     from .lm_head import *
     from .mlp import *
+    from .moe import *
     from .norm import *
+    from .pp_model import *
 else:
     sys.modules[__name__] = _LazyModule(
         __name__,
 
@@ -21,12 +21,17 @@
 from ..general import GeneralInterface
 from .dpo_loss import dpo_loss_forward
 from .kto_loss import kto_loss_forward
-from .sft_loss import sft_loss_forward
+from .sft_loss import mtp_sft_loss_forward, sft_loss_forward
 
 
 class LossInterface(GeneralInterface):
 
-    _global_mapping = {"sft": sft_loss_forward, "dpo": dpo_loss_forward, "kto": kto_loss_forward}
+    _global_mapping = {
+        "sft": sft_loss_forward,
+        "dpo": dpo_loss_forward,
+        "kto": kto_loss_forward,
+        "mtp_sft": mtp_sft_loss_forward,
+    }
 
 
 ALL_LOSS_FUNCTIONS = LossInterface()
@@ -40,16 +45,12 @@ def __init__(self, config, return_tuple=True, ignore_eos_token=False, use_infohu
         self.kto_config = copy.deepcopy(config.get("kto_config", None))
         self.ignored_index = getattr(config, "ignored_index", -100)
         self.use_filtered_label_loss = config.get("use_filtered_label_loss", False)
-        self.loss_subbatch_seqlen = config.get(
-            "loss_subbatch_seqlen", -1
-        )  # 切分由loss_subbatch_seqlen决定是否开启，loss_subbatch_seqlen > 0 才启动
+        self.loss_subbatch_seqlen = config.get("loss_subbatch_seqlen", -1)
         self.use_subbatch = self.loss_subbatch_seqlen > 0
         self.sequence_parallel = config.get("sequence_parallel", False)
         self.tensor_parallel = config.tensor_parallel_degree > 1
         self.use_fused_head_and_loss_fn = config.get("use_fused_head_and_loss_fn", False)
-        self.enable_parallel_cross_entropy = (
-            config.tensor_parallel_degree > 1 and config.tensor_parallel_output
-        )  # loss并行计算时，use_fused_head_and_loss_fn = False
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
         logger.info(
             f"loss_subbatch_seqlen: {self.loss_subbatch_seqlen} , use_fused_head_and_loss_fn: {self.use_fused_head_and_loss_fn}, use_filtered_label_loss: {self.use_filtered_label_loss}"
         )
@@ -80,6 +81,9 @@ def __init__(self, config, return_tuple=True, ignore_eos_token=False, use_infohu
         else:
             loss_type = "sft"
 
+            if config.get("num_nextn_predict_layers", 0) > 0:
+                loss_type = "mtp_sft"
+
         self.loss_foward_fn = ALL_LOSS_FUNCTIONS.get(loss_type)
         self.loss_type = loss_type
 
 
@@ -61,7 +61,6 @@ def sft_loss_forward(
     logits, labels, hidden_states, lm_head_weight, lm_head_bias, transpose_y = sft_preprocess_inputs(
         self, logits, labels
     )
-
     if self.use_filtered_label_loss:
         if self.tensor_parallel and self.sequence_parallel and logits is None:
             masked_lm_labels, sparse_label_idx = sequence_parallel_sparse_mask_labels(labels, self.ignored_index)
@@ -70,21 +69,21 @@ def sft_loss_forward(
                 hidden_states = paddle.gather(hidden_states, sparse_label_idx, axis=0)
                 hidden_states = AllGatherVarlenOp.apply(hidden_states)
         else:
-            masked_lm_labels = masked_lm_labels.flatten()
+            masked_lm_labels = labels.flatten()
             sparse_label_idx = paddle.nonzero(masked_lm_labels != self.ignored_index).flatten()
             masked_lm_labels = paddle.take_along_axis(masked_lm_labels, sparse_label_idx, axis=0)
             if hidden_states is not None:
                 hidden_states = hidden_states.reshape([-1, hidden_states.shape[-1]])
                 hidden_states = paddle.take_along_axis(hidden_states, sparse_label_idx.reshape([-1, 1]), axis=0)
             if logits is not None:
                 logits = paddle.gather(logits, sparse_label_idx, axis=1)
+        labels = masked_lm_labels
     else:
         if self.sequence_parallel:
             if hidden_states is not None:
                 hidden_states = AllGatherOp.apply(hidden_states)
 
-        masked_lm_labels = labels
-
+    masked_lm_labels = labels
     # bsz,seq_len,hidden_size or seq_len,hidden_size
     seq_len = masked_lm_labels.shape[1] if masked_lm_labels.ndim == 2 else masked_lm_labels.shape[0]
     if self.use_fused_head_and_loss_fn and self.use_subbatch and seq_len > self.loss_subbatch_seqlen:
@@ -145,3 +144,57 @@ def sft_loss_forward(
             masked_lm_loss = self.loss_func(logits, labels.unsqueeze(-1))
     loss = sft_postprocess_loss(self, masked_lm_loss, labels, loss_mask, **kwargs)
     return loss
+
+
+def mtp_sft_loss_forward(
+    self: nn.Layer,
+    logits: Union[paddle.Tensor, Tuple[paddle.Tensor]],
+    labels: Union[paddle.Tensor, Tuple[paddle.Tensor]],
+    loss_mask: paddle.Tensor = None,
+    router_loss: paddle.Tensor = None,
+    mtp_logits: paddle.Tensor = None,
+    **kwargs
+):
+    num_nextn_predict_layers = self.config.get("num_nextn_predict_layers", 0)
+    multi_token_pred_lambda = self.config.get("multi_token_pred_lambda", 0.3)
+    if num_nextn_predict_layers > 0:
+        labels_ori = labels
+        labels = labels[:, :-num_nextn_predict_layers]
+        if loss_mask is not None:
+            loss_mask = loss_mask[:, :-num_nextn_predict_layers]
+        seq_length = labels.shape[1]
+
+    sft_loss = sft_loss_forward(self, logits, labels, loss_mask, **kwargs)
+
+    if num_nextn_predict_layers > 0:
+        mtp_loss_res = []
+        for depth in range(num_nextn_predict_layers):
+            logtis_cur_depth = mtp_logits[depth]
+            labels_cur_depth = labels_ori[:, (depth + 1) : (depth + 1 + seq_length)]
+            res_cur_depth = sft_loss_forward(logtis_cur_depth, labels_cur_depth, loss_mask)
+            mtp_loss_res.append(res_cur_depth)
+
+    def add_loss(main_loss, loss):
+        return main_loss + loss - loss.detach()
+
+    if self.return_tuple:
+        loss, loss_sum = sft_loss
+    else:
+        loss, loss_sum = sft_loss, None
+
+    if num_nextn_predict_layers > 0:
+        loss = add_loss(
+            loss,
+            multi_token_pred_lambda * sum([x[0] for x in mtp_loss_res]) / len(mtp_loss_res),
+        )
+
+    if loss_sum is not None:
+        loss_sum = loss_sum + multi_token_pred_lambda * sum([x[1].detach() for x in mtp_loss_res]) / len(mtp_loss_res)
+
+    if router_loss is not None and isinstance(router_loss, paddle.Tensor):
+        loss = loss + router_loss - router_loss.detach()
+
+    if self.return_tuple:
+        return loss, loss_sum
+    else:
+        return loss