PaddlePaddle
diff --git a/‎paddleformers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎paddleformers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddleformers/generation/utils.py‎
Lines changed: 63 additions & 12 deletions b/‎paddleformers/generation/utils.py‎
Lines changed: 63 additions & 12 deletions
diff --git a/‎paddleformers/nn/__init__.py‎
Lines changed: 63 additions & 0 deletions b/‎paddleformers/nn/__init__.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎paddleformers/nn/activation.py‎
Lines changed: 36 additions & 0 deletions b/‎paddleformers/nn/activation.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎paddleformers/nn/attention/__init__.py‎
Lines changed: 41 additions & 0 deletions b/‎paddleformers/nn/attention/__init__.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎paddleformers/nn/attention/eager_attention.py‎
Lines changed: 59 additions & 0 deletions b/‎paddleformers/nn/attention/eager_attention.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎paddleformers/nn/attention/flashmask_attention.py‎
Lines changed: 47 additions & 0 deletions b/‎paddleformers/nn/attention/flashmask_attention.py‎
Lines changed: 47 additions & 0 deletions
@@ -52,6 +52,7 @@
 modules = [
     "data",
     "datasets",
+    "nn",
     "mergekit",
     "ops",
     "peft",
@@ -69,6 +70,7 @@
         data,
         datasets,
         mergekit,
+        nn,
         ops,
         peft,
         quantization,
 
@@ -29,6 +29,8 @@
 from ..transformers.model_outputs import ModelOutput
 from ..transformers.utils import get_scale_by_dtype
 from ..utils.log import logger
+from ..utils.masking_utils import _expand_2d_mask, _make_causal_mask
+from ..utils.tools import get_env_device
 from .configuration_utils import DEFAULT_MAX_NEW_TOKENS, GenerationConfig
 from .logits_process import (
     ForcedBOSTokenLogitsProcessor,
@@ -339,13 +341,61 @@ def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
         is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
             (eos_token_id is not None) and (pad_token_id != eos_token_id)
         )
-        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
-            attention_mask = (input_ids == pad_token_id).astype(paddle.get_default_dtype()) * get_scale_by_dtype(
-                return_positive=False
-            )
+        inputs_tensor = input_ids
+
+        # No information for attention mask inference -> return default attention mask
+        default_attention_mask = paddle.ones(input_ids.shape[:2], dtype=paddle.get_default_dtype())
+        if pad_token_id is None:
+            return default_attention_mask
+        can_infer_attention_mask = is_pad_token_in_inputs_ids * is_pad_token_not_equal_to_eos_token_id
+        attention_mask_from_padding = (inputs_tensor != pad_token_id).astype(paddle.get_default_dtype())
+
+        attention_mask = (
+            attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask
+        )
+        return attention_mask
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape, past_key_values_length=past_key_values_length
+                    )
+                    if get_env_device() in ["npu", "mlu", "intel_hpu"]:
+                        expanded_attn_mask = expanded_attn_mask.astype("bool") & combined_attention_mask.astype("bool")
+                    else:
+                        expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        if get_env_device() in ["npu", "mlu", "intel_hpu"]:
+            x = paddle.to_tensor(0.0, dtype="float32")
+            y = paddle.to_tensor(paddle.finfo(dtype).min, dtype="float32")
+            expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), x, y).astype(dtype)
+        elif get_env_device() == "xpu":
+            x = paddle.to_tensor(0.0, dtype="float32")
+            y = paddle.to_tensor(-1.7005809656952787e38, dtype="float32")
+            expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), x, y)
+        elif get_env_device() == "gcu":
+            min_val = paddle.finfo(dtype).min
+            x = paddle.to_tensor(0.0, dtype=dtype)
+            y = paddle.to_tensor(min_val, dtype=dtype)
+            expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), x, y).astype(dtype)
         else:
-            attention_mask = paddle.zeros_like(input_ids, dtype=paddle.get_default_dtype())
-        return paddle.unsqueeze(attention_mask, axis=[1, 2])
+            expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), 0.0, paddle.finfo(dtype).min)
+            expanded_attn_mask = expanded_attn_mask.astype(dtype)
+        return expanded_attn_mask
 
     @staticmethod
     def prepare_seq_len_for_generation(input_ids, pad_token_id, eos_token_id):
@@ -853,12 +903,8 @@ def generate(
                 bos_token_id, encoder_output=model_kwargs["inputs_embeds"]
             )
 
-        if model_kwargs.get("attention_mask", None) is None:
-            # TODO
-            # Init `attention_mask` depending on `pad_token_id`
-            model_kwargs["attention_mask"] = self.prepare_attention_mask_for_generation(
-                input_ids, pad_token_id, eos_token_id
-            )
+        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
+        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
         self.is_encoder_decoder = self.config.is_encoder_decoder
 
         if self.is_encoder_decoder:
@@ -880,6 +926,11 @@ def generate(
 
         pad_token_id = self.set_pad_token_id(pad_token_id, eos_token_id)
 
+        if not kwargs_has_attention_mask and accepts_attention_mask:
+            model_kwargs["attention_mask"] = self.prepare_attention_mask_for_generation(
+                input_ids, pad_token_id, eos_token_id
+            )
+
         if generation_config.max_length != 0 and generation_config.max_new_tokens == DEFAULT_MAX_NEW_TOKENS:
             logger.warning("`max_length` will be deprecated in future releases, use `max_new_tokens` instead.")
             generation_config.max_new_tokens = generation_config.max_length
 
@@ -0,0 +1,63 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from contextlib import suppress
+from typing import TYPE_CHECKING
+
+from ..utils.lazy_import import _LazyModule
+
+import_structure = {
+    "attention": ["AttentionInterface", "ALL_ATTENTION_FUNCTIONS"],
+    "criterion": ["LossInterface", "ALL_LOSS_FUNCTIONS", "CriterionLayer"],
+    "attention.eager_attention": ["eager_attention_forward"],
+    "attention.flashmask_attention": ["flashmask_attention_forward"],
+    "attention.interface": ["AttentionInterface", "ALL_ATTENTION_FUNCTIONS"],
+    "attention.sdpa_attention": ["sdpa_attention_forward"],
+    "attention.utils": ["repeat_kv"],
+    "criterion.dpo_loss": ["dpo_preprocess_inputs", "dpo_logps", "cal_dpo_loss", "dpo_loss_forward"],
+    "criterion.interface": ["LossInterface", "ALL_LOSS_FUNCTIONS", "CriterionLayer"],
+    "criterion.kto_loss": ["kto_preprocess_inputs", "_nested_gather", "kto_logps", "kto_loss", "kto_loss_forward"],
+    "criterion.loss_utils": ["calc_lm_head_logits", "subbatch"],
+    "criterion.sft_loss": [
+        "sft_preprocess_inputs",
+        "sft_postprocess_loss",
+        "sft_loss_forward",
+    ],
+    "activation": ["ACT2FN", "ClassInstantier", "ACT2CLS"],
+    "embedding": ["Embedding"],
+    "general": ["GeneralInterface"],
+    "linear": ["Linear"],
+    "lm_head": ["LMHead"],
+    "mlp": ["MLP"],
+    "norm": ["Norm", "LayerNorm", "RMSNorm"],
+}
+
+if TYPE_CHECKING:
+    from .activation import *
+    from .attention import *
+    from .criterion import *
+    from .embedding import *
+    from .general import *
+    from .linear import *
+    from .lm_head import *
+    from .mlp import *
+    from .norm import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import paddle.nn as nn
+
+
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "relu": nn.ReLU,
+    "relu6": nn.ReLU6,
+    "sigmoid": nn.Sigmoid,
+    "silu": nn.Silu,
+    "tanh": nn.Tanh,
+    "prelu": nn.PReLU,
+}
+
+ACT2FN = ClassInstantier(ACT2CLS)
@@ -0,0 +1,41 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from contextlib import suppress
+from typing import TYPE_CHECKING
+
+from ...utils.lazy_import import _LazyModule
+
+import_structure = {
+    "eager_attention": ["eager_attention_forward"],
+    "flashmask_attention": ["flashmask_attention_forward"],
+    "interface": ["AttentionInterface", "ALL_ATTENTION_FUNCTIONS"],
+    "sdpa_attention": ["sdpa_attention_forward"],
+    "utils": ["repeat_kv"],
+}
+
+if TYPE_CHECKING:
+    from .eager_attention import *
+    from .flashmask_attention import *
+    from .interface import *
+    from .sdpa_attention import *
+    from .utils import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
@@ -0,0 +1,59 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import paddle
+import paddle.nn as nn
+
+from .utils import repeat_kv
+
+
+def eager_attention_forward(
+    module: nn.Layer,
+    query: paddle.Tensor,
+    key: paddle.Tensor,
+    value: paddle.Tensor,
+    attention_mask: Optional[paddle.Tensor] = None,
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    is_causal: Optional[bool] = None,
+    **kwargs,
+):
+    num_key_value_heads = None
+    if hasattr(module, "num_key_value_heads"):
+        num_key_value_heads = module.num_key_value_heads
+    elif hasattr(module, "num_key_value_groups"):
+        num_key_value_heads = module.num_key_value_groups
+
+    if num_key_value_heads is not None:
+        key = repeat_kv(key, module.num_key_value_heads)
+        value = repeat_kv(value, module.num_key_value_heads)
+
+    perm = [0, 2, 1, 3]  # b l h d -> b h l d
+    query = paddle.transpose(x=query, perm=perm)
+    key = paddle.transpose(x=key, perm=perm)
+    value = paddle.transpose(x=value, perm=perm)
+    attn_weights = paddle.matmul(query, key.transpose([0, 1, 3, 2])) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = paddle.matmul(attn_weights, value)  # b h l l @ b h l d -> b h l d
+    attn_output = attn_output.transpose([0, 2, 1, 3])  # b h l d -> b l h d
+    attn_output = paddle.reshape(x=attn_output, shape=[0, 0, attn_output.shape[2] * attn_output.shape[3]])
+
+    return attn_output, attn_weights
@@ -0,0 +1,47 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.functional.flash_attention import flashmask_attention
+
+
+def flashmask_attention_forward(
+    module: nn.Layer,
+    query: paddle.Tensor,
+    key: paddle.Tensor,
+    value: paddle.Tensor,
+    attention_mask: Optional[paddle.Tensor] = None,
+    attn_mask_start_row_indices=None,
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    is_causal: Optional[bool] = None,
+    **kwargs
+):
+    if attn_mask_start_row_indices is not None:
+        attn_mask_start_row_indices = attn_mask_start_row_indices.unsqueeze(-1)
+
+    # b,l,h,d
+    out = flashmask_attention(
+        query,
+        key,
+        value,
+        startend_row_indices=attn_mask_start_row_indices,
+        causal=True,
+    )
+    out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+    return out, None