【GPT-OSS】update sliding_attention layer use flashmask (#2606)

xiaoguoguo626807 · web-flow · commit ed813692d941 · 2025-09-19T10:51:04.000+08:00
diff --git a/paddleformers/generation/utils.py b/paddleformers/generation/utils.py
@@ -63,6 +63,35 @@
 ]
 
 
+def _make_sliding_window_mask(input_shape, past_key_values_length=0, window_size=5):
+    """
+    Generate a sliding window mask that restricts each position to only attend to historical positions within the window.
+    Format: [bsz, 1, tgt_seq_len, src_seq_len], where True indicates allowed attention and False indicates masking.
+    """
+    batch_size, seq_length = input_shape
+    # Total sequence length = historical sequence length + current sequence length (for generating complete mask)
+    total_length = past_key_values_length + seq_length
+
+    # Initialize mask with all False values
+    mask = paddle.zeros((seq_length, total_length), dtype=paddle.bool)
+
+    for i in range(seq_length):
+        # Absolute position of current location in the total sequence (including historical sequence)
+        current_pos = past_key_values_length + i
+        # Window start position: max(0, current position - window size + 1)
+        start = max(0, current_pos - window_size + 1)
+        # Window end position: current position (causal mask restriction, cannot exceed self)
+        end = current_pos + 1  # Slice is left closed and right open, so+1
+        # Mark window range as True (allow attention)
+        mask[i, start:end] = True
+
+    # Expand dimensions to [bsz, 1, tgt_seq_len, src_seq_len]
+    mask = mask.unsqueeze(0).unsqueeze(0)
+    # Copy to each sample in batch_size
+    mask = paddle.tile(mask, repeat_times=[batch_size, 1, 1, 1])
+    return mask
+
+
 def get_unfinished_flag(
     input_ids: Tensor, unfinished_flag: Tensor, eos_token_id: Union[int, list[int], list[list[int]]]
 ) -> Tensor:
@@ -354,29 +383,53 @@ def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
         return attention_mask
 
     @staticmethod
-    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+    def _prepare_decoder_attention_mask(
+        attention_mask, input_shape, past_key_values_length, dtype, sliding_window_size=None
+    ):
+        # Step 1: Process input mask to generate basic expanded mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             if len(attention_mask.shape) == 2:
                 expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
-                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                # When not generating in single step, need to combine causal mask and sliding window mask
                 if input_shape[-1] > 1:
-                    combined_attention_mask = _make_causal_mask(
-                        input_shape, past_key_values_length=past_key_values_length
-                    )
+                    # Generate basic causal mask (prevent future information leakage)
+                    causal_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+                    # Generate sliding window mask (limit historical attention range)
+                    if sliding_window_size is not None and sliding_window_size > 0:
+                        window_mask = _make_sliding_window_mask(
+                            input_shape, past_key_values_length=past_key_values_length, window_size=sliding_window_size
+                        )
+                        # Take intersection of sliding window mask and causal mask (satisfy both restrictions)
+                        combined_attention_mask = causal_mask & window_mask
+                    else:
+                        combined_attention_mask = (
+                            causal_mask  # Use causal mask directly when sliding window is disabled
+                        )
+
+                    # Combine with user-provided mask (e.g., padding mask)
                     if get_env_device() in ["npu", "mlu", "intel_hpu"]:
                         expanded_attn_mask = expanded_attn_mask.astype("bool") & combined_attention_mask.astype("bool")
                     else:
                         expanded_attn_mask = expanded_attn_mask & combined_attention_mask
             # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
             elif len(attention_mask.shape) == 3:
                 expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
-            # if attention_mask is already 4-D, do nothing
+            # 4D mask is used directly
             else:
                 expanded_attn_mask = attention_mask
         else:
-            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
-        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+            # When no input mask, generate causal mask + sliding window mask (if enabled)
+            causal_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+            if sliding_window_size is not None and sliding_window_size > 0:
+                window_mask = _make_sliding_window_mask(
+                    input_shape, past_key_values_length=past_key_values_length, window_size=sliding_window_size
+                )
+                expanded_attn_mask = causal_mask & window_mask
+            else:
+                expanded_attn_mask = causal_mask  # Use causal mask directly when sliding window is disabled
+
+        # Step 2: Convert boolean mask to numerical mask (adapt to different devices)
         if get_env_device() in ["npu", "mlu", "intel_hpu"]:
             x = paddle.to_tensor(0.0, dtype="float32")
             y = paddle.to_tensor(paddle.finfo(dtype).min, dtype="float32")
diff --git a/paddleformers/nn/attention/eager_attention.py b/paddleformers/nn/attention/eager_attention.py
@@ -27,6 +27,7 @@ def eager_attention_forward(
     value: paddle.Tensor,
     attention_mask: Optional[paddle.Tensor] = None,
     dropout: float = 0.0,
+    sink: Optional[paddle.Tensor] = None,
     scaling: Optional[float] = None,
     is_causal: Optional[bool] = None,
     **kwargs,
@@ -45,8 +46,16 @@ def eager_attention_forward(
     if attention_mask is not None:
         causal_mask = attention_mask[:, :, :, : key.shape[-2]]
         attn_weights = attn_weights + causal_mask
-    attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    if sink is not None:
+        sink = sink.reshape([1, -1, 1, 1]).expand([query.shape[0], -1, query.shape[-2], -1])
+        combined_logits = paddle.concat([attn_weights, sink], axis=-1)
+        probs = nn.functional.softmax(combined_logits, axis=-1, dtype=combined_logits.dtype)
+        scores = probs[..., :-1]  # we drop the sink here
+        attn_weights = nn.functional.dropout(scores, p=dropout, training=module.training)
+    else:
+        attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
 
     attn_output = paddle.matmul(attn_weights, value)  # b h l l @ b h l d -> b h l d
     attn_output = attn_output.transpose([0, 2, 1, 3])  # b h l d -> b l h d
diff --git a/paddleformers/nn/attention/flashmask_attention.py b/paddleformers/nn/attention/flashmask_attention.py
@@ -36,7 +36,8 @@ def flashmask_attention_forward(
     # b,l,h,d
     if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.ndim == 3:
         attn_mask_startend_row_indices = attn_mask_startend_row_indices.unsqueeze(-1)
-
+    if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 1:
+        is_causal = True
     if sink is None:
         out = flashmask_attention(
             query,
@@ -54,7 +55,7 @@ def flashmask_attention_forward(
             startend_row_indices=attn_mask_startend_row_indices,
             dropout_p=dropout,
             softmax_scale=scaling,
-            causal=is_causal,
+            causal=is_causal if is_causal is not None else False,
         )
     out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
 
diff --git a/paddleformers/nn/attention/sink_impl.py b/paddleformers/nn/attention/sink_impl.py
@@ -417,10 +417,10 @@ def backward(ctx, grad_output):
                 value_states,
                 raw_output,
                 lse_original,
-                dropout,
-                attention_mask,
-                causal,
-                scale,
+                dropout=dropout,
+                attention_mask=attention_mask,
+                causal=causal,
+                softmax_scale=scale,
             )
         else:
             grad_q_main, grad_k_repeated, grad_v_repeated = _flashmask_attention_backward_dispatch(
@@ -477,7 +477,16 @@ def backward(ctx, grad_output):
             )
             x = (g_ell.unsqueeze(-1) * query).to(query.dtype)
             _, grad_k_extra_repeated, _ = _flash_attention_backward_dispatch(
-                x, query, key_states, key_states, mu_k, lse_k, dropout, causal, scale
+                x,
+                query,
+                key_states,
+                key_states,
+                mu_k,
+                lse_k,
+                dropout=dropout,
+                attention_mask=attention_mask,
+                causal=causal,
+                softmax_scale=scale,
             )
         else:
             # Use FlashMask for computing mu_k
@@ -511,12 +520,23 @@ def backward(ctx, grad_output):
         # Combine main and extra gradients
         grad_q = grad_q_main + grad_q_extra
         grad_k = grad_k_main + grad_k_extra
-
-        # Return gradients (number of return values must match forward inputs)
-        if startend_row_indices is None:
-            return grad_q, grad_k, grad_v, grad_sink
+        if query.dtype != grad_q.dtype:
+            grad_q = grad_q.cast(query.dtype)
+        if key.dtype != grad_k.dtype:
+            grad_k = grad_k.cast(key.dtype)
+        if value.dtype != grad_v.dtype:
+            grad_v = grad_v.cast(value.dtype)
+        if sink.stop_gradient:
+            # Return gradients (number of return values must match forward inputs)
+            if startend_row_indices is None:
+                return grad_q, grad_k, grad_v, None  # grad_sink
+            else:
+                return grad_q, grad_k, grad_v, None, None
         else:
-            return grad_q, grad_k, grad_v, grad_sink, None
+            if startend_row_indices is None:
+                return grad_q, grad_k, grad_v, grad_sink
+            else:
+                return grad_q, grad_k, grad_v, grad_sink, None
 
 
 def sink_attention_forward(
diff --git a/paddleformers/nn/pp_model.py b/paddleformers/nn/pp_model.py
@@ -551,6 +551,8 @@ def __init__(self, config: PretrainedConfig, **kwargs):
                     EmbeddingPipe,
                     shared_weight_attr="embedding_weight",
                     config=config,
+                    embed_cls=self._embed_cls,
+                    rotary_emb_cls=self._rotary_emb_cls,
                 ),
                 "model",
             )
diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
@@ -175,7 +175,7 @@
     "ernie4_5_moe.modeling": ["Ernie4_5_MoeModel", "Ernie4_5_MoeForCausalLM", "Ernie4_5_MoeForCausalLMPipe"],
     "export": ["export_model"],
     "gpt_oss.configuration": ["GptOssConfig"],
-    "gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM"],
+    "gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM", "GptOssForCausalLMPipe"],
     "llama.configuration": [
         "LLAMA_PRETRAINED_INIT_CONFIGURATION",
         "LlamaConfig",
diff --git a/paddleformers/transformers/gpt_oss/__init__.py b/paddleformers/transformers/gpt_oss/__init__.py
@@ -11,6 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
+from typing import TYPE_CHECKING
 
-from .configuration import *
-from .modeling import *
+from ...utils.lazy_import import _LazyModule
+
+import_structure = {
+    "configuration": ["GptOssConfig"],
+    "modeling": [
+        "GptOssModel",
+        "GptOssPretrainedModel",
+        "GptOssForCausalLM",
+        "GptOssForCausalLMPipe",
+    ],
+}
+if TYPE_CHECKING:
+    from .configuration import *
+    from .modeling import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
diff --git a/paddleformers/transformers/gpt_oss/configuration.py b/paddleformers/transformers/gpt_oss/configuration.py
@@ -42,6 +42,7 @@ def __init__(
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         max_position_embeddings=131072,
+        use_rmsnorm=True,
         rms_norm_eps: float = 1e-5,
         rope_scaling={"rope_type": "yarn", "factor": 32.0, "beta_fast": 32.0, "beta_slow": 1.0, "truncate": False},
         attention_dropout: float = 0.0,
@@ -50,6 +51,7 @@ def __init__(
         output_router_logits=False,
         use_cache=True,
         layer_types=None,
+        pp_seg_method="layer:GptOssDecoderLayer",
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -67,6 +69,7 @@ def __init__(
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
+        self.use_rmsnorm = use_rmsnorm
         self.rms_norm_eps = rms_norm_eps
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
@@ -91,6 +94,7 @@ def __init__(
         self.output_router_logits = output_router_logits
         self.use_cache = use_cache
         self.use_bias = False
+        self.pp_seg_method = pp_seg_method
 
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
diff --git a/paddleformers/transformers/gpt_oss/modeling.py b/paddleformers/transformers/gpt_oss/modeling.py

Original file line number	Diff line number	Diff line change
`@@ -551,6 +551,8 @@ def __init__(self, config: PretrainedConfig, **kwargs):`
`551`	`551`	`EmbeddingPipe,`
`552`	`552`	`shared_weight_attr="embedding_weight",`
`553`	`553`	`config=config,`
	`554`	`+ embed_cls=self._embed_cls,`
	`555`	`+ rotary_emb_cls=self._rotary_emb_cls,`
`554`	`556`	`),`
`555`	`557`	`"model",`
`556`	`558`	`)`