actually pass attn mask

DavidLandup0 · DavidLandup0 · commit 00c025a84ea7 · 2025-07-26T16:45:08.000+09:00
diff --git a/keras_hub/src/models/smollm3/smollm3_backbone.py b/keras_hub/src/models/smollm3/smollm3_backbone.py
@@ -115,6 +115,9 @@ def __init__(
         position_id_input = keras.Input(
             shape=(None,), dtype="int32", name="position_ids"
         )
+        padding_mask_input = keras.Input(
+            shape=(None,), dtype="int32", name="padding_mask"
+        )
 
         hidden_states = self.token_embedding(token_id_input)
         position_embeddings = self.rotary_embedding(
@@ -125,6 +128,7 @@ def __init__(
             hidden_states = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
+                decoder_padding_mask=padding_mask_input
                 **kwargs,
             )
 
diff --git a/keras_hub/src/models/smollm3/smollm3_layers.py b/keras_hub/src/models/smollm3/smollm3_layers.py
@@ -6,8 +6,12 @@
 from keras_hub.src.models.smollm3.smollm3_utils import apply_rotary_pos_emb
 from keras_hub.src.models.smollm3.smollm3_utils import eager_attention_forward
 from keras_hub.src.models.smollm3.smollm3_utils import rope_init
-
-
+from keras_hub.src.layers.modeling.transformer_layer_utils import (
+    merge_padding_and_attention_mask,
+)
+from keras_hub.src.layers.modeling.transformer_layer_utils import (
+    compute_causal_mask,
+)
 
 class SmolLM3Attention(layers.Layer):
     """
@@ -368,6 +372,46 @@ def __init__(
 
         self.attention_type = layer_types[layer_idx]
 
+
+    def _compute_self_attention_mask(
+        self,
+        decoder_sequence,
+        decoder_padding_mask,
+        decoder_attention_mask,
+        use_causal_mask,
+        self_attention_cache,
+        self_attention_cache_update_index,
+    ):
+        decoder_mask = merge_padding_and_attention_mask(
+            decoder_sequence, decoder_padding_mask, decoder_attention_mask
+        )
+        if use_causal_mask:
+            batch_size = ops.shape(decoder_sequence)[0]
+            input_length = output_length = ops.shape(decoder_sequence)[1]
+            # We need to handle a rectangular causal mask when doing cached
+            # decoding. For generative inference, `decoder_sequence` will
+            # generally be length 1, and `cache` will be the full generation
+            # length.
+            if self_attention_cache is not None:
+                input_length = ops.shape(self_attention_cache)[2]
+
+            causal_mask = compute_causal_mask(
+                batch_size,
+                input_length,
+                output_length,
+                (
+                    0
+                    if self_attention_cache_update_index is None
+                    else self_attention_cache_update_index
+                ),
+            )
+            return (
+                ops.minimum(decoder_mask, causal_mask)
+                if decoder_mask is not None
+                else causal_mask
+            )
+        return decoder_mask
+
     def build(self, input_shape):
         """
         Builds the sub-layers based on the input shape.
@@ -403,6 +447,8 @@ def call(
         hidden_states,
         position_embeddings=None,
         training=False,
+        decoder_padding_mask=None,
+        decoder_attention_mask=None,
         **kwargs,
     ):
         """
@@ -414,6 +460,15 @@ def call(
             training: Whether the layer is in training mode.
         """
         self_attention_cache = kwargs.get("self_attention_cache", None)
+        self_attention_cache_update_index = kwargs.get("self_attention_cache_update_index", None)
+
+        self_attention_mask = self._compute_self_attention_mask(
+            decoder_sequence=hidden_states,
+            decoder_padding_mask=decoder_padding_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            self_attention_cache=self_attention_cache,
+            self_attention_cache_update_index=self_attention_cache_update_index,
+        )
 
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -423,6 +478,7 @@ def call(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
             training=training,
+            attention_mask=self_attention_mask,
             **kwargs,
         )
 
diff --git a/keras_hub/src/models/smollm3/smollm3_utils.py b/keras_hub/src/models/smollm3/smollm3_utils.py
@@ -46,8 +46,6 @@ def eager_attention_forward(
         * scaling
     )
 
-    # Apply attention mask if provided
-    print("attention_mask", attention_mask)
     if attention_mask is not None:
         causal_mask = attention_mask[:, :, :, : ops.shape(key_states)[-2]]
         attn_weights = ops.add(attn_weights, causal_mask)

Original file line number	Diff line number	Diff line change
`@@ -46,8 +46,6 @@ def eager_attention_forward(`
`46`	`46`	`* scaling`
`47`	`47`	`)`
`48`	`48`
`49`		`- # Apply attention mask if provided`
`50`		`- print("attention_mask", attention_mask)`
`51`	`49`	`if attention_mask is not None:`
`52`	`50`	`causal_mask = attention_mask[:, :, :, : ops.shape(key_states)[-2]]`
`53`	`51`	`attn_weights = ops.add(attn_weights, causal_mask)`