actually pass attn mask

DavidLandup0 · DavidLandup0 · commit 671c261d85c1 · 2025-07-26T16:49:29.000+09:00
diff --git a/keras_hub/src/models/smollm3/smollm3_backbone.py b/keras_hub/src/models/smollm3/smollm3_backbone.py
@@ -128,8 +128,7 @@ def __init__(
             hidden_states = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
-                decoder_padding_mask=padding_mask_input
-                **kwargs,
+                decoder_padding_mask=padding_mask_input**kwargs,
             )
 
         sequence_output = self.norm(hidden_states)
diff --git a/keras_hub/src/models/smollm3/smollm3_layers.py b/keras_hub/src/models/smollm3/smollm3_layers.py
@@ -3,15 +3,16 @@
 from keras import layers
 from keras import ops
 
-from keras_hub.src.models.smollm3.smollm3_utils import apply_rotary_pos_emb
-from keras_hub.src.models.smollm3.smollm3_utils import eager_attention_forward
-from keras_hub.src.models.smollm3.smollm3_utils import rope_init
 from keras_hub.src.layers.modeling.transformer_layer_utils import (
-    merge_padding_and_attention_mask,
+    compute_causal_mask,
 )
 from keras_hub.src.layers.modeling.transformer_layer_utils import (
-    compute_causal_mask,
+    merge_padding_and_attention_mask,
 )
+from keras_hub.src.models.smollm3.smollm3_utils import apply_rotary_pos_emb
+from keras_hub.src.models.smollm3.smollm3_utils import eager_attention_forward
+from keras_hub.src.models.smollm3.smollm3_utils import rope_init
+
 
 class SmolLM3Attention(layers.Layer):
     """
@@ -372,7 +373,6 @@ def __init__(
 
         self.attention_type = layer_types[layer_idx]
 
-
     def _compute_self_attention_mask(
         self,
         decoder_sequence,
@@ -460,7 +460,9 @@ def call(
             training: Whether the layer is in training mode.
         """
         self_attention_cache = kwargs.get("self_attention_cache", None)
-        self_attention_cache_update_index = kwargs.get("self_attention_cache_update_index", None)
+        self_attention_cache_update_index = kwargs.get(
+            "self_attention_cache_update_index", None
+        )
 
         self_attention_mask = self._compute_self_attention_mask(
             decoder_sequence=hidden_states,

Original file line number	Diff line number	Diff line change
`@@ -128,8 +128,7 @@ def __init__(`
`128`	`128`	`hidden_states = decoder_layer(`
`129`	`129`	`hidden_states,`
`130`	`130`	`position_embeddings=position_embeddings,`
`131`		`- decoder_padding_mask=padding_mask_input`
`132`		`- **kwargs,`
	`131`	`+ decoder_padding_mask=padding_mask_input**kwargs,`
`133`	`132`	`)`
`134`	`133`
`135`	`134`	`sequence_output = self.norm(hidden_states)`