fix self attn mask comp

DavidLandup0 · DavidLandup0 · commit 7481c8e87637 · 2025-07-26T16:55:57.000+09:00
diff --git a/keras_hub/src/models/smollm3/smollm3_layers.py b/keras_hub/src/models/smollm3/smollm3_layers.py
@@ -378,39 +378,35 @@ def _compute_self_attention_mask(
         decoder_sequence,
         decoder_padding_mask,
         decoder_attention_mask,
-        use_causal_mask,
         self_attention_cache,
         self_attention_cache_update_index,
     ):
         decoder_mask = merge_padding_and_attention_mask(
             decoder_sequence, decoder_padding_mask, decoder_attention_mask
         )
-        if use_causal_mask:
-            batch_size = ops.shape(decoder_sequence)[0]
-            input_length = output_length = ops.shape(decoder_sequence)[1]
-            # We need to handle a rectangular causal mask when doing cached
-            # decoding. For generative inference, `decoder_sequence` will
-            # generally be length 1, and `cache` will be the full generation
-            # length.
-            if self_attention_cache is not None:
-                input_length = ops.shape(self_attention_cache)[2]
-
-            causal_mask = compute_causal_mask(
-                batch_size,
-                input_length,
-                output_length,
-                (
-                    0
-                    if self_attention_cache_update_index is None
-                    else self_attention_cache_update_index
-                ),
-            )
-            return (
-                ops.minimum(decoder_mask, causal_mask)
-                if decoder_mask is not None
-                else causal_mask
-            )
-        return decoder_mask
+        batch_size = ops.shape(decoder_sequence)[0]
+        input_length = output_length = ops.shape(decoder_sequence)[1]
+        # We need to handle a rectangular causal mask when doing cached
+        # decoding. For generative inference, `decoder_sequence` will
+        # generally be length 1, and `cache` will be the full generation length.
+        if self_attention_cache is not None:
+            input_length = ops.shape(self_attention_cache)[2]
+
+        cache_update_index = (
+            0
+            if self_attention_cache_update_index is None
+            else self_attention_cache_update_index
+        )
+
+        causal_mask = compute_causal_mask(
+            batch_size, input_length, output_length, cache_update_index
+        )
+
+        return (
+            ops.minimum(decoder_mask, causal_mask)
+            if decoder_mask is not None
+            else causal_mask
+        )
 
     def build(self, input_shape):
         """