make training false

DavidLandup0 · DavidLandup0 · commit e2f194a8c27e · 2025-07-26T14:59:14.000+09:00
diff --git a/keras_hub/src/models/smollm3/smollm3_backbone.py b/keras_hub/src/models/smollm3/smollm3_backbone.py
@@ -176,4 +176,4 @@ def get_config(self):
                 "partial_rotary_factor": self.partial_rotary_factor,
             }
         )
-        return config
+        return config
diff --git a/keras_hub/src/models/smollm3/smollm3_layers.py b/keras_hub/src/models/smollm3/smollm3_layers.py
@@ -121,9 +121,9 @@ def call(
         input_shape = ops.shape(hidden_states)[:-1]
         hidden_shape = (*input_shape, self.num_attention_heads, self.head_dim)
 
-        query_states = ops.reshape(self.q_proj(hidden_states),hidden_shape)
+        query_states = ops.reshape(self.q_proj(hidden_states), hidden_shape)
         # (batch, num_heads, seq_len, head_dim)
-        query_states = ops.transpose(query_states, axes=(0, 2, 1, 3))  
+        query_states = ops.transpose(query_states, axes=(0, 2, 1, 3))
 
         def _compute_kv_values(x_input):
             kv_hidden_shape = (
@@ -132,13 +132,9 @@ def _compute_kv_values(x_input):
                 self.head_dim,
             )
 
-            key_states_raw = ops.reshape(
-                self.k_proj(x_input),
-                kv_hidden_shape
-            )
+            key_states_raw = ops.reshape(self.k_proj(x_input), kv_hidden_shape)
             value_states_raw = ops.reshape(
-                self.v_proj(x_input),
-                kv_hidden_shape
+                self.v_proj(x_input), kv_hidden_shape
             )
 
             key_states = ops.transpose(key_states_raw, axes=(0, 2, 1, 3))
@@ -155,7 +151,9 @@ def _compute_kv_values(x_input):
                 key_states = key_cache
                 value_states = value_cache
             else:
-                print("self_attention_cache_update_index is not None, computing kv values")
+                print(
+                    "self_attention_cache_update_index is not None, computing kv values"
+                )
                 key_update, value_update = _compute_kv_values(hidden_states)
                 update_idx_tensor = ops.convert_to_tensor(
                     self_attention_cache_update_index, dtype="int32"
@@ -417,7 +415,7 @@ def call(
             position_embeddings: Optional tuple of (cos, sin) tensors for RoPE.
             training: Whether the layer is in training mode.
         """
-        self_attention_cache = kwargs.get('self_attention_cache', None)
+        self_attention_cache = kwargs.get("self_attention_cache", None)
 
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
diff --git a/keras_hub/src/models/smollm3/smollm3_utils.py b/keras_hub/src/models/smollm3/smollm3_utils.py
@@ -52,7 +52,7 @@ def eager_attention_forward(
         attn_weights = ops.add(attn_weights, causal_mask)
 
     attn_weights = ops.softmax(attn_weights, axis=-1)
-    
+
     if training:
         attn_weights = random.dropout(attn_weights, rate=dropout)
     attn_output = ops.matmul(attn_weights, value_states)
diff --git a/keras_hub/src/utils/transformers/convert_smollm3.py b/keras_hub/src/utils/transformers/convert_smollm3.py
@@ -52,7 +52,6 @@ def transpose_and_reshape(x, shape):
         )
 
         # Attention layers
-
         ## Query
         loader.port_weight(
             keras_variable=decoder_layer.self_attn.q_proj.kernel,
@@ -110,6 +109,8 @@ def transpose_and_reshape(x, shape):
         hf_weight_key="model.norm.weight",
     )
 
+    backbone.training = False
+
     return backbone
 
 

Original file line number	Diff line number	Diff line change
`@@ -176,4 +176,4 @@ def get_config(self):`
`176`	`176`	`"partial_rotary_factor": self.partial_rotary_factor,`
`177`	`177`	`}`
`178`	`178`	`)`
`179`		`- return config`
	`179`	`+ return config`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,6 @@ def transpose_and_reshape(x, shape):`
`52`	`52`	`)`
`53`	`53`
`54`	`54`	`# Attention layers`
`55`		`-`
`56`	`55`	`## Query`
`57`	`56`	`loader.port_weight(`
`58`	`57`	`keras_variable=decoder_layer.self_attn.q_proj.kernel,`
`@@ -110,6 +109,8 @@ def transpose_and_reshape(x, shape):`
`110`	`109`	`hf_weight_key="model.norm.weight",`
`111`	`110`	`)`
`112`	`111`
	`112`	`+ backbone.training = False`
	`113`	`+`
`113`	`114`	`return backbone`
`114`	`115`
`115`	`116`