Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions keras_hub/src/models/mistral/mistral_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
self._rope_scaling_factor = rope_scaling_factor

def build(self, inputs_shape):
print("inputs_shape",inputs_shape)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This appears to be a debugging print statement. Please remove it before this change is merged.

# Einsum variables:
# b = batch size
# q = query length
Expand All @@ -54,17 +55,18 @@ def build(self, inputs_shape):
# v = num key/value heads
# h = head dim
self._hidden_dim = inputs_shape[-1]
print("self._hidden_dim // self._num_query_heads",self._hidden_dim , self._num_query_heads)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This appears to be a debugging print statement. Please remove it before this change is merged.

self._head_dim = self._hidden_dim // self._num_query_heads
self._inv_norm_factor = 1.0 / math.sqrt(self._head_dim)

print("(None, self._num_query_heads, self._head_dim)",(None, self._num_query_heads, self._head_dim))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This appears to be a debugging print statement. Please remove it before this change is merged.

self._query_dense = keras.layers.EinsumDense(
equation="bqm,muh->bquh",
output_shape=(None, self._num_query_heads, self._head_dim),
kernel_initializer=self._kernel_initializer,
dtype=self.dtype_policy,
name="query",
)
self._query_dense.build(inputs_shape)
self._query_dense.build((None,None,4096))#inputs_shape
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The input shape for _query_dense.build is hardcoded. This makes the layer less flexible and seems to be a temporary change for debugging, as indicated by the commented-out code. Please consider restoring the original dynamic inputs_shape.

Suggested change
self._query_dense.build((None,None,4096))#inputs_shape
self._query_dense.build(inputs_shape)


self._key_dense = keras.layers.EinsumDense(
equation="bkm,mvh->bkvh",
Expand All @@ -77,7 +79,7 @@ def build(self, inputs_shape):
dtype=self.dtype_policy,
name="key",
)
self._key_dense.build(inputs_shape)
self._key_dense.build((None,None,4096))#input_shape
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Similar to the query layer, the input shape for _key_dense.build is hardcoded. This should be reverted to use inputs_shape to maintain the model's flexibility.

Suggested change
self._key_dense.build((None,None,4096))#input_shape
self._key_dense.build(inputs_shape)


self._value_dense = keras.layers.EinsumDense(
equation="bkm,mvh->bkvh",
Expand All @@ -90,7 +92,7 @@ def build(self, inputs_shape):
dtype=self.dtype_policy,
name="value",
)
self._value_dense.build(inputs_shape)
self._value_dense.build((None,None,4096))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The input shape for _value_dense.build is also hardcoded. This should be reverted to use inputs_shape to maintain the model's flexibility.

Suggested change
self._value_dense.build((None,None,4096))
self._value_dense.build(inputs_shape)


self._softmax = keras.layers.Softmax(
axis=-1,
Expand All @@ -111,7 +113,7 @@ def build(self, inputs_shape):
name="attention_output",
)
self._output_dense.build(
(None, None, self._num_query_heads, self._head_dim)
(None, None, self._num_query_heads, 128)#self._head_dim)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The head dimension for _output_dense.build is hardcoded to 128. This should use the calculated self._head_dim to ensure the model works correctly with different configurations.

Suggested change
(None, None, self._num_query_heads, 128)#self._head_dim)
(None, None, self._num_query_heads, self._head_dim)

)

self.rotary_embedding_layer = RotaryEmbedding(
Expand Down
6 changes: 4 additions & 2 deletions keras_hub/src/utils/transformers/convert_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def convert_weights(backbone, loader, transformers_config):
hf_weight_key=f"model.layers.{index}.post_attention_layernorm.weight",
hook_fn=lambda hf_tensor, _: hf_tensor.astype(np.float16),
)

print("decoder_layer._self_attention_layer._query_dense.kernel",decoder_layer._self_attention_layer._query_dense.kernel,index)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This appears to be a debugging print statement. Please remove it before this change is merged.

# Attention layers
loader.port_weight(
keras_variable=decoder_layer._self_attention_layer._query_dense.kernel,
Expand All @@ -59,6 +59,8 @@ def convert_weights(backbone, loader, transformers_config):
np.transpose(hf_tensor.astype(np.float16)), keras_shape
),
)
print("decoder_layer._self_attention_layer._key_dense.kernel",decoder_layer._self_attention_layer._key_dense.kernel,index)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This appears to be a debugging print statement and an extra newline. Please remove them before this change is merged.

loader.port_weight(
keras_variable=decoder_layer._self_attention_layer._key_dense.kernel,
hf_weight_key=f"model.layers.{index}.self_attn.k_proj.weight",
Expand Down Expand Up @@ -113,4 +115,4 @@ def convert_weights(backbone, loader, transformers_config):


def convert_tokenizer(cls, preset, **kwargs):
return cls(get_file(preset, "tokenizer.model"), **kwargs)
return cls(get_file(preset, "tekken.json"),**kwargs)#)"tokenizer.model"), **kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The tokenizer file is hardcoded to "tekken.json", which seems like a temporary value for testing. This should be reverted to the original "tokenizer.model" to ensure presets can be loaded correctly.

Suggested change
return cls(get_file(preset, "tekken.json"),**kwargs)#)"tokenizer.model"), **kwargs)
return cls(get_file(preset, "tokenizer.model"), **kwargs)

Loading