-
Notifications
You must be signed in to change notification settings - Fork 301
[keras-hub-2333]adjusted the backbone for mistral and switched the tokenizer #2368
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -45,6 +45,7 @@ def __init__( | |||||
self._rope_scaling_factor = rope_scaling_factor | ||||||
|
||||||
def build(self, inputs_shape): | ||||||
print("inputs_shape",inputs_shape) | ||||||
# Einsum variables: | ||||||
# b = batch size | ||||||
# q = query length | ||||||
|
@@ -54,17 +55,18 @@ def build(self, inputs_shape): | |||||
# v = num key/value heads | ||||||
# h = head dim | ||||||
self._hidden_dim = inputs_shape[-1] | ||||||
print("self._hidden_dim // self._num_query_heads",self._hidden_dim , self._num_query_heads) | ||||||
|
||||||
self._head_dim = self._hidden_dim // self._num_query_heads | ||||||
self._inv_norm_factor = 1.0 / math.sqrt(self._head_dim) | ||||||
|
||||||
print("(None, self._num_query_heads, self._head_dim)",(None, self._num_query_heads, self._head_dim)) | ||||||
|
||||||
self._query_dense = keras.layers.EinsumDense( | ||||||
equation="bqm,muh->bquh", | ||||||
output_shape=(None, self._num_query_heads, self._head_dim), | ||||||
kernel_initializer=self._kernel_initializer, | ||||||
dtype=self.dtype_policy, | ||||||
name="query", | ||||||
) | ||||||
self._query_dense.build(inputs_shape) | ||||||
self._query_dense.build((None,None,4096))#inputs_shape | ||||||
|
self._query_dense.build((None,None,4096))#inputs_shape | |
self._query_dense.build(inputs_shape) |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The head dimension for _output_dense.build
is hardcoded to 128
. This should use the calculated self._head_dim
to ensure the model works correctly with different configurations.
(None, None, self._num_query_heads, 128)#self._head_dim) | |
(None, None, self._num_query_heads, self._head_dim) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -50,7 +50,7 @@ def convert_weights(backbone, loader, transformers_config): | |||||
hf_weight_key=f"model.layers.{index}.post_attention_layernorm.weight", | ||||||
hook_fn=lambda hf_tensor, _: hf_tensor.astype(np.float16), | ||||||
) | ||||||
|
||||||
print("decoder_layer._self_attention_layer._query_dense.kernel",decoder_layer._self_attention_layer._query_dense.kernel,index) | ||||||
|
||||||
# Attention layers | ||||||
loader.port_weight( | ||||||
keras_variable=decoder_layer._self_attention_layer._query_dense.kernel, | ||||||
|
@@ -59,6 +59,8 @@ def convert_weights(backbone, loader, transformers_config): | |||||
np.transpose(hf_tensor.astype(np.float16)), keras_shape | ||||||
), | ||||||
) | ||||||
print("decoder_layer._self_attention_layer._key_dense.kernel",decoder_layer._self_attention_layer._key_dense.kernel,index) | ||||||
|
||||||
|
||||||
loader.port_weight( | ||||||
keras_variable=decoder_layer._self_attention_layer._key_dense.kernel, | ||||||
hf_weight_key=f"model.layers.{index}.self_attn.k_proj.weight", | ||||||
|
@@ -113,4 +115,4 @@ def convert_weights(backbone, loader, transformers_config): | |||||
|
||||||
|
||||||
def convert_tokenizer(cls, preset, **kwargs): | ||||||
return cls(get_file(preset, "tokenizer.model"), **kwargs) | ||||||
return cls(get_file(preset, "tekken.json"),**kwargs)#)"tokenizer.model"), **kwargs) | ||||||
|
return cls(get_file(preset, "tekken.json"),**kwargs)#)"tokenizer.model"), **kwargs) | |
return cls(get_file(preset, "tokenizer.model"), **kwargs) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This appears to be a debugging print statement. Please remove it before this change is merged.