@@ -3324,6 +3324,145 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33243324 return [(self .map_tensor_name (name ), data_torch )]
33253325
33263326
3327+ @Model .register ("T5EncoderModel" )
3328+ class T5EncoderModel (Model ):
3329+ model_arch = gguf .MODEL_ARCH .T5ENCODER
3330+
3331+ def __init__ (self , * args , ** kwargs ):
3332+ super ().__init__ (* args , ** kwargs )
3333+ self .shared_token_embeddings_found = False
3334+
3335+ def set_vocab (self ):
3336+ # to avoid TypeError: Descriptors cannot be created directly
3337+ # exception when importing sentencepiece_model_pb2
3338+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
3339+ from sentencepiece import SentencePieceProcessor
3340+ from sentencepiece import sentencepiece_model_pb2 as model
3341+
3342+ tokenizer_path = self .dir_model / 'tokenizer.model'
3343+
3344+ # many older models use spiece.model tokenizer model filename
3345+ if not tokenizer_path .is_file ():
3346+ tokenizer_path = self .dir_model / 'spiece.model'
3347+
3348+ if not tokenizer_path .is_file ():
3349+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
3350+
3351+ sentencepiece_model = model .ModelProto () # pyright: ignore[reportAttributeAccessIssue]
3352+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
3353+
3354+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
3355+ if sentencepiece_model .trainer_spec .model_type == 2 : # BPE
3356+ # assure the tokenizer model file name is correct
3357+ assert tokenizer_path .name == 'tokenizer.model'
3358+ return self ._set_vocab_sentencepiece ()
3359+ else :
3360+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
3361+
3362+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
3363+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
3364+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
3365+
3366+ tokenizer = SentencePieceProcessor ()
3367+ tokenizer .LoadFromFile (str (tokenizer_path ))
3368+
3369+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
3370+
3371+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3372+ scores : list [float ] = [- 10000.0 ] * vocab_size
3373+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3374+
3375+ for token_id in range (tokenizer .vocab_size ()):
3376+ piece = tokenizer .IdToPiece (token_id )
3377+ text = piece .encode ("utf-8" )
3378+ score = tokenizer .GetScore (token_id )
3379+
3380+ toktype = SentencePieceTokenTypes .NORMAL
3381+ if tokenizer .IsUnknown (token_id ):
3382+ toktype = SentencePieceTokenTypes .UNKNOWN
3383+ elif tokenizer .IsControl (token_id ):
3384+ toktype = SentencePieceTokenTypes .CONTROL
3385+ elif tokenizer .IsUnused (token_id ):
3386+ toktype = SentencePieceTokenTypes .UNUSED
3387+ elif tokenizer .IsByte (token_id ):
3388+ toktype = SentencePieceTokenTypes .BYTE
3389+
3390+ tokens [token_id ] = text
3391+ scores [token_id ] = score
3392+ toktypes [token_id ] = toktype
3393+
3394+ added_tokens_file = self .dir_model / 'added_tokens.json'
3395+ if added_tokens_file .is_file ():
3396+ with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
3397+ added_tokens_json = json .load (f )
3398+ for key in added_tokens_json :
3399+ token_id = added_tokens_json [key ]
3400+ if token_id >= vocab_size :
3401+ logger .warning (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
3402+ continue
3403+
3404+ tokens [token_id ] = key .encode ("utf-8" )
3405+ scores [token_id ] = - 1000.0
3406+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
3407+
3408+ if vocab_size > len (tokens ):
3409+ pad_count = vocab_size - len (tokens )
3410+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3411+ for i in range (1 , pad_count + 1 ):
3412+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3413+ scores .append (- 1000.0 )
3414+ toktypes .append (SentencePieceTokenTypes .UNUSED )
3415+
3416+ self .gguf_writer .add_tokenizer_model ("t5" )
3417+ self .gguf_writer .add_tokenizer_pre ("default" )
3418+ self .gguf_writer .add_token_list (tokens )
3419+ self .gguf_writer .add_token_scores (scores )
3420+ self .gguf_writer .add_token_types (toktypes )
3421+ self .gguf_writer .add_add_space_prefix (add_prefix )
3422+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
3423+ if precompiled_charsmap :
3424+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
3425+
3426+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3427+ special_vocab .add_to_gguf (self .gguf_writer )
3428+
3429+ self .gguf_writer .add_add_bos_token (False )
3430+ self .gguf_writer .add_add_eos_token (True )
3431+
3432+ def set_gguf_parameters (self ):
3433+ if (n_ctx := self .find_hparam (["n_positions" ], optional = True )) is None :
3434+ logger .warning ("Couldn't find context length in config.json, assuming default value of 512" )
3435+ n_ctx = 512
3436+ self .gguf_writer .add_context_length (n_ctx )
3437+ self .gguf_writer .add_embedding_length (self .hparams ["d_model" ])
3438+ self .gguf_writer .add_feed_forward_length (self .hparams ["d_ff" ])
3439+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
3440+ self .gguf_writer .add_head_count (self .hparams ["num_heads" ])
3441+ self .gguf_writer .add_key_length (self .hparams ["d_kv" ])
3442+ self .gguf_writer .add_value_length (self .hparams ["d_kv" ])
3443+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
3444+ self .gguf_writer .add_relative_attn_buckets_count (self .hparams ["relative_attention_num_buckets" ])
3445+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layer_norm_epsilon" ])
3446+ self .gguf_writer .add_file_type (self .ftype )
3447+
3448+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3449+ del bid # unused
3450+
3451+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
3452+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
3453+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
3454+ # and decoder and ignore the remaining ones.
3455+ if name in ["decoder.embed_tokens.weight" , "encoder.embed_tokens.weight" , "shared.weight" ]:
3456+ if not self .shared_token_embeddings_found :
3457+ name = "shared.weight"
3458+ self .shared_token_embeddings_found = True
3459+ else :
3460+ logger .debug (f"Skipping shared tensor { name !r} in safetensors so that convert can end normally." )
3461+ return []
3462+
3463+ return [(self .map_tensor_name (name ), data_torch )]
3464+
3465+
33273466@Model .register ("JAISLMHeadModel" )
33283467class JaisModel (Model ):
33293468 model_arch = gguf .MODEL_ARCH .JAIS
0 commit comments