@@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
8080 if not self .is_safetensors :
8181 self .part_names = Model .get_model_part_names (self .dir_model , "pytorch_model" , ".bin" )
8282 self .hparams = Model .load_hparams (self .dir_model )
83- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
83+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
8484 self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
8585 self .tensor_names = None
8686 if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2768,6 +2768,124 @@ def write_tensors(self):
27682768 raise ValueError (f"Unprocessed experts: { experts } " )
27692769
27702770
2771+ @Model .register ("T5ForConditionalGeneration" )
2772+ @Model .register ("T5WithLMHeadModel" )
2773+ class T5Model (Model ):
2774+ model_arch = gguf .MODEL_ARCH .T5
2775+
2776+ def set_vocab (self ):
2777+ # to avoid TypeError: Descriptors cannot be created directly
2778+ # exception when importing sentencepiece_model_pb2
2779+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
2780+ from sentencepiece import SentencePieceProcessor
2781+ from sentencepiece import sentencepiece_model_pb2 as model
2782+
2783+ tokenizer_path = self .dir_model / 'spiece.model'
2784+
2785+ if not tokenizer_path .is_file ():
2786+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
2787+
2788+ sentencepiece_model = model .ModelProto ()
2789+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
2790+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
2791+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
2792+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
2793+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
2794+
2795+ tokenizer = SentencePieceProcessor ()
2796+ tokenizer .LoadFromFile (str (tokenizer_path ))
2797+
2798+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
2799+
2800+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2801+ scores : list [float ] = [- 10000.0 ] * vocab_size
2802+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2803+
2804+ for token_id in range (tokenizer .vocab_size ()):
2805+ piece = tokenizer .IdToPiece (token_id )
2806+ text = piece .encode ("utf-8" )
2807+ score = tokenizer .GetScore (token_id )
2808+
2809+ toktype = SentencePieceTokenTypes .NORMAL
2810+ if tokenizer .IsUnknown (token_id ):
2811+ toktype = SentencePieceTokenTypes .UNKNOWN
2812+ elif tokenizer .IsControl (token_id ):
2813+ toktype = SentencePieceTokenTypes .CONTROL
2814+ elif tokenizer .IsUnused (token_id ):
2815+ toktype = SentencePieceTokenTypes .UNUSED
2816+ elif tokenizer .IsByte (token_id ):
2817+ toktype = SentencePieceTokenTypes .BYTE
2818+
2819+ tokens [token_id ] = text
2820+ scores [token_id ] = score
2821+ toktypes [token_id ] = toktype
2822+
2823+ added_tokens_file = self .dir_model / 'added_tokens.json'
2824+ if added_tokens_file .is_file ():
2825+ with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
2826+ added_tokens_json = json .load (f )
2827+ for key in added_tokens_json :
2828+ token_id = added_tokens_json [key ]
2829+ if (token_id >= vocab_size ):
2830+ logger .warning (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
2831+ continue
2832+
2833+ tokens [token_id ] = key .encode ("utf-8" )
2834+ scores [token_id ] = - 1000.0
2835+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2836+
2837+ if vocab_size > len (tokens ):
2838+ pad_count = vocab_size - len (tokens )
2839+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
2840+ for i in range (1 , pad_count + 1 ):
2841+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
2842+ scores .append (- 1000.0 )
2843+ toktypes .append (SentencePieceTokenTypes .UNUSED )
2844+
2845+ self .gguf_writer .add_tokenizer_model ("t5" )
2846+ self .gguf_writer .add_tokenizer_pre ("default" )
2847+ self .gguf_writer .add_token_list (tokens )
2848+ self .gguf_writer .add_token_scores (scores )
2849+ self .gguf_writer .add_token_types (toktypes )
2850+ self .gguf_writer .add_add_space_prefix (add_prefix )
2851+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
2852+ if precompiled_charsmap :
2853+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
2854+
2855+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2856+ special_vocab .add_to_gguf (self .gguf_writer )
2857+
2858+ self .gguf_writer .add_add_bos_token (False )
2859+ self .gguf_writer .add_add_eos_token (True )
2860+
2861+ def set_gguf_parameters (self ):
2862+ self .gguf_writer .add_name ("T5" )
2863+ self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2864+ self .gguf_writer .add_embedding_length (self .hparams ["d_model" ])
2865+ self .gguf_writer .add_feed_forward_length (self .hparams ["d_ff" ])
2866+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2867+ self .gguf_writer .add_head_count (self .hparams ["num_heads" ])
2868+ self .gguf_writer .add_key_length (self .hparams ["d_kv" ])
2869+ self .gguf_writer .add_value_length (self .hparams ["d_kv" ])
2870+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2871+ self .gguf_writer .add_relative_attn_buckets_count (self .hparams ["relative_attention_num_buckets" ])
2872+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layer_norm_epsilon" ])
2873+ self .gguf_writer .add_decoder_start_token_id (self .hparams ["decoder_start_token_id" ])
2874+ self .gguf_writer .add_file_type (self .ftype )
2875+
2876+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2877+ del bid # unused
2878+
2879+ # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
2880+ # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
2881+ # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
2882+ if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight" :
2883+ logger .debug (f"Skipping tensor { name !r} in safetensors so that convert can end normally." )
2884+ return []
2885+
2886+ return [(self .map_tensor_name (name ), data_torch )]
2887+
2888+
27712889###### CONVERSION LOGIC ######
27722890
27732891
0 commit comments