@@ -1551,7 +1551,7 @@ def set_vocab(self):
15511551 self .gguf_writer .add_token_list (tokens )
15521552 self .gguf_writer .add_token_types (toktypes )
15531553
1554- special_vocab = gguf .SpecialVocab (dir_model , n_vocab = len (tokens ))
1554+ special_vocab = gguf .SpecialVocab (dir_model , n_vocab = len (tokens ), load_merges = True )
15551555 special_vocab .add_to_gguf (self .gguf_writer )
15561556
15571557 def set_gguf_parameters (self ):
@@ -2200,41 +2200,68 @@ def set_vocab(self):
22002200
22012201 from transformers import AutoTokenizer
22022202 tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
2203- vocab_size = len (tokenizer .vocab )
2204- # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
2205- # because vocab_size is the count of items, and indexes start at 0.
2203+
2204+ # PLaMo2 has padded vocabulary - get actual size from embedding weight
2205+ # Load the embedding tensor to get the real vocab size
2206+ import torch
2207+ from safetensors import safe_open
2208+ actual_vocab_size = None
2209+
2210+ # Check the model weight files to get actual vocab size
2211+ weight_map_file = dir_model / "model.safetensors.index.json"
2212+ if weight_map_file .exists ():
2213+ import json
2214+ with open (weight_map_file , 'r' ) as f :
2215+ weight_map = json .load (f )
2216+ embed_file = weight_map ['weight_map' ]['model.embed_tokens.weight' ]
2217+ embed_path = dir_model / embed_file
2218+
2219+ with safe_open (str (embed_path ), framework = 'pt' , device = 'cpu' ) as f :
2220+ embed_weight = f .get_tensor ('model.embed_tokens.weight' )
2221+ actual_vocab_size = embed_weight .shape [0 ]
2222+
2223+ vocab_size = actual_vocab_size if actual_vocab_size else len (tokenizer .vocab )
2224+
2225+ # Since we are checking the maximum index, we need to ensure it's strictly less than tokenizer vocab size,
2226+ # because PLaMo2 has padded vocabulary
22062227 max_vocab_index = max (tokenizer .get_vocab ().values ())
2207- if max_vocab_index >= vocab_size :
2228+ if max_vocab_index >= len ( tokenizer . vocab ) :
22082229 raise ValueError ("Vocabulary size exceeds expected maximum size." )
22092230
22102231 reverse_vocab : dict [int , str ] = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .vocab .items ()}
22112232 added_vocab = tokenizer .get_added_vocab ()
22122233
22132234 for token_id in range (vocab_size ):
2214- token_text = reverse_vocab [token_id ].encode ('utf-8' )
2215- # replace "\x00" to string with length > 0
2216- if token_text == b"\x00 " :
2217- toktype = gguf .TokenType .BYTE # special
2218- token_text = f"<{ token_text } >" .encode ('utf-8' )
2219- elif re .fullmatch (br"<0x[0-9A-Fa-f]{2}>" , token_text ):
2220- toktype = gguf .TokenType .BYTE # special
2221- elif reverse_vocab [token_id ] in added_vocab :
2222- if tokenizer .added_tokens_decoder [token_id ].special :
2223- toktype = gguf .TokenType .CONTROL
2224- else :
2225- toktype = gguf .TokenType .USER_DEFINED
2235+ # Handle padding tokens for vocab entries beyond tokenizer vocabulary
2236+ if token_id >= len (tokenizer .vocab ):
2237+ # Create padding tokens for the extra vocabulary entries
2238+ token_text = f"<pad_{ token_id } >" .encode ('utf-8' )
2239+ toktype = gguf .TokenType .UNUSED
22262240 else :
2227- toktype = gguf .TokenType .NORMAL
2241+ token_text = reverse_vocab [token_id ].encode ('utf-8' )
2242+ # replace "\x00" to string with length > 0
2243+ if token_text == b"\x00 " :
2244+ toktype = gguf .TokenType .BYTE # special
2245+ token_text = f"<{ token_text } >" .encode ('utf-8' )
2246+ elif re .fullmatch (br"<0x[0-9A-Fa-f]{2}>" , token_text ):
2247+ toktype = gguf .TokenType .BYTE # special
2248+ elif reverse_vocab [token_id ] in added_vocab :
2249+ if tokenizer .added_tokens_decoder [token_id ].special :
2250+ toktype = gguf .TokenType .CONTROL
2251+ else :
2252+ toktype = gguf .TokenType .USER_DEFINED
2253+ else :
2254+ toktype = gguf .TokenType .NORMAL
22282255
22292256 tokens .append (token_text )
22302257 toktypes .append (toktype )
22312258
2232- # self.gguf_writer.add_tokenizer_model("llama")
2233- # self.gguf_writer.add_tokenizer_pre("default")
2259+ self .gguf_writer .add_tokenizer_model ("llama" )
2260+ self .gguf_writer .add_tokenizer_pre ("default" )
22342261 self .gguf_writer .add_token_list (tokens )
22352262 self .gguf_writer .add_token_types (toktypes )
22362263
2237- special_vocab = gguf .SpecialVocab (dir_model , n_vocab = len (tokens ))
2264+ special_vocab = gguf .SpecialVocab (dir_model , n_vocab = len (tokens ), load_merges = False )
22382265 special_vocab .add_to_gguf (self .gguf_writer )
22392266
22402267 def set_gguf_parameters (self ):
@@ -2245,9 +2272,16 @@ def set_gguf_parameters(self):
22452272 # Mamba parameters
22462273 if hparams .get ("mamba_enabled" , False ):
22472274 self .gguf_writer .add_ssm_conv_kernel (hparams .get ("mamba_d_conv" , 4 ))
2248- self .gguf_writer .add_ssm_inner_size (hparams .get ("mamba_d_state" , 64 ) * hparams .get ("intermediate_size" , 13312 ) // hparams .get ("hidden_size" , 4096 ))
2275+ # PLaMo2 SSM inner size = mamba_num_heads * hidden_size_per_head
2276+ mamba_num_heads = hparams .get ("mamba_num_heads" , 64 )
2277+ hidden_size_per_head = hparams .get ("hidden_size_per_head" , 128 )
2278+ ssm_inner_size = mamba_num_heads * hidden_size_per_head
2279+ self .gguf_writer .add_ssm_inner_size (ssm_inner_size )
22492280 self .gguf_writer .add_ssm_state_size (hparams .get ("mamba_d_state" , 64 ))
2250- self .gguf_writer .add_ssm_time_step_rank (hparams .get ("mamba_d_state" , 64 ) // 16 ) # Commonly d_state/16
2281+ # PLaMo2 dt_dim = max(64, hidden_size // 16)
2282+ hidden_size = hparams .get ("hidden_size" , 4096 )
2283+ dt_dim = max (64 , hidden_size // 16 )
2284+ self .gguf_writer .add_ssm_time_step_rank (dt_dim )
22512285
22522286 # Attention window parameters
22532287 if "attention_window_size" in hparams :
@@ -2273,6 +2307,24 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
22732307 # Reconstruct the name without the duplicate "layers"
22742308 name = f"model.layers.{ layer_num } .{ rest } "
22752309
2310+ # Handle combined gate_up_proj tensor split
2311+ if name .endswith (".mlp.gate_up_proj.weight" ):
2312+ # Split the combined gate_up tensor into separate gate and up tensors
2313+ # The tensor shape is (2 * intermediate_size, hidden_size)
2314+ # Split along dim 0 to get gate (first half) and up (second half)
2315+ intermediate_size = data_torch .shape [0 ] // 2
2316+ gate_weight = data_torch [:intermediate_size , :]
2317+ up_weight = data_torch [intermediate_size :, :]
2318+
2319+ # Map to the correct names
2320+ gate_name = self .map_tensor_name (name .replace ("gate_up_proj" , "gate_proj" ))
2321+ up_name = self .map_tensor_name (name .replace ("gate_up_proj" , "up_proj" ))
2322+
2323+ return [
2324+ (gate_name , gate_weight ),
2325+ (up_name , up_weight )
2326+ ]
2327+
22762328 # Handle Mamba-specific A_log tensor transformation
22772329 if name .endswith (".A_log" ):
22782330 # Map the tensor name first
0 commit comments