@@ -6399,25 +6399,25 @@ class HunYuanMoEModel(TextModel):
63996399
64006400 def __init__ (self , * args , ** kwargs ):
64016401 super ().__init__ (* args , ** kwargs )
6402+ # FIX for tied embeddings: Capture the token embeddings.
6403+ self ._tok_embd = None
64026404
64036405 def set_vocab (self ):
64046406 self ._set_vocab_gpt2 (load_merges = False )
6407+ # FIX for BOS token: Manually set the correct BOS token ID.
6408+ # The SpecialVocab helper gets incorrect id `bos_token_id: 1` from config.json.
6409+ self .gguf_writer .add_bos_token_id (127959 ) # <|bos|>
64056410
64066411 def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
6407- tokens : list [str ] = []
6408- toktypes : list [int ] = []
6409-
64106412 from transformers import AutoTokenizer
64116413 tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
64126414
6415+ # Fake merges
64136416 merges = []
64146417 mergeable_ranks = tokenizer .mergeable_ranks
64156418 for token , rank in mergeable_ranks .items ():
64166419 if len (token ) == 1 :
64176420 continue
6418- # bpe() will decompose the token into its smallest parts and then
6419- # re-merge them. If the token is a valid merge, bpe() will return
6420- # the two pieces that were merged to create it.
64216421 merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
64226422 if len (merged ) == 2 :
64236423 merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
@@ -6472,16 +6472,22 @@ def set_gguf_parameters(self):
64726472 rope_scaling = self .hparams .get ("rope_scaling" , {})
64736473 if rope_scaling .get ("type" ) == "dynamic" :
64746474 logger .warning ("Model uses 'dynamic' rope scaling, which is not yet supported in GGUF. "
6475- "The resulting model may not work correctly with contexts longer than the training length ." )
6475+ "Long-context extrapolation will not work correctly. Setting rope scaling type to NONE ." )
64766476 self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .NONE )
6477- else :
6478- # Fallback for other potential scaling types
6479- # This part is inherited from TextModel and will handle standard rope_theta
6480- pass
64816477
64826478 _experts : list [dict [str , Tensor ]] | None = None
64836479
64846480 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
6481+ # FIX for tied embeddings: Capture the token embeddings.
6482+ if name == "model.embed_tokens.weight" :
6483+ self ._tok_embd = data_torch .clone ()
6484+
6485+ # FIX for tied embeddings: Skip the lm_head if it's tied.
6486+ if name == "lm_head.weight" :
6487+ if self .hparams .get ("tie_word_embeddings" , False ):
6488+ logger .info ("Skipping tied output layer 'lm_head.weight'" )
6489+ return []
6490+
64856491 # process the experts separately
64866492 if name .find ("mlp.experts" ) != - 1 :
64876493 n_experts = self .hparams ["num_experts" ]
0 commit comments