@@ -180,7 +180,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
180180 extra = sorted (tensor_names_from_parts .difference (self .tensor_names ))
181181 missing_files = sorted (set (weight_map [n ] for n in missing if n in weight_map ))
182182 if len (extra ) == 0 and len (missing_files ) > 0 :
183- raise ValueError (f"Missing or incomplete model files: { missing_files } " )
183+ raise ValueError (f"Missing or incomplete model files: { missing_files } \n "
184+ f"Missing tensors: { missing } " )
184185 else :
185186 raise ValueError ("Mismatch between weight map and model parts for tensor names:\n "
186187 f"Missing tensors: { missing } \n "
@@ -528,6 +529,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
528529 reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .vocab .items ()}
529530 added_vocab = tokenizer .get_added_vocab ()
530531
532+ added_tokens_decoder = tokenizer .added_tokens_decoder
533+
531534 for i in range (vocab_size ):
532535 if i not in reverse_vocab :
533536 tokens .append (f"[PAD{ i } ]" )
@@ -537,13 +540,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
537540 if token in added_vocab :
538541 # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539542 # To avoid unexpected issues - we make sure to normalize non-normalized tokens
540- if not tokenizer . added_tokens_decoder [i ].normalized :
543+ if not added_tokens_decoder [i ].normalized :
541544 previous_token = token
542545 token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
543546 if previous_token != token :
544547 logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
545548
546- if tokenizer . added_tokens_decoder [i ].special or self .does_token_look_special (token ):
549+ if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
547550 toktypes .append (gguf .TokenType .CONTROL )
548551 else :
549552 # NOTE: this was added for Gemma.
@@ -1099,13 +1102,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
10991102
11001103 tensors .append ((self .map_tensor_name (name ), data_torch ))
11011104
1102- if name == "word_embeddings.weight" :
1103- assert self .tensor_names is not None
1104-
1105- # TODO: tie them at runtime, don't duplicate in the model file
1106- if all (s not in self .tensor_names for s in ("lm_head.weight" , "output.weight" )):
1107- tensors .append ((self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ), data_torch ))
1108-
11091105 return tensors
11101106
11111107
@@ -1747,6 +1743,25 @@ def prepare_tensors(self):
17471743 raise ValueError (f"Unprocessed experts: { experts } " )
17481744
17491745
1746+ @Model .register ("Mistral3ForConditionalGeneration" )
1747+ class Mistral3Model (LlamaModel ):
1748+ model_arch = gguf .MODEL_ARCH .LLAMA
1749+
1750+ # we need to merge the text_config into the root level of hparams
1751+ def __init__ (self , * args , ** kwargs ):
1752+ hparams = Model .load_hparams (kwargs ["dir_model" ])
1753+ if "text_config" in hparams :
1754+ hparams = {** hparams , ** hparams ["text_config" ]}
1755+ kwargs ["hparams" ] = hparams
1756+ super ().__init__ (* args , ** kwargs )
1757+
1758+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ):
1759+ name = name .replace ("language_model." , "" )
1760+ if "multi_modal_projector" in name or "vision_tower" in name :
1761+ return []
1762+ return super ().modify_tensors (data_torch , name , bid )
1763+
1764+
17501765@Model .register ("DeciLMForCausalLM" )
17511766class DeciModel (Model ):
17521767 model_arch = gguf .MODEL_ARCH .DECI
@@ -2404,10 +2419,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24042419
24052420 tensors .append ((new_name , data_torch ))
24062421
2407- # note: GPT2 output is tied to (same as) wte in original model
2408- if new_name == self .format_tensor_name (gguf .MODEL_TENSOR .TOKEN_EMBD ):
2409- tensors .append ((self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ), data_torch ))
2410-
24112422 return tensors
24122423
24132424
@@ -2737,21 +2748,26 @@ def set_gguf_parameters(self):
27372748 self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
27382749 self .gguf_writer .add_rope_scaling_factor (1.0 )
27392750
2751+ _has_tok_embd = False
2752+
27402753 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
27412754 del bid # unused
27422755
2743- new_name = self .map_tensor_name (name )
2744-
2745- tensors : list [tuple [str , Tensor ]] = [(new_name , data_torch )]
2756+ output_name = self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT )
2757+ tok_embd_name = self .format_tensor_name (gguf .MODEL_TENSOR .TOKEN_EMBD )
27462758
2747- if new_name == self .format_tensor_name (gguf .MODEL_TENSOR .TOKEN_EMBD ):
2748- assert self .tensor_names is not None
2759+ new_name = self .map_tensor_name (name )
27492760
2750- if all (s not in self .tensor_names for s in ("lm_head.weight" , "output.weight" )):
2751- # copy tok_embd.weight to output.weight
2752- tensors .append ((self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ), data_torch ))
2761+ # assuming token_embd.weight is seen before output.weight
2762+ if not self ._has_tok_embd and new_name == self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ):
2763+ # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
2764+ if self .tensor_names and "transformer.wte.weight" in self .tensor_names :
2765+ logger .debug (f"{ tok_embd_name } not found before { output_name } , assuming they are tied" )
2766+ self .tensor_names .remove ("transformer.wte.weight" )
2767+ elif new_name == tok_embd_name :
2768+ self ._has_tok_embd = True
27532769
2754- return tensors
2770+ return [( new_name , data_torch )]
27552771
27562772
27572773@Model .register ("InternLM2ForCausalLM" )
0 commit comments