2929 sys .path .insert (1 , str (Path (__file__ ).parent / 'gguf-py' ))
3030import gguf
3131from gguf .vocab import MistralTokenizerType , MistralVocab
32+ from mistral_common .tokens .tokenizers .base import TokenizerVersion
3233from mistral_common .tokens .tokenizers .multimodal import DATASET_MEAN , DATASET_STD
34+ from mistral_common .tokens .tokenizers .tekken import Tekkenizer
35+ from mistral_common .tokens .tokenizers .sentencepiece import (
36+ SentencePieceTokenizer ,
37+ )
3338
3439
3540logger = logging .getLogger ("hf-to-gguf" )
@@ -110,13 +115,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
110115 def get_remote_tensors () -> Iterator [tuple [str , Tensor ]]:
111116 logger .info (f"Using remote model with HuggingFace id: { remote_hf_model_id } " )
112117
113- if not self .is_mistral_format :
114- remote_tensors = gguf .utility .SafetensorRemote .get_list_tensors_hf_model (remote_hf_model_id )
115-
116- else :
117- url = f"{ gguf .utility .SafetensorRemote .BASE_DOMAIN } /{ remote_hf_model_id } /resolve/main/consolidated.safetensors"
118- remote_tensors = gguf .utility .SafetensorRemote .get_list_tensors (url )
119-
118+ remote_tensors = gguf .utility .SafetensorRemote .get_list_tensors_hf_model (remote_hf_model_id )
120119 self .tensor_names = set (name for name in remote_tensors .keys ())
121120 for name , remote_tensor in remote_tensors .items ():
122121 yield (name , LazyTorchTensor .from_remote_tensor (remote_tensor ))
@@ -1993,6 +1992,11 @@ def _set_vocab_mistral(self):
19931992 self .gguf_writer .add_add_bos_token (True )
19941993 self .gguf_writer .add_add_eos_token (False )
19951994
1995+ template_dir = Path (__file__ ).parent / "models/templates/"
1996+
1997+ template = MistralModel .get_community_chat_template (vocab , template_dir )
1998+ self .gguf_writer .add_chat_template (template )
1999+
19962000 def set_vocab (self ):
19972001 if self .is_mistral_format :
19982002 return self ._set_vocab_mistral ()
@@ -2002,12 +2006,6 @@ def set_vocab(self):
20022006 if path_tekken_json .is_file () and not path_tokenizer_json .is_file ():
20032007 self ._set_vocab_mistral ()
20042008
2005- script_dir = Path (__file__ ).parent
2006- template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
2007- with open (template_path , "r" , encoding = "utf-8" ) as f :
2008- template = f .read ()
2009- self .gguf_writer .add_chat_template (template )
2010-
20112009 try :
20122010 self ._set_vocab_sentencepiece ()
20132011 except FileNotFoundError :
@@ -2038,7 +2036,7 @@ def set_vocab(self):
20382036
20392037 # Apply to granite small models only
20402038 if self .hparams .get ("vocab_size" , 32000 ) == 49152 :
2041- self .gguf_writer .add_add_bos_token (False )
2039+ self .gguf_writer .add_add_bos_token (False )
20422040
20432041 def set_gguf_parameters (self ):
20442042 super ().set_gguf_parameters ()
@@ -7820,6 +7818,39 @@ class MistralModel(LlamaModel):
78207818 is_mistral_format = True
78217819 undo_permute = False
78227820
7821+ @staticmethod
7822+ def get_community_chat_template (vocab : MistralVocab , templates_dir : Path ):
7823+ assert TokenizerVersion is not None , "mistral_common is not installed"
7824+ assert isinstance (vocab .tokenizer , (Tekkenizer , SentencePieceTokenizer )), (
7825+ f"Expected Tekkenizer or SentencePieceTokenizer, got { type (vocab .tokenizer )} "
7826+ )
7827+
7828+ if vocab .tokenizer .version == TokenizerVersion .v1 :
7829+ return "mistral-v1"
7830+ elif vocab .tokenizer .version == TokenizerVersion .v3 and vocab .tokenizer_type == MistralTokenizerType .spm :
7831+ return "mistral-v3"
7832+ elif vocab .tokenizer .version == TokenizerVersion .v3 and vocab .tokenizer_type == MistralTokenizerType .tekken :
7833+ return "mistral-v3-tekken"
7834+ elif vocab .tokenizer .version == TokenizerVersion .v7 and vocab .tokenizer_type == MistralTokenizerType .spm :
7835+ return "mistral-v7"
7836+ elif vocab .tokenizer .version == TokenizerVersion .v7 and vocab .tokenizer_type == MistralTokenizerType .tekken :
7837+ return "mistral-v7-tekken"
7838+ elif vocab .tokenizer .version == TokenizerVersion .v11 :
7839+ template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja"
7840+ elif vocab .tokenizer .version == TokenizerVersion .v13 :
7841+ template_file = "unsloth-mistral-Devstral-Small-2507.jinja"
7842+ else :
7843+ raise ValueError (f"Unknown tokenizer type: { vocab .tokenizer_type } " )
7844+
7845+ template_path = templates_dir / template_file
7846+ if not template_path .exists ():
7847+ raise FileNotFoundError (f"Template file not found: { template_path } " )
7848+
7849+ with open (template_path , "r" , encoding = "utf-8" ) as f :
7850+ template = f .read ()
7851+
7852+ return template
7853+
78237854
78247855class PixtralModel (LlavaVisionModel ):
78257856 model_name = "Pixtral"
0 commit comments