Skip to content

Commit 42489f5

Browse files
committed
add default chat template
1 parent 63002a0 commit 42489f5

File tree

2 files changed

+46
-15
lines changed

2 files changed

+46
-15
lines changed

convert_hf_to_gguf.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,12 @@
2929
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
3030
import gguf
3131
from gguf.vocab import MistralTokenizerType, MistralVocab
32+
from mistral_common.tokens.tokenizers.base import TokenizerVersion
3233
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
34+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
35+
from mistral_common.tokens.tokenizers.sentencepiece import (
36+
SentencePieceTokenizer,
37+
)
3338

3439

3540
logger = logging.getLogger("hf-to-gguf")
@@ -110,13 +115,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
110115
def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
111116
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
112117

113-
if not self.is_mistral_format:
114-
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
115-
116-
else:
117-
url = f"{gguf.utility.SafetensorRemote.BASE_DOMAIN}/{remote_hf_model_id}/resolve/main/consolidated.safetensors"
118-
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors(url)
119-
118+
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
120119
self.tensor_names = set(name for name in remote_tensors.keys())
121120
for name, remote_tensor in remote_tensors.items():
122121
yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
@@ -1993,6 +1992,11 @@ def _set_vocab_mistral(self):
19931992
self.gguf_writer.add_add_bos_token(True)
19941993
self.gguf_writer.add_add_eos_token(False)
19951994

1995+
template_dir = Path(__file__).parent / "models/templates/"
1996+
1997+
template = MistralModel.get_community_chat_template(vocab, template_dir)
1998+
self.gguf_writer.add_chat_template(template)
1999+
19962000
def set_vocab(self):
19972001
if self.is_mistral_format:
19982002
return self._set_vocab_mistral()
@@ -2002,12 +2006,6 @@ def set_vocab(self):
20022006
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
20032007
self._set_vocab_mistral()
20042008

2005-
script_dir = Path(__file__).parent
2006-
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
2007-
with open(template_path, "r", encoding="utf-8") as f:
2008-
template = f.read()
2009-
self.gguf_writer.add_chat_template(template)
2010-
20112009
try:
20122010
self._set_vocab_sentencepiece()
20132011
except FileNotFoundError:
@@ -2038,7 +2036,7 @@ def set_vocab(self):
20382036

20392037
# Apply to granite small models only
20402038
if self.hparams.get("vocab_size", 32000) == 49152:
2041-
self.gguf_writer.add_add_bos_token(False)
2039+
self.gguf_writer.add_add_bos_token(False)
20422040

20432041
def set_gguf_parameters(self):
20442042
super().set_gguf_parameters()
@@ -7820,6 +7818,39 @@ class MistralModel(LlamaModel):
78207818
is_mistral_format = True
78217819
undo_permute = False
78227820

7821+
@staticmethod
7822+
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path):
7823+
assert TokenizerVersion is not None, "mistral_common is not installed"
7824+
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
7825+
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
7826+
)
7827+
7828+
if vocab.tokenizer.version == TokenizerVersion.v1:
7829+
return "mistral-v1"
7830+
elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm:
7831+
return "mistral-v3"
7832+
elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken:
7833+
return "mistral-v3-tekken"
7834+
elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm:
7835+
return "mistral-v7"
7836+
elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken:
7837+
return "mistral-v7-tekken"
7838+
elif vocab.tokenizer.version == TokenizerVersion.v11:
7839+
template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja"
7840+
elif vocab.tokenizer.version == TokenizerVersion.v13:
7841+
template_file = "unsloth-mistral-Devstral-Small-2507.jinja"
7842+
else:
7843+
raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type}")
7844+
7845+
template_path = templates_dir / template_file
7846+
if not template_path.exists():
7847+
raise FileNotFoundError(f"Template file not found: {template_path}")
7848+
7849+
with open(template_path, "r", encoding="utf-8") as f:
7850+
template = f.read()
7851+
7852+
return template
7853+
78237854

78247855
class PixtralModel(LlavaVisionModel):
78257856
model_name = "Pixtral"

gguf-py/gguf/vocab.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
except ImportError:
2626
_mistral_common_installed = False
2727
MistralTokenizer = None
28-
Tekkenizer = None
2928
SentencePieceTokenizer = None
29+
Tekkenizer = None
3030
_filter_valid_tokenizer_files = None
3131
else:
3232
_mistral_common_installed = True

0 commit comments

Comments
 (0)