Skip to content

Commit 424c579

Browse files
convert : support latest mistral-common (fix conversion with --mistral-format) (#17712)
* fix convert_hf_to_gguf.py failing with --mistral-format using later mistral-common versions. * use get_one_valid_tokenizer_file from mistral-common if available and fallback to old logic otherwise. * use file name instead of file path for get_one_valid_tokenizer_file. * fix --mistral-format tokenizer file failing for tokenizers in subdirectories. * move get_one_valid_tokenizer_file import to avoid nested try-except.
1 parent e9f9483 commit 424c579

File tree

1 file changed

+30
-16
lines changed

1 file changed

+30
-16
lines changed

gguf-py/gguf/vocab.py

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@
3131
else:
3232
_mistral_common_installed = True
3333

34+
try:
35+
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
36+
get_one_valid_tokenizer_file,
37+
)
38+
except ImportError:
39+
# We still want the conversion to work with older mistral-common versions.
40+
get_one_valid_tokenizer_file = None
41+
3442

3543
import gguf
3644

@@ -673,32 +681,38 @@ def __init__(self, base_path: Path):
673681

674682
# Find the tokenizer files
675683
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
676-
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
677-
678-
if len(valid_tokenizer_files) == 0:
679-
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
680-
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
681-
if len(valid_tokenizer_files) > 1:
682-
if "tekken.json" in valid_tokenizer_files:
683-
tokenizer_file = "tekken.json"
684-
else:
685-
tokenizer_file = sorted(valid_tokenizer_files)[-1]
686-
logger.warning(
687-
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
688-
)
684+
685+
if get_one_valid_tokenizer_file is not None:
686+
tokenizer_file_path = get_one_valid_tokenizer_file(all_files)
689687
else:
690-
tokenizer_file = valid_tokenizer_files[0]
688+
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
689+
690+
if len(valid_tokenizer_files) == 0:
691+
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
692+
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
693+
if len(valid_tokenizer_files) > 1:
694+
if "tekken.json" in valid_tokenizer_files:
695+
tokenizer_file = "tekken.json"
696+
else:
697+
tokenizer_file = sorted(valid_tokenizer_files)[-1]
698+
logger.warning(
699+
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
700+
)
701+
else:
702+
tokenizer_file = valid_tokenizer_files[0]
703+
704+
tokenizer_file_path = base_path / tokenizer_file
691705

692706
self.tokenizer = MistralTokenizer.from_file(
693-
base_path / tokenizer_file
707+
tokenizer_file_path
694708
).instruct_tokenizer.tokenizer
695709
self.tokenizer_type = (
696710
MistralTokenizerType.tekken
697711
if isinstance(self.tokenizer, Tekkenizer)
698712
else MistralTokenizerType.spm
699713
)
700714
self.vocab_size = self.tokenizer.n_words
701-
self.fname_tokenizer = base_path / tokenizer_file
715+
self.fname_tokenizer = tokenizer_file_path
702716
self._name = (
703717
"mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version
704718
)

0 commit comments

Comments
 (0)