|
31 | 31 | else: |
32 | 32 | _mistral_common_installed = True |
33 | 33 |
|
| 34 | +try: |
| 35 | + from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] |
| 36 | + get_one_valid_tokenizer_file, |
| 37 | + ) |
| 38 | +except ImportError: |
| 39 | + # We still want the conversion to work with older mistral-common versions. |
| 40 | + get_one_valid_tokenizer_file = None |
| 41 | + |
34 | 42 |
|
35 | 43 | import gguf |
36 | 44 |
|
@@ -673,32 +681,38 @@ def __init__(self, base_path: Path): |
673 | 681 |
|
674 | 682 | # Find the tokenizer files |
675 | 683 | all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()] |
676 | | - valid_tokenizer_files = _filter_valid_tokenizer_files(all_files) |
677 | | - |
678 | | - if len(valid_tokenizer_files) == 0: |
679 | | - raise ValueError(f"No tokenizer file found in the directory: {base_path}") |
680 | | - # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one. |
681 | | - if len(valid_tokenizer_files) > 1: |
682 | | - if "tekken.json" in valid_tokenizer_files: |
683 | | - tokenizer_file = "tekken.json" |
684 | | - else: |
685 | | - tokenizer_file = sorted(valid_tokenizer_files)[-1] |
686 | | - logger.warning( |
687 | | - f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}" |
688 | | - ) |
| 684 | + |
| 685 | + if get_one_valid_tokenizer_file is not None: |
| 686 | + tokenizer_file_path = get_one_valid_tokenizer_file(all_files) |
689 | 687 | else: |
690 | | - tokenizer_file = valid_tokenizer_files[0] |
| 688 | + valid_tokenizer_files = _filter_valid_tokenizer_files(all_files) |
| 689 | + |
| 690 | + if len(valid_tokenizer_files) == 0: |
| 691 | + raise ValueError(f"No tokenizer file found in the directory: {base_path}") |
| 692 | + # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one. |
| 693 | + if len(valid_tokenizer_files) > 1: |
| 694 | + if "tekken.json" in valid_tokenizer_files: |
| 695 | + tokenizer_file = "tekken.json" |
| 696 | + else: |
| 697 | + tokenizer_file = sorted(valid_tokenizer_files)[-1] |
| 698 | + logger.warning( |
| 699 | + f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}" |
| 700 | + ) |
| 701 | + else: |
| 702 | + tokenizer_file = valid_tokenizer_files[0] |
| 703 | + |
| 704 | + tokenizer_file_path = base_path / tokenizer_file |
691 | 705 |
|
692 | 706 | self.tokenizer = MistralTokenizer.from_file( |
693 | | - base_path / tokenizer_file |
| 707 | + tokenizer_file_path |
694 | 708 | ).instruct_tokenizer.tokenizer |
695 | 709 | self.tokenizer_type = ( |
696 | 710 | MistralTokenizerType.tekken |
697 | 711 | if isinstance(self.tokenizer, Tekkenizer) |
698 | 712 | else MistralTokenizerType.spm |
699 | 713 | ) |
700 | 714 | self.vocab_size = self.tokenizer.n_words |
701 | | - self.fname_tokenizer = base_path / tokenizer_file |
| 715 | + self.fname_tokenizer = tokenizer_file_path |
702 | 716 | self._name = ( |
703 | 717 | "mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version |
704 | 718 | ) |
|
0 commit comments