Skip to content

Commit 20fc380

Browse files
authored
convert : fix gemma v1 tokenizer convert (#8248)
ggml-ci
1 parent f619024 commit 20fc380

28 files changed

+85
-4
lines changed

convert-hf-to-gguf-update.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ class TOKENIZER_TYPE(IntEnum):
8686
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
8787
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
8888
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
89+
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
90+
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
8991
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
9092
]
9193

@@ -273,7 +275,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
273275
"3333333",
274276
"33333333",
275277
"333333333",
276-
# "Cửa Việt", # llama-bpe fails on this
278+
"Cửa Việt", # llama-bpe fails on this
279+
" discards",
277280
chktxt,
278281
]
279282

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2316,6 +2316,8 @@ def set_vocab(self):
23162316
special_vocab._set_special_token("eot", 107)
23172317
special_vocab.add_to_gguf(self.gguf_writer)
23182318

2319+
self.gguf_writer.add_add_space_prefix(False)
2320+
23192321
def set_gguf_parameters(self):
23202322
hparams = self.hparams
23212323
block_count = hparams["num_hidden_layers"]
@@ -2366,6 +2368,7 @@ def set_vocab(self):
23662368

23672369
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
23682370
special_vocab.add_to_gguf(self.gguf_writer)
2371+
23692372
self.gguf_writer.add_add_space_prefix(False)
23702373

23712374
def set_gguf_parameters(self):

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ __ggml_vocab_test__
9191
__ggml_vocab_test__
9292
333333333
9393
__ggml_vocab_test__
94+
Cửa Việt
95+
__ggml_vocab_test__
96+
discards
97+
__ggml_vocab_test__
9498

9599

96100

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,6 @@
4040
21211 22394 22394
4141
21211 22394 22394 2509
4242
21211 22394 22394 22394
43+
12731 2050 19710
44+
5860 18117
4345
100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222

models/ggml-vocab-command-r.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ __ggml_vocab_test__
9191
__ggml_vocab_test__
9292
333333333
9393
__ggml_vocab_test__
94+
Cửa Việt
95+
__ggml_vocab_test__
96+
discards
97+
__ggml_vocab_test__
9498

9599

96100

models/ggml-vocab-command-r.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,6 @@
4040
26 26 26 26 26 26 26
4141
26 26 26 26 26 26 26 26
4242
26 26 26 26 26 26 26 26 26
43+
42 30719 12584
44+
3642 4388
4345
127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51

models/ggml-vocab-deepseek-coder.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ __ggml_vocab_test__
9191
__ggml_vocab_test__
9292
333333333
9393
__ggml_vocab_test__
94+
Cửa Việt
95+
__ggml_vocab_test__
96+
discards
97+
__ggml_vocab_test__
9498

9599

96100

models/ggml-vocab-deepseek-coder.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,6 @@
4040
18 18 18 18 18 18 18
4141
18 18 18 18 18 18 18 18
4242
18 18 18 18 18 18 18 18 18
43+
34 155 119 242 64 24297 155 119 216 83
44+
1607 2539
4345
185 207 185 185 207 185 185 185 207 12405 459 22758 185 243 185 315 185 251 185 730 185 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 12394 99 234 10047 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 524 18 207 18 1202 18 207 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 10047 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 18155 374 17194 28 2861 6478 616 2251 14994 31269 4191 6 4686 4686 10252 3358 3358 3409 524 15330 3023 15031 5668 303 6 312 798 651 83 839 362 6 82 741 11 651 1369 340 2037 30 651 44 441 2037 303 6 642 1098 359 11 651 35 340 833 738 10860 30 998 6 10709 245 6 75 43

models/ggml-vocab-deepseek-llm.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ __ggml_vocab_test__
9191
__ggml_vocab_test__
9292
333333333
9393
__ggml_vocab_test__
94+
Cửa Việt
95+
__ggml_vocab_test__
96+
discards
97+
__ggml_vocab_test__
9498

9599

96100

models/ggml-vocab-deepseek-llm.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,6 @@
4040
18 18 18 18 18 18 18
4141
18 18 18 18 18 18 18 18
4242
18 18 18 18 18 18 18 18 18
43+
34 32555 242 64 23708 32555 216 83
44+
1763 2550
4345
185 207 185 185 207 185 185 185 207 11969 486 22504 185 243 185 300 185 251 185 663 185 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 12356 99 234 10044 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 526 18 207 18 1204 18 207 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71899 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239 78827 55170 76659 620 91754 31116 36804 4885 4885 10897 4390 4390 41047 15278 3033 14986 5675 304 6 313 803 655 33326 362 6 82 745 11 655 1374 340 2049 30 655 44 441 2049 304 6 647 1099 359 11 655 35 340 837 742 10842 30 1003 6 10699 245 6 75 43

0 commit comments

Comments
 (0)