Skip to content

Commit ac5449b

Browse files
committed
update tokenizer out
1 parent 16247c4 commit ac5449b

File tree

107 files changed

+4787
-95
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+4787
-95
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -761,7 +761,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
761761
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
762762
# ref: https://huggingface.co/facebook/chameleon-7b
763763
res = "chameleon"
764-
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
764+
if chkhsh == "68fa7e0a33050885cc10a2acfa4df354042188f0afa03b809f7a71c4cde6e373":
765765
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
766766
res = "minerva-7b"
767767
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
@@ -794,10 +794,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
794794
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
795795
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
796796
res = "llama4"
797-
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
798-
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
799-
res = "chatglm-bpe"
800-
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
797+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
801798
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
802799
res = "glm4"
803800
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ def get_existing_models(convert_py):
247247
else:
248248
# otherwise, compute the hash of the tokenizer
249249
try:
250+
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
250251
if name == "t5":
251252
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
252253
else:
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Äpfel
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
78+
3
79+
__ggml_vocab_test__
80+
33
81+
__ggml_vocab_test__
82+
333
83+
__ggml_vocab_test__
84+
3333
85+
__ggml_vocab_test__
86+
33333
87+
__ggml_vocab_test__
88+
333333
89+
__ggml_vocab_test__
90+
3333333
91+
__ggml_vocab_test__
92+
33333333
93+
__ggml_vocab_test__
94+
333333333
95+
__ggml_vocab_test__
96+
Cửa Việt
97+
__ggml_vocab_test__
98+
discards
99+
__ggml_vocab_test__
100+
101+
102+
103+
104+
105+
106+
107+
108+
109+
110+
111+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112+
__ggml_vocab_test__
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
1368 220 19 220 24452 3710
2+
56222 34745 316
3+
4+
220
5+
256
6+
305
7+
197
8+
198
9+
198 198
10+
198 198 198
11+
197 198
12+
14455 1931
13+
34994 1931
14+
14455 4832
15+
34994 4832
16+
34994 4832 0
17+
14455 11 1931 0
18+
34994 11 1931 0
19+
501 341 8811 99 247 13 37710
20+
86 15 19 23 220 22 83 5322 43067 27647 5599 18572
21+
107171 31758 3186 23679 109404 31994 8108 23947 31216 26804 3875
22+
34387 222 34387 114 34387 241 80203 233 34387 237 80203 224 34387 244 34387 115 34387 253 80203 223 34387 253 34387 95 34387 114 34387 227 34387 223 34387 249 34387 227 80203 223 34387 231
23+
114055 222 363 12415 8 21803 114 56848 75978 104 25661 363 64398 1098 115815 53600 659 8 44358 227 363 7619 86273 378 723 1097 1645 9775 8
24+
14455
25+
34994
26+
220 34994
27+
256 34994
28+
305 34994
29+
305 34994 198 305 34994
30+
363
31+
198 373
32+
6 16206
33+
14455 11 340 88386 0 2071 449 362 21803 223 3543 17175 401 32164 1557 16 18 16 19 16 20 16 820 7506
34+
15421 4021
35+
18
36+
18 18
37+
18 18 18
38+
18 18 18 18
39+
18 18 18 18 18
40+
18 18 18 18 18 18
41+
18 18 18 18 18 18 18
42+
18 18 18 18 18 18 18 18
43+
18 18 18 18 18 18 18 18 18
44+
34 17136 255 64 34335 82161 83
45+
2133 3082
46+
198 220 198 198 220 198 198 198 220 197 220 456 220 197 198 256 198 305 198 263 198 798 198 114055 222 363 12415 8 21803 114 56848 75978 104 25661 363 64398 1098 115815 53600 659 8 44358 227 8811 99 247 5901 99 247 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 491 18 220 18 1152 18 220 34387 222 34387 114 34387 241 80203 233 34387 237 80203 224 34387 244 34387 115 34387 253 80203 223 34387 253 34387 95 34387 114 34387 227 44542 223 3543 17175 401 32164 1557 16 18 16 19 16 20 16 820 7506 93668 1315 25898 41979 31758 3186 23679 109404 31994 8108 23947 31216 26804 3875 36809 54629 44470 2891 7980 7980 13252 15421 4021 34600 10303 331 3868 1007 689 119947 464 698 947 11 689 1364 362 2607 30 689 44 526 2607 331 4164 1253 403 11 689 35 362 1029 971 13197 30 1104 6 54377 259 82191 43
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Äpfel
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
78+
3
79+
__ggml_vocab_test__
80+
33
81+
__ggml_vocab_test__
82+
333
83+
__ggml_vocab_test__
84+
3333
85+
__ggml_vocab_test__
86+
33333
87+
__ggml_vocab_test__
88+
333333
89+
__ggml_vocab_test__
90+
3333333
91+
__ggml_vocab_test__
92+
33333333
93+
__ggml_vocab_test__
94+
333333333
95+
__ggml_vocab_test__
96+
Cửa Việt
97+
__ggml_vocab_test__
98+
discards
99+
__ggml_vocab_test__
100+
101+
102+
103+
104+
105+
106+
107+
108+
109+
110+
111+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112+
__ggml_vocab_test__
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
8469 8168 125 100 10195
2+
9392 9568 8178
3+
4+
5+
6+
7+
8+
9+
10+
11+
12+
8701 8572
13+
8701 8572
14+
8701 8572
15+
8701 8572
16+
8701 8572 106
17+
8701 117 8572 106
18+
8701 117 8572 106
19+
8554 8310 100 119 8337 8187
20+
165 9099 8156 128 10455 8169 8334 8197 11274 8168 8189 9670
21+
100 245 11318 100
22+
100
23+
100 113 8275 13156 8178 114 100 113 13149 10383 13152 8167 9343 8118 11485 10869 12000 9255 114 100 113 10110 13152 8167 9343 9231 11325 12894 157 10799 8228 11285 114
24+
8701
25+
8701
26+
8701
27+
8701
28+
8701
29+
8701 8701
30+
113
31+
134
32+
112 10451 8139
33+
8701 117 167 112 8513 106 9510 8995 8357 100 136 2769 2682 1762 8350 2339 868 9403 9281 9216 1921 8080
34+
106 106 106 106 106 106
35+
124
36+
8226
37+
10745
38+
10745 8152
39+
10745 8921
40+
10745 8921 8152
41+
10745 8921 8921
42+
10745 8921 8921 8152
43+
10745 8921 8921 8921
44+
145 9478 10138 8418
45+
9796 10203 9345 8118
46+
100 113 8275 13156 8178 114 100 113 13149 10383 13152 8167 9343 8118 11485 10869 12000 9255 114 100 100 124 8226 10745 10745 8152 10745 8921 10745 8921 8152 10745 8921 8921 10745 8921 8921 8152 124 119 124 124 119 119 124 124 119 119 119 124 100 136 2769 2682 1762 8350 2339 868 9403 9281 9216 1921 8080 118 118 118 118 118 118 134 134 134 134 134 134 134 100 245 11318 100 112 112 112 112 112 112 100 100 100 100 100 100 100 107 107 107 107 119 119 119 119 119 119 106 106 106 106 106 106 136 136 136 136 136 136 151 112 12810 8815 8329 112 8228 8635 9245 112 161 11136 117 112 8847 8357 11541 8358 136 112 155 9059 11541 8358 151 112 10856 9994 8233 117 112 146 8357 8993 13048 9686 136 8997 112 12810 143 112 10856

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
ied 4 ½ months
22
__ggml_vocab_test__
3-
Führer
3+
Äpfel
44
__ggml_vocab_test__
55

66
__ggml_vocab_test__

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
29464 2094 1018 1092 2706
2-
11865 17875
2+
9706 7959 2140
33

44

55

0 commit comments

Comments
 (0)