Skip to content

Commit a6b9bde

Browse files
committed
update convert_hf_to_gguf to include ruri-large
1 parent f256169 commit a6b9bde

File tree

5 files changed

+165
-1
lines changed

5 files changed

+165
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
809809
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
810810
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
811811
res = "minerva-7b"
812+
if chkhsh == "9286a8bdaef9f09da63eae001d8dca3e8b4dcfebfe468807c0c87a831a4a1901":
813+
# ref: https://huggingface.co/cl-nagoya/ruri-large
814+
res = "ruri-large"
812815

813816
if res is None:
814817
logger.warning("\n")

convert_hf_to_gguf_update.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,15 @@ def download_model(model):
176176

177177
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
178178

179-
files = ["config.json", "tokenizer.json", "tokenizer_config.json", "vocab.txt"]
179+
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
180180

181181
if name == "gpt-4o":
182182
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
183183
files = ["tokenizer.json", "tokenizer_config.json"]
184184

185+
if name == "ruri-large":
186+
files = ["config.json", "tokenizer_config.json", "vocab.txt"]
187+
185188
if tokt == TOKENIZER_TYPE.SPM:
186189
files.append("tokenizer.model")
187190

models/ggml-vocab-ruri-large.gguf

632 KB
Binary file not shown.
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Äpfel
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
78+
3
79+
__ggml_vocab_test__
80+
33
81+
__ggml_vocab_test__
82+
333
83+
__ggml_vocab_test__
84+
3333
85+
__ggml_vocab_test__
86+
33333
87+
__ggml_vocab_test__
88+
333333
89+
__ggml_vocab_test__
90+
3333333
91+
__ggml_vocab_test__
92+
33333333
93+
__ggml_vocab_test__
94+
333333333
95+
__ggml_vocab_test__
96+
Cửa Việt
97+
__ggml_vocab_test__
98+
discards
99+
__ggml_vocab_test__
100+
101+
102+
103+
104+
105+
106+
107+
108+
109+
110+
111+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112+
__ggml_vocab_test__
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
88 13247 35 32 1 33 92 18336 7095 7045
2+
1
3+
4+
5+
6+
7+
8+
9+
10+
11+
12+
32004 29944 102 28789
13+
32004 29944 102 28789
14+
32004 29944 18520
15+
32004 29944 18520
16+
32004 29944 18520 16
17+
32004 29944 27 102 28789 16
18+
32004 29944 27 102 28789 16
19+
14152 12741 23274 1 29 82 16003
20+
102 16435 7187 38 99 7069 25460 7099 83 7045 7094 7222 7095 7069
21+
1 1 1
22+
1
23+
1 23 31304 7048 21907 7071 24 1 1 1 23 92 19760 14698 12835 84 7073 7075 32061 7045 26430 30214 16061 23624 16061 7094 24 1 23 18446 16157 84 7073 7075 32061 14152 12648 22106 7045 21801 7045 94 7070 7044 17253 20903 7044 24
24+
32004 29944
25+
32004 29944
26+
32004 29944
27+
32004 29944
28+
32004 29944
29+
32004 29944 32004 29944
30+
23
31+
44
32+
22 84 14469
33+
32004 29944 27 104 22 28187 16 55 13544 21369 7084 23418 1 46 2366 2263 1448 80 16003 12835 17228 22230 17880 23055 1589 109
34+
16 16 16 16 16 16
35+
34
36+
13590
37+
13590 7083
38+
13590 17209
39+
13590 17209 7083
40+
13590 17209 17209
41+
13590 17209 17209 7083
42+
13590 17209 17209 17209
43+
13590 17209 17209 17209 7083
44+
1 1
45+
23283 23637 14194 7045
46+
1 23 31304 7048 21907 7071 24 1 1 1 23 92 19760 14698 12835 84 7073 7075 32061 7045 26430 30214 16061 23624 16061 7094 24 1 1 34 13590 13590 7083 13590 17209 13590 17209 7083 13590 17209 17209 13590 17209 17209 7083 13590 17209 17209 17209 34 29 34 34 29 29 34 34 29 29 29 34 1 46 2366 2263 1448 80 16003 12835 17228 22230 17880 23055 1589 109 26810 7509 7509 7509 7509 8741 8741 8741 8741 8741 8741 8741 1 1 1 22 22 22 22 22 22 79 79 79 79 79 8669 8669 7329 7329 7329 7329 28042 28042 28042 7508 7508 7508 7508 7508 7508 8134 8134 8134 8134 8134 8134 56 22 101 7084 24992 12620 22 17253 16344 30334 22 98 13891 12940 27 22 18898 23418 31114 12940 46 22 60 31304 7046 31114 12940 56 22 91 7071 92 19897 21801 27 22 51 23418 91 26183 98 19851 30713 7043 46 21452 22 69 7084 80 22 91 7159

0 commit comments

Comments
 (0)