Skip to content

Commit b3b67d6

Browse files
committed
(try) adding helium model
1 parent b4d92a5 commit b3b67d6

File tree

4 files changed

+199
-9
lines changed

4 files changed

+199
-9
lines changed

convert_hf_to_gguf.py

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
696696
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
697697
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
698698
res = "deepseek-v3"
699+
if chkhsh == "1b8c872b06b15bbd59870e70d5d689fa8af17c689f857667d06f7338d0d64f39":
700+
# ref: https://huggingface.co/kyutai/helium-1-preview-2b
701+
res = "helium"
699702

700703
if res is None:
701704
logger.warning("\n")
@@ -1557,19 +1560,47 @@ def prepare_tensors(self):
15571560
raise ValueError(f"Unprocessed norms: {norms}")
15581561

15591562

1560-
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1563+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "HeliumForCausalLM")
15611564
class LlamaModel(Model):
15621565
model_arch = gguf.MODEL_ARCH.LLAMA
15631566

1567+
def set_vocab_helium(self):
1568+
with open(self.dir_model / 'tokenizer.json', "r", encoding="utf-8") as f:
1569+
tokenizer_json = json.load(f)
1570+
tok_model = tokenizer_json["model"]
1571+
assert tok_model["type"] == "Unigram"
1572+
1573+
tokens = [tok[0] for tok in tok_model["vocab"]]
1574+
scores = [tok[1] for tok in tok_model["vocab"]]
1575+
toktypes = [gguf.TokenType.CONTROL if i <= 105 else gguf.TokenType.NORMAL for i in range(len(tokens))]
1576+
1577+
self.gguf_writer.add_tokenizer_model("t5")
1578+
self.gguf_writer.add_tokenizer_pre("default")
1579+
self.gguf_writer.add_token_list(tokens)
1580+
self.gguf_writer.add_token_scores(scores)
1581+
self.gguf_writer.add_token_types(toktypes)
1582+
# self.gguf_writer.add_add_space_prefix(add_prefix)
1583+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
1584+
# self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
1585+
# if precompiled_charsmap:
1586+
# self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
1587+
1588+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
1589+
special_vocab.add_to_gguf(self.gguf_writer)
1590+
1591+
self.gguf_writer.add_add_bos_token(True)
1592+
self.gguf_writer.add_add_eos_token(False)
1593+
15641594
def set_vocab(self):
1565-
try:
1566-
self._set_vocab_sentencepiece()
1567-
except FileNotFoundError:
1568-
try:
1569-
self._set_vocab_llama_hf()
1570-
except (FileNotFoundError, TypeError):
1571-
# Llama 3
1572-
self._set_vocab_gpt2()
1595+
# try:
1596+
# self._set_vocab_sentencepiece()
1597+
# except FileNotFoundError:
1598+
# try:
1599+
# self._set_vocab_llama_hf()
1600+
# except (FileNotFoundError, TypeError):
1601+
# # Llama 3
1602+
# self._set_vocab_gpt2()
1603+
self.set_vocab_helium()
15731604

15741605
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
15751606
if self.hparams.get("vocab_size", 32000) == 32016:

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ extern "C" {
105105
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
106106
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
107107
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
108+
LLAMA_VOCAB_PRE_TYPE_HELIUM = 29,
108109
};
109110

110111
enum llama_rope_type {

models/ggml-vocab-helium.gguf.inp

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
78+
3
79+
__ggml_vocab_test__
80+
33
81+
__ggml_vocab_test__
82+
333
83+
__ggml_vocab_test__
84+
3333
85+
__ggml_vocab_test__
86+
33333
87+
__ggml_vocab_test__
88+
333333
89+
__ggml_vocab_test__
90+
3333333
91+
__ggml_vocab_test__
92+
33333333
93+
__ggml_vocab_test__
94+
333333333
95+
__ggml_vocab_test__
96+
Cửa Việt
97+
__ggml_vocab_test__
98+
discards
99+
__ggml_vocab_test__
100+
101+
102+
103+
104+
105+
106+
107+
108+
109+
110+
111+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112+
__ggml_vocab_test__

models/ggml-vocab-helium.gguf.out

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
518 451 363 386 363 300 295 1599
2+
38260
3+
4+
363 363
5+
363 363 363
6+
2573
7+
363 115
8+
116
9+
116 116
10+
116 116 116
11+
363 115 116
12+
33163 998
13+
363 33163 998
14+
33163 1008
15+
363 33163 1008
16+
363 33163 1008 682
17+
33163 362 998 682
18+
363 33163 362 998 682
19+
363 419 379 363 346 265 272 259 364 529 4551
20+
1829 372 386 388 363 394 1973 17630 471 366 434 593 2772
21+
363 45339 315 243 7237 363 18586 363 314 251 315 244 17878 314 285 27759 32722
22+
363 331 264 234 331 264 288 331 264 253 331 265 245 331 264 249 331 265 236 331 264 256 331 264 289 331 264 265 331 265 235 331 264 265 331 264 268 331 264 288 331 264 239 331 264 235 331 264 261 331 264 239 331 265 235 331 264 243
23+
363 346 265 260 234 377 6322 381 363 346 265 258 288 332 234 247 346 265 246 277 345 290 249 377 38148 498 30570 366 468 32949 1214 381 363 332 262 239 377 8543 498 30570 385 454 530 947 15528 381
24+
33163
25+
363 33163
26+
363 363 33163
27+
363 363 363 33163
28+
2573 33163
29+
2573 33163 116 2573 33163
30+
363 377
31+
116 363 516
32+
363 378 918
33+
33163 362 448 378 2237 682 1259 405 424 363 346 265 258 235 363 420 336 242 251 336 237 285 335 262 274 32993 335 289 271 334 295 262 368 382 368 386 368 383 368 335 270 275 345 295 264
34+
363 23526 23526
35+
363 382
36+
363 382 382
37+
363 382 382 382
38+
363 382 382 382 382
39+
363 382 382 382 382 382
40+
363 382 382 382 382 382 382
41+
363 382 382 382 382 382 382 382
42+
363 382 382 382 382 382 382 382 382
43+
363 382 382 382 382 382 382 382 382 382
44+
501 331 293 279 414 2924 331 293 241 401
45+
363 22391 366
46+
116 363 363 116 116 363 363 116 116 116 363 363 115 363 115 115 363 115 116 363 363 363 116 2573 116 7547 116 12414 116 363 346 265 260 234 377 6322 381 363 346 265 258 288 332 234 247 346 265 246 277 345 290 249 377 38148 498 30570 366 468 32949 1214 381 363 332 262 239 363 346 265 272 259 346 265 272 259 363 382 363 382 382 363 382 382 382 363 382 382 382 382 363 382 382 382 382 382 363 382 382 382 382 382 382 363 382 382 382 382 382 382 382 363 382 382 382 382 382 382 382 382 363 382 364 382 363 382 364 364 382 363 382 719 382 363 331 264 234 331 264 288 331 264 253 331 265 245 331 264 249 331 265 236 331 264 256 331 264 289 331 264 265 331 265 235 331 264 265 331 264 268 331 264 288 331 264 239 346 265 258 235 363 420 336 242 251 336 237 285 335 262 274 32993 335 289 271 334 295 262 368 382 368 386 368 383 368 335 270 275 345 295 264 469 45329 808 808 808 808 808 808 808 363 45339 315 243 7237 363 18586 363 314 251 315 244 17878 314 285 27759 32722 17970 378 378 378 378 2017 47327 47327 413 413 413 656 25481 23526 23526 22389 22389 402 378 635 508 363 378 401 2334 456 378 366 502 362 363 378 8267 424 1384 420 363 378 734 421 1384 402 378 853 700 404 362 363 378 594 424 583 534 10383 420 591 378 16129 369 378 561 753

0 commit comments

Comments
 (0)