Skip to content

Commit 428db16

Browse files
committed
feat: Add granite-docling vocab pre enum
Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 64e10f5 commit 428db16

File tree

2 files changed

+46
-40
lines changed

2 files changed

+46
-40
lines changed

src/llama-vocab.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
347347
case LLAMA_VOCAB_PRE_TYPE_OLMO:
348348
case LLAMA_VOCAB_PRE_TYPE_JAIS:
349349
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
350+
case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
350351
regex_exprs = {
351352
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
352353
};
@@ -1961,6 +1962,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
19611962
tokenizer_pre == "trillion") {
19621963
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
19631964
clean_spaces = false;
1965+
} else if (
1966+
tokenizer_pre == "granite-docling") {
1967+
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
1968+
clean_spaces = false;
19641969
} else if (
19651970
tokenizer_pre == "bailingmoe" ||
19661971
tokenizer_pre == "llada-moe") {

src/llama-vocab.h

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,46 +8,47 @@
88

99
// pre-tokenization types
1010
enum llama_vocab_pre_type {
11-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47-
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48-
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49-
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50-
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
11+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21+
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23+
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24+
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28+
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29+
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30+
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32+
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37+
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48+
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50+
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
51+
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
5152
};
5253

5354
struct LLM_KV;

0 commit comments

Comments
 (0)