Skip to content

Commit ea265fe

Browse files
committed
support for smoldocling
Signed-off-by: ryan-mangeno <[email protected]>
1 parent b25e927 commit ea265fe

File tree

6 files changed

+34
-1
lines changed

6 files changed

+34
-1
lines changed

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):
128128
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
129129
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
130130
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
131+
{"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", },
131132
]
132133

133134
# some models are known to be broken upstream, so we will skip them as exceptions

gguf-py/gguf/tensor_mapping.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class TensorNameMap:
3232
"model.word_embeddings", # bailingmoe
3333
"language_model.model.embed_tokens", # llama4
3434
"encoder", # neobert
35+
"model.text_model.embed_tokens.weight", # smoldocling
3536
),
3637

3738
# Token type embeddings
@@ -63,7 +64,7 @@ class TensorNameMap:
6364
MODEL_TENSOR.OUTPUT: (
6465
"embed_out", # gptneox
6566
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
66-
"output", # llama-pth bloom internlm2
67+
"output", # llama-pth bloom internlm2 smoldocling
6768
"word_embeddings_for_head", # persimmon
6869
"lm_head.linear", # phi2
6970
"output_layer", # chatglm
@@ -93,6 +94,7 @@ class TensorNameMap:
9394
"model.ln_out", # rwkv7
9495
"backbone.final_layer_norm", # wavtokenizer
9596
"model.norm", # llama4
97+
"output_norm", # smoldocling
9698
),
9799

98100
# Rope frequencies
@@ -136,6 +138,7 @@ class TensorNameMap:
136138
"model.layers.{bid}.ln1", # rwkv7
137139
"model.layers.{bid}.input_layernorm", # llama4
138140
"transformer_encoder.{bid}.attention_norm", # neobert
141+
"blk.{bid}.attn_norm", # smoldocling
139142
),
140143

141144
# Attention norm 2
@@ -179,6 +182,7 @@ class TensorNameMap:
179182
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
180183
"transformer.h.{bid}.attn.attention.q_proj", # exaone
181184
"model.layers.{bid}.self_attn.q_proj", # llama4
185+
"blk.{bid}.attn_q", # smoldocling
182186
),
183187

184188
# Attention key
@@ -195,6 +199,7 @@ class TensorNameMap:
195199
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
196200
"transformer.h.{bid}.attn.attention.k_proj", # exaone
197201
"model.layers.{bid}.self_attn.k_proj", # llama4
202+
"blk.{bid}.attn_k", # smoldocling
198203
),
199204

200205
# Attention value
@@ -210,6 +215,8 @@ class TensorNameMap:
210215
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
211216
"transformer.h.{bid}.attn.attention.v_proj", # exaone
212217
"model.layers.{bid}.self_attn.v_proj", # llama4
218+
"blk.{bid}.attn_v", # smoldocling
219+
213220
),
214221

215222
# Attention output
@@ -240,6 +247,7 @@ class TensorNameMap:
240247
"transformer.h.{bid}.attn.attention.out_proj", # exaone
241248
"model.layers.{bid}.self_attn.o_proj", # llama4
242249
"transformer_encoder.{bid}.wo", # neobert
250+
"blk.{bid}.attn_output", # smoldocling
243251
),
244252

245253
# Attention output norm
@@ -249,6 +257,7 @@ class TensorNameMap:
249257
"encoder.layers.{bid}.norm1", # nomic-bert
250258
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
251259
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
260+
"blk.{bid}.attn_norm", # smoldocling
252261
),
253262

254263
MODEL_TENSOR.ATTN_POST_NORM: (
@@ -281,6 +290,7 @@ class TensorNameMap:
281290
"transformer.layers.{bid}.ffn_norm", # openelm
282291
"model.layers.{bid}.post_attention_layernorm", # llama4
283292
"transformer_encoder.{bid}.ffn_norm", # neobert
293+
"blk.{bid}.ffn_norm", # smoldocling
284294
),
285295

286296
# Post feed-forward norm
@@ -346,6 +356,7 @@ class TensorNameMap:
346356
"transformer.h.{bid}.mlp.c_fc_1", # exaone
347357
"model.layers.{bid}.feed_forward.up_proj", # llama4
348358
"transformer_encoder.{bid}.ffn.w12", # neobert
359+
"blk.{bid}.ffn_up", # smoldocling
349360
),
350361

351362
MODEL_TENSOR.FFN_UP_EXP: (
@@ -383,6 +394,8 @@ class TensorNameMap:
383394
"model.layers.{bid}.residual_mlp.w1", # arctic
384395
"transformer.h.{bid}.mlp.c_fc_0", # exaone
385396
"model.layers.{bid}.feed_forward.gate_proj", # llama4
397+
"blk.{bid}.ffn_gate", # smoldocling
398+
386399
),
387400

388401
MODEL_TENSOR.FFN_GATE_EXP: (
@@ -429,6 +442,8 @@ class TensorNameMap:
429442
"model.layers.h.{bid}.mlp.c_proj", # exaone
430443
"model.layers.{bid}.feed_forward.down_proj", # llama4
431444
"transformer_encoder.{bid}.ffn.w3", # neobert
445+
"blk.{bid}.ffn_down", # smoldocling
446+
432447
),
433448

434449
MODEL_TENSOR.FFN_DOWN_EXP: (

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ extern "C" {
117117
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118118
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119119
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
120+
LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 36,
120121
};
121122

122123
enum llama_rope_type {

src/llama-model.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) {
4040
case LLM_TYPE_190M: return "190M";
4141
case LLM_TYPE_220M: return "220M";
4242
case LLM_TYPE_250M: return "250M";
43+
case LLM_TYPE_256M: return "256M";
4344
case LLM_TYPE_270M: return "270M";
4445
case LLM_TYPE_335M: return "335M";
4546
case LLM_TYPE_410M: return "410M";
@@ -575,6 +576,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
575576
case 22: type = LLM_TYPE_1B; break;
576577
case 26: type = LLM_TYPE_3B; break;
577578
case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
579+
case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
578580
// granite uses a vocab with len 49152
579581
case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
580582
case 36: type = LLM_TYPE_8B; break; // granite

src/llama-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ enum llm_type {
3232
LLM_TYPE_190M,
3333
LLM_TYPE_220M,
3434
LLM_TYPE_250M,
35+
LLM_TYPE_256M, // smoldocling
3536
LLM_TYPE_270M,
3637
LLM_TYPE_335M,
3738
LLM_TYPE_410M,

src/llama-vocab.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
424424
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
425425
};
426426
break;
427+
case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING:
428+
// uses digits and byte level pre tokenizers defined in the pre_tokenizer section of
429+
// https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json
430+
regex_exprs = {
431+
"[0-9]",
432+
"[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+",
433+
};
427434
default:
428435
// default regex for BPE tokenization pre-processing
429436
regex_exprs = {
@@ -1656,6 +1663,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
16561663
tokenizer_pre == "seed-coder") {
16571664
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
16581665
clean_spaces = false;
1666+
} else if (
1667+
tokenizer_pre == "smoldocling") {
1668+
pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING;
1669+
clean_spaces = false;
16591670
} else {
16601671
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
16611672
}
@@ -1839,6 +1850,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
18391850
|| t.first == "<EOT>"
18401851
|| t.first == "_<EOT>"
18411852
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
1853+
|| t.first == "<end_of_utterance>" // smoldocling
18421854
) {
18431855
special_eot_id = t.second;
18441856
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1998,6 +2010,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
19982010
|| t.first == "<EOT>"
19992011
|| t.first == "_<EOT>"
20002012
|| t.first == "<|end_of_text|>"
2013+
|| t.first == "<end_of_utterance>" // smoldocling
20012014
) {
20022015
special_eog_ids.insert(t.second);
20032016
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

0 commit comments

Comments
 (0)