support for smoldocling

ryan-mangeno · ryan-mangeno · commit ea265fe47027 · 2025-07-09T15:04:39.000-04:00
Signed-off-by: ryan-mangeno &lt;ryanmangeno@gmail.com&gt;
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
     {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
     {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
+    {"name": "smoldocling",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", },
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -32,6 +32,7 @@ class TensorNameMap:
             "model.word_embeddings",                     # bailingmoe
             "language_model.model.embed_tokens",         # llama4
             "encoder",                                   # neobert
+            "model.text_model.embed_tokens.weight",      # smoldocling
         ),
 
         # Token type embeddings
@@ -63,7 +64,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
             "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
-            "output",                    # llama-pth bloom internlm2
+            "output",                    # llama-pth bloom internlm2 smoldocling
             "word_embeddings_for_head",  # persimmon
             "lm_head.linear",            # phi2
             "output_layer",              # chatglm
@@ -93,6 +94,7 @@ class TensorNameMap:
             "model.ln_out",                            # rwkv7
             "backbone.final_layer_norm",               # wavtokenizer
             "model.norm",                              # llama4
+            "output_norm",                             # smoldocling
         ),
 
         # Rope frequencies
@@ -136,6 +138,7 @@ class TensorNameMap:
             "model.layers.{bid}.ln1",                               # rwkv7
             "model.layers.{bid}.input_layernorm",                   # llama4
             "transformer_encoder.{bid}.attention_norm",             # neobert
+            "blk.{bid}.attn_norm",                                  # smoldocling
         ),
 
         # Attention norm 2
@@ -179,6 +182,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
             "transformer.h.{bid}.attn.attention.q_proj",                 # exaone
             "model.layers.{bid}.self_attn.q_proj",                       # llama4
+            "blk.{bid}.attn_q",                                          # smoldocling
         ),
 
         # Attention key
@@ -195,6 +199,7 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
             "transformer.h.{bid}.attn.attention.k_proj",               # exaone
             "model.layers.{bid}.self_attn.k_proj",                     # llama4
+            "blk.{bid}.attn_k",                                        # smoldocling
         ),
 
         # Attention value
@@ -210,6 +215,8 @@ class TensorNameMap:
             "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
             "transformer.h.{bid}.attn.attention.v_proj",                 # exaone
             "model.layers.{bid}.self_attn.v_proj",                       # llama4
+            "blk.{bid}.attn_v",                                          # smoldocling
+
         ),
 
         # Attention output
@@ -240,6 +247,7 @@ class TensorNameMap:
             "transformer.h.{bid}.attn.attention.out_proj",                  # exaone
             "model.layers.{bid}.self_attn.o_proj",                          # llama4
             "transformer_encoder.{bid}.wo",                                 # neobert
+            "blk.{bid}.attn_output",                                        # smoldocling
         ),
 
         # Attention output norm
@@ -249,6 +257,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.norm1",                      # nomic-bert
             "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
             "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
+            "blk.{bid}.attn_norm",                             # smoldocling
         ),
 
         MODEL_TENSOR.ATTN_POST_NORM: (
@@ -281,6 +290,7 @@ class TensorNameMap:
             "transformer.layers.{bid}.ffn_norm",                             # openelm
             "model.layers.{bid}.post_attention_layernorm",                   # llama4
             "transformer_encoder.{bid}.ffn_norm",                            # neobert
+            "blk.{bid}.ffn_norm",                                            # smoldocling
         ),
 
         # Post feed-forward norm
@@ -346,6 +356,7 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
             "model.layers.{bid}.feed_forward.up_proj",                # llama4
             "transformer_encoder.{bid}.ffn.w12",                      # neobert
+            "blk.{bid}.ffn_up",                                       # smoldocling                                     
         ),
 
         MODEL_TENSOR.FFN_UP_EXP: (
@@ -383,6 +394,8 @@ class TensorNameMap:
             "model.layers.{bid}.residual_mlp.w1",         # arctic
             "transformer.h.{bid}.mlp.c_fc_0",             # exaone
             "model.layers.{bid}.feed_forward.gate_proj",  # llama4
+            "blk.{bid}.ffn_gate",                         # smoldocling                                     
+
         ),
 
         MODEL_TENSOR.FFN_GATE_EXP: (
@@ -429,6 +442,8 @@ class TensorNameMap:
             "model.layers.h.{bid}.mlp.c_proj",                        # exaone
             "model.layers.{bid}.feed_forward.down_proj",              # llama4
             "transformer_encoder.{bid}.ffn.w3",                       # neobert
+            "blk.{bid}.ffn_down",                                     # smoldocling                                     
+
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
diff --git a/include/llama.h b/include/llama.h
@@ -117,6 +117,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
         LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
         LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
+        LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING    = 36,
     };
 
     enum llama_rope_type {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_190M:          return "190M";
         case LLM_TYPE_220M:          return "220M";
         case LLM_TYPE_250M:          return "250M";
+        case LLM_TYPE_256M:          return "256M";
         case LLM_TYPE_270M:          return "270M";
         case LLM_TYPE_335M:          return "335M";
         case LLM_TYPE_410M:          return "410M";
@@ -575,6 +576,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                         case 22: type = LLM_TYPE_1B; break;
                         case 26: type = LLM_TYPE_3B; break;
                         case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
+                        case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
                         // granite uses a vocab with len 49152
                         case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
                         case 36: type = LLM_TYPE_8B; break; // granite
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -32,6 +32,7 @@ enum llm_type {
     LLM_TYPE_190M,
     LLM_TYPE_220M,
     LLM_TYPE_250M,
+    LLM_TYPE_256M, // smoldocling
     LLM_TYPE_270M,
     LLM_TYPE_335M,
     LLM_TYPE_410M,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -424,6 +424,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING:
+                // uses digits and byte level pre tokenizers defined in the pre_tokenizer section of
+                // https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json 
+                regex_exprs = {
+                        "[0-9]",
+                        "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+",
+                    };
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -1656,6 +1663,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "seed-coder") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
                 clean_spaces = false;
+            } else if (
+                tokenizer_pre == "smoldocling") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING;
+                clean_spaces = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -1839,6 +1850,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<EOT>"
                         || t.first == "_<EOT>"
                         || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
+                        || t.first == "<end_of_utterance>" // smoldocling
                    ) {
                     special_eot_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1998,6 +2010,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     || t.first == "<EOT>"
                     || t.first == "_<EOT>"
                     || t.first == "<|end_of_text|>"
+                    || t.first == "<end_of_utterance>" // smoldocling
                ) {
                 special_eog_ids.insert(t.second);
                 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):`
`128`	`128`	`{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },`
`129`	`129`	`{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },`
`130`	`130`	`{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },`
	`131`	`+ {"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", },`
`131`	`132`	`]`
`132`	`133`
`133`	`134`	`# some models are known to be broken upstream, so we will skip them as exceptions`