Skip to content

Commit c2202d2

Browse files
committed
fix: Use granite-docling pre
Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 428db16 commit c2202d2

File tree

2 files changed

+4
-5
lines changed

2 files changed

+4
-5
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -738,9 +738,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
738738
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
739739
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
740740
res = "grok-2"
741-
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
742-
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
743-
res = "trillion"
744741
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
745742
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
746743
res = "llama-bpe"
@@ -894,6 +891,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
894891
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
895892
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
896893
res = "llada-moe"
894+
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
895+
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
896+
res = "granite-docling"
897897

898898
if res is None:
899899
logger.warning("\n")

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
140140
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
141141
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
142142
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
143+
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
143144
]
144145

145146
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -160,8 +161,6 @@ class TOKENIZER_TYPE(IntEnum):
160161
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
161162
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
162163
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
163-
# granite-docling uses gpt-2 pre w/ clean_spaces false which maps to trillion
164-
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", "chkhsh": "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e"},
165164
]
166165

167166

0 commit comments

Comments
 (0)