Skip to content

Commit cbc4bc2

Browse files
committed
feat: Add granite-docling preprocessor workaround mapping
It uses the same pretokenizer as granite3, but the vocab is different so we need a different chkhsh. Branch: GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 703f9e3 commit cbc4bc2

File tree

2 files changed

+5
-0
lines changed

2 files changed

+5
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
738738
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
739739
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
740740
res = "grok-2"
741+
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
742+
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
743+
res = "refact"
741744
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
742745
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
743746
res = "llama-bpe"

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ class TOKENIZER_TYPE(IntEnum):
160160
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
161161
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
162162
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
163+
# granite-docling uses granite3 tokenizer w/ additional tokens => refact pretokenizer
164+
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", "chkhsh": "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e"},
163165
]
164166

165167

0 commit comments

Comments
 (0)