@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
140
140
{"name" : "exaone4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" , },
141
141
{"name" : "mellum" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/JetBrains/Mellum-4b-base" , },
142
142
{"name" : "llada-moe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base" , },
143
+ {"name" : "granite-docling" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ibm-granite/granite-docling-258M" , },
143
144
]
144
145
145
146
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -160,8 +161,6 @@ class TOKENIZER_TYPE(IntEnum):
160
161
{"name" : "kimi-k2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/moonshotai/Kimi-K2-Base" , "chkhsh" : "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" },
161
162
{"name" : "qwen2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B" , "chkhsh" : "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c" },
162
163
{"name" : "grok-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/alvarobartt/grok-2-tokenizer" , "chkhsh" : "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273" },
163
- # granite-docling uses gpt-2 pre w/ clean_spaces false which maps to trillion
164
- {"name" : "trillion" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ibm-granite/granite-docling-258M" , "chkhsh" : "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e" },
165
164
]
166
165
167
166
0 commit comments