@@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):
128128 {"name" : "llama4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct" , },
129129 {"name" : "pixtral" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistral-community/pixtral-12b" , },
130130 {"name" : "seed-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base" , },
131+ {"name" : "a.x-4.0" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/skt/A.X-4.0" , },
131132]
132133
133134# some models are known to be broken upstream, so we will skip them as exceptions
@@ -137,6 +138,12 @@ class TOKENIZER_TYPE(IntEnum):
137138 {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , "chkhsh" : "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" },
138139 {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , "chkhsh" : "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" },
139140 {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , "chkhsh" : "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" },
141+ {"name" : "hunyuan" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tencent/Hunyuan-A13B-Instruct" , "chkhsh" : "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664" },
142+ # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
143+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base" , "chkhsh" : "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6" },
144+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-1B-Base" , "chkhsh" : "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86" },
145+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-7B-Base" , "chkhsh" : "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896" },
146+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-34B-Base" , "chkhsh" : "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b" },
140147]
141148
142149
0 commit comments