@@ -86,7 +86,6 @@ class TOKENIZER_TYPE(IntEnum):
8686 {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
8787 {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
8888 {"name" : "falcon3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon3-7B-Base" , },
89- {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-7B-Base" , },
9089 {"name" : "bert-bge-large" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/BAAI/bge-large-zh-v1.5" , },
9190 {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
9291 {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
@@ -139,6 +138,11 @@ class TOKENIZER_TYPE(IntEnum):
139138 {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , "chkhsh" : "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" },
140139 {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , "chkhsh" : "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" },
141140 {"name" : "hunyuan" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tencent/Hunyuan-A13B-Instruct" , "chkhsh" : "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664" },
141+ # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
142+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base" , "chkhsh" : "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6" },
143+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-1B-Base" , "chkhsh" : "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86" },
144+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-7B-Base" , "chkhsh" : "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896" },
145+ {"name" : "falcon-h1" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon-H1-34B-Base" , "chkhsh" : "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b" },
142146]
143147
144148
0 commit comments