@@ -65,49 +65,50 @@ class TOKENIZER_TYPE(IntEnum):
6565
6666# TODO: add models here, base models preferred
6767models = [
68- {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
69- {"name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
70- {"name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
71- {"name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
72- {"name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
73- {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
74- {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
75- {"name" : "falcon3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon3-7B-Base" , },
76- {"name" : "bert-bge-large" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/BAAI/bge-large-zh-v1.5" , },
77- {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
78- {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
79- {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
80- {"name" : "stablelm2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
81- {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
82- {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
83- {"name" : "qwen2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" , },
84- {"name" : "olmo" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" , },
85- {"name" : "dbrx" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/databricks/dbrx-base" , },
86- {"name" : "jina-v1-en" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en" , },
87- {"name" : "jina-v2-en" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" , }, # WPM!
88- {"name" : "jina-v2-es" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" , },
89- {"name" : "jina-v2-de" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" , },
90- {"name" : "smaug-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" , },
91- {"name" : "poro-chat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" , },
92- {"name" : "jina-v2-code" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" , },
93- {"name" : "viking" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Viking-7B" , }, # Also used for Viking 13B and 33B
94- {"name" : "gemma" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2b" , },
95- {"name" : "gemma-2" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2-9b" , },
96- {"name" : "jais" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/core42/jais-13b" , },
97- {"name" : "t5" , "tokt" : TOKENIZER_TYPE .UGM , "repo" : "https://huggingface.co/google-t5/t5-small" , },
98- {"name" : "codeshell" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/WisdomShell/CodeShell-7B" , },
99- {"name" : "tekken" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407" , },
100- {"name" : "smollm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/HuggingFaceTB/SmolLM-135M" , },
101- {'name' : "bloom" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigscience/bloom" , },
102- {'name' : "gpt3-finnish" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/TurkuNLP/gpt3-finnish-small" , },
103- {"name" : "exaone" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" , },
104- {"name" : "phi-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-2" , },
105- {"name" : "chameleon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/facebook/chameleon-7b" , },
106- {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , },
107- {"name" : "roberta-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sentence-transformers/stsb-roberta-base" },
108- {"name" : "gigachat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct" },
109- {"name" : "megrez" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Infinigence/Megrez-3B-Instruct" },
110- {"name" : "deepseek-v3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/DeepSeek-V3" },
68+ {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
69+ {"name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
70+ {"name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
71+ {"name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
72+ {"name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
73+ {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
74+ {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
75+ {"name" : "falcon3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon3-7B-Base" , },
76+ {"name" : "bert-bge-large" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/BAAI/bge-large-zh-v1.5" , },
77+ {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
78+ {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
79+ {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
80+ {"name" : "stablelm2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
81+ {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
82+ {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
83+ {"name" : "qwen2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" , },
84+ {"name" : "olmo" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" , },
85+ {"name" : "dbrx" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/databricks/dbrx-base" , },
86+ {"name" : "jina-v1-en" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en" , },
87+ {"name" : "jina-v2-en" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" , }, # WPM!
88+ {"name" : "jina-v2-es" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" , },
89+ {"name" : "jina-v2-de" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" , },
90+ {"name" : "smaug-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" , },
91+ {"name" : "poro-chat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" , },
92+ {"name" : "jina-v2-code" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" , },
93+ {"name" : "viking" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Viking-7B" , }, # Also used for Viking 13B and 33B
94+ {"name" : "gemma" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2b" , },
95+ {"name" : "gemma-2" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2-9b" , },
96+ {"name" : "jais" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/core42/jais-13b" , },
97+ {"name" : "t5" , "tokt" : TOKENIZER_TYPE .UGM , "repo" : "https://huggingface.co/google-t5/t5-small" , },
98+ {"name" : "codeshell" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/WisdomShell/CodeShell-7B" , },
99+ {"name" : "tekken" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407" , },
100+ {"name" : "smollm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/HuggingFaceTB/SmolLM-135M" , },
101+ {'name' : "bloom" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigscience/bloom" , },
102+ {'name' : "gpt3-finnish" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/TurkuNLP/gpt3-finnish-small" , },
103+ {"name" : "exaone" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" , },
104+ {"name" : "phi-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-2" , },
105+ {"name" : "chameleon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/facebook/chameleon-7b" , },
106+ {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , },
107+ {"name" : "roberta-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sentence-transformers/stsb-roberta-base" },
108+ {"name" : "gigachat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct" },
109+ {"name" : "megrez" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Infinigence/Megrez-3B-Instruct" },
110+ {"name" : "deepseek-v3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/DeepSeek-V3" },
111+ {"name" : "deepseek-r1-qwen" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" },
111112]
112113
113114
0 commit comments