@@ -62,6 +62,7 @@ class TOKENIZER_TYPE(IntEnum):
6262 {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
6363 {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
6464 {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
65+ {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
6566]
6667
6768# make directory "models/tokenizers" if it doesn't exist
@@ -102,6 +103,14 @@ def download_file_with_auth(url, token, save_path):
102103 save_path = f"models/tokenizers/{ name } /tokenizer.json"
103104 download_file_with_auth (url , token , save_path )
104105
106+ # if downloaded file is less than 1KB, we likely need to download an LFS instead
107+ if os .path .getsize (save_path ) < 1024 :
108+ # remove the file
109+ os .remove (save_path )
110+ url = f"{ repo } /resolve/main/tokenizer.json"
111+ save_path = f"models/tokenizers/{ name } /tokenizer.json"
112+ download_file_with_auth (url , token , save_path )
113+
105114 if tokt == TOKENIZER_TYPE .SPM :
106115 url = f"{ repo } /resolve/main/tokenizer.model"
107116 save_path = f"models/tokenizers/{ name } /tokenizer.model"
0 commit comments