@@ -64,6 +64,7 @@ class TOKENIZER_TYPE(IntEnum):
6464 {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
6565 {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
6666 {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
67+ {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
6768]
6869
6970# make directory "models/tokenizers" if it doesn't exist
@@ -104,6 +105,14 @@ def download_file_with_auth(url, token, save_path):
104105 save_path = f"models/tokenizers/{ name } /tokenizer.json"
105106 download_file_with_auth (url , token , save_path )
106107
108+ # if downloaded file is less than 1KB, we likely need to download an LFS instead
109+ if os .path .getsize (save_path ) < 1024 :
110+ # remove the file
111+ os .remove (save_path )
112+ url = f"{ repo } /resolve/main/tokenizer.json"
113+ save_path = f"models/tokenizers/{ name } /tokenizer.json"
114+ download_file_with_auth (url , token , save_path )
115+
107116 if tokt == TOKENIZER_TYPE .SPM :
108117 url = f"{ repo } /resolve/main/tokenizer.model"
109118 save_path = f"models/tokenizers/{ name } /tokenizer.model"
0 commit comments