Skip to content

Commit a59abea

Browse files
authored
added better guidance for if deprecated tokenizer path fails (#1568)
Adds a check to see if the old tokenizer path is being used when tokenizer path fails. This way it can provide guidance to people to update to the supported `hf_assets_path` and `download_hf_assets.py` script
1 parent 297a72a commit a59abea

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

torchtitan/components/tokenizer.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,16 @@ def _load_config(self, config_path: str) -> Optional[dict]:
8282
def _load_tokenizer_from_path(self, tokenizer_path: str) -> Tokenizer:
8383
"""Load tokenizer from various file formats."""
8484
if not os.path.exists(tokenizer_path):
85-
raise FileNotFoundError(f"Tokenizer path '{tokenizer_path}' does not exist")
85+
if "assets/tokenizer" in tokenizer_path:
86+
raise FileNotFoundError(
87+
"Detected ./assets/tokenizer path which was deprecated in https://github.com/pytorch/torchtitan/pull/1540.\n"
88+
"Remove --model.tokenizer_path and download to --model.hf_assets_path using ./scripts/download_hf_assets.py\n"
89+
"See example: https://github.com/pytorch/torchtitan/tree/main/torchtitan/models/deepseek_v3#download-tokenizer"
90+
)
91+
else:
92+
raise FileNotFoundError(
93+
f"Tokenizer path '{tokenizer_path}' does not exist"
94+
)
8695

8796
# Define paths for different tokenizer file types
8897
tokenizer_json_path = os.path.join(tokenizer_path, "tokenizer.json")

0 commit comments

Comments
 (0)