[HF] Deprecate tokenizer_path in Toml Files (#1592)

wesleytruong · web-flow · commit b5b7ffbd6c98 · 2025-08-19T13:20:35.000-07:00
This PR deprecates the `model.tokenizer_path` in .toml files and replaces them with `model.hf_assets_path`. See #1526 for more details. Reasoning: `tokenizer_path` is still supported in .toml files by naively overriding `hf_assets_path` when it is specified. This is meant to allow backwards compatibility, but it's not meant to be a well-maintained option in the future. `tokenizer_path` is used for: - loading a tokenizer `hf_assets_path` can be used for: - one stop shop for accessing hf repo's files - loading a tokenizer (or multiple tokenizers) - loading safetensor checkpoints - loading other hf assets within the same repo (encoders, autoencoders, etc.) The reason we change `tokenizer_path -> hf_assets_path` is to be more consistent with this new functionality. and we additionally change the path `assets/tokenizer -> assets/hf/<model-name>` to reflect the new `download_hf_assets.py` script. Breaking Changes: - You may have to download or move tokenizer to the new default HF assets path `./assets/hf/<model-name>` - You may have to download "duplicate" tokenizers for different versions of the same model - e.g. Llama-3.1-8B and Llama-3.1-405B will each require a tokenizer in their respective HF assets path
diff --git a/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml b/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml
@@ -22,7 +22,7 @@ enable_wandb = false
 name = "deepseek_v2"
 flavor = "deepseek-ai/DeepSeek-V2-Lite"
 # test tokenizer.model, for debug purpose only
-tokenizer_path = "./tests/assets/tokenizer"
+hf_assets_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/llama4/train_configs/debug_model.toml b/torchtitan/experiments/llama4/train_configs/debug_model.toml
@@ -22,7 +22,7 @@ enable_wandb = false
 name = "llama4"
 flavor = "debugmodel"
 # test tokenizer.model, for debug purpose only
-tokenizer_path = "./tests/assets/tokenizer"
+hf_assets_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml b/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml
@@ -17,7 +17,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama4"
 flavor = "17bx128e"
-tokenizer_path = "./assets/tokenizer/Llama-4-Scout-17B-16E"
+hf_assets_path = "./assets/hf/Llama-4-Scout-17B-128E"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml b/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml
@@ -17,7 +17,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama4"
 flavor = "17bx16e"
-tokenizer_path = "./assets/tokenizer/Llama-4-Scout-17B-16E"
+hf_assets_path = "./assets/hf/Llama-4-Scout-17B-16E"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/qwen3/README.md b/torchtitan/experiments/qwen3/README.md
@@ -10,7 +10,7 @@ Other model sizes are added to the args, but toml file configs need to be added
 
 #### Download Qwen3 tokenizer
 
-```python scripts/download_tokenizer.py --repo_id Qwen/Qwen3-0.6B```
+```python scripts/download_hf_assets.py --repo_id Qwen/Qwen3-0.6B --asset tokenizer```
 
 
 #### Parity with HF
diff --git a/torchtitan/models/deepseek_v3/train_configs/debug_model.toml b/torchtitan/models/deepseek_v3/train_configs/debug_model.toml
@@ -22,7 +22,7 @@ enable_wandb = false
 name = "deepseek_v3"
 flavor = "debugmodel"
 # test tokenizer, for debug purpose only
-tokenizer_path = "./tests/assets/tokenizer"
+hf_assets_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml b/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml
@@ -20,7 +20,7 @@ enable_wandb = false
 [model]
 name = "deepseek_v3"
 flavor = "16B"
-tokenizer_path = "./assets/tokenizer/deepseek-moe-16b-base"
+hf_assets_path = "./assets/hf/deepseek-moe-16b-base"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml b/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml
@@ -20,7 +20,7 @@ enable_wandb = false
 [model]
 name = "deepseek_v3"
 flavor = "671B"
-tokenizer_path = "./assets/tokenizer/DeepSeek-V3"
+hf_assets_path = "./assets/hf/DeepSeek-V3"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/models/llama3/train_configs/llama3_405b.toml b/torchtitan/models/llama3/train_configs/llama3_405b.toml
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama3"
 flavor = "405B"
-tokenizer_path = "./assets/tokenizer/Llama-3.1-8B"
+hf_assets_path = "./assets/hf/Llama-3.1-405B"
 converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/models/llama3/train_configs/llama3_70b.toml b/torchtitan/models/llama3/train_configs/llama3_70b.toml
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama3"
 flavor = "70B"
-tokenizer_path = "./assets/tokenizer/Llama-3.1-8B"
+hf_assets_path = "./assets/hf/Llama-3.1-70B"
 # converters = ["float8"]
 
 [optimizer]