Skip to content

Commit a91b520

Browse files
Tokenizer: add_prefix_space shouldn't affect self.use_bos (#1328)
1 parent 5de6fdf commit a91b520

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

litgpt/tokenizer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,11 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
7373
return False
7474
with open(tokenizer_config_path, encoding="utf-8") as fp:
7575
config = json.load(fp)
76-
if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
77-
return True
78-
# for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True.
76+
if "add_bos_token" in config:
77+
return config["add_bos_token"]
78+
# if `add_bos_token` isn't in the config file, but LLaMA tokenizer is used - return True.
7979
# ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2
80-
return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer"
80+
return config.get("tokenizer_class") == "LlamaTokenizer"
8181

8282
def encode(
8383
self,

0 commit comments

Comments
 (0)