Skip to content

Commit 3ae9f16

Browse files
authored
Remove warning for duplicates in string vocab (#38)
* remove warning for duplicates in string vocab * lint
1 parent 51ec7fe commit 3ae9f16

File tree

1 file changed

+0
-10
lines changed

1 file changed

+0
-10
lines changed

genlm/backend/tokenization/vocab.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Functions to get and check HuggingFace tokenizer vocabularies"""
22

3-
import warnings
43
from transformers import AutoTokenizer
54

65
from genlm.backend.tokenization.bytes import ByteVocabError, get_byte_vocab
@@ -91,13 +90,4 @@ def bytes_to_strs(tokenizer, byte_vocab, byte2str_fallback):
9190

9291
str_vocab.append(token)
9392

94-
duplicates = {
95-
token: indices for token, indices in seen_tokens.items() if len(indices) > 1
96-
}
97-
if duplicates:
98-
warnings.warn(
99-
"Duplicate tokens found in string vocabulary. "
100-
"This may lead to downstream issues with the string vocabulary; we recommend using the byte vocabulary."
101-
)
102-
10393
return str_vocab

0 commit comments

Comments
 (0)