Skip to content

Commit e06c5d9

Browse files
committed
fix e5 bug
1 parent 59502a1 commit e06c5d9

File tree

1 file changed

+4
-6
lines changed

1 file changed

+4
-6
lines changed

model2vec/distill/tokenizer.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,14 +269,14 @@ def _process_internal_tokens(
269269
# Isolate the prefix. We can't do first_token[0] because we don't know
270270
# how long the prefix is.
271271
# e.g., "Ġaaaa" -> "Ġ"
272-
a_index = 0 if "a" not in first_token else first_token.index("a")
272+
a_index = None if "a" not in first_token else first_token.index("a")
273273
word_prefix = first_token[:a_index]
274274
is_byte_prefix = word_prefix == "Ġ"
275275
second_token = encoded.tokens[1]
276276
# The second token is the first subword token.
277277
# If a tokenizer uses subwords, this token will have been prefixed.
278278
# We don't know how long the prefix is.
279-
a_index = 0 if "a" not in second_token else second_token.index("a")
279+
a_index = None if "a" not in second_token else second_token.index("a")
280280
subword_prefix = second_token[:a_index]
281281

282282
pre_tokenizer: PreTokenizer | None = backend_tokenizer.pre_tokenizer
@@ -355,12 +355,10 @@ def _create_normalized_form(
355355
if is_byte_prefix:
356356
return token
357357
# We need to check if the token is a subword or not and remove the prefix.
358-
if is_subword and subword_prefix:
358+
if is_subword:
359359
return token.removeprefix(subword_prefix)
360360
# If the token is not a subword, we need to remove the word prefix, and add metaspace.
361-
if word_prefix:
362-
token = token.removeprefix(word_prefix)
363-
return f"▁{token}"
361+
return f"▁{token.removeprefix(word_prefix)}"
364362

365363

366364
def turn_tokens_into_ids(tokens: list[Token], tokenizer: PreTrainedTokenizerFast, unk_token: str) -> list[list[int]]:

0 commit comments

Comments
 (0)