@@ -269,14 +269,14 @@ def _process_internal_tokens(
269269 # Isolate the prefix. We can't do first_token[0] because we don't know
270270 # how long the prefix is.
271271 # e.g., "Ġaaaa" -> "Ġ"
272- a_index = 0 if "a" not in first_token else first_token .index ("a" )
272+ a_index = None if "a" not in first_token else first_token .index ("a" )
273273 word_prefix = first_token [:a_index ]
274274 is_byte_prefix = word_prefix == "Ġ"
275275 second_token = encoded .tokens [1 ]
276276 # The second token is the first subword token.
277277 # If a tokenizer uses subwords, this token will have been prefixed.
278278 # We don't know how long the prefix is.
279- a_index = 0 if "a" not in second_token else second_token .index ("a" )
279+ a_index = None if "a" not in second_token else second_token .index ("a" )
280280 subword_prefix = second_token [:a_index ]
281281
282282 pre_tokenizer : PreTokenizer | None = backend_tokenizer .pre_tokenizer
@@ -355,12 +355,10 @@ def _create_normalized_form(
355355 if is_byte_prefix :
356356 return token
357357 # We need to check if the token is a subword or not and remove the prefix.
358- if is_subword and subword_prefix :
358+ if is_subword :
359359 return token .removeprefix (subword_prefix )
360360 # If the token is not a subword, we need to remove the word prefix, and add metaspace.
361- if word_prefix :
362- token = token .removeprefix (word_prefix )
363- return f"▁{ token } "
361+ return f"▁{ token .removeprefix (word_prefix )} "
364362
365363
366364def turn_tokens_into_ids (tokens : list [Token ], tokenizer : PreTrainedTokenizerFast , unk_token : str ) -> list [list [int ]]:
0 commit comments