55from typing import cast
66
77from skeletoken import TokenizerModel
8- from skeletoken .addedtoken import AddedToken
8+ from skeletoken .addedtoken import AddedToken , AddedTokens
99from skeletoken .models import WordPiece
1010from skeletoken .pretokenizers import ByteLevelPreTokenizer , PreTokenizerSequence
1111from tokenizers import Tokenizer
@@ -50,7 +50,7 @@ def replace_vocabulary(tokenizer: Tokenizer, new_vocabulary: list[Token]) -> Tok
5050 tokenizer_model .model .vocab .replace_vocabulary (tokens )
5151
5252 new_added_tokens = []
53- for added_token in tokenizer_model .added_tokens :
53+ for added_token in tokenizer_model .added_tokens . root :
5454 if added_token .content not in {tokenizer_model .unk_token , tokenizer_model .pad_token }:
5555 continue
5656 new_added_tokens .append (added_token )
@@ -70,7 +70,7 @@ def replace_vocabulary(tokenizer: Tokenizer, new_vocabulary: list[Token]) -> Tok
7070 )
7171
7272 pre_tokenized_tokens = [x .normalized_form for x in new_vocabulary ]
73- tokenizer_model .added_tokens = _remap_added_tokens (new_added_tokens , pre_tokenized_tokens )
73+ tokenizer_model .added_tokens = AddedTokens ( _remap_added_tokens (new_added_tokens , pre_tokenized_tokens ) )
7474 # Set post processor to None because we don't care about it
7575 tokenizer_model .post_processor = None
7676 # We need to re-set the pad and unk tokens to put the correct indices.
@@ -166,7 +166,7 @@ def _process_internal_tokens(
166166 added_tokens_to_keep : set [str ] = {
167167 x for x in (tokenizer_model .pad_token , tokenizer_model .unk_token ) if x is not None
168168 }
169- added_tokens_to_remove = {x .content for x in tokenizer_model .added_tokens } - added_tokens_to_keep
169+ added_tokens_to_remove = {x .content for x in tokenizer_model .added_tokens . root } - added_tokens_to_keep
170170 cleaned_internal_tokens : list [Token ] = []
171171
172172 for token in internal_tokens :
0 commit comments