22
33import json
44import logging
5+ from string import punctuation
56from typing import Any
67
7- from tokenizers import Tokenizer
8+ from tokenizers import Regex , Tokenizer
9+ from tokenizers .normalizers import Lowercase , Normalizer , Replace , Strip
10+ from tokenizers .normalizers import Sequence as NormalizerSequence
811from tokenizers .pre_tokenizers import (
912 BertPreTokenizer ,
1013 ByteLevel ,
1114 CharDelimiterSplit ,
12- Digits ,
1315 Metaspace ,
1416 PreTokenizer ,
1517 Punctuation ,
4547}
4648
4749
48- def _pre_tokenize_vocabulary (tokenizer : Tokenizer , tokens : list [Token ]) -> list [str ]:
50+ def _pre_tokenize_vocabulary (tokenizer : Tokenizer , tokens : list [Token ], subword_prefix : str ) -> list [str ]:
4951 """
5052 Apply pre-tokenization to vocabulary tokens if a pre-tokenizer is present.
5153
@@ -54,19 +56,28 @@ def _pre_tokenize_vocabulary(tokenizer: Tokenizer, tokens: list[Token]) -> list[
5456
5557 :param tokenizer: The tokenizer to use.
5658 :param tokens: The tokens to pre-tokenize.
59+ :param subword_prefix: The prefix for subwords.
5760 :return: The pre-tokenized tokens.
5861 """
5962 pre_tokenized_tokens = []
6063
6164 if tokenizer .pre_tokenizer is not None :
6265 for token in tokens :
63- if token .is_original :
66+ if token .is_subword :
6467 # Original tokens do not need to be pre-tokenized.
65- pre_tokenized_tokens .append (token .form )
66- else :
68+ form = token .form
69+ if subword_prefix is not None :
70+ form = token .form .removeprefix (subword_prefix )
71+ pre_tokenized_tokens .append (form )
72+ elif token .should_be_pretokenized :
6773 # Join tokens just to be sure.
74+ token .form = tokenizer .normalizer .normalize_str (token .form ).rstrip ()
6875 pretokenized_tokens , _ = zip (* tokenizer .pre_tokenizer .pre_tokenize_str (token .form ))
69- pre_tokenized_tokens .append (" " .join (pretokenized_tokens ))
76+ form = " " .join (pretokenized_tokens )
77+ pre_tokenized_tokens .append (form )
78+ else :
79+ token .form = tokenizer .normalizer .normalize_str (token .form ).rstrip ()
80+ pre_tokenized_tokens .append (token .form )
7081 else :
7182 pre_tokenized_tokens = [token .form for token in tokens ]
7283
@@ -95,12 +106,38 @@ def _remap_added_tokens(
95106 return special_tokens
96107
97108
109+ def _prepare_normalizer (
110+ normalizer : Normalizer ,
111+ ) -> Normalizer :
112+ """
113+ Prepare the normalizer for the tokenizer.
114+
115+ This function sets the normalizer for the tokenizer based on the provided normalizer type.
116+ If no normalizer is provided, it uses the default one.
117+
118+ :param normalizer: The tokenizer to prepare.
119+ :return: The prepared tokenizer.
120+ """
121+ new_normalizers = []
122+ for char in punctuation :
123+ new_normalizers .append (Replace (char , f" { char } " ))
124+ new_normalizers .append (Replace (Regex (r"\s+" ), " " ))
125+ new_normalizers .append (Strip (right = True ))
126+ if normalizer is None :
127+ return NormalizerSequence (new_normalizers )
128+
129+ return NormalizerSequence ([normalizer ] + new_normalizers )
130+
131+
98132def _fix_single_pretokenizer (pretokenizer : PreTokenizer ) -> PreTokenizer | None :
99133 """Fixes a single pretokenizer to allow multiword units."""
134+ if isinstance (pretokenizer , Metaspace ):
135+ return Metaspace (split = False , replacement = pretokenizer .replacement , prepend_scheme = pretokenizer .prepend_scheme )
100136 if isinstance (pretokenizer , _FORBIDDEN_PRETOKENIZERS ):
101- return Metaspace (split = False , replacement = "Ġ " )
137+ return Metaspace (split = False , replacement = "▁ " )
102138 elif isinstance (pretokenizer , ByteLevel ):
103139 pretokenizer .use_regex = False
140+ pretokenizer .add_prefix_space = True
104141
105142 return pretokenizer
106143
@@ -111,68 +148,29 @@ def _fix_pretokenizer_for_super(pre: PreTokenizer | None) -> Tokenizer:
111148 return pre
112149
113150 if isinstance (pre , Sequence ):
114- new_pretokenizers = []
115- for pretokenizer in pre :
116- new_pretokenizers .append (_fix_single_pretokenizer (pretokenizer ))
117- return Sequence (new_pretokenizers )
151+ return Metaspace (split = False )
118152
119153 return _fix_single_pretokenizer (pre )
120154
121155
122- def _make_new_merges_from_vocab (
123- merges : list [tuple [str , str ]], tokens : list [str ], special_tokens : set [str | None ]
124- ) -> list [tuple [str , str ]]:
125- """
126- Generate new merges from a vocabulary.
127-
128- This function creates new merge pairs from a given vocabulary of tokens.
129- The merges are used to build or extend a tokenizer's merge table.
130-
131- :param merges: The list of existing merges in the form (first, second) where first and second are tokens.
132- :param tokens: The list of tokens (vocabulary) from which to generate new merges.
133- :param special_tokens: Tokens that should not be merged.
134- :return: The list of new merges in the form (first, second) where first and second are tokens.
135- """
136- new_merges = merges .copy ()
137- current_vocab = set (tokens ) - special_tokens
138- already_merged = set ("" .join (merge ) for merge in merges )
139-
140- for token in tokens :
141- if token in special_tokens :
142- continue
143- if token in already_merged :
144- continue
145- if len (token ) == 1 :
146- continue
147- merges = []
148- for index in range (1 , len (token )):
149- first , second = token [:index ], token [index :]
150- if first in current_vocab and second in current_vocab :
151- merges .append ((first , second ))
152- if not merges :
153- logger .warning (f"Token { token } has no merges." )
154- continue
155- new_merges .extend (merges )
156-
157- return new_merges
158-
159-
160156def _process_wordpiece (
161157 tokenizer_json : dict [str , Any ], pre_tokenized_tokens : list [str ], unk_token : str | None
162158) -> dict [str , Any ]:
163159 """Process the WordPiece tokenizer JSON."""
164- tokenizer_json ["model" ]["unk_token" ] = unk_token
165- tokenizer_json ["model" ]["vocab" ] = {token : idx for idx , token in enumerate (pre_tokenized_tokens )}
160+ tokenizer_json ["model" ]["type" ] = "Unigram"
161+ tokenizer_json ["model" ]["unk_id" ] = pre_tokenized_tokens .index (unk_token ) if unk_token else None
162+ tokenizer_json ["model" ]["vocab" ] = [(token , 0.0 ) for token in pre_tokenized_tokens ]
166163
167164 return tokenizer_json
168165
169166
170- def _process_bpe (tokenizer_json : dict [str , Any ], pre_tokenized_tokens : list [str ]) -> dict [str , Any ]:
167+ def _process_bpe (
168+ tokenizer_json : dict [str , Any ], pre_tokenized_tokens : list [str ], unk_token : str | None
169+ ) -> dict [str , Any ]:
171170 """Process the BPE tokenizer JSON."""
172- tokenizer_json = _process_wordpiece (tokenizer_json , pre_tokenized_tokens , None )
173- merges = tokenizer_json ["model" ]["merges" ]
174- merges = _make_new_merges_from_vocab (merges , pre_tokenized_tokens , {"[UNK]" , "[PAD]" })
175- tokenizer_json ["model" ]["merges" ] = merges
171+ tokenizer_json ["model" ]["type" ] = "Unigram"
172+ tokenizer_json ["model" ]["unk_id" ] = pre_tokenized_tokens .index (unk_token ) if unk_token else None
173+ tokenizer_json ["model" ]["vocab" ] = [(token , 0.0 ) for token in pre_tokenized_tokens ]
176174
177175 return tokenizer_json
178176
@@ -194,13 +192,16 @@ def replace_vocabulary(
194192 tokenizer : Tokenizer , new_vocabulary : list [Token ], unk_token : str | None , pad_token : str | None
195193) -> Tokenizer :
196194 """Replace the vocabulary of a tokenizer with a new one."""
195+ tokenizer .normalizer = _prepare_normalizer (tokenizer .normalizer )
197196 tokenizer .pre_tokenizer = _fix_pretokenizer_for_super (tokenizer .pre_tokenizer )
198197 tokenizer_json : dict [str , Any ] = json .loads (tokenizer .to_str ())
199198
200199 # NOTE: all tokens have been normalized before.
201200 # Very careful, we need to pretokenize words before adding them to the vocabulary.
202201 # But only if they are not part of the original vocabulary.
203- pre_tokenized_tokens = _pre_tokenize_vocabulary (tokenizer , new_vocabulary )
202+ subword_prefix = tokenizer_json ["model" ].get ("continuing_subword_prefix" , "" )
203+
204+ pre_tokenized_tokens = _pre_tokenize_vocabulary (tokenizer , new_vocabulary , subword_prefix = subword_prefix )
204205
205206 model_type = tokenizer_json ["model" ]["type" ]
206207 added_tokens : list [dict [str , Any ]] = tokenizer_json ["added_tokens" ]
@@ -215,7 +216,7 @@ def replace_vocabulary(
215216 if model_type == "WordPiece" :
216217 tokenizer_json = _process_wordpiece (tokenizer_json , pre_tokenized_tokens , "[UNK]" )
217218 elif model_type == "BPE" :
218- tokenizer_json = _process_bpe (tokenizer_json , pre_tokenized_tokens )
219+ tokenizer_json = _process_bpe (tokenizer_json , pre_tokenized_tokens , "[UNK]" )
219220 elif model_type == "Unigram" :
220221 tokenizer_json = _process_unigram (tokenizer_json , pre_tokenized_tokens , "[UNK]" )
221222 else :
0 commit comments