1818
1919"""Tokenization classes for InternLM."""
2020import os
21+ import re
2122from shutil import copyfile
2223from typing import Any , Dict , List , Optional , Tuple
2324
2425import sentencepiece as spm
2526from paddlenlp .transformers .tokenizer_utils import PretrainedTokenizer
27+ from paddlenlp .transformers .tokenizer_utils_base import AddedToken , TextInput
28+
2629from paddlemix .utils .log import logger
2730
28- VOCAB_FILES_NAMES = {' vocab_file' : ' ./tokenizer.model' }
31+ VOCAB_FILES_NAMES = {" vocab_file" : " ./tokenizer.model" }
2932# VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
3033
3134# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
@@ -38,18 +41,18 @@ class InternLM2Tokenizer(PretrainedTokenizer):
3841 Path to the vocabulary file.
3942 """
4043
41- resource_files_names = VOCAB_FILES_NAMES # vocab_files_names in torch
42- pretrained_resource_files_map = {} # pretrained_vocab_files_map in torch
43- model_input_names = [' input_ids' , ' attention_mask' ]
44- _auto_class = ' AutoTokenizer'
44+ resource_files_names = VOCAB_FILES_NAMES # vocab_files_names in torch
45+ pretrained_resource_files_map = {} # pretrained_vocab_files_map in torch
46+ model_input_names = [" input_ids" , " attention_mask" ]
47+ _auto_class = " AutoTokenizer"
4548
4649 def __init__ (
4750 self ,
4851 vocab_file ,
49- unk_token = ' <unk>' ,
50- bos_token = ' <s>' ,
51- eos_token = ' </s>' ,
52- pad_token = ' </s>' ,
52+ unk_token = " <unk>" ,
53+ bos_token = " <s>" ,
54+ eos_token = " </s>" ,
55+ pad_token = " </s>" ,
5356 sp_model_kwargs : Optional [Dict [str , Any ]] = None ,
5457 add_bos_token = True ,
5558 add_eos_token = False ,
@@ -78,7 +81,7 @@ def __init__(
7881 def no_prefix_space_tokens (self ):
7982 if self ._no_prefix_space_tokens is None :
8083 vocab = self .convert_ids_to_tokens (list (range (self .vocab_size )))
81- self ._no_prefix_space_tokens = {i for i , tok in enumerate (vocab ) if not tok .startswith ('▁' )}
84+ self ._no_prefix_space_tokens = {i for i , tok in enumerate (vocab ) if not tok .startswith ("▁" )}
8285 return self ._no_prefix_space_tokens
8386
8487 @property
@@ -115,20 +118,20 @@ def _convert_id_to_token(self, index):
115118
116119 def _maybe_add_prefix_space (self , tokens , decoded ):
117120 if tokens and tokens [0 ] not in self .no_prefix_space_tokens :
118- return ' ' + decoded
121+ return " " + decoded
119122 else :
120123 return decoded
121124
122125 def convert_tokens_to_string (self , tokens ):
123126 """Converts a sequence of tokens (string) in a single string."""
124127 current_sub_tokens = []
125- out_string = ''
128+ out_string = ""
126129 prev_is_special = False
127130 for token in tokens :
128131 # make sure that special tokens are not decoded using sentencepiece model
129132 if token in self .all_special_tokens :
130133 if not prev_is_special :
131- out_string += ' '
134+ out_string += " "
132135 out_string += self .sp_model .decode (current_sub_tokens ) + token
133136 prev_is_special = True
134137 current_sub_tokens = []
@@ -152,16 +155,16 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)
152155 `Tuple(str)`: Paths to the files saved.
153156 """
154157 if not os .path .isdir (save_directory ):
155- logger .error (f' Vocabulary path ({ save_directory } ) should be a directory' )
158+ logger .error (f" Vocabulary path ({ save_directory } ) should be a directory" )
156159 return
157160 out_vocab_file = os .path .join (
158- save_directory , (filename_prefix + '-' if filename_prefix else '' ) + VOCAB_FILES_NAMES [' vocab_file' ]
161+ save_directory , (filename_prefix + "-" if filename_prefix else "" ) + VOCAB_FILES_NAMES [" vocab_file" ]
159162 )
160163
161164 if os .path .abspath (self .vocab_file ) != os .path .abspath (out_vocab_file ) and os .path .isfile (self .vocab_file ):
162165 copyfile (self .vocab_file , out_vocab_file )
163166 elif not os .path .isfile (self .vocab_file ):
164- with open (out_vocab_file , 'wb' ) as fi :
167+ with open (out_vocab_file , "wb" ) as fi :
165168 content_spiece_model = self .sp_model .serialized_model_proto ()
166169 fi .write (content_spiece_model )
167170
@@ -231,3 +234,77 @@ def create_token_type_ids_from_sequences(
231234 if token_ids_1 is None :
232235 return len (token_ids_0 + eos ) * [0 ]
233236 return len (token_ids_0 + eos + token_ids_1 + eos ) * [0 ]
237+
238+ def tokenize (self , text : TextInput , ** kwargs ) -> List [str ]:
239+ """
240+ Converts a string into a sequence of tokens, using the tokenizer.
241+
242+ Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
243+ (BPE/SentencePieces/WordPieces). Takes care of added tokens.
244+
245+ Args:
246+ text (`str`):
247+ The sequence to be encoded.
248+ **kwargs (additional keyword arguments):
249+ Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
250+
251+ Returns:
252+ `List[str]`: The list of tokens.
253+ """
254+ split_special_tokens = kwargs .pop ("split_special_tokens" , self .split_special_tokens )
255+
256+ text , kwargs = self .prepare_for_tokenization (text , ** kwargs )
257+
258+ # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
259+ all_special_tokens_extended = dict (
260+ (str (t ), t ) for t in self .all_special_tokens_extended if isinstance (t , AddedToken )
261+ )
262+
263+ if hasattr (self , "do_lower_case" ) and self .do_lower_case :
264+ # convert non-special tokens to lowercase. Might be super slow as well?
265+ escaped_special_toks = [
266+ re .escape (s_tok ) for s_tok in (self .unique_no_split_tokens + self .all_special_tokens )
267+ ]
268+ pattern = r"(" + r"|" .join (escaped_special_toks ) + r")|" + r"(.+?)"
269+ text = re .sub (pattern , lambda m : m .groups ()[0 ] or m .groups ()[1 ].lower (), text )
270+
271+ if split_special_tokens :
272+ no_split_token = []
273+ tokens = [text ]
274+ else :
275+ no_split_token = set (self .unique_no_split_tokens ) # don't split on any of the added tokens
276+ # "This is something<special_token_1> else"
277+ tokens = self .tokens_trie .split (text )
278+
279+ # ["This is something", "<special_token_1>", " else"]
280+ for i , token in enumerate (tokens ):
281+ if token in no_split_token :
282+ tok_extended = all_special_tokens_extended .get (token , None )
283+ left = tokens [i - 1 ] if i > 0 else None
284+ right = tokens [i + 1 ] if i < len (tokens ) - 1 else None
285+ if isinstance (tok_extended , AddedToken ):
286+ if tok_extended .rstrip and right :
287+ # A bit counter-intuitive but we strip the left of the string
288+ # since tok_extended.rstrip means the special token is eating all white spaces on its right
289+ tokens [i + 1 ] = right .lstrip ()
290+ # Strip white spaces on the left
291+ if tok_extended .lstrip and left :
292+ tokens [i - 1 ] = left .rstrip () # Opposite here
293+ if tok_extended .single_word and left and left [- 1 ] != " " :
294+ tokens [i - 1 ] += token
295+ tokens [i ] = ""
296+ elif tok_extended .single_word and right and right [0 ] != " " :
297+ tokens [i + 1 ] = token + tokens [i + 1 ]
298+ tokens [i ] = ""
299+ # ["This is something", "<special_token_1>", "else"]
300+ tokenized_text = []
301+ for token in tokens :
302+ # Need to skip eventual empty (fully stripped) tokens
303+ if not token :
304+ continue
305+ if token in no_split_token :
306+ tokenized_text .append (token )
307+ else :
308+ tokenized_text .extend (self ._tokenize (token ))
309+ # ["This", " is", " something", "<special_token_1>", "else"]
310+ return tokenized_text
0 commit comments