|
28 | 28 |
|
29 | 29 | class ErnieCtmTokenizer(PretrainedTokenizer):
|
30 | 30 | r"""
|
31 |
| - Construct a ERNIE-CTM tokenizer. It uses a basic tokenizer to do punctuation |
32 |
| - splitting, lower casing and so on, and follows a WordPiece tokenizer to |
33 |
| - tokenize as subwords. |
| 31 | + Construct a ERNIE-CTM tokenizer. |
34 | 32 |
|
35 | 33 | Args:
|
36 |
| - vocab_file (`str`): |
37 |
| - File containing the vocabulary. |
38 |
| - do_lower_case (`bool`, optional): |
| 34 | + vocab_file (str): |
| 35 | + File path of the vocabulary. |
| 36 | + do_lower_case (bool, optional): |
39 | 37 | Whether or not to lowercase the input when tokenizing. Defaults to `True`
|
40 |
| - do_basic_tokenize (`bool`, optional): |
| 38 | + do_basic_tokenize (bool, optional): |
41 | 39 | Whether or not to do basic tokenization before WordPiece. Defaults to `True`
|
42 |
| - unk_token (`str`, optional): |
43 |
| - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
44 |
| - token instead. Defaults to `"[UNK]"` |
45 |
| - sep_token (`str`, optional): |
46 |
| - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for |
47 |
| - sequence classification or for a text and a question for question answering. It is also used as the last |
48 |
| - token of a sequence built with special tokens. Defaults to `"[SEP]"` |
49 |
| - pad_token (`str`, optional): |
50 |
| - The token used for padding, for example when batching sequences of different lengths. Defaults to `"[PAD]"` |
51 |
| - cls_token_template (`str`, optional) |
52 |
| - The template of summary token for multiple summary placeholders. Defauts to `"[CLS{}]"` |
53 |
| - cls_num (`int`, optional): |
| 40 | + unk_token (str, optional): |
| 41 | + A special token representing the *unknown (out-of-vocabulary)* token. |
| 42 | + An unknown token is set to be `unk_token` inorder to be converted to an ID. |
| 43 | + Defaults to "[UNK]". |
| 44 | + sep_token (str, optional): |
| 45 | + A special token separating two different sentences in the same input. |
| 46 | + Defaults to "[SEP]". |
| 47 | + pad_token (str, optional): |
| 48 | + A special token used to make arrays of tokens the same size for batching purposes. |
| 49 | + Defaults to "[PAD]". |
| 50 | + cls_token_template (str, optional) |
| 51 | + The template of summary token for multiple summary placeholders. Defaults to `"[CLS{}]"` |
| 52 | + cls_num (int, optional): |
54 | 53 | Summary placeholder used in ernie-ctm model. For catching a sentence global feature from multiple aware.
|
55 |
| - Defaults to 1 |
56 |
| - mask_token (`str`, optional): |
57 |
| - The token used for masking values. This is the token used when training this model with masked language |
58 |
| - modeling. This is the token which the model will try to predict. Defaults to `"[MASK]"` |
59 |
| - strip_accents: (`bool`, optional): |
| 54 | + Defaults to `1`. |
| 55 | + mask_token (str, optional): |
| 56 | + A special token representing a masked token. This is the token used in the masked |
| 57 | + language modeling task. This is the token which the model will try to predict the original unmasked ones. |
| 58 | + Defaults to `"[MASK]"`. |
| 59 | + strip_accents: (bool, optional): |
60 | 60 | Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
61 | 61 | value for `lowercase` (as in the original BERT).
|
| 62 | +
|
| 63 | + Examples: |
| 64 | + .. code-block:: |
| 65 | +
|
| 66 | + from paddlenlp.transformers import ErnieCtmTokenizer |
| 67 | + tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm') |
| 68 | +
|
| 69 | + encoded_inputs = tokenizer('He was a puppeteer') |
| 70 | + # encoded_inputs: |
| 71 | + # {'input_ids': [101, 98, 153, 150, 99, 168, 146, 164, 99, 146, 99, 161, 166, 161, |
| 72 | + # 161, 150, 165, 150, 150, 163, 102], |
| 73 | + # 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} |
62 | 74 | """
|
63 | 75 | resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
|
64 | 76 | pretrained_resource_files_map = {
|
@@ -104,28 +116,58 @@ def __init__(self,
|
104 | 116 |
|
105 | 117 | @property
|
106 | 118 | def vocab_size(self):
|
| 119 | + """ |
| 120 | + Return the size of vocabulary. |
| 121 | +
|
| 122 | + Returns: |
| 123 | + int: The size of vocabulary. |
| 124 | + """ |
107 | 125 | return len(self.vocab)
|
108 | 126 |
|
109 | 127 | def convert_tokens_to_string(self, tokens):
|
110 |
| - # Converts a sequence of tokens (strings for sub-words) in a single string. |
| 128 | + r""" |
| 129 | + Converts a sequence of tokens (list of string) in a single string. Since |
| 130 | + the usage of WordPiece introducing `##` to concat subwords, also remove |
| 131 | + `##` when converting. |
| 132 | +
|
| 133 | + Args: |
| 134 | + tokens (List[str]): A list of string representing tokens to be converted. |
| 135 | +
|
| 136 | + Returns: |
| 137 | + str: Converted string from tokens. |
| 138 | +
|
| 139 | + Examples: |
| 140 | + .. code-block:: |
| 141 | +
|
| 142 | + from paddlenlp.transformers import ErnieCtmTokenizer |
| 143 | + tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm') |
| 144 | +
|
| 145 | + tokens = tokenizer.tokenize('He was a puppeteer') |
| 146 | + strings = tokenizer.convert_tokens_to_string(tokens) |
| 147 | + #he was a puppeteer |
| 148 | +
|
| 149 | + """ |
111 | 150 | out_string = " ".join(tokens).replace(" ##", "").strip()
|
112 | 151 | return out_string
|
113 | 152 |
|
114 | 153 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
115 |
| - """Build model inputs from a sequence or a pair of sequences for sequence classification tasks by |
116 |
| - concatenating and add special tokens. A ERNIE-CTM sequence has the following format: |
| 154 | + """ |
| 155 | + Build model inputs from a sequence or a pair of sequences for sequence classification tasks by |
| 156 | + concatenating and add special tokens. |
117 | 157 |
|
118 |
| - - single sequence: [CLS0][CLS1]... X [SEP] |
119 |
| - - pair of sequences: [CLS0][CLS1]... X [SEP] X [SEP] |
| 158 | + A ERNIE-CTM sequence has the following format: |
| 159 | +
|
| 160 | + - single sequence: [CLS0][CLS1]... X [SEP] |
| 161 | + - pair of sequences: [CLS0][CLS1]... X [SEP] X [SEP] |
120 | 162 |
|
121 | 163 | Args:
|
122 |
| - token_ids_0 (`List`): |
| 164 | + token_ids_0 (List): |
123 | 165 | List of IDs to which the special tokens will be added.
|
124 |
| - token_ids_1 (`List`, optional): |
125 |
| - second list of IDs for sequence pairs. Defaults to ``None``. |
| 166 | + token_ids_1 (List, optional): |
| 167 | + Optional second list of IDs for sequence pairs. Defaults to ``None``. |
126 | 168 |
|
127 | 169 | Returns:
|
128 |
| - List: The input IDs with the appropriate special tokens. |
| 170 | + List[int]: The input_id with the appropriate special tokens. |
129 | 171 | """
|
130 | 172 | cls_token_ids = [
|
131 | 173 | self.convert_tokens_to_ids(self.cls_token_template.format(sid))
|
@@ -178,6 +220,7 @@ def create_token_type_ids_from_sequences(self,
|
178 | 220 | token_ids_1=None):
|
179 | 221 | """
|
180 | 222 | Creates a token_type mask from the input sequences.
|
| 223 | +
|
181 | 224 | If `token_ids_1` is not `None`, then a sequence pair
|
182 | 225 | token_type mask has the following format:
|
183 | 226 |
|
@@ -260,7 +303,16 @@ def tokenize(self, text, **kwargs):
|
260 | 303 | Args:
|
261 | 304 | text (str):
|
262 | 305 | The text to be tokenized.
|
| 306 | +
|
263 | 307 | Returns:
|
264 | 308 | List(str): A list of string representing converted tokens.
|
| 309 | +
|
| 310 | + Examples: |
| 311 | + .. code-block:: |
| 312 | +
|
| 313 | + from paddlenlp.transformers import ErnieCtmTokenizer |
| 314 | + tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm') |
| 315 | + tokens = tokenizer.tokenize('He was a puppeteer') |
| 316 | +
|
265 | 317 | """
|
266 | 318 | return self._tokenize(text, **kwargs)
|
0 commit comments