Skip to content

Commit 6b6bcf1

Browse files
authored
Modify Ernie Docstring (PaddlePaddle#997)
* modify ernie * modify modeling * modify ernie-ctm * modify tokenizer * modify ernie-ctm tokenizer * modify modeling * modify erniemodel * modify ernie-ctm * modify ernie-gen * modify tokenizer * modify ernie models * modify ernie-gen * fix errirs * modify erniemodel * modify erniemodel
1 parent 375df59 commit 6b6bcf1

File tree

9 files changed

+897
-716
lines changed

9 files changed

+897
-716
lines changed

paddlenlp/transformers/ernie/modeling.py

Lines changed: 129 additions & 179 deletions
Large diffs are not rendered by default.

paddlenlp/transformers/ernie/tokenizer.py

Lines changed: 142 additions & 87 deletions
Large diffs are not rendered by default.

paddlenlp/transformers/ernie_ctm/modeling.py

Lines changed: 200 additions & 65 deletions
Large diffs are not rendered by default.

paddlenlp/transformers/ernie_ctm/tokenizer.py

Lines changed: 85 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -28,37 +28,49 @@
2828

2929
class ErnieCtmTokenizer(PretrainedTokenizer):
3030
r"""
31-
Construct a ERNIE-CTM tokenizer. It uses a basic tokenizer to do punctuation
32-
splitting, lower casing and so on, and follows a WordPiece tokenizer to
33-
tokenize as subwords.
31+
Construct a ERNIE-CTM tokenizer.
3432
3533
Args:
36-
vocab_file (`str`):
37-
File containing the vocabulary.
38-
do_lower_case (`bool`, optional):
34+
vocab_file (str):
35+
File path of the vocabulary.
36+
do_lower_case (bool, optional):
3937
Whether or not to lowercase the input when tokenizing. Defaults to `True`
40-
do_basic_tokenize (`bool`, optional):
38+
do_basic_tokenize (bool, optional):
4139
Whether or not to do basic tokenization before WordPiece. Defaults to `True`
42-
unk_token (`str`, optional):
43-
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
44-
token instead. Defaults to `"[UNK]"`
45-
sep_token (`str`, optional):
46-
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
47-
sequence classification or for a text and a question for question answering. It is also used as the last
48-
token of a sequence built with special tokens. Defaults to `"[SEP]"`
49-
pad_token (`str`, optional):
50-
The token used for padding, for example when batching sequences of different lengths. Defaults to `"[PAD]"`
51-
cls_token_template (`str`, optional)
52-
The template of summary token for multiple summary placeholders. Defauts to `"[CLS{}]"`
53-
cls_num (`int`, optional):
40+
unk_token (str, optional):
41+
A special token representing the *unknown (out-of-vocabulary)* token.
42+
An unknown token is set to be `unk_token` inorder to be converted to an ID.
43+
Defaults to "[UNK]".
44+
sep_token (str, optional):
45+
A special token separating two different sentences in the same input.
46+
Defaults to "[SEP]".
47+
pad_token (str, optional):
48+
A special token used to make arrays of tokens the same size for batching purposes.
49+
Defaults to "[PAD]".
50+
cls_token_template (str, optional)
51+
The template of summary token for multiple summary placeholders. Defaults to `"[CLS{}]"`
52+
cls_num (int, optional):
5453
Summary placeholder used in ernie-ctm model. For catching a sentence global feature from multiple aware.
55-
Defaults to 1
56-
mask_token (`str`, optional):
57-
The token used for masking values. This is the token used when training this model with masked language
58-
modeling. This is the token which the model will try to predict. Defaults to `"[MASK]"`
59-
strip_accents: (`bool`, optional):
54+
Defaults to `1`.
55+
mask_token (str, optional):
56+
A special token representing a masked token. This is the token used in the masked
57+
language modeling task. This is the token which the model will try to predict the original unmasked ones.
58+
Defaults to `"[MASK]"`.
59+
strip_accents: (bool, optional):
6060
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
6161
value for `lowercase` (as in the original BERT).
62+
63+
Examples:
64+
.. code-block::
65+
66+
from paddlenlp.transformers import ErnieCtmTokenizer
67+
tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
68+
69+
encoded_inputs = tokenizer('He was a puppeteer')
70+
# encoded_inputs:
71+
# {'input_ids': [101, 98, 153, 150, 99, 168, 146, 164, 99, 146, 99, 161, 166, 161,
72+
# 161, 150, 165, 150, 150, 163, 102],
73+
# 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
6274
"""
6375
resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
6476
pretrained_resource_files_map = {
@@ -104,28 +116,58 @@ def __init__(self,
104116

105117
@property
106118
def vocab_size(self):
119+
"""
120+
Return the size of vocabulary.
121+
122+
Returns:
123+
int: The size of vocabulary.
124+
"""
107125
return len(self.vocab)
108126

109127
def convert_tokens_to_string(self, tokens):
110-
# Converts a sequence of tokens (strings for sub-words) in a single string.
128+
r"""
129+
Converts a sequence of tokens (list of string) in a single string. Since
130+
the usage of WordPiece introducing `##` to concat subwords, also remove
131+
`##` when converting.
132+
133+
Args:
134+
tokens (List[str]): A list of string representing tokens to be converted.
135+
136+
Returns:
137+
str: Converted string from tokens.
138+
139+
Examples:
140+
.. code-block::
141+
142+
from paddlenlp.transformers import ErnieCtmTokenizer
143+
tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
144+
145+
tokens = tokenizer.tokenize('He was a puppeteer')
146+
strings = tokenizer.convert_tokens_to_string(tokens)
147+
#he was a puppeteer
148+
149+
"""
111150
out_string = " ".join(tokens).replace(" ##", "").strip()
112151
return out_string
113152

114153
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
115-
"""Build model inputs from a sequence or a pair of sequences for sequence classification tasks by
116-
concatenating and add special tokens. A ERNIE-CTM sequence has the following format:
154+
"""
155+
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by
156+
concatenating and add special tokens.
117157
118-
- single sequence: [CLS0][CLS1]... X [SEP]
119-
- pair of sequences: [CLS0][CLS1]... X [SEP] X [SEP]
158+
A ERNIE-CTM sequence has the following format:
159+
160+
- single sequence: [CLS0][CLS1]... X [SEP]
161+
- pair of sequences: [CLS0][CLS1]... X [SEP] X [SEP]
120162
121163
Args:
122-
token_ids_0 (`List`):
164+
token_ids_0 (List):
123165
List of IDs to which the special tokens will be added.
124-
token_ids_1 (`List`, optional):
125-
second list of IDs for sequence pairs. Defaults to ``None``.
166+
token_ids_1 (List, optional):
167+
Optional second list of IDs for sequence pairs. Defaults to ``None``.
126168
127169
Returns:
128-
List: The input IDs with the appropriate special tokens.
170+
List[int]: The input_id with the appropriate special tokens.
129171
"""
130172
cls_token_ids = [
131173
self.convert_tokens_to_ids(self.cls_token_template.format(sid))
@@ -178,6 +220,7 @@ def create_token_type_ids_from_sequences(self,
178220
token_ids_1=None):
179221
"""
180222
Creates a token_type mask from the input sequences.
223+
181224
If `token_ids_1` is not `None`, then a sequence pair
182225
token_type mask has the following format:
183226
@@ -260,7 +303,16 @@ def tokenize(self, text, **kwargs):
260303
Args:
261304
text (str):
262305
The text to be tokenized.
306+
263307
Returns:
264308
List(str): A list of string representing converted tokens.
309+
310+
Examples:
311+
.. code-block::
312+
313+
from paddlenlp.transformers import ErnieCtmTokenizer
314+
tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
315+
tokens = tokenizer.tokenize('He was a puppeteer')
316+
265317
"""
266318
return self._tokenize(text, **kwargs)

0 commit comments

Comments
 (0)