Modify layers Doc (#1140)

huhuiwen99 · web-flow · commit 8df317a9677e · 2021-10-11T18:33:34.000+08:00
* modify transforner-rst

* modify roformer tokenizer

* delete modifications

* modify datasets

* modify layers

* fix errors

* fix errors

* fix errors
diff --git a/paddlenlp/datasets/dataset.py b/paddlenlp/datasets/dataset.py
@@ -77,14 +77,14 @@ def load_dataset(path_or_read_func,
         path_or_read_func (str|callable): Name of the dataset processing script 
             in PaddleNLP library or a custom data reading function.
         name (str, optional): Additional name to select a more specific dataset.
-            Default to None.
-        data_files (str|list|tuple|dict, optional): Defineing the path of dataset 
-            files. If None. `splits` must be specified. Default to None. 
+            Defaults to None.
+        data_files (str|list|tuple|dict, optional): Defining the path of dataset
+            files. If None. `splits` must be specified. Defaults to None.
         splits (str|list|tuple, optional): Which split of the data to load. If None.
-            `data_files` must be specified. Default to None. 
-        lazy (bool, optional): Wheather to return `MapDataset` or an `IterDataset`.
+            `data_files` must be specified. Defaults to None.
+        lazy (bool, optional): Weather to return `MapDataset` or an `IterDataset`.
             True for `IterDataset`. False for `MapDataset`. If None, return the 
-            default type of this dataset.
+            default type of this dataset. Defaults to None.
         kwargs (dict): Other keyword arguments to be passed to the `DatasetBuilder`.
 
     Returns:
@@ -195,7 +195,7 @@ def filter(self, fn, num_workers=0):
             fn (callable): A filter function that takes a sample as input and
                 returns a boolean. Samples that return False would be discarded.
             num_workers(int, optional): Number of processes for multiprocessing. If 
-                set to 0, it doesn't use multiprocessing. Defalt: 0.
+                set to 0, it doesn't use multiprocessing. Defaults to `0`.
         """
         assert num_workers >= 0, "num_workers should be a non-negative value"
         if num_workers > 0:
@@ -241,14 +241,14 @@ def shard(self, num_shards=None, index=None, contiguous=False):
         Args:
             num_shards (int, optional): An integer representing the number of
                 data shards. If None, `num_shards` would be number of trainers.
-                Default: None
+                Defaults to `None`.
             index (int, optional): An integer representing the index of the
                 current shard. If None, `index` would be the current trainer rank
-                id. Default: None.
+                id. Defaults to `None`.
             contiguous: (bool, optional): If true, contiguous chunks of data 
                 will be select for sharding. And total number of examples will 
                 be the same. Otherwise each shard will contain all examples of 
-                dataset whose index mod `num_shards` = `index`. Default: False.
+                dataset whose index mod `num_shards` = `index`. Defaults to `False`.
         """
         if num_shards is None:
             num_shards = dist.get_world_size()
@@ -280,13 +280,13 @@ def map(self, fn, lazy=True, batched=False, num_workers=0):
             lazy (bool, optional): If True, transformations would be delayed and
                 performed on demand. Otherwise, transforms all samples at once. Note that 
                 if `fn` is stochastic, `lazy` should be True or you will get the same
-                result on all epochs. Defalt: False.
+                result on all epochs. Defaults to False.
             batched(bool, optional): If True, transformations would take all examples as 
                 input and return a collection of transformed examples. Note that if set 
-                True, `lazy` option would be ignored. Defalt: False.
+                True, `lazy` option would be ignored. Defaults to False.
             num_workers(int, optional): Number of processes for multiprocessing. If 
                 set to 0, it doesn't use multiprocessing. Note that if set to positive
-                value, `lazy` option would be ignored. Defalt: 0.
+                value, `lazy` option would be ignored. Defaults to 0.
         """
 
         assert num_workers >= 0, "num_workers should be a non-negative value"
@@ -416,10 +416,10 @@ def shard(self, num_shards=None, index=None):
         Args:
             num_shards (int, optional): An integer representing the number of
                 data shards. If None, `num_shards` would be number of trainers.
-                Default: None
+                Defaults to None.
             index (int, optional): An integer representing the index of the
                 current shard. If None, `index` would be the current trainer rank
-                id. Default: None.
+                id. Defaults to None.
         """
         if num_shards is None:
             num_shards = dist.get_world_size()
diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py
@@ -35,11 +35,11 @@ class LinearChainCrf(nn.Layer):
     See https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers for reference.
 
     Args:
-        num_labels (`int`): 
+        num_labels (int):
             The label number.
-        crf_lr (`float`, optional): 
+        crf_lr (float, optional):
             The crf layer learning rate. Defaults to ``0.1``.
-        with_start_stop_tag (`bool`, optional): 
+        with_start_stop_tag (bool, optional):
             If set to True, the start tag and stop tag will be considered, the transitions params will be a tensor with a shape of `[num_labels+2, num_labels+2]`.
             Else, the transitions params will be a tensor with a shape of `[num_labels, num_labels]`.
     """
@@ -105,14 +105,13 @@ def forward(self, inputs, lengths):
         Further, We can get F(n) is a recursive formula with F(n-1).
 
         Args:
-            inputs (`Tensor`): 
+            inputs (Tensor):
                 The input predicted tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
-            lengths (`Tensor`): 
+            lengths (Tensor):
                 The input length. Its dtype is int64 and has a shape of `[batch_size]`.
 
         Returns:
-            norm_score (`Tensor`): 
-                The normalizers tensor. Its dtype is float32 and has a shape of `[batch_size]`.
+            Tensor: Returns the normalizers tensor `norm_score`. Its dtype is float32 and has a shape of `[batch_size]`.
         """
         batch_size, seq_len, n_labels = inputs.shape
         inputs_t_exp = inputs.transpose([1, 0, 2]).unsqueeze(-1)
@@ -154,16 +153,15 @@ def gold_score(self, inputs, labels, lengths):
         $$ score(x,y) = \\sum_i Emit(x_i,y_i) + Trans(y_{i-1}, y_i) $$
 
         Args:
-            inputs (`Tensor`): 
+            inputs (Tensor):
                 The input predicted tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
-            labels (`Tensor`) : 
+            labels (Tensor):
                 The input label tensor. Its dtype is int64 and has a shape of `[batch_size, sequence_length]`
-            lengths (`Tensor`): 
+            lengths (Tensor):
                 The input length. Its dtype is int64 and has a shape of `[batch_size]`.
 
         Returns:
-            unnorm_score (`Tensor`): 
-                The unnormalized sequence scores tensor. Its dtype is float32 and has a shape of `[batch_size]`.
+            Tensor: Returns the unnormalized sequence scores tensor `unnorm_score`. Its dtype is float32 and has a shape of `[batch_size]`.
         """
         unnorm_score = self._point_score(
             inputs, labels, lengths) + self._trans_score(labels, lengths)
@@ -268,7 +266,7 @@ class LinearChainCrfLoss(nn.Layer):
     The negative log-likelihood for linear chain Conditional Random Field (CRF).
 
     Args:
-        crf (`LinearChainCrf`): 
+        crf (LinearChainCrf):
             The `LinearChainCrf` network object. Its parameter will be used to calculate the loss.
     """
 
@@ -286,16 +284,16 @@ def forward(self, inputs, lengths, labels, old_version_labels=None):
         then we have $$ loss = -logp(y|x) = -log(exp(score(x,y))/Z(x)) = -score(x,y) + logZ(x) $$
 
         Args:
-            inputs (`Tensor`): 
+            inputs (Tensor):
                 The input predicted tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
-            lengths (`Tensor`): 
+            lengths (Tensor):
                 The input length. Its dtype is int64 and has a shape of `[batch_size]`.
-            labels (`Tensor`) : 
+            labels (Tensor) :
                 The input label tensor. Its dtype is int64 and has a shape of `[batch_size, sequence_length]`
-            old_version_labels (`Tensor`, optional): Unnecessary parameter for compatibility with older versions. Defaults to ``None``.
+            old_version_labels (Tensor, optional): Unnecessary parameter for compatibility with older versions. Defaults to ``None``.
 
         Returns:
-            loss (`Tensor`): The crf loss. Its dtype is float32 and has a shape of `[batch_size]`.
+            Tensor: The crf loss. Its dtype is float32 and has a shape of `[batch_size]`.
         """
         # Note: When closing to convergence, the loss could be a small negative number. This may caused by underflow when calculating exp in logsumexp.
         #       We add relu here to avoid negative loss. In theory, the crf loss must be greater than or equal to 0, relu will not impact on it.
@@ -318,9 +316,9 @@ class ViterbiDecoder(nn.Layer):
     ViterbiDecoder can decode the highest scoring sequence of tags, it should only be used at test time.
 
     Args:
-        transitions (`Tensor`): 
+        transitions (Tensor):
             The transition matrix.  Its dtype is float32 and has a shape of `[num_tags, num_tags]`.
-        with_start_stop_tag (`bool`, optional): 
+        with_start_stop_tag (bool, optional):
             If set to True, the last row and the last column of transitions will be considered as start tag,
             the the penultimate row and the penultimate column of transitions will be considered as stop tag.
             Else, all the rows and columns will be considered as the real tag. Defaults to ``None``.
@@ -363,15 +361,16 @@ def forward(self, inputs, lengths):
         Decode the highest scoring sequence of tags.
 
         Args:
-            inputs (`Tensor`):  
+            inputs (Tensor):
                 The unary emission tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
-            length (`Tensor`):  
+            length (Tensor):
                 The input length tensor storing real length of each sequence for correctness. Its dtype is int64 and has a shape of `[batch_size]`.
+
         Returns:
-            scores(`Tensor`): 
-                The scores tensor containing the score for the Viterbi sequence. Its dtype is float32 and has a shape of `[batch_size]`.
-            paths(`Tensor`): 
-                The paths tensor containing the highest scoring tag indices. Its dtype is int64 and has a shape of `[batch_size, sequence_length`].
+            tuple: Returns tuple (scores, paths). The `scores` tensor containing the score for the Viterbi sequence.
+            Its dtype is float32 and has a shape of `[batch_size]`.
+            The `paths` tensor containing the highest scoring tag indices.
+            Its dtype is int64 and has a shape of `[batch_size, sequence_length]`.
         """
         input_shape = paddle.shape(inputs)
         batch_size = input_shape[0]
diff --git a/paddlenlp/layers/sequence.py b/paddlenlp/layers/sequence.py
@@ -18,14 +18,14 @@ def sequence_mask(seq_ids, valid_lengths):
     To boost the performance, this sequence_mask is different with paddle.fluid.layers.sequence_mask
 
     Args:
-        seq_ids (`Tensor`): 
+        seq_ids (Tensor):
             The whole sequence index, a tensor with a shape of [batch_size, sequence_length].
-        valid_lengths (`Tensor`): 
+        valid_lengths (Tensor):
             The valid length of every sequence, a tensor with a shape of [batch_size].
 
     Returns:
-        mask (`Tensor`): 
-            The output sequence mask. Its dtype is ``bool`` and has a shpe of [batch_size, sequence_length].
+        Tensor: Returns the output sequence mask `mask`.
+        Its dtype is `bool` and has a shape of [batch_size, sequence_length].
     """
     lengths_exp = valid_lengths.unsqueeze(1)
     mask = seq_ids < lengths_exp
diff --git a/paddlenlp/layers/tcn.py b/paddlenlp/layers/tcn.py
@@ -23,7 +23,7 @@ class Chomp1d(nn.Layer):
     Remove the elements on the right.
 
     Args:
-        chomp_size (`int`): 
+        chomp_size (int):
             The number of elements removed.
     """
 
@@ -41,19 +41,19 @@ class TemporalBlock(nn.Layer):
     See the Figure 1(b) in https://arxiv.org/pdf/1803.01271.pdf for more details.
 
     Args:
-        n_inputs (`int`): 
+        n_inputs (int):
             The number of channels in the input tensor.
-        n_outputs (`int`): 
+        n_outputs (int):
             The number of filters.
-        kernel_size (`int`): 
+        kernel_size (int):
             The filter size.
-        stride (`int`): 
+        stride (int):
             The stride size.
-        dilation (`int`): 
+        dilation (int):
             The dilation size.
-        padding (`int`): 
+        padding (int):
             The size of zeros to be padded.
-        dropout (`float`, optional): 
+        dropout (float, optional):
             Probability of dropout the units. Defaults to 0.2.
     """
 
@@ -112,6 +112,12 @@ def init_weights(self):
                 paddle.tensor.normal(0.0, 0.01, self.downsample.weight.shape))
 
     def forward(self, x):
+        """
+        Args:
+            x (Tensor):
+                The input tensor with a shape  of [batch_size, input_channel, sequence_length].
+
+        """
         out = self.net(x)
         res = x if self.downsample is None else self.downsample(x)
         return self.relu(out + res)
@@ -124,10 +130,14 @@ def __init__(self, input_channel, num_channels, kernel_size=2, dropout=0.2):
         such as LSTMs in many tasks. See https://arxiv.org/pdf/1803.01271.pdf for more details.
 
         Args:
-            input_channel ([int]): The number of channels in the input tensor. 
-            num_channels ([list | tuple]): The number of channels in different layer. 
-            kernel_size (int, optional): [description]. Defaults to 2.
-            dropout (float, optional): [description]. Defaults to 0.2.
+            input_channel (int):
+                The number of channels in the input tensor.
+            num_channels (list | tuple):
+                The number of channels in different layer.
+            kernel_size (int, optional):
+                The filter size.. Defaults to 2.
+            dropout (float, optional):
+                Probability of dropout the units.. Defaults to 0.2.
         """
         super(TCN, self).__init__()
         layers = nn.LayerList()
@@ -153,12 +163,11 @@ def forward(self, x):
         Apply temporal convolutional networks to the input tensor.
 
         Args:
-            x (`Tensor`): 
-                The input tensor with a shape  of [batch_size, input_channel, sequence_length].
+            x (Tensor):
+                The input tensor with a shape of [batch_size, input_channel, sequence_length].
 
         Returns:
-            output (`Tensor`): 
-                The output tensor with a shape of [batch_size, num_channels[-1], sequence_length].
+            Tensor: The `output` tensor with a shape of [batch_size, num_channels[-1], sequence_length].
         """
         output = self.network(x)
         return output
diff --git a/paddlenlp/losses/rdrop.py b/paddlenlp/losses/rdrop.py
@@ -18,22 +18,24 @@
 
 __all__ = ['RDropLoss']
 
+
 class RDropLoss(nn.Layer):
     """
     R-Drop Loss implementation
     For more information about R-drop please refer to this paper: https://arxiv.org/abs/2106.14448
     Original implementation please refer to this code: https://github.com/dropreg/R-Drop
+
+    Args:
+        reduction(str, optional):
+            Indicate how to average the loss, the candicates are ``'none'``,``'batchmean'``,``'mean'``,``'sum'``.
+            If `reduction` is ``'mean'``, the reduced mean loss is returned;
+            If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+            If `reduction` is ``'sum'``, the reduced sum loss is returned;
+            If `reduction` is ``'none'``, no reduction will be applied.
+            Defaults to ``'none'``.
     """
+
     def __init__(self, reduction='none'):
-        """
-        reduction(obj:`str`, optional): Indicate how to average the loss,
-        the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
-        If `reduction` is ``'mean'``, the reduced mean loss is returned;
-        If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
-        if `reduction` is ``'sum'``, the reduced sum loss is returned;
-        if `reduction` is ``'none'``, no reduction will be apllied.
-        Default is ``'none'``.
-        """
         super(RDropLoss, self).__init__()
         if reduction not in ['sum', 'mean', 'none', 'batchmean']:
             raise ValueError(
@@ -44,15 +46,25 @@ def __init__(self, reduction='none'):
     def forward(self, p, q, pad_mask=None):
         """
         Args:
-            p(obj:`Tensor`): the first forward logits of training examples.
-            q(obj:`Tensor`): the second forward logits of training examples.
-            pad_mask(obj:`Tensor`, optional): The Tensor containing the binary mask to index with, it's data type is bool.
+            p(Tensor): the first forward logits of training examples.
+            q(Tensor): the second forward logits of training examples.
+            pad_mask(Tensor, optional): The Tensor containing the binary mask to index with, it's data type is bool.
 
         Returns:
-            loss(obj:`Tensor`): the rdrop loss of p and q
+            Tensor: Returns tensor `loss`, the rdrop loss of p and q.
         """
-        p_loss = F.kl_div(F.log_softmax(p, axis=-1), F.softmax(q, axis=-1), reduction=self.reduction)
-        q_loss = F.kl_div(F.log_softmax(q, axis=-1), F.softmax(p, axis=-1), reduction=self.reduction)
+        p_loss = F.kl_div(
+            F.log_softmax(
+                p, axis=-1),
+            F.softmax(
+                q, axis=-1),
+            reduction=self.reduction)
+        q_loss = F.kl_div(
+            F.log_softmax(
+                q, axis=-1),
+            F.softmax(
+                p, axis=-1),
+            reduction=self.reduction)
 
         # pad_mask is for seq-level tasks
         if pad_mask is not None: