Docstring updates for upcoming doc publish (#146)

mattdangerw · web-flow · commit 008d92656e16 · 2022-05-03T14:16:19.000-07:00
* Docstring updates for upcoming doc publish

* More fixes

* fixups
diff --git a/keras_nlp/layers/fnet_encoder.py b/keras_nlp/layers/fnet_encoder.py
@@ -21,12 +21,11 @@
 class FNetEncoder(keras.layers.Layer):
     """FNet encoder.
 
-    This class follows the architecture of FNet encoder layer in paper
-    "FNet: Mixing Tokens with Fourier Transforms"
-    (https://arxiv.org/abs/2105.03824). Users can instantiate multiple instances
-    of this class to stack up the encoder.
+    This class follows the architecture of FNet encoder layer in the
+    [FNet paper](https://arxiv.org/abs/2105.03824). Users can instantiate
+    multiple instances of this class to stack up the encoder.
 
-    Note on padding: In the official FNet code, padding tokens are added to the
+    Note on masking: In the official FNet code, padding tokens are added to the
     the input. However, the padding masks are deleted, i.e., mixing of
     all tokens is done. This is because certain frequencies will be zeroed
     out if we apply padding masks in every encoder layer. Hence, we don't
@@ -36,14 +35,14 @@ class FNetEncoder(keras.layers.Layer):
         intermediate_dim: int. The hidden size of feedforward network.
         dropout: float, defaults to 0. The dropout value, applied in the
             feedforward network.
-        activation: string or `tf.keras.activations`, defaults to "relu". The
+        activation: string or `keras.activations`, defaults to "relu". The
             activation function of feedforward network.
         layer_norm_epsilon: float, defaults to 1e-5. The epsilon value in layer
             normalization components.
-        kernel_initializer: "string" or `tf.keras.initializers` initializer,
+        kernel_initializer: "string" or `keras.initializers` initializer,
             defaults to "glorot_uniform". The kernel initializer for the dense
             layers.
-        bias_initializer: "string" or `tf.keras.initializers` initializer,
+        bias_initializer: "string" or `keras.initializers` initializer,
             defaults to "zeros". The bias initializer for the dense layers.
         name: string, defaults to None. The name of the layer.
         **kwargs: other keyword arguments.
@@ -56,17 +55,17 @@ class FNetEncoder(keras.layers.Layer):
         intermediate_dim=64)
 
     # Create a simple model containing the encoder.
-    input = tf.keras.Input(shape=[4, 6])
+    input = keras.Input(shape=[10, 64])
     output = encoder(input)
-    model = tf.keras.Model(inputs=input, outputs=output)
+    model = keras.Model(inputs=input, outputs=output)
 
     # Call encoder on the inputs.
     input_data = tf.random.uniform(shape=[1, 10, 64])
     output = model(input_data)
     ```
 
     References:
-        [Lee-Thorp et al., 2021](https://arxiv.org/abs/2105.03824)
+     - [Lee-Thorp et al., 2021](https://arxiv.org/abs/2105.03824)
     """
 
     def __init__(
diff --git a/keras_nlp/layers/mlm_head.py b/keras_nlp/layers/mlm_head.py
@@ -22,10 +22,11 @@ class MLMHead(keras.layers.Layer):
     """Masked Language Model (MLM) head.
 
     This layer takes two inputs:
+
      - `inputs`: which should be a tensor of encoded tokens with shape
-            `(batch_size, sequence_length, encoding_dim)`.
+       `(batch_size, sequence_length, encoding_dim)`.
      - `mask_positions`: which should be a tensor of integer positions to
-            predict with shape `(batch_size, masks_per_sequence)`.
+       predict with shape `(batch_size, masks_per_sequence)`.
 
     The token encodings should usually be the last output of an encoder model,
     and mask positions should be the interger positions you would like to
@@ -46,17 +47,18 @@ class MLMHead(keras.layers.Layer):
         embedding_weights: Optional. The weights of the word embedding used
             to transform input token ids. The transpose of this weight matrix
             will be used to project a token embedding vector to a prediction
-            over all input words, as described in [1].
+            over all input words, as described
+            [here](https://arxiv.org/abs/1608.05859).
         intermediate_activation: The activation function of inner dense layer.
         activation: The activation function for the outputs of the layer.
             Usually either `None` (return logits), or `"softmax"`
             (return probabilities).
         layer_norm_epsilon: float, defaults to 1e-5. The epsilon value in layer
             normalization components.
-        kernel_initializer: string or tf.keras.initializers initializer,
+        kernel_initializer: string or `keras.initializers` initializer,
             defaults to "glorot_uniform". The kernel initializer for
             the dense and multiheaded attention layers.
-        bias_initializer: string or tf.keras.initializers initializer,
+        bias_initializer: string or `keras.initializers` initializer,
             defaults to "zeros". The bias initializer for
             the dense and multiheaded attention layers.
         name: string, defaults to None. The name of the layer.
@@ -91,7 +93,7 @@ class MLMHead(keras.layers.Layer):
     ```
 
     References:
-        [1] [Press and Wolf, 2016](https://arxiv.org/abs/1608.05859)
+     - [Press and Wolf, 2016](https://arxiv.org/abs/1608.05859)
     """
 
     def __init__(
@@ -147,7 +149,7 @@ def _build(self, input_shape):
             kernel_initializer=self.kernel_initializer,
             bias_initializer=self.bias_initializer,
         )
-        self._layer_norm = tf.keras.layers.LayerNormalization(
+        self._layer_norm = keras.layers.LayerNormalization(
             epsilon=self.layer_norm_epsilon,
         )
         if self.embedding_weights is None:
diff --git a/keras_nlp/layers/mlm_mask_generator.py b/keras_nlp/layers/mlm_mask_generator.py
@@ -73,15 +73,15 @@ class MLMMaskGenerator(keras.layers.Layer):
     Examples:
 
     Basic usage.
-    >>> masker = keras_nlp.layers.MLMMaskGenerator( \
-            vocabulary_size=10, mask_selection_rate=0.2, mask_token_id=0, \
-            mask_selection_length=5)
+    >>> masker = keras_nlp.layers.MLMMaskGenerator(
+    ...     vocabulary_size=10, mask_selection_rate=0.2, mask_token_id=0,
+    ...     mask_selection_length=5)
     >>> masker(tf.constant([1, 2, 3, 4, 5]))
 
     Ragged Input:
-    >>> masker = keras_nlp.layers.MLMMaskGenerator( \
-            vocabulary_size=10, mask_selection_rate=0.5, mask_token_id=0, \
-            mask_selection_length=5)
+    >>> masker = keras_nlp.layers.MLMMaskGenerator(
+    ...     vocabulary_size=10, mask_selection_rate=0.5, mask_token_id=0,
+    ...     mask_selection_length=5)
     >>> masker(tf.ragged.constant([[1, 2], [1, 2, 3, 4]]))
     """
 
diff --git a/keras_nlp/layers/position_embedding.py b/keras_nlp/layers/position_embedding.py
@@ -34,10 +34,10 @@ class PositionEmbedding(keras.layers.Layer):
     Args:
         sequence_length: The maximum length of the dynamic sequence.
         initializer: The initializer to use for the embedding weights. Defaults
-            to "glorot_uniform".
+            to `"glorot_uniform"`.
         seq_axis: The axis of the input tensor where we add the embeddings.
 
-    Example:
+    Examples:
     ```python
     token_embeddings = layers.Embedding(
         input_dim=vocab_size, output_dim=embed_dim
@@ -52,8 +52,7 @@ class PositionEmbedding(keras.layers.Layer):
     ```
 
     Reference:
-        [BERT: Pre-training of Deep Bidirectional Transformers for Language
-        Understanding](https://arxiv.org/abs/1810.04805).
+     - [Devlin et al., 2019](https://arxiv.org/abs/1810.04805)
     """
 
     def __init__(
diff --git a/keras_nlp/layers/sine_position_encoding.py b/keras_nlp/layers/sine_position_encoding.py
@@ -35,7 +35,7 @@ class SinePositionEncoding(keras.layers.Layer):
             curves, as described in Attention is All You Need. Defaults to
             10000.
 
-    Example:
+    Examples:
     ```python
     # create a simple embedding layer with sinusoidal positional encoding
     seq_len = 100
@@ -50,7 +50,7 @@ class SinePositionEncoding(keras.layers.Layer):
     ```
 
     References:
-      [Attention is All You Need](https://arxiv.org/abs/1706.03762)
+     - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
     """
 
     def __init__(
diff --git a/keras_nlp/layers/token_and_position_embedding.py b/keras_nlp/layers/token_and_position_embedding.py
@@ -41,7 +41,7 @@ class TokenAndPositionEmbedding(keras.layers.Layer):
             used in the vocabulary
             (input_dim should equal size of vocabulary + 1).
 
-    Example:
+    Examples:
     ```python
     seq_length = 50
     vocab_size = 5000
diff --git a/keras_nlp/layers/transformer_decoder.py b/keras_nlp/layers/transformer_decoder.py
@@ -26,23 +26,29 @@
 class TransformerDecoder(keras.layers.Layer):
     """Transformer decoder.
 
-    This class follows the architecture of transformer decoder layer in paper
-    "Attention is All You Need" (https://arxiv.org/abs/1706.03762). Users can
-    instantiate multiple instances of this class to stack up the decoder.
+    This class follows the architecture of the transformer decoder layer in the
+    paper [Attention is All You Need](https://arxiv.org/abs/1706.03762). Users
+    can instantiate multiple instances of this class to stack up a decoder.
+
+    This layer will correctly compute an attention mask from an implicit
+    Keras padding mask (for example, by passing `mask_zero=True` to a
+    `keras.layers.Embedding` layer). See the Masking and Padding
+    [guide](https://keras.io/guides/understanding_masking_and_padding/)
+    for more details.
 
     Args:
         intermediate_dim: int, the hidden size of feedforward network.
         num_heads: int, the number of heads in MultiHeadAttention.
         dropout: float, defaults to 0. the dropout value, shared by
             MultiHeadAttention and feedforward network.
-        activation: string or tf.keras.activations, defaults to "relu". the
+        activation: string or `keras.activations`, defaults to "relu". the
             activation function of feedforward network.
         layer_norm_epsilon: float, defaults to 1e-5. The eps value in layer
             normalization components.
-        kernel_initializer: string or tf.keras.initializers initializer,
+        kernel_initializer: string or `keras.initializers` initializer,
             defaults to "glorot_uniform". The kernel initializer for
             the dense and multiheaded attention layers.
-        bias_initializer: string or tf.keras.initializers initializer,
+        bias_initializer: string or `keras.initializers` initializer,
             defaults to "zeros". The bias initializer for
             the dense and multiheaded attention layers.
         name: string, defaults to None. The name of the layer.
@@ -55,21 +61,21 @@ class TransformerDecoder(keras.layers.Layer):
         intermediate_dim=64, num_heads=8)
 
     # Create a simple model containing the decoder.
-    decoder_input = tf.keras.Input(shape=[4, 6])
-    encoder_input = tf.keras.Input(shape=[4, 6])
+    decoder_input = keras.Input(shape=[10, 64])
+    encoder_input = keras.Input(shape=[10, 64])
     output = decoder(decoder_input, encoder_input)
-    model = tf.keras.Model(inputs=[decoder_input, encoder_input],
+    model = keras.Model(inputs=[decoder_input, encoder_input],
         outputs=output)
 
     # Call decoder on the inputs.
-    decoder_input_data = tf.random.uniform(shape=[1, 10, 64])
-    encoder_input_data = tf.random.uniform(shape=[1, 10, 64])
+    decoder_input_data = tf.random.uniform(shape=[2, 10, 64])
+    encoder_input_data = tf.random.uniform(shape=[2, 10, 64])
     decoder_output = model([decoder_input_data, encoder_input_data])
 
     ```
 
     References:
-        [Vaswani et al., 20XX](https://arxiv.org/abs/1706.03762)
+     - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
 
     """
 
diff --git a/keras_nlp/layers/transformer_encoder.py b/keras_nlp/layers/transformer_encoder.py
@@ -24,23 +24,30 @@
 class TransformerEncoder(keras.layers.Layer):
     """Transformer encoder.
 
-    This class follows the architecture of transformer encoder layer in paper
-    "Attention is All You Need" (https://arxiv.org/abs/1706.03762). Users can
-    instantiate multiple instances of this class to stack up the encoder.
+    This class follows the architecture of the transformer encoder layer in the
+    paper [Attention is All You Need](https://arxiv.org/abs/1706.03762). Users
+    can instantiate multiple instances of this class to stack up an encoder.
+
+    This layer will correctly compute an attention mask from an implicit
+    Keras padding mask (for example, by passing `mask_zero=True` to a
+    `keras.layers.Embedding` layer). See the Masking and Padding
+    [guide](https://keras.io/guides/understanding_masking_and_padding/)
+    for more details.
 
     Args:
         intermediate_dim: int, the hidden size of feedforward network.
-        num_heads: int, the number of heads in MultiHeadAttention.
+        num_heads: int, the number of heads in the
+            `keras.layers.MultiHeadAttention` layer.
         dropout: float, defaults to 0. the dropout value, shared by
-            MultiHeadAttention and feedforward network.
-        activation: string or `tf.keras.activations`, defaults to "relu". the
+            `keras.layers.MultiHeadAttention` and feedforward network.
+        activation: string or `keras.activations`, defaults to "relu". the
             activation function of feedforward network.
         layer_norm_epsilon: float, defaults to 1e-5. The epsilon value in layer
             normalization components.
-        kernel_initializer: string or tf.keras.initializers initializer,
+        kernel_initializer: string or `keras.initializers` initializer,
             defaults to "glorot_uniform". The kernel initializer for
             the dense and multiheaded attention layers.
-        bias_initializer: string or tf.keras.initializers initializer,
+        bias_initializer: string or `keras.initializers` initializer,
             defaults to "zeros". The bias initializer for
             the dense and multiheaded attention layers.
         name: string, defaults to None. The name of the layer.
@@ -54,18 +61,18 @@ class TransformerEncoder(keras.layers.Layer):
         intermediate_dim=64, num_heads=8)
 
     # Create a simple model containing the encoder.
-    input = tf.keras.Input(shape=[4, 6])
+    input = keras.Input(shape=[10, 64])
     output = encoder(input)
-    model = tf.keras.Model(inputs=input, outputs=output)
+    model = keras.Model(inputs=input, outputs=output)
 
     # Call encoder on the inputs.
-    input_data = tf.random.uniform(shape=[1, 10, 64])
+    input_data = tf.random.uniform(shape=[2, 10, 64])
     output = model(input_data)
 
     ```
 
     References:
-        [Vaswani et al., 20XX](https://arxiv.org/abs/1706.03762)
+     - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
     """
 
     def __init__(
diff --git a/keras_nlp/tokenizers/byte_tokenizer.py b/keras_nlp/tokenizers/byte_tokenizer.py
@@ -30,19 +30,25 @@ class ByteTokenizer(tokenizer.Tokenizer):
     This tokenizer is a vocabulary-free tokenizer which will tokenize text as
     as raw bytes from [0, 256).
 
+    Tokenizer outputs can either be padded and truncated with a
+    `sequence_length` argument, or left un-truncated. The exact output will
+    depend on the rank of the input tensors.
+
     If input is a batch of strings:
     By default, the layer will output a `tf.RaggedTensor` where the last
     dimension of the output is ragged. If `sequence_length` is set, the layer
     will output a dense `tf.Tensor` where all inputs have been padded or
-    truncated to `sequence_length`. The output dtype can be controlled via the
-    `dtype` argument, which should be an integer type
-    (tf.int16, tf.int32, etc.).
+    truncated to `sequence_length`.
 
     If input is a scalar string:
     There are two cases here. If `sequence_length` is set, the output will be
     a dense `tf.Tensor` of shape `[sequence_length]`. Otherwise, the output will
     be a dense `tf.Tensor` of shape `[None]`.
 
+    The output dtype can be controlled via the
+    `dtype` argument, which should be an integer type
+    (tf.int16, tf.int32, etc.).
+
     Args:
         lowercase: boolean. If True, the input text will be converted to
             lowercase before tokenization.
@@ -89,22 +95,22 @@ class ByteTokenizer(tokenizer.Tokenizer):
     <tf.Tensor: shape=(1, 8), dtype=int32, numpy=
     array([[104, 101, 108, 108, 111,   0,   0,   0]], dtype=int32)>
 
-    Tokenize first, then batch the dataset up.
+    Tokenize, then batch for ragged outputs.
     >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer()
     >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
     >>> ds = ds.map(tokenizer)
     >>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(2))
     >>> ds.take(1).get_single_element()
     <tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
 
-    Batch the inputs and then tokenize.
+    Batch, then tokenize for ragged outputs.
     >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer()
     >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
     >>> ds = ds.batch(2).map(tokenizer)
     >>> ds.take(1).get_single_element()
     <tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
 
-    Tokenize first, then batch the dataset up (`sequence_length` provided).
+    Tokenize, then batch for dense outputs (`sequence_length` provided).
     >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer(sequence_length=5)
     >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
     >>> ds = ds.map(tokenizer)
@@ -114,7 +120,7 @@ class ByteTokenizer(tokenizer.Tokenizer):
     array([[104, 101, 108, 108, 111],
            [102, 117, 110,   0,   0]], dtype=int32)>
 
-    Batch the inputs and then tokenize (`sequence_length` provided).
+    Batch, then tokenize for dense outputs. (`sequence_length` provided).
     >>> tokenizer = keras_nlp.tokenizers.ByteTokenizer(sequence_length=5)
     >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
     >>> ds = ds.batch(2).map(tokenizer)
diff --git a/keras_nlp/tokenizers/tokenizer.py b/keras_nlp/tokenizers/tokenizer.py
diff --git a/keras_nlp/tokenizers/unicode_character_tokenizer.py b/keras_nlp/tokenizers/unicode_character_tokenizer.py
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py