Skip to content

Commit 008d926

Browse files
authored
Docstring updates for upcoming doc publish (#146)
* Docstring updates for upcoming doc publish * More fixes * fixups
1 parent 68eb2f7 commit 008d926

12 files changed

+147
-92
lines changed

keras_nlp/layers/fnet_encoder.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,11 @@
2121
class FNetEncoder(keras.layers.Layer):
2222
"""FNet encoder.
2323
24-
This class follows the architecture of FNet encoder layer in paper
25-
"FNet: Mixing Tokens with Fourier Transforms"
26-
(https://arxiv.org/abs/2105.03824). Users can instantiate multiple instances
27-
of this class to stack up the encoder.
24+
This class follows the architecture of FNet encoder layer in the
25+
[FNet paper](https://arxiv.org/abs/2105.03824). Users can instantiate
26+
multiple instances of this class to stack up the encoder.
2827
29-
Note on padding: In the official FNet code, padding tokens are added to the
28+
Note on masking: In the official FNet code, padding tokens are added to the
3029
the input. However, the padding masks are deleted, i.e., mixing of
3130
all tokens is done. This is because certain frequencies will be zeroed
3231
out if we apply padding masks in every encoder layer. Hence, we don't
@@ -36,14 +35,14 @@ class FNetEncoder(keras.layers.Layer):
3635
intermediate_dim: int. The hidden size of feedforward network.
3736
dropout: float, defaults to 0. The dropout value, applied in the
3837
feedforward network.
39-
activation: string or `tf.keras.activations`, defaults to "relu". The
38+
activation: string or `keras.activations`, defaults to "relu". The
4039
activation function of feedforward network.
4140
layer_norm_epsilon: float, defaults to 1e-5. The epsilon value in layer
4241
normalization components.
43-
kernel_initializer: "string" or `tf.keras.initializers` initializer,
42+
kernel_initializer: "string" or `keras.initializers` initializer,
4443
defaults to "glorot_uniform". The kernel initializer for the dense
4544
layers.
46-
bias_initializer: "string" or `tf.keras.initializers` initializer,
45+
bias_initializer: "string" or `keras.initializers` initializer,
4746
defaults to "zeros". The bias initializer for the dense layers.
4847
name: string, defaults to None. The name of the layer.
4948
**kwargs: other keyword arguments.
@@ -56,17 +55,17 @@ class FNetEncoder(keras.layers.Layer):
5655
intermediate_dim=64)
5756
5857
# Create a simple model containing the encoder.
59-
input = tf.keras.Input(shape=[4, 6])
58+
input = keras.Input(shape=[10, 64])
6059
output = encoder(input)
61-
model = tf.keras.Model(inputs=input, outputs=output)
60+
model = keras.Model(inputs=input, outputs=output)
6261
6362
# Call encoder on the inputs.
6463
input_data = tf.random.uniform(shape=[1, 10, 64])
6564
output = model(input_data)
6665
```
6766
6867
References:
69-
[Lee-Thorp et al., 2021](https://arxiv.org/abs/2105.03824)
68+
- [Lee-Thorp et al., 2021](https://arxiv.org/abs/2105.03824)
7069
"""
7170

7271
def __init__(

keras_nlp/layers/mlm_head.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ class MLMHead(keras.layers.Layer):
2222
"""Masked Language Model (MLM) head.
2323
2424
This layer takes two inputs:
25+
2526
- `inputs`: which should be a tensor of encoded tokens with shape
26-
`(batch_size, sequence_length, encoding_dim)`.
27+
`(batch_size, sequence_length, encoding_dim)`.
2728
- `mask_positions`: which should be a tensor of integer positions to
28-
predict with shape `(batch_size, masks_per_sequence)`.
29+
predict with shape `(batch_size, masks_per_sequence)`.
2930
3031
The token encodings should usually be the last output of an encoder model,
3132
and mask positions should be the interger positions you would like to
@@ -46,17 +47,18 @@ class MLMHead(keras.layers.Layer):
4647
embedding_weights: Optional. The weights of the word embedding used
4748
to transform input token ids. The transpose of this weight matrix
4849
will be used to project a token embedding vector to a prediction
49-
over all input words, as described in [1].
50+
over all input words, as described
51+
[here](https://arxiv.org/abs/1608.05859).
5052
intermediate_activation: The activation function of inner dense layer.
5153
activation: The activation function for the outputs of the layer.
5254
Usually either `None` (return logits), or `"softmax"`
5355
(return probabilities).
5456
layer_norm_epsilon: float, defaults to 1e-5. The epsilon value in layer
5557
normalization components.
56-
kernel_initializer: string or tf.keras.initializers initializer,
58+
kernel_initializer: string or `keras.initializers` initializer,
5759
defaults to "glorot_uniform". The kernel initializer for
5860
the dense and multiheaded attention layers.
59-
bias_initializer: string or tf.keras.initializers initializer,
61+
bias_initializer: string or `keras.initializers` initializer,
6062
defaults to "zeros". The bias initializer for
6163
the dense and multiheaded attention layers.
6264
name: string, defaults to None. The name of the layer.
@@ -91,7 +93,7 @@ class MLMHead(keras.layers.Layer):
9193
```
9294
9395
References:
94-
[1] [Press and Wolf, 2016](https://arxiv.org/abs/1608.05859)
96+
- [Press and Wolf, 2016](https://arxiv.org/abs/1608.05859)
9597
"""
9698

9799
def __init__(
@@ -147,7 +149,7 @@ def _build(self, input_shape):
147149
kernel_initializer=self.kernel_initializer,
148150
bias_initializer=self.bias_initializer,
149151
)
150-
self._layer_norm = tf.keras.layers.LayerNormalization(
152+
self._layer_norm = keras.layers.LayerNormalization(
151153
epsilon=self.layer_norm_epsilon,
152154
)
153155
if self.embedding_weights is None:

keras_nlp/layers/mlm_mask_generator.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,15 @@ class MLMMaskGenerator(keras.layers.Layer):
7373
Examples:
7474
7575
Basic usage.
76-
>>> masker = keras_nlp.layers.MLMMaskGenerator( \
77-
vocabulary_size=10, mask_selection_rate=0.2, mask_token_id=0, \
78-
mask_selection_length=5)
76+
>>> masker = keras_nlp.layers.MLMMaskGenerator(
77+
... vocabulary_size=10, mask_selection_rate=0.2, mask_token_id=0,
78+
... mask_selection_length=5)
7979
>>> masker(tf.constant([1, 2, 3, 4, 5]))
8080
8181
Ragged Input:
82-
>>> masker = keras_nlp.layers.MLMMaskGenerator( \
83-
vocabulary_size=10, mask_selection_rate=0.5, mask_token_id=0, \
84-
mask_selection_length=5)
82+
>>> masker = keras_nlp.layers.MLMMaskGenerator(
83+
... vocabulary_size=10, mask_selection_rate=0.5, mask_token_id=0,
84+
... mask_selection_length=5)
8585
>>> masker(tf.ragged.constant([[1, 2], [1, 2, 3, 4]]))
8686
"""
8787

keras_nlp/layers/position_embedding.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ class PositionEmbedding(keras.layers.Layer):
3434
Args:
3535
sequence_length: The maximum length of the dynamic sequence.
3636
initializer: The initializer to use for the embedding weights. Defaults
37-
to "glorot_uniform".
37+
to `"glorot_uniform"`.
3838
seq_axis: The axis of the input tensor where we add the embeddings.
3939
40-
Example:
40+
Examples:
4141
```python
4242
token_embeddings = layers.Embedding(
4343
input_dim=vocab_size, output_dim=embed_dim
@@ -52,8 +52,7 @@ class PositionEmbedding(keras.layers.Layer):
5252
```
5353
5454
Reference:
55-
[BERT: Pre-training of Deep Bidirectional Transformers for Language
56-
Understanding](https://arxiv.org/abs/1810.04805).
55+
- [Devlin et al., 2019](https://arxiv.org/abs/1810.04805)
5756
"""
5857

5958
def __init__(

keras_nlp/layers/sine_position_encoding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class SinePositionEncoding(keras.layers.Layer):
3535
curves, as described in Attention is All You Need. Defaults to
3636
10000.
3737
38-
Example:
38+
Examples:
3939
```python
4040
# create a simple embedding layer with sinusoidal positional encoding
4141
seq_len = 100
@@ -50,7 +50,7 @@ class SinePositionEncoding(keras.layers.Layer):
5050
```
5151
5252
References:
53-
[Attention is All You Need](https://arxiv.org/abs/1706.03762)
53+
- [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
5454
"""
5555

5656
def __init__(

keras_nlp/layers/token_and_position_embedding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class TokenAndPositionEmbedding(keras.layers.Layer):
4141
used in the vocabulary
4242
(input_dim should equal size of vocabulary + 1).
4343
44-
Example:
44+
Examples:
4545
```python
4646
seq_length = 50
4747
vocab_size = 5000

keras_nlp/layers/transformer_decoder.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,29 @@
2626
class TransformerDecoder(keras.layers.Layer):
2727
"""Transformer decoder.
2828
29-
This class follows the architecture of transformer decoder layer in paper
30-
"Attention is All You Need" (https://arxiv.org/abs/1706.03762). Users can
31-
instantiate multiple instances of this class to stack up the decoder.
29+
This class follows the architecture of the transformer decoder layer in the
30+
paper [Attention is All You Need](https://arxiv.org/abs/1706.03762). Users
31+
can instantiate multiple instances of this class to stack up a decoder.
32+
33+
This layer will correctly compute an attention mask from an implicit
34+
Keras padding mask (for example, by passing `mask_zero=True` to a
35+
`keras.layers.Embedding` layer). See the Masking and Padding
36+
[guide](https://keras.io/guides/understanding_masking_and_padding/)
37+
for more details.
3238
3339
Args:
3440
intermediate_dim: int, the hidden size of feedforward network.
3541
num_heads: int, the number of heads in MultiHeadAttention.
3642
dropout: float, defaults to 0. the dropout value, shared by
3743
MultiHeadAttention and feedforward network.
38-
activation: string or tf.keras.activations, defaults to "relu". the
44+
activation: string or `keras.activations`, defaults to "relu". the
3945
activation function of feedforward network.
4046
layer_norm_epsilon: float, defaults to 1e-5. The eps value in layer
4147
normalization components.
42-
kernel_initializer: string or tf.keras.initializers initializer,
48+
kernel_initializer: string or `keras.initializers` initializer,
4349
defaults to "glorot_uniform". The kernel initializer for
4450
the dense and multiheaded attention layers.
45-
bias_initializer: string or tf.keras.initializers initializer,
51+
bias_initializer: string or `keras.initializers` initializer,
4652
defaults to "zeros". The bias initializer for
4753
the dense and multiheaded attention layers.
4854
name: string, defaults to None. The name of the layer.
@@ -55,21 +61,21 @@ class TransformerDecoder(keras.layers.Layer):
5561
intermediate_dim=64, num_heads=8)
5662
5763
# Create a simple model containing the decoder.
58-
decoder_input = tf.keras.Input(shape=[4, 6])
59-
encoder_input = tf.keras.Input(shape=[4, 6])
64+
decoder_input = keras.Input(shape=[10, 64])
65+
encoder_input = keras.Input(shape=[10, 64])
6066
output = decoder(decoder_input, encoder_input)
61-
model = tf.keras.Model(inputs=[decoder_input, encoder_input],
67+
model = keras.Model(inputs=[decoder_input, encoder_input],
6268
outputs=output)
6369
6470
# Call decoder on the inputs.
65-
decoder_input_data = tf.random.uniform(shape=[1, 10, 64])
66-
encoder_input_data = tf.random.uniform(shape=[1, 10, 64])
71+
decoder_input_data = tf.random.uniform(shape=[2, 10, 64])
72+
encoder_input_data = tf.random.uniform(shape=[2, 10, 64])
6773
decoder_output = model([decoder_input_data, encoder_input_data])
6874
6975
```
7076
7177
References:
72-
[Vaswani et al., 20XX](https://arxiv.org/abs/1706.03762)
78+
- [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
7379
7480
"""
7581

keras_nlp/layers/transformer_encoder.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,30 @@
2424
class TransformerEncoder(keras.layers.Layer):
2525
"""Transformer encoder.
2626
27-
This class follows the architecture of transformer encoder layer in paper
28-
"Attention is All You Need" (https://arxiv.org/abs/1706.03762). Users can
29-
instantiate multiple instances of this class to stack up the encoder.
27+
This class follows the architecture of the transformer encoder layer in the
28+
paper [Attention is All You Need](https://arxiv.org/abs/1706.03762). Users
29+
can instantiate multiple instances of this class to stack up an encoder.
30+
31+
This layer will correctly compute an attention mask from an implicit
32+
Keras padding mask (for example, by passing `mask_zero=True` to a
33+
`keras.layers.Embedding` layer). See the Masking and Padding
34+
[guide](https://keras.io/guides/understanding_masking_and_padding/)
35+
for more details.
3036
3137
Args:
3238
intermediate_dim: int, the hidden size of feedforward network.
33-
num_heads: int, the number of heads in MultiHeadAttention.
39+
num_heads: int, the number of heads in the
40+
`keras.layers.MultiHeadAttention` layer.
3441
dropout: float, defaults to 0. the dropout value, shared by
35-
MultiHeadAttention and feedforward network.
36-
activation: string or `tf.keras.activations`, defaults to "relu". the
42+
`keras.layers.MultiHeadAttention` and feedforward network.
43+
activation: string or `keras.activations`, defaults to "relu". the
3744
activation function of feedforward network.
3845
layer_norm_epsilon: float, defaults to 1e-5. The epsilon value in layer
3946
normalization components.
40-
kernel_initializer: string or tf.keras.initializers initializer,
47+
kernel_initializer: string or `keras.initializers` initializer,
4148
defaults to "glorot_uniform". The kernel initializer for
4249
the dense and multiheaded attention layers.
43-
bias_initializer: string or tf.keras.initializers initializer,
50+
bias_initializer: string or `keras.initializers` initializer,
4451
defaults to "zeros". The bias initializer for
4552
the dense and multiheaded attention layers.
4653
name: string, defaults to None. The name of the layer.
@@ -54,18 +61,18 @@ class TransformerEncoder(keras.layers.Layer):
5461
intermediate_dim=64, num_heads=8)
5562
5663
# Create a simple model containing the encoder.
57-
input = tf.keras.Input(shape=[4, 6])
64+
input = keras.Input(shape=[10, 64])
5865
output = encoder(input)
59-
model = tf.keras.Model(inputs=input, outputs=output)
66+
model = keras.Model(inputs=input, outputs=output)
6067
6168
# Call encoder on the inputs.
62-
input_data = tf.random.uniform(shape=[1, 10, 64])
69+
input_data = tf.random.uniform(shape=[2, 10, 64])
6370
output = model(input_data)
6471
6572
```
6673
6774
References:
68-
[Vaswani et al., 20XX](https://arxiv.org/abs/1706.03762)
75+
- [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
6976
"""
7077

7178
def __init__(

keras_nlp/tokenizers/byte_tokenizer.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,25 @@ class ByteTokenizer(tokenizer.Tokenizer):
3030
This tokenizer is a vocabulary-free tokenizer which will tokenize text as
3131
as raw bytes from [0, 256).
3232
33+
Tokenizer outputs can either be padded and truncated with a
34+
`sequence_length` argument, or left un-truncated. The exact output will
35+
depend on the rank of the input tensors.
36+
3337
If input is a batch of strings:
3438
By default, the layer will output a `tf.RaggedTensor` where the last
3539
dimension of the output is ragged. If `sequence_length` is set, the layer
3640
will output a dense `tf.Tensor` where all inputs have been padded or
37-
truncated to `sequence_length`. The output dtype can be controlled via the
38-
`dtype` argument, which should be an integer type
39-
(tf.int16, tf.int32, etc.).
41+
truncated to `sequence_length`.
4042
4143
If input is a scalar string:
4244
There are two cases here. If `sequence_length` is set, the output will be
4345
a dense `tf.Tensor` of shape `[sequence_length]`. Otherwise, the output will
4446
be a dense `tf.Tensor` of shape `[None]`.
4547
48+
The output dtype can be controlled via the
49+
`dtype` argument, which should be an integer type
50+
(tf.int16, tf.int32, etc.).
51+
4652
Args:
4753
lowercase: boolean. If True, the input text will be converted to
4854
lowercase before tokenization.
@@ -89,22 +95,22 @@ class ByteTokenizer(tokenizer.Tokenizer):
8995
<tf.Tensor: shape=(1, 8), dtype=int32, numpy=
9096
array([[104, 101, 108, 108, 111, 0, 0, 0]], dtype=int32)>
9197
92-
Tokenize first, then batch the dataset up.
98+
Tokenize, then batch for ragged outputs.
9399
>>> tokenizer = keras_nlp.tokenizers.ByteTokenizer()
94100
>>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
95101
>>> ds = ds.map(tokenizer)
96102
>>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(2))
97103
>>> ds.take(1).get_single_element()
98104
<tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
99105
100-
Batch the inputs and then tokenize.
106+
Batch, then tokenize for ragged outputs.
101107
>>> tokenizer = keras_nlp.tokenizers.ByteTokenizer()
102108
>>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
103109
>>> ds = ds.batch(2).map(tokenizer)
104110
>>> ds.take(1).get_single_element()
105111
<tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
106112
107-
Tokenize first, then batch the dataset up (`sequence_length` provided).
113+
Tokenize, then batch for dense outputs (`sequence_length` provided).
108114
>>> tokenizer = keras_nlp.tokenizers.ByteTokenizer(sequence_length=5)
109115
>>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
110116
>>> ds = ds.map(tokenizer)
@@ -114,7 +120,7 @@ class ByteTokenizer(tokenizer.Tokenizer):
114120
array([[104, 101, 108, 108, 111],
115121
[102, 117, 110, 0, 0]], dtype=int32)>
116122
117-
Batch the inputs and then tokenize (`sequence_length` provided).
123+
Batch, then tokenize for dense outputs. (`sequence_length` provided).
118124
>>> tokenizer = keras_nlp.tokenizers.ByteTokenizer(sequence_length=5)
119125
>>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
120126
>>> ds = ds.batch(2).map(tokenizer)

0 commit comments

Comments
 (0)