Skip to content

Commit 4d2e19e

Browse files
Lingjun LiuLingjun Liu
authored andcommitted
documentation
1 parent 3c4cae1 commit 4d2e19e

File tree

3 files changed

+353
-362
lines changed

3 files changed

+353
-362
lines changed

tensorlayer/models/transformer/attention_layer.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,21 @@
2323

2424

2525
class MultiHeadAttentionLayer(tl.layers.Layer):
26-
"""Multi-headed attention layer."""
26+
"""The :class:`MultiHeadAttentionLayer` layer is for multi-head attention computation.
27+
The weight computation is between "key" and "query", which will then matmul with "value" to generate information
28+
that selectively focuses on the "query" messages.
29+
Parameters
30+
-----------
31+
num_heads : int
32+
The number of heads which allow attention computation for different features
33+
hidden_size : int
34+
Out dim for the layer
35+
keep_prob : float
36+
Keep probablity for drop-out mechanism between 0 and 1
37+
"""
2738

2839
def __init__(self, num_heads, hidden_size, keep_prob):
29-
"""Initialize Attention.
3040

31-
Args:
32-
hidden_size: int, output dim of hidden layer.
33-
num_heads: int, number of heads to repeat the same attention structure.
34-
keep_prob: float, keep rate for dropout mechanism inside attention for training.
35-
"""
3641
if hidden_size % num_heads:
3742
raise ValueError(
3843
"Hidden size ({}) must be divisible by the number of heads ({}).".format(hidden_size, num_heads)

0 commit comments

Comments
 (0)