@@ -52,7 +52,7 @@ def img_conv_group(input,
52
52
conv_act = None ,
53
53
param_attr = None ,
54
54
conv_with_batchnorm = False ,
55
- conv_batchnorm_drop_rate = None ,
55
+ conv_batchnorm_drop_rate = 0 ,
56
56
pool_stride = 1 ,
57
57
pool_type = None ):
58
58
"""
@@ -120,21 +120,21 @@ def sequence_conv_pool(input,
120
120
121
121
def glu (input , dim = - 1 ):
122
122
"""
123
- The gated linear unit composed by split, sigmoid activation and elementwise
124
- multiplication. Specifically, Split the input into two equal sized parts
125
- :math:`a` and :math:`b` along the given dimension and then compute as
123
+ The gated linear unit composed by split, sigmoid activation and elementwise
124
+ multiplication. Specifically, Split the input into two equal sized parts
125
+ :math:`a` and :math:`b` along the given dimension and then compute as
126
126
following:
127
127
128
128
.. math::
129
129
130
130
{GLU}(a, b)= a \otimes \sigma(b)
131
131
132
- Refer to `Language Modeling with Gated Convolutional Networks
132
+ Refer to `Language Modeling with Gated Convolutional Networks
133
133
<https://arxiv.org/pdf/1612.08083.pdf>`_.
134
-
134
+
135
135
Args:
136
136
input (Variable): The input variable which is a Tensor or LoDTensor.
137
- dim (int): The dimension along which to split. If :math:`dim < 0`, the
137
+ dim (int): The dimension along which to split. If :math:`dim < 0`, the
138
138
dimension to split along is :math:`rank(input) + dim`.
139
139
140
140
Returns:
@@ -157,24 +157,24 @@ def dot_product_attention(querys, keys, values):
157
157
"""
158
158
The dot-product attention.
159
159
160
- Attention mechanism can be seen as mapping a query and a set of key-value
161
- pairs to an output. The output is computed as a weighted sum of the values,
162
- where the weight assigned to each value is computed by a compatibility
160
+ Attention mechanism can be seen as mapping a query and a set of key-value
161
+ pairs to an output. The output is computed as a weighted sum of the values,
162
+ where the weight assigned to each value is computed by a compatibility
163
163
function (dot-product here) of the query with the corresponding key.
164
-
165
- The dot-product attention can be implemented through (batch) matrix
164
+
165
+ The dot-product attention can be implemented through (batch) matrix
166
166
multipication as follows:
167
167
168
168
.. math::
169
169
170
170
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
171
171
172
- Refer to `Attention Is All You Need
172
+ Refer to `Attention Is All You Need
173
173
<https://arxiv.org/pdf/1706.03762.pdf>`_.
174
174
175
- Note that batch data containing sequences with different lengths is not
175
+ Note that batch data containing sequences with different lengths is not
176
176
supported by this because of the (batch) matrix multipication.
177
-
177
+
178
178
Args:
179
179
query (Variable): The input variable which is a Tensor or LoDTensor.
180
180
key (Variable): The input variable which is a Tensor or LoDTensor.
0 commit comments