@@ -56,7 +56,7 @@ def img_conv_group(input,
56
56
conv_act = None ,
57
57
param_attr = None ,
58
58
conv_with_batchnorm = False ,
59
- conv_batchnorm_drop_rate = None ,
59
+ conv_batchnorm_drop_rate = 0.0 ,
60
60
pool_stride = 1 ,
61
61
pool_type = None ,
62
62
use_cudnn = True ):
@@ -127,21 +127,21 @@ def sequence_conv_pool(input,
127
127
128
128
def glu (input , dim = - 1 ):
129
129
"""
130
- The gated linear unit composed by split, sigmoid activation and elementwise
131
- multiplication. Specifically, Split the input into two equal sized parts
132
- :math:`a` and :math:`b` along the given dimension and then compute as
130
+ The gated linear unit composed by split, sigmoid activation and elementwise
131
+ multiplication. Specifically, Split the input into two equal sized parts
132
+ :math:`a` and :math:`b` along the given dimension and then compute as
133
133
following:
134
134
135
135
.. math::
136
136
137
137
{GLU}(a, b)= a \otimes \sigma(b)
138
138
139
- Refer to `Language Modeling with Gated Convolutional Networks
139
+ Refer to `Language Modeling with Gated Convolutional Networks
140
140
<https://arxiv.org/pdf/1612.08083.pdf>`_.
141
-
141
+
142
142
Args:
143
143
input (Variable): The input variable which is a Tensor or LoDTensor.
144
- dim (int): The dimension along which to split. If :math:`dim < 0`, the
144
+ dim (int): The dimension along which to split. If :math:`dim < 0`, the
145
145
dimension to split along is :math:`rank(input) + dim`.
146
146
147
147
Returns:
@@ -164,24 +164,24 @@ def dot_product_attention(querys, keys, values):
164
164
"""
165
165
The dot-product attention.
166
166
167
- Attention mechanism can be seen as mapping a query and a set of key-value
168
- pairs to an output. The output is computed as a weighted sum of the values,
169
- where the weight assigned to each value is computed by a compatibility
167
+ Attention mechanism can be seen as mapping a query and a set of key-value
168
+ pairs to an output. The output is computed as a weighted sum of the values,
169
+ where the weight assigned to each value is computed by a compatibility
170
170
function (dot-product here) of the query with the corresponding key.
171
-
172
- The dot-product attention can be implemented through (batch) matrix
171
+
172
+ The dot-product attention can be implemented through (batch) matrix
173
173
multipication as follows:
174
174
175
175
.. math::
176
176
177
177
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
178
178
179
- Refer to `Attention Is All You Need
179
+ Refer to `Attention Is All You Need
180
180
<https://arxiv.org/pdf/1706.03762.pdf>`_.
181
181
182
- Note that batch data containing sequences with different lengths is not
182
+ Note that batch data containing sequences with different lengths is not
183
183
supported by this because of the (batch) matrix multipication.
184
-
184
+
185
185
Args:
186
186
query (Variable): The input variable which is a Tensor or LoDTensor.
187
187
key (Variable): The input variable which is a Tensor or LoDTensor.
0 commit comments