Skip to content

Commit b1bfbf2

Browse files
authored
[API] Fix matmul api useage. (#4058)
1 parent c4ee791 commit b1bfbf2

File tree

5 files changed

+33
-42
lines changed

5 files changed

+33
-42
lines changed

applications/document_intelligence/doc_vqa/Rerank/src/model/ernie.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,15 @@
1313
# limitations under the License.
1414
"""Ernie model."""
1515

16-
from __future__ import absolute_import
17-
from __future__ import division
18-
from __future__ import print_function
19-
from __future__ import unicode_literals
20-
from __future__ import absolute_import
16+
from __future__ import absolute_import, division, print_function, unicode_literals
2117

2218
import json
23-
import six
2419
import logging
2520
from io import open
2621

22+
import paddle
2723
import paddle.fluid as fluid
28-
24+
import six
2925
from model.transformer_encoder import encoder, pre_process_layer
3026

3127
log = logging.getLogger(__name__)
@@ -140,7 +136,7 @@ def _build_model(self, model_name, src_ids, position_ids, sentence_ids, task_ids
140136

141137
emb_out = pre_process_layer(emb_out, "nd", self._prepostprocess_dropout, name=model_name + "pre_encoder")
142138

143-
self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
139+
self_attn_mask = paddle.matmul(x=input_mask, y=input_mask, transpose_y=True)
144140

145141
self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
146142
n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
@@ -226,7 +222,7 @@ def get_lm_output(self, mask_label, mask_pos):
226222
name="mask_lm_out_fc.b_0", initializer=fluid.initializer.Constant(value=0.0)
227223
)
228224
if self._weight_sharing:
229-
fc_out = fluid.layers.matmul(
225+
fc_out = paddle.matmul(
230226
x=mask_trans_feat,
231227
y=fluid.default_main_program().global_block().var(self._word_emb_name),
232228
transpose_y=True,

applications/document_intelligence/doc_vqa/Rerank/src/model/transformer_encoder.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,11 @@
1313
# limitations under the License.
1414
"""Transformer encoder."""
1515

16-
from __future__ import absolute_import
17-
from __future__ import division
18-
from __future__ import print_function
16+
from __future__ import absolute_import, division, print_function
1917

2018
from functools import partial
2119

20+
import paddle
2221
import paddle.fluid as fluid
2322
import paddle.fluid.layers as layers
2423

@@ -111,15 +110,15 @@ def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
111110
Scaled Dot-Product Attention
112111
"""
113112
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
114-
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
113+
product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
115114
if attn_bias:
116115
product += attn_bias
117116
weights = layers.softmax(product)
118117
if dropout_rate:
119118
weights = layers.dropout(
120119
weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False
121120
)
122-
out = layers.matmul(weights, v)
121+
out = paddle.matmul(weights, v)
123122
return out
124123

125124
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)

examples/language_model/gpt-3/dygraph/modeling.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,25 @@
1313
# limitations under the License.
1414

1515
import collections
16-
import math
1716

18-
import numpy as np
1917
import paddle
18+
import paddle.incubate as incubate
2019
import paddle.nn as nn
2120
import paddle.nn.functional as F
2221
import paddle.tensor as tensor
22+
from paddle.distributed import fleet
23+
from paddle.distributed.fleet.meta_parallel import (
24+
LayerDesc,
25+
PipelineLayer,
26+
SharedLayerDesc,
27+
get_rng_state_tracker,
28+
)
29+
from paddle.distributed.fleet.utils import recompute
2330
from paddle.fluid import layers
2431
from paddle.nn.layer.transformer import _convert_param_attr_to_list
2532

2633
from paddlenlp.transformers import PretrainedModel, register_base_model
2734

28-
import paddlenlp
29-
from paddle.distributed import fleet
30-
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
31-
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc
32-
import paddle.incubate as incubate
33-
from paddle.distributed.fleet.utils import recompute
34-
3535
__all__ = [
3636
"GPTModel",
3737
"GPTPretrainedModel",
@@ -46,7 +46,7 @@ def parallel_matmul(lm_output, logit_weights, parallel_output):
4646
hcg = fleet.get_hybrid_communicate_group()
4747
model_parallel_group = hcg.get_model_parallel_group()
4848
world_size = hcg.get_model_parallel_world_size()
49-
rank = hcg.get_model_parallel_rank()
49+
# rank = hcg.get_model_parallel_rank()
5050

5151
if world_size > 1:
5252
input_parallel = paddle.distributed.collective._c_identity(lm_output, group=model_parallel_group)
@@ -215,7 +215,7 @@ def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None
215215
else:
216216
q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache)
217217
# scale dot product attention
218-
product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
218+
product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
219219

220220
# if attn_mask is not None:
221221
# product = product + attn_mask

examples/language_model/gpt-3/static/modeling.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,18 @@
1313
# limitations under the License.
1414

1515
import collections
16-
import math
1716

18-
import numpy as np
1917
import paddle
18+
import paddle.incubate as incubate
2019
import paddle.nn as nn
2120
import paddle.nn.functional as F
2221
import paddle.tensor as tensor
22+
from paddle.distributed.fleet import fleet
2323
from paddle.fluid import layers
2424
from paddle.nn.layer.transformer import _convert_param_attr_to_list
25-
from paddle.distributed.fleet import fleet
26-
import paddle.incubate as incubate
2725

28-
from paddlenlp.transformers import PretrainedModel, register_base_model
2926
import paddlenlp
27+
from paddlenlp.transformers import PretrainedModel, register_base_model
3028

3129
__all__ = ["GPTModel", "GPTForPretraining", "GPTPretrainingCriterion", "GPTForGeneration"]
3230

@@ -154,7 +152,7 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
154152
k = tensor.concat([cache.k, k], axis=2)
155153
v = tensor.concat([cache.v, v], axis=2)
156154

157-
## if not assign here, assign in While loop
155+
# if not assign here, assign in While loop
158156
# layers.assign(k, cache.k) # update caches
159157
# layers.assign(v, cache.v)
160158

@@ -220,7 +218,7 @@ def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None
220218
else:
221219
q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache)
222220
# scale dot product attention
223-
product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
221+
product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
224222

225223
if self.training:
226224
weights = incubate.softmax_mask_fuse_upper_triangle(product)
@@ -424,7 +422,7 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
424422
if isinstance(cache, self.Cache):
425423
attn_output, cache_kv_out = self.self_attn(tgt, attn_mask=tgt_mask, cache=cache.kv)
426424

427-
## if not assign here, update caches in While loop
425+
# if not assign here, update caches in While loop
428426
# layers.assign(cache_kv_out, cache.kv)
429427
if use_cache:
430428
cache = self.Cache(cache_kv_out)
@@ -1069,7 +1067,7 @@ def forward(self, inputs, use_cache=False, cache=None):
10691067
inputs (dict): include src_ids.
10701068
pos_ids, input_mask and max_dec_len are optional.
10711069
"""
1072-
######### forward context #########
1070+
# forward context
10731071
input_ids = inputs["src_ids"]
10741072
position_ids = inputs["pos_ids"] if "pos_ids" in inputs else None
10751073
attention_mask = inputs["input_mask"] if "input_mask" in inputs else None
@@ -1092,13 +1090,12 @@ def forward(self, inputs, use_cache=False, cache=None):
10921090
logits, cached_kvs = self.model(input_ids, position_ids, encode_mask, use_cache=True, cache=gen_caches)
10931091

10941092
next_id = paddle.argmax(logits[:, -1, :], axis=-1).reshape([-1, 1])
1095-
####################################
10961093

10971094
if "max_dec_len" not in inputs:
10981095
max_len = layers.fill_constant([1], dtype=int_type, value=self.max_dec_len, force_cpu=True)
10991096
else:
11001097
max_len = inputs["max_dec_len"]
1101-
min_len = layers.fill_constant(shape=[1], dtype=int_type, value=self.min_dec_len, force_cpu=True)
1098+
# min_len = layers.fill_constant(shape=[1], dtype=int_type, value=self.min_dec_len, force_cpu=True)
11021099
step_idx = layers.fill_constant(shape=[1], value=0, dtype="int64", force_cpu=True)
11031100

11041101
placehold_ids = layers.fill_constant_batch_size_like(
@@ -1108,7 +1105,7 @@ def forward(self, inputs, use_cache=False, cache=None):
11081105

11091106
if "max_dec_len" in inputs:
11101107
max_len = paddle.tensor.creation._memcpy(max_len, place=paddle.CPUPlace())
1111-
cond_int = paddle.full([1], 0, dtype=int_type, name="cond_int")
1108+
# cond_int = paddle.full([1], 0, dtype=int_type, name="cond_int")
11121109
cond = paddle.less_than(step_idx, max_len)
11131110

11141111
if attention_mask is not None:

paddlenlp/transformers/ofa_utils.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import os
15+
1616
import numpy as np
1717
import paddle
1818
import paddle.nn as nn
@@ -30,7 +30,7 @@
3030

3131
def prepare_qkv_ofa(self, query, key, value, cache=None):
3232
q = self.q_proj(query)
33-
if hasattr(self.q_proj, "fn") and self.q_proj.fn.cur_config["expand_ratio"] != None:
33+
if hasattr(self.q_proj, "fn") and self.q_proj.fn.cur_config["expand_ratio"] is not None:
3434
self.num_heads = int(self.num_heads * self.q_proj.fn.cur_config["expand_ratio"])
3535
q = paddle.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
3636
q = paddle.transpose(x=q, perm=[0, 2, 1, 3])
@@ -64,8 +64,7 @@ def mha_ofa_forward(self, query, key, value, attn_mask=None, cache=None):
6464
q, k, v, cache = self._prepare_qkv(query, key, value, cache)
6565

6666
# scale dot product attention
67-
# TODO: use paddle.matmul, however it doesn't support `alpha`
68-
product = paddle.fluid.layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
67+
product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
6968
if attn_mask[0] is not None:
7069
# TODO(guosheng): support bool mask
7170
product = product + attn_mask[0]
@@ -91,7 +90,7 @@ def mha_ofa_forward(self, query, key, value, attn_mask=None, cache=None):
9190
if cache is not None:
9291
outs.append(cache)
9392

94-
if hasattr(self.q_proj, "fn") and self.q_proj.fn.cur_config["expand_ratio"] != None:
93+
if hasattr(self.q_proj, "fn") and self.q_proj.fn.cur_config["expand_ratio"] is not None:
9594
self.num_heads = int(float(self.num_heads) / self.q_proj.fn.cur_config["expand_ratio"])
9695
return out if len(outs) == 1 else tuple(outs)
9796

0 commit comments

Comments
 (0)