Skip to content

Commit 205e500

Browse files
committed
[ehealth] merge and move create_dataloader to utils.py
2 parents 61bfdd5 + 92d82a8 commit 205e500

File tree

7 files changed

+68
-48
lines changed

7 files changed

+68
-48
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,12 +317,11 @@ PaddleNLP提供了多粒度、多场景的NLP应用示例,面向动态图模
317317
- 现在就加入PaddleNLP的技术交流群,一起交流NLP技术吧!⬇️
318318

319319
<div align="center">
320-
<img src="https://user-images.githubusercontent.com/11793384/156118227-78837467-5087-40ab-9717-5ab92855cf57.JPG" width="230" height="300" />
320+
<img src="https://user-images.githubusercontent.com/11793384/156540272-353d3d80-f2ec-410d-b863-b51f2d156a72.jpg" width="230" height="300" />
321321
</div>
322322

323323

324324

325-
326325
## 版本更新
327326

328327
更多版本更新说明请查看[ChangeLog](./docs/changelog.md)

README_en.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ To connect with other users and contributors, welcome to join our [Slack channel
217217
Scan the QR code below with your Wechat⬇️. You can access to official technical exchange group. Look forward to your participation.
218218

219219
<div align="center">
220-
<img src="https://user-images.githubusercontent.com/11793384/156119400-1bdbfb6f-9af0-4886-8f98-7d17f386638f.jpg" width="210" height="200" />
220+
<img src="https://user-images.githubusercontent.com/11793384/156540669-c9453a1a-3ed1-4434-a68e-73b9e2f5f771.jpg" width="210" height="200" />
221221
</div>
222222

223223

examples/biomedical/cblue/train_classification.py

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from paddlenlp.metrics import MultiLabelsMetric, AccuracyAndF1
3131
from paddlenlp.ops.optimizer import ExponentialMovingAverage
3232

33-
from utils import convert_example
33+
from utils import convert_example, create_dataloader
3434

3535
METRIC_CLASSES = {
3636
'KUAKE-QIC': Accuracy,
@@ -109,29 +109,6 @@ def evaluate(model, criterion, metric, data_loader):
109109
metric.reset()
110110

111111

112-
def create_dataloader(dataset,
113-
mode='train',
114-
batch_size=1,
115-
batchify_fn=None,
116-
trans_fn=None):
117-
if trans_fn:
118-
dataset = dataset.map(trans_fn)
119-
120-
shuffle = True if mode == 'train' else False
121-
if mode == 'train':
122-
batch_sampler = paddle.io.DistributedBatchSampler(
123-
dataset, batch_size=batch_size, shuffle=shuffle)
124-
else:
125-
batch_sampler = paddle.io.BatchSampler(
126-
dataset, batch_size=batch_size, shuffle=shuffle)
127-
128-
return paddle.io.DataLoader(
129-
dataset=dataset,
130-
batch_sampler=batch_sampler,
131-
collate_fn=batchify_fn,
132-
return_list=True)
133-
134-
135112
def do_train():
136113
paddle.set_device(args.device)
137114
rank = paddle.distributed.get_rank()

examples/biomedical/cblue/utils.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import numpy as np
16+
import paddle
1617

1718
from paddlenlp.transformers import normalize_chars, tokenize_special_chars
1819

@@ -78,3 +79,26 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
7879
return input_ids, token_type_ids, position_ids
7980
label = np.array([example['label']], dtype='int64')
8081
return input_ids, token_type_ids, position_ids, label
82+
83+
84+
def create_dataloader(dataset,
85+
mode='train',
86+
batch_size=1,
87+
batchify_fn=None,
88+
trans_fn=None):
89+
if trans_fn:
90+
dataset = dataset.map(trans_fn)
91+
92+
shuffle = True if mode == 'train' else False
93+
if mode == 'train':
94+
batch_sampler = paddle.io.DistributedBatchSampler(
95+
dataset, batch_size=batch_size, shuffle=shuffle)
96+
else:
97+
batch_sampler = paddle.io.BatchSampler(
98+
dataset, batch_size=batch_size, shuffle=shuffle)
99+
100+
return paddle.io.DataLoader(
101+
dataset=dataset,
102+
batch_sampler=batch_sampler,
103+
collate_fn=batchify_fn,
104+
return_list=True)

examples/language_model/gpt/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@ GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupe
3131
- tqdm
3232
- visualdl
3333
- paddlepaddle-gpu >= 2.2rc
34+
- pybind11
35+
- lac (可选)
36+
- zstandard (可选)
3437

35-
安装命令 `pip install regex sentencepiece tqdm visualdl`
38+
安装命令 `pip install regex sentencepiece tqdm visualdl pybind11 lac zstandard`
3639
注:需要PaddlePaddle版本大于等于2.2rc,或者使用最新develop版本,安装方法请参见Paddle[官网](https://www.paddlepaddle.org.cn)
3740

3841
### 数据准备
@@ -50,7 +53,6 @@ tar -xvf openwebtext2.json.zst.tar -C /path/to/openwebtext
5053
```
5154

5255
然后使用[data_tools](../data_tools)工具下的`create_pretraining_data.py`脚本进行数据集制作:
53-
5456
```
5557
python -u create_pretraining_data.py \
5658
--model_name gpt2-en \

paddlenlp/transformers/convbert/modeling.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,18 @@
2121
from .. import PretrainedModel, register_base_model
2222

2323
__all__ = [
24-
"ConvBertModel", "ConvBertPretrainedModel", "ConvBertForTotalPretraining",
25-
"ConvBertDiscriminator", "ConvBertGenerator", "ConvBertClassificationHead",
26-
"ConvBertForSequenceClassification", "ConvBertForTokenClassification",
27-
"ConvBertPretrainingCriterion", "ConvBertForQuestionAnswering",
28-
"ConvBertForMultipleChoice", "ConvBertForPretraining"
24+
"ConvBertModel",
25+
"ConvBertPretrainedModel",
26+
"ConvBertForTotalPretraining",
27+
"ConvBertDiscriminator",
28+
"ConvBertGenerator",
29+
"ConvBertClassificationHead",
30+
"ConvBertForSequenceClassification",
31+
"ConvBertForTokenClassification",
32+
"ConvBertPretrainingCriterion",
33+
"ConvBertForQuestionAnswering",
34+
"ConvBertForMultipleChoice",
35+
"ConvBertForPretraining",
2936
]
3037
dtype_float = paddle.get_default_dtype()
3138

@@ -115,7 +122,8 @@ def __init__(
115122
self.need_weights = need_weights
116123
self.head_dim = embed_dim // num_heads
117124
self.scale = self.head_dim**-0.5
118-
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
125+
assert self.head_dim * \
126+
num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
119127

120128
new_num_attention_heads = num_heads // head_ratio
121129
if num_heads // head_ratio < 1:
@@ -140,9 +148,7 @@ def __init__(
140148
self.conv_kernel_layer = nn.Linear(
141149
self.all_head_size, self.num_heads * self.conv_kernel_size)
142150
self.conv_out_layer = nn.Linear(embed_dim, self.all_head_size)
143-
self.unfold = nn.Unfold(
144-
kernel_sizes=[self.conv_kernel_size, 1],
145-
paddings=[(self.conv_kernel_size - 1) // 2, 0], )
151+
self.padding = (self.conv_kernel_size - 1) // 2
146152

147153
def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
148154
key = query if key is None else key
@@ -153,28 +159,34 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
153159
v = self.v_proj(value)
154160

155161
if self.conv_type == "sdconv":
162+
bs = paddle.shape(q)[0]
163+
seqlen = paddle.shape(q)[1]
156164
mixed_key_conv_attn_layer = self.key_conv_attn_layer(query)
157165
conv_attn_layer = mixed_key_conv_attn_layer * q
158-
batch_size = q.shape[0]
166+
159167
# conv_kernel_layer
160168
conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer)
161169
conv_kernel_layer = tensor.reshape(
162170
conv_kernel_layer, shape=[-1, self.conv_kernel_size, 1])
163171
conv_kernel_layer = F.softmax(conv_kernel_layer, axis=1)
164-
# conv_out
165172
conv_out_layer = self.conv_out_layer(query)
166-
conv_out_layer = tensor.reshape(
167-
conv_out_layer, [batch_size, -1, self.all_head_size, 1])
168-
conv_out_layer = tensor.transpose(conv_out_layer, perm=[0, 2, 1, 3])
169-
conv_out_layer = self.unfold(conv_out_layer)
170-
conv_out_layer = tensor.transpose(conv_out_layer, perm=[0, 2, 1])
173+
conv_out_layer = F.pad(conv_out_layer,
174+
pad=[self.padding, self.padding],
175+
data_format="NLC")
176+
conv_out_layer = paddle.stack(
177+
[
178+
paddle.slice(
179+
conv_out_layer, axes=[1], starts=[i],
180+
ends=[i + seqlen]) for i in range(self.conv_kernel_size)
181+
],
182+
axis=-1)
171183
conv_out_layer = tensor.reshape(
172184
conv_out_layer,
173185
shape=[-1, self.head_dim, self.conv_kernel_size])
174186
conv_out_layer = tensor.matmul(conv_out_layer, conv_kernel_layer)
175187
conv_out = tensor.reshape(
176188
conv_out_layer,
177-
shape=[batch_size, -1, self.num_heads, self.head_dim])
189+
shape=[bs, seqlen, self.num_heads, self.head_dim])
178190

179191
q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
180192
q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

paddlenlp/transformers/roberta/modeling.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,12 @@ def forward(self,
354354
(input_ids == self.pad_token_id
355355
).astype(self.pooler.dense.weight.dtype) * -1e4,
356356
axis=[1, 2])
357+
elif attention_mask.ndim == 2:
358+
attention_mask = paddle.unsqueeze(
359+
attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
360+
attention_mask = (1.0 - attention_mask) * -1e4
361+
attention_mask.stop_gradient = True
362+
357363
embedding_output = self.embeddings(
358364
input_ids=input_ids,
359365
position_ids=position_ids,
@@ -679,7 +685,7 @@ def forward(self,
679685
680686
tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
681687
model = RobertaForMaskedLM.from_pretrained('roberta-wwm-ext')
682-
688+
683689
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
684690
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
685691
@@ -777,7 +783,7 @@ def forward(self,
777783
778784
tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
779785
model = RobertaForCausalLM.from_pretrained('roberta-wwm-ext')
780-
786+
781787
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
782788
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
783789

0 commit comments

Comments
 (0)