Skip to content

Commit efde614

Browse files
authored
Merge branch 'develop' into op_path
2 parents 7c19d59 + 70649b1 commit efde614

File tree

258 files changed

+21196
-1163
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

258 files changed

+21196
-1163
lines changed

applications/neural_search/recall/domain_adaptive_pretraining/data_tools/dataset_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,7 @@ def get_samples_mapping(indexed_dataset, data_prefix, num_epochs,
759759
# device_index=rank which is not the case for model
760760
# parallel case
761761
if paddle.distributed.get_world_size() > 1:
762-
if paddle.fluid.framework.in_dygraph_mode():
762+
if paddle.in_dynamic_mode():
763763
paddle.distributed.barrier()
764764

765765
# Load indexed dataset.

applications/neural_search/recall/milvus/feature_extract.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,13 @@ def predict(self, data, tokenizer):
156156
logits = self.output_handle.copy_to_cpu()
157157
all_embeddings.append(logits)
158158
examples = []
159-
159+
if (len(examples) > 0):
160+
input_ids, segment_ids = batchify_fn(examples)
161+
self.input_handles[0].copy_from_cpu(input_ids)
162+
self.input_handles[1].copy_from_cpu(segment_ids)
163+
self.predictor.run()
164+
logits = self.output_handle.copy_to_cpu()
165+
all_embeddings.append(logits)
160166
all_embeddings = np.concatenate(all_embeddings, axis=0)
161167
np.save('corpus_embedding', all_embeddings)
162168

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
CUDA_VISIBLE_DEVICES=2 python feature_extract.py \
22
--model_dir=./output \
3-
--corpus_file "data/milvus_data.csv"
3+
--corpus_file "milvus/milvus_data.csv"

applications/neural_search/recall/simcse/deploy/python/predict.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,6 @@ def predict(self, data, tokenizer):
280280
id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
281281
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
282282
res = predictor.extract_embedding(corpus_list, tokenizer)
283-
res = predictor.predict(corpus_list, tokenizer)
284283
print(res.shape)
285284
print(res)
286285
corpus_list = [['中西方语言与文化的差异', '中西方文化差异以及语言体现中西方文化,差异,语言体现'],

applications/question_answering/faq_system/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,9 @@ python vector_insert.py
269269

270270
### Paddle Serving 部署
271271

272+
Paddle Serving 的安装可以参考[Paddle Serving 安装文档](https://github.com/PaddlePaddle/Serving#installation)。需要在服务端和客户端安装相关的依赖,安装完依赖后就可以执行下面的步骤。
273+
274+
272275
首先把生成的静态图模型导出为 Paddle Serving的格式,命令如下:
273276

274277
```
@@ -353,4 +356,4 @@ Search milvus time cost is 0.004535675048828125 seconds
353356
华新镇“亮牌分批复工”工作方案具体内容是什么? 所有店铺一律先贴“红牌”禁止经营,经相关部门审批後,再换贴“蓝牌”准许复工。 0.7162970900535583
354357
.....
355358
```
356-
输出的结果包括特征提取和检索的时间,还包含检索出来的问答对
359+
输出的结果包括特征提取和检索的时间,还包含检索出来的问答对

applications/sentiment_analysis/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,29 @@ sh run_predict.sh
158158

159159
在训练后,如果需要进行高性能预测,可参考(3)进行动转静,然后基于Paddle Inference 进行高性能预测。
160160

161+
### 4.3 数据标注说明
162+
如果你想标注自己的业务数据,并尝试利用标注的新数据重新训练本项目。本项目推荐使用 [doccano](https://github.com/doccano/doccano) 进行数据标注平台,同时本项目也打通了其从标注到训练的通道,即 doccano 导出的数据后可通过 [doccano.py](./doccano.py) 脚本轻松将数据转换为输入模型时需要的形式,实现无缝衔接。 为达到这个目的,您需要按以下标注规则在 doccano 平台上标注数据:
163+
164+
<div align="center">
165+
<img src="./imgs/labeling_example.png" />
166+
<p>图2 数据标注样例图<p/>
167+
</div>
168+
169+
- 在doccano平台上,定义标签 Pos-Aspect、 Neg-Aspect 和 Opinion,其中 Pos-Aspect 表示 Aspect 的情感极性为正向;Neg-Aspect 表示 Aspect 的情感极性为负向;Opinion 表示相应的观点词。
170+
- 使用以上定义的标签开始标注数据,图2展示了一个标注样例。
171+
- 当标注完成后,在 doccano 平台上导出 `jsonl` 形式的文件,并将其重命名为 `doccano.json` 后,放入 `./data` 目录下。
172+
- 通过 [doccano.py](./doccano.py) 脚本进行数据形式转换,然后便可以开始进行相应模型训练。
173+
174+
```shell
175+
python doccano.py \
176+
--doccano_file ./data/doccano.json \
177+
--save_ext_dir ./data/ext_data \
178+
--save_cls_dir ./data/cls_data
179+
```
180+
181+
**备注:**
182+
- 默认情况下 [doccano.py](./doccano.py) 脚本会按照比例将数据划分为 train/dev/test 数据集
183+
- 每次执行 [doccano.py](./doccano.py) 脚本,将会覆盖已有的同名数据文件
161184

162185
## 5. 小模型优化策略
163186
以上实验中,无论是评论观点抽取模型,还是属性级情感分类模型,使用的均是 Large 版的 SKEP 模型,考虑到企业用户在线上部署时会考虑到模型预测效率,本项目提供了一套基于 [PP-MiniLM](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/model_compression/pp-minilm) 中文特色小模型的解决方案。PP-MiniLM 提供了一套完整的小模型优化方案:首先使用 Task-agnostic 的方式进行模型蒸馏、然后依托于 [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 进行模型裁剪、模型量化等模型压缩技术,有效减小了模型的规模,加快了模型运行速度。
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import json
17+
import argparse
18+
import numpy as np
19+
from utils import decoding, concate_aspect_and_opinion, save_examples, save_dict
20+
21+
22+
def doccano2SA(doccano_file, save_ext_dir, save_cls_dir, splits=[0.8, 0.9], is_shuffle=True):
23+
"""
24+
@Description: Consvert doccano file to data format which is suitable to input to this Application.
25+
@Param doccano_file: The annotated file exported from doccano labeling platform.
26+
@Param save_ext_dir: The directory of ext data that you wanna save.
27+
@Param save_cls_dir: The directory of cls data that you wanna save.
28+
@Param splits: Whether to split doccano file into train/dev/test, note: Only []/ len(splits)==2 accepted.
29+
@Param is_shuffle: Whether to shuffle data.
30+
"""
31+
if not os.path.exists(doccano_file):
32+
raise ValueError("Please input the correct path of doccano file.")
33+
34+
if not os.path.exists(save_ext_dir):
35+
os.makedirs(save_ext_dir)
36+
37+
if not os.path.exists(save_cls_dir):
38+
os.makedirs(save_cls_dir)
39+
40+
if len(splits) != 0 and len(splits) != 2:
41+
raise ValueError("Only []/ len(splits)==2 accepted for splits.")
42+
43+
if splits and (splits[0] >= splits[1] or splits[0] >= 1.0 or splits[1] >= 1.0 or splits[0]<=0. or splits[1]<=0):
44+
raise ValueError("Please set correct splits, the element in it should be in (0,1), and splits[1]>splits[0].")
45+
46+
def label_ext_with_label_term(ext_label, start, end, tag):
47+
48+
if tag == "Opinion":
49+
b_tag = "B-Opinion"
50+
i_tag = "I-Opinion"
51+
else:
52+
b_tag = "B-Aspect"
53+
i_tag = "I-Aspect"
54+
55+
ext_label[start] = b_tag
56+
for i in range(start+1, end):
57+
ext_label[i] = i_tag
58+
59+
ext_examples, cls_examples = [], []
60+
with open(doccano_file, "r", encoding="utf-8") as f:
61+
raw_examples = f.readlines()
62+
# start to label for ext and cls data
63+
for line in raw_examples:
64+
items = json.loads(line)
65+
text, label_terms = items["data"], items["label"]
66+
# label ext data with label_terms
67+
ext_label = ["O"] * len(text)
68+
aspect_mapper = {}
69+
for label_term in label_terms:
70+
start, end, tag = label_term
71+
label_ext_with_label_term(ext_label, start, end, tag)
72+
if tag == "Pos-Aspect":
73+
aspect_mapper[text[start:end]] = "1"
74+
elif tag == "Neg-Aspect":
75+
aspect_mapper[text[start:end]] = "0"
76+
ext_examples.append((text, " ".join(ext_label)))
77+
# label cls data
78+
aps = decoding(text, ext_label)
79+
for ap in aps:
80+
aspect, opinions = ap[0], list(set(ap[1:]))
81+
if aspect not in aspect_mapper:
82+
continue
83+
aspect_text = concate_aspect_and_opinion(text, aspect, opinions)
84+
cls_examples.append((aspect_mapper[aspect], aspect_text, text))
85+
86+
# index for saving data
87+
ext_idx = np.arange(len(ext_examples))
88+
cls_idx = np.arange(len(cls_examples))
89+
90+
if is_shuffle:
91+
ext_idx = np.random.permutation(ext_idx)
92+
cls_idx = np.random.permutation(cls_idx)
93+
94+
95+
if len(splits) == 0:
96+
# save ext data
97+
save_ext_path = os.path.join(save_ext_dir, "doccano.txt")
98+
save_examples(ext_examples, save_ext_path, ext_idx)
99+
print(f"\next: save data to {save_ext_path}.")
100+
# save cls data
101+
save_cls_path = os.path.join(save_cls_dir, "doccano.txt")
102+
save_examples(cls_examples, save_cls_path, cls_idx)
103+
print(f"\ncls: save data to {save_cls_path}.")
104+
105+
else:
106+
# save ext data
107+
eth1, eth2 = int(len(ext_examples)*splits[0]), int(len(ext_examples)*splits[1])
108+
save_ext_train_path = os.path.join(save_ext_dir, "train.txt")
109+
save_ext_dev_path = os.path.join(save_ext_dir, "dev.txt")
110+
save_ext_test_path = os.path.join(save_ext_dir, "test.txt")
111+
save_examples(ext_examples, save_ext_train_path, ext_idx[:eth1])
112+
save_examples(ext_examples, save_ext_dev_path, ext_idx[eth1:eth2])
113+
save_examples(ext_examples, save_ext_test_path, ext_idx[eth2:])
114+
print(f"\next: save train data to {save_ext_train_path}.")
115+
print(f"ext: save dev data to {save_ext_dev_path}.")
116+
print(f"ext: save test data to {save_ext_test_path}.")
117+
118+
# save cls data
119+
cth1, cth2 = int(len(cls_examples)*splits[0]), int(len(cls_examples)*splits[1])
120+
save_cls_train_path = os.path.join(save_cls_dir, "train.txt")
121+
save_cls_dev_path = os.path.join(save_cls_dir, "dev.txt")
122+
save_cls_test_path = os.path.join(save_cls_dir, "test.txt")
123+
save_examples(cls_examples, save_cls_train_path, cls_idx[:cth1])
124+
save_examples(cls_examples, save_cls_dev_path, cls_idx[cth1:cth2])
125+
save_examples(cls_examples, save_cls_test_path, cls_idx[cth2:])
126+
print(f"\ncls: save train data to {save_cls_train_path}.")
127+
print(f"cls: save dev data to {save_cls_dev_path}.")
128+
print(f"cls: save test data to {save_cls_test_path}.")
129+
130+
# save ext dict
131+
ext_dict_path = os.path.join(save_ext_dir, "label.dict")
132+
cls_dict_path = os.path.join(save_cls_dir, "label.dict")
133+
save_dict(ext_dict_path, "ext")
134+
save_dict(cls_dict_path, "cls")
135+
print(f"\next: save dict to {ext_dict_path}.")
136+
print(f"cls: save dict to {cls_dict_path}.")
137+
138+
139+
if __name__=="__main__":
140+
parser = argparse.ArgumentParser()
141+
parser.add_argument("--doccano_file", type=str, default="./data/doccano.json", help="The doccano file exported from doccano platform.")
142+
parser.add_argument("--save_ext_dir", type=str, default="./data/ext_data1", help="The path of ext data that you wanna save.")
143+
parser.add_argument("--save_cls_dir", type=str, default="./data/cls_data1", help="The path of cls data that you wanna save.")
144+
args = parser.parse_args()
145+
146+
doccano2SA(args.doccano_file, args.save_ext_dir, args.save_cls_dir, is_shuffle=True)
147+
144 KB
Loading

applications/sentiment_analysis/utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,36 @@ def decoding(text, tag_seq):
9898
aps.append(no_a_words)
9999

100100
return aps
101+
102+
103+
def concate_aspect_and_opinion(text, aspect, opinions):
104+
aspect_text = ""
105+
for opinion in opinions:
106+
if text.find(aspect) <= text.find(opinion):
107+
aspect_text += aspect + opinion + ","
108+
else:
109+
aspect_text += opinion + aspect + ","
110+
aspect_text = aspect_text[:-1]
111+
112+
return aspect_text
113+
114+
def save_examples(examples, save_path, idxs):
115+
with open(save_path, "w", encoding="utf-8") as f:
116+
for idx in idxs:
117+
line = "\t".join(examples[idx])+"\n"
118+
f.write(line)
119+
120+
def save_dict(dict_path, dict_type):
121+
if dict_type not in ["ext", "cls"]:
122+
raise ValueError("Only ext/cls should be accepted for dict_type.")
123+
124+
with open(dict_path, "w", encoding="utf-8") as f:
125+
if dict_type == "ext":
126+
label_list = ["O", "B-Aspect", "I-Aspect", "B-Opinion", "I-Opinion"]
127+
else:
128+
label_list = ["负向", "正向"]
129+
130+
for label in label_list:
131+
f.write(label+"\n")
132+
133+

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
:caption: 模型库
4646

4747
Transformer预训练模型 <model_zoo/transformers>
48-
TaskFlow <model_zoo/taskflow>
48+
一键预测功能 <model_zoo/taskflow>
4949
预训练词向量 <model_zoo/embeddings>
5050

5151
.. toctree::

0 commit comments

Comments
 (0)