PaddlePaddle
diff --git a/‎applications/neural_search/recall/domain_adaptive_pretraining/data_tools/dataset_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎applications/neural_search/recall/domain_adaptive_pretraining/data_tools/dataset_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎applications/neural_search/recall/milvus/feature_extract.py‎
Lines changed: 7 additions & 1 deletion b/‎applications/neural_search/recall/milvus/feature_extract.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎applications/neural_search/recall/milvus/scripts/feature_extract.sh‎
Lines changed: 1 addition & 1 deletion b/‎applications/neural_search/recall/milvus/scripts/feature_extract.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎applications/neural_search/recall/simcse/deploy/python/predict.py‎
Lines changed: 0 additions & 1 deletion b/‎applications/neural_search/recall/simcse/deploy/python/predict.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎applications/question_answering/faq_system/README.md‎
Lines changed: 4 additions & 1 deletion b/‎applications/question_answering/faq_system/README.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎applications/sentiment_analysis/README.md‎
Lines changed: 23 additions & 0 deletions b/‎applications/sentiment_analysis/README.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎applications/sentiment_analysis/doccano.py‎
Lines changed: 147 additions & 0 deletions b/‎applications/sentiment_analysis/doccano.py‎
Lines changed: 147 additions & 0 deletions
diff --git a/‎applications/sentiment_analysis/imgs/labeling_example.png‎
144 KB b/‎applications/sentiment_analysis/imgs/labeling_example.png‎
144 KB
diff --git a/‎applications/sentiment_analysis/utils.py‎
Lines changed: 33 additions & 0 deletions b/‎applications/sentiment_analysis/utils.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎docs/index.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/index.rst‎
Lines changed: 1 addition & 1 deletion
@@ -759,7 +759,7 @@ def get_samples_mapping(indexed_dataset, data_prefix, num_epochs,
     # device_index=rank which is not the case for model
     # parallel case
     if paddle.distributed.get_world_size() > 1:
-        if paddle.fluid.framework.in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             paddle.distributed.barrier()
 
     # Load indexed dataset.
 
@@ -156,7 +156,13 @@ def predict(self, data, tokenizer):
                 logits = self.output_handle.copy_to_cpu()
                 all_embeddings.append(logits)
                 examples = []
-
+        if (len(examples) > 0):
+            input_ids, segment_ids = batchify_fn(examples)
+            self.input_handles[0].copy_from_cpu(input_ids)
+            self.input_handles[1].copy_from_cpu(segment_ids)
+            self.predictor.run()
+            logits = self.output_handle.copy_to_cpu()
+            all_embeddings.append(logits)
         all_embeddings = np.concatenate(all_embeddings, axis=0)
         np.save('corpus_embedding', all_embeddings)
 
 
@@ -1,3 +1,3 @@
 CUDA_VISIBLE_DEVICES=2 python feature_extract.py \
         --model_dir=./output \
-        --corpus_file "data/milvus_data.csv" 
+        --corpus_file "milvus/milvus_data.csv" 
@@ -280,7 +280,6 @@ def predict(self, data, tokenizer):
     id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
     corpus_list = [{idx: text} for idx, text in id2corpus.items()]
     res = predictor.extract_embedding(corpus_list, tokenizer)
-    res = predictor.predict(corpus_list, tokenizer)
     print(res.shape)
     print(res)
     corpus_list = [['中西方语言与文化的差异', '中西方文化差异以及语言体现中西方文化,差异,语言体现'],
 
@@ -269,6 +269,9 @@ python vector_insert.py
 
 ### Paddle Serving 部署
 
+Paddle Serving 的安装可以参考[Paddle Serving 安装文档](https://github.com/PaddlePaddle/Serving#installation)。需要在服务端和客户端安装相关的依赖，安装完依赖后就可以执行下面的步骤。
+
+
 首先把生成的静态图模型导出为 Paddle Serving的格式，命令如下：
 
 ```
@@ -353,4 +356,4 @@ Search milvus time cost is 0.004535675048828125 seconds
 华新镇“亮牌分批复工”工作方案具体内容是什么？    所有店铺一律先贴“红牌”禁止经营，经相关部门审批後，再换贴“蓝牌”准许复工。 0.7162970900535583
 .....
 ```
-输出的结果包括特征提取和检索的时间，还包含检索出来的问答对，
+输出的结果包括特征提取和检索的时间，还包含检索出来的问答对。
@@ -158,6 +158,29 @@ sh run_predict.sh
 
 在训练后，如果需要进行高性能预测，可参考（3）进行动转静，然后基于Paddle Inference 进行高性能预测。
 
+### 4.3 数据标注说明
+如果你想标注自己的业务数据，并尝试利用标注的新数据重新训练本项目。本项目推荐使用 [doccano](https://github.com/doccano/doccano) 进行数据标注平台，同时本项目也打通了其从标注到训练的通道，即 doccano 导出的数据后可通过 [doccano.py](./doccano.py) 脚本轻松将数据转换为输入模型时需要的形式，实现无缝衔接。 为达到这个目的，您需要按以下标注规则在 doccano 平台上标注数据：
+
+<div align="center">
+    <img src="./imgs/labeling_example.png" />
+    <p>图2 数据标注样例图<p/>
+</div>
+
+- 在doccano平台上，定义标签 Pos-Aspect、 Neg-Aspect 和 Opinion，其中 Pos-Aspect 表示 Aspect 的情感极性为正向；Neg-Aspect 表示 Aspect 的情感极性为负向；Opinion 表示相应的观点词。
+- 使用以上定义的标签开始标注数据，图2展示了一个标注样例。
+- 当标注完成后，在 doccano 平台上导出 `jsonl` 形式的文件，并将其重命名为 `doccano.json` 后，放入 `./data` 目录下。
+- 通过 [doccano.py](./doccano.py) 脚本进行数据形式转换，然后便可以开始进行相应模型训练。
+
+```shell
+python doccano.py \
+    --doccano_file ./data/doccano.json \
+    --save_ext_dir ./data/ext_data \
+    --save_cls_dir ./data/cls_data
+```
+
+**备注：** 
+- 默认情况下 [doccano.py](./doccano.py) 脚本会按照比例将数据划分为 train/dev/test 数据集
+- 每次执行 [doccano.py](./doccano.py) 脚本，将会覆盖已有的同名数据文件
 
 ## 5. 小模型优化策略
 以上实验中，无论是评论观点抽取模型，还是属性级情感分类模型，使用的均是 Large 版的 SKEP 模型，考虑到企业用户在线上部署时会考虑到模型预测效率，本项目提供了一套基于 [PP-MiniLM](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/model_compression/pp-minilm) 中文特色小模型的解决方案。PP-MiniLM 提供了一套完整的小模型优化方案：首先使用 Task-agnostic 的方式进行模型蒸馏、然后依托于 [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 进行模型裁剪、模型量化等模型压缩技术，有效减小了模型的规模，加快了模型运行速度。
 
@@ -0,0 +1,147 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import argparse
+import numpy as np
+from utils import decoding, concate_aspect_and_opinion, save_examples, save_dict
+
+
+def doccano2SA(doccano_file, save_ext_dir, save_cls_dir, splits=[0.8, 0.9], is_shuffle=True):
+    """
+        @Description: Consvert doccano file to data format which is suitable to input to this Application.
+        @Param doccano_file: The annotated file exported from doccano labeling platform.
+        @Param save_ext_dir: The directory of ext data that you wanna save.
+        @Param save_cls_dir: The directory of cls data that you wanna save.
+        @Param splits: Whether to split doccano file into train/dev/test, note: Only []/ len(splits)==2 accepted.
+        @Param is_shuffle: Whether to shuffle data.
+    """
+    if not os.path.exists(doccano_file):
+        raise ValueError("Please input the correct path of doccano file.")
+    
+    if not os.path.exists(save_ext_dir):
+        os.makedirs(save_ext_dir)
+    
+    if not os.path.exists(save_cls_dir):
+        os.makedirs(save_cls_dir)
+    
+    if len(splits) != 0 and len(splits) != 2:
+        raise ValueError("Only []/ len(splits)==2 accepted for splits.")
+
+    if splits and (splits[0] >= splits[1] or splits[0] >= 1.0 or splits[1] >= 1.0 or splits[0]<=0. or splits[1]<=0):
+        raise ValueError("Please set correct splits, the element in it should be in (0,1), and splits[1]>splits[0].")
+
+    def label_ext_with_label_term(ext_label, start, end, tag):
+
+        if tag == "Opinion":                
+            b_tag = "B-Opinion"
+            i_tag = "I-Opinion"
+        else:
+            b_tag = "B-Aspect"
+            i_tag = "I-Aspect"
+
+        ext_label[start] = b_tag
+        for i in range(start+1, end):
+            ext_label[i] = i_tag
+
+    ext_examples, cls_examples = [], []
+    with open(doccano_file, "r", encoding="utf-8") as f:
+        raw_examples = f.readlines()
+    # start to label for ext and cls data
+    for line in raw_examples:
+        items = json.loads(line)
+        text, label_terms = items["data"], items["label"]
+        # label ext data with label_terms
+        ext_label = ["O"] * len(text)
+        aspect_mapper = {}
+        for label_term in label_terms:
+            start, end, tag = label_term
+            label_ext_with_label_term(ext_label, start, end, tag)
+            if tag == "Pos-Aspect":
+                aspect_mapper[text[start:end]] = "1"
+            elif tag == "Neg-Aspect":
+                aspect_mapper[text[start:end]] = "0"
+        ext_examples.append((text, " ".join(ext_label)))
+        # label cls data
+        aps = decoding(text, ext_label)
+        for ap in aps:
+            aspect, opinions = ap[0], list(set(ap[1:]))
+            if aspect not in aspect_mapper:
+                continue
+            aspect_text = concate_aspect_and_opinion(text, aspect, opinions)
+            cls_examples.append((aspect_mapper[aspect], aspect_text, text))
+
+    # index for saving data
+    ext_idx = np.arange(len(ext_examples))
+    cls_idx = np.arange(len(cls_examples))
+
+    if is_shuffle:
+        ext_idx = np.random.permutation(ext_idx)
+        cls_idx = np.random.permutation(cls_idx)
+
+
+    if len(splits) == 0:
+        # save ext data
+        save_ext_path = os.path.join(save_ext_dir, "doccano.txt")
+        save_examples(ext_examples, save_ext_path, ext_idx)
+        print(f"\next: save data to {save_ext_path}.")
+        # save cls data
+        save_cls_path = os.path.join(save_cls_dir, "doccano.txt")
+        save_examples(cls_examples, save_cls_path, cls_idx)
+        print(f"\ncls: save data to {save_cls_path}.")
+    
+    else:
+        # save ext data
+        eth1, eth2 = int(len(ext_examples)*splits[0]), int(len(ext_examples)*splits[1])
+        save_ext_train_path = os.path.join(save_ext_dir, "train.txt")
+        save_ext_dev_path = os.path.join(save_ext_dir, "dev.txt")
+        save_ext_test_path = os.path.join(save_ext_dir, "test.txt")
+        save_examples(ext_examples, save_ext_train_path, ext_idx[:eth1])
+        save_examples(ext_examples, save_ext_dev_path, ext_idx[eth1:eth2])
+        save_examples(ext_examples, save_ext_test_path, ext_idx[eth2:])
+        print(f"\next: save train data to {save_ext_train_path}.")
+        print(f"ext: save dev data to {save_ext_dev_path}.")
+        print(f"ext: save test data to {save_ext_test_path}.")
+
+        # save cls data
+        cth1, cth2 = int(len(cls_examples)*splits[0]), int(len(cls_examples)*splits[1])
+        save_cls_train_path = os.path.join(save_cls_dir, "train.txt")
+        save_cls_dev_path = os.path.join(save_cls_dir, "dev.txt")
+        save_cls_test_path = os.path.join(save_cls_dir, "test.txt")
+        save_examples(cls_examples, save_cls_train_path, cls_idx[:cth1])
+        save_examples(cls_examples, save_cls_dev_path, cls_idx[cth1:cth2])
+        save_examples(cls_examples, save_cls_test_path, cls_idx[cth2:])
+        print(f"\ncls: save train data to {save_cls_train_path}.")
+        print(f"cls: save dev data to {save_cls_dev_path}.")
+        print(f"cls: save test data to {save_cls_test_path}.")
+
+    # save ext dict
+    ext_dict_path = os.path.join(save_ext_dir, "label.dict")
+    cls_dict_path = os.path.join(save_cls_dir, "label.dict")
+    save_dict(ext_dict_path, "ext")
+    save_dict(cls_dict_path, "cls")
+    print(f"\next: save dict to {ext_dict_path}.")
+    print(f"cls: save dict to {cls_dict_path}.")
+    
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--doccano_file", type=str, default="./data/doccano.json", help="The doccano file exported from doccano platform.")
+    parser.add_argument("--save_ext_dir", type=str, default="./data/ext_data1", help="The path of ext data that you wanna save.")
+    parser.add_argument("--save_cls_dir", type=str, default="./data/cls_data1", help="The path of cls data that you wanna save.")
+    args = parser.parse_args()
+
+    doccano2SA(args.doccano_file, args.save_ext_dir, args.save_cls_dir, is_shuffle=True)
+
@@ -98,3 +98,36 @@ def decoding(text, tag_seq):
         aps.append(no_a_words)
 
     return aps
+
+
+def concate_aspect_and_opinion(text, aspect, opinions):
+    aspect_text = ""
+    for opinion in opinions:
+        if text.find(aspect) <= text.find(opinion):
+            aspect_text += aspect + opinion + "，"
+        else:
+            aspect_text += opinion + aspect + "，"
+    aspect_text = aspect_text[:-1]
+
+    return aspect_text
+
+def save_examples(examples, save_path, idxs):
+    with open(save_path, "w", encoding="utf-8") as f:
+        for idx in idxs:
+            line = "\t".join(examples[idx])+"\n"
+            f.write(line)
+
+def save_dict(dict_path, dict_type):
+    if dict_type not in ["ext", "cls"]:
+        raise ValueError("Only ext/cls should be accepted for dict_type.")
+
+    with open(dict_path, "w", encoding="utf-8") as f:
+        if dict_type == "ext":
+            label_list = ["O", "B-Aspect", "I-Aspect", "B-Opinion", "I-Opinion"]
+        else:
+            label_list = ["负向", "正向"]
+
+        for label in label_list:
+            f.write(label+"\n")
+        
+
@@ -45,7 +45,7 @@
    :caption: 模型库
 
    Transformer预训练模型 <model_zoo/transformers>
-   TaskFlow <model_zoo/taskflow>
+   一键预测功能 <model_zoo/taskflow>
    预训练词向量 <model_zoo/embeddings>
 
 .. toctree::