Update usage of wordtag

linjieccc · linjieccc · commit 18f3a7c35637 · 2021-12-09T13:03:05.000Z
diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md
@@ -111,6 +111,7 @@ my_seg("平原上的火焰计划于年末上映")
 #### 可配置参数说明
 
 * `batch_size`：批处理大小，请结合机器情况进行调整，默认为1。
+* `custom_vocab`：用户自定义词典文件，默认为None。
 
 ### 词性标注
 
@@ -169,6 +170,7 @@ my_pos("赛里木湖是新疆海拔最高的高山湖泊")
 #### 可配置参数说明
 
 * `batch_size`：批处理大小，请结合机器情况进行调整，默认值为1。
+* `custom_vocab`：用户自定义词典文件，默认为None。
 
 ### 命名实体识别
 
@@ -183,9 +185,67 @@ ner(["热梅茶是一道以梅子为主要原料制作的茶饮", "《孤女》
 >>> [[('热梅茶', '饮食类_饮品'), ('是', '肯定词'), ('一道', '数量词'), ('以', '介词'), ('梅子', '饮食类'), ('为', '肯定词'), ('主要原料', '物体类'), ('制作', '场景事件'), ('的', '助词'), ('茶饮', '饮食类_饮品')], [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), ('，', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]]
 ```
 
+- 标签集合：
+
+|人物类_实体|物体类|生物类_动物|医学术语类|链接地址|肯定词|
+|人物类_概念|物体类_兵器|品牌名|术语类_生物体|个性特征|否定词|
+|作品类_实体|物体类_化学物质|场所类|疾病损伤类|感官特征|数量词|
+|作品类_概念|其他角色类|场所类_交通场所|疾病损伤类_植物病虫害|场景事件|叹词|
+|组织机构类|文化类|位置方位|宇宙类|介词|拟声词|
+|组织机构类_企事业单位|文化类_语言文字|世界地区类|事件类|介词_方位介词|修饰词|
+|组织机构类_医疗卫生机构|文化类_奖项赛事活动|饮食类|时间类|助词|外语单词|
+|组织机构类_国家机关|文化类_制度政策协议|饮食类_菜品|时间类_特殊日|代词|英语单词|
+|组织机构类_体育组织机构|文化类_姓氏与人名|饮食类_饮品|术语类|连词|汉语拼音|
+|组织机构类_教育组织机构|生物类|药物类|术语类_符号指标类|副词|词汇用语|
+|组织机构类_军事组织机构|生物类_植物|药物类_中药|信息资料|疑问词|w(标点)|
+
+#### 自定义词典
+
+用户可以通过装载自定义词典来定制化分词和词性标注结果。
+
+词典文件`custom_ner.txt`示例：
+
+```text
+长津湖/电影类_实体
+收/词汇用语 尾/术语类
+最 大
+海外票仓
+```
+
+以"《长津湖》收尾，北美是最大海外票仓"为例，原本的输出结果为：
+
+```text
+[('《', 'w'), ('长津湖', '作品类_实体'), ('》', 'w'), ('收尾', '场景事件'), ('，', 'w'), ('北美', '世界地区类'), ('是', '肯定词'), ('最大', '修饰词'), ('海外', '场所类'), ('票仓', '词汇用语')]
+```
+
+装载自定义词典及输出结果示例：
+
+```python
+from paddlenlp import Taskflow
+
+my_ner = Taskflow("ner", custom_vocab="custom_ner.txt")
+my_ner("《长津湖》收尾，北美是最大海外票仓")
+>>> [('《', 'w'), ('长津湖', '电影类_实体'), ('》', 'w'), ('收', '词汇用语'), ('尾', '术语类'), ('，', 'w'), ('北美', '世界地区类'), ('是', '肯定词'), ('最', '修饰词'), ('大', '修饰词'), ('海外票仓', '场所类')]
+```
+
+#### 自定义NER模型
+
+用户可以使用自己的数据训练自定义NER模型，参考[NER-WordTag增量训练示例](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_to_knowledge/ernie-ctm)。
+
+使用Taskflow加载自定义模型进行一键预测：
+
+```shell
+from paddlenlp import Taskflow
+
+my_ner = Taskflow("ner", params_path="/path/to/your/params", tag_path="/path/to/your/tag")
+```
+
 #### 可配置参数说明
 
 * `batch_size`：批处理大小，请结合机器情况进行调整，默认为1。
+* `custom_vocab`：用户自定义词典文件，默认为None。
+* `params_path`：模型参数文件路径，默认为None。
+* `tag_path`：标签文件路径，默认为None。
 
 ### 文本纠错
 
diff --git a/examples/text_to_knowledge/ernie-ctm/README.md b/examples/text_to_knowledge/ernie-ctm/README.md
@@ -81,15 +81,6 @@ data/
 《/w 全球化与中国：理论与发展趋势/作品类_实体 》/w 是/肯定词 2010年/时间类 经济管理出版社/组织机构类 出版/场景事件 的/助词 图书/作品类_概念 ，/w 作者/人物类_概念 是/肯定词 余永定/人物类_实体 、/w 路爱国/人物类_实体 、/w 高海红/人物类_实体 。/w
 ```
 
-WordTag模型使用了**BIOES标注体系**，用户可以在标签文件中（该示例为`tags.txt`）按照该标注体系自定义添加词性或命名实体类别，标签文件示例：
-
-```text
-B-组织机构类_企事业单位
-I-组织机构类_企事业单位
-E-组织机构类_企事业单位
-S-组织机构类_企事业单位
-```
-
 #### 模型训练
 
 ```shell
diff --git a/examples/text_to_knowledge/ernie-ctm/predict.py b/examples/text_to_knowledge/ernie-ctm/predict.py
@@ -78,8 +78,7 @@ def do_predict(data,
         input_ids = paddle.to_tensor(input_ids)
         token_type_ids = paddle.to_tensor(token_type_ids)
         seq_len = paddle.to_tensor(seq_len)
-        logits, _ = model(input_ids, token_type_ids)
-        _, pred_tags = viterbi_decoder(logits, seq_len)
+        pred_tags = model(input_ids, token_type_ids, lengths=seq_len)
         all_pred_tags.extend(pred_tags.numpy().tolist())
     results = decode(data, all_pred_tags, summary_num, idx_to_tags)
     return results
@@ -95,14 +94,9 @@ def do_predict(data,
     tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt"))
     idx_to_tags = dict(zip(*(tags_to_idx.values(), tags_to_idx.keys())))
 
-    crf = LinearChainCrf(len(tags_to_idx), 100, with_start_stop_tag=False)
-    viterbi_decoder = ViterbiDecoder(crf.transitions, False)
-
     model = ErnieCtmWordtagModel.from_pretrained(
         "wordtag",
-        num_tag=len(tags_to_idx),
-        num_cls_label=4,
-        ignore_index=tags_to_idx["O"])
+        num_tag=len(tags_to_idx))
     tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag")
 
     if args.params_path and os.path.isfile(args.params_path):
@@ -113,7 +107,7 @@ def do_predict(data,
     results = do_predict(data, 
                          model, 
                          tokenizer, 
-                         viterbi_decoder, 
+                         model.viterbi_decoder, 
                          tags_to_idx, 
                          idx_to_tags,
                          batch_size=args.batch_size)
diff --git a/examples/text_to_knowledge/ernie-ctm/train.py b/examples/text_to_knowledge/ernie-ctm/train.py
@@ -64,17 +64,17 @@ def set_seed(seed):
 
 
 @paddle.no_grad()
-def evaluate(model, metric, criterion, data_loader, tags, tags_to_idx):
+def evaluate(model, metric, data_loader, tags, tags_to_idx):
     model.eval()
     metric.reset()
     losses = []
     for batch in data_loader():
         input_ids, token_type_ids, seq_len, tags = batch
-        seq_logits, _ = model(input_ids,
+        loss, seq_logits = model(input_ids,
                               token_type_ids,
                               lengths=seq_len,
                               tag_labels=tags)
-        loss = criterion(seq_logits, seq_len, tags).mean()
+        loss = loss.mean()
         losses.append(loss.numpy())
         
         correct = metric.compute(
@@ -109,9 +109,9 @@ def do_train(args):
     tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag")
     model = ErnieCtmWordtagModel.from_pretrained(
         "wordtag",
-        num_tag=len(tags_to_idx),
-        num_cls_label=4,
-        ignore_index=tags_to_idx["O"])
+        num_tag=len(tags_to_idx))
+    model.crf_loss = LinearChainCrfLoss(
+        LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False))
 
     trans_func = partial(
         convert_example,
@@ -170,9 +170,6 @@ def do_train(args):
     logger.info("WarmUp steps: %s" % warmup)
 
     metric = SequenceAccuracy()
-    crf_lr = 0.1
-    crf = LinearChainCrf(len(tags_to_idx), crf_lr, with_start_stop_tag=False)
-    criterion = LinearChainCrfLoss(crf)
 
     total_loss = 0
     global_step = 0
@@ -185,12 +182,11 @@ def do_train(args):
             global_step += 1
             input_ids, token_type_ids, seq_len, tags = batch
 
-            seq_logits, _ = model(
+            loss, _ = model(
                 input_ids,
                 token_type_ids,
                 lengths=seq_len,
                 tag_labels=tags)
-            loss = criterion(seq_logits, seq_len, tags)
             loss = loss.mean()
             total_loss += loss
             loss.backward()
@@ -219,7 +215,7 @@ def do_train(args):
                 model_to_save.save_pretrained(output_dir)
                 tokenizer.save_pretrained(output_dir)
 
-        evaluate(model, metric, criterion, dev_data_loader, tags, tags_to_idx)
+        evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
 
 
 def print_arguments(args):
diff --git a/paddlenlp/taskflow/knowledge_mining.py b/paddlenlp/taskflow/knowledge_mining.py
@@ -128,7 +128,7 @@
           from paddlenlp import Taskflow 
 
           # 默认使用WordTag词类知识标注工具
-          wordtag = Taskflow("knowledge_mining")
+          wordtag = Taskflow("knowledge_mining", model="wordtag")
           wordtag("《孤女》是2010年九州出版社出版的小说，作者是余兼羽")
           '''
           [{'text': '《孤女》是2010年九州出版社出版的小说，作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': '，', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
@@ -207,8 +207,6 @@ def __init__(self,
         self._termtree = TermTree.from_dir(term_schema_path, term_data_path,
                                            self._linking)
         
-        self.crf = LinearChainCrf(len(self._tags_to_index), 100, with_start_stop_tag=False)
-        self._viterbi_decoder = ViterbiDecoder(self.crf.transitions, False)
         self._usage = usage
         self._summary_num = 2
 
@@ -510,6 +508,9 @@ def _construct_input_spec(self):
             paddle.static.InputSpec(shape=[None, None],
                                     dtype="int64",
                                     name="token_type_ids"),  # token_type_ids
+            paddle.static.InputSpec(shape=[None],
+                                    dtype="int64",
+                                    name="seq_len"),  # seq_len
         ]
 
     def _construct_model(self, model):
@@ -518,9 +519,7 @@ def _construct_model(self, model):
         """
         model_instance = ErnieCtmWordtagModel.from_pretrained(
             model,
-            num_cls_label=4,
-            num_tag=len(self._tags_to_index),
-            ignore_index=self._tags_to_index["O"])
+            num_tag=len(self._tags_to_index))
         config_keys = ErnieCtmWordtagModel.pretrained_init_configuration[
             self.model]
         self.kwargs.update(config_keys)
@@ -554,11 +553,10 @@ def _run_model(self, inputs):
             input_ids, token_type_ids, seq_len = batch
             self.input_handles[0].copy_from_cpu(input_ids.numpy())
             self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
+            self.input_handles[2].copy_from_cpu(seq_len.numpy())
             self.predictor.run()
-            logits = self.output_handle[0].copy_to_cpu()
-            score, pred_tags = self._viterbi_decoder(
-                paddle.to_tensor(logits), seq_len)
-            all_pred_tags.extend(pred_tags.numpy().tolist())
+            pred_tags = self.output_handle[0].copy_to_cpu()
+            all_pred_tags.extend(pred_tags.tolist())
         inputs['all_pred_tags'] = all_pred_tags
         return inputs
 
diff --git a/paddlenlp/taskflow/named_entity_recognition.py b/paddlenlp/taskflow/named_entity_recognition.py
@@ -24,6 +24,7 @@
 from .utils import download_file
 from .utils import TermTree
 from .knowledge_mining import WordTagTask
+from .utils import Customization
 
 usage = r"""
           from paddlenlp import Taskflow 
@@ -35,8 +36,7 @@
           '''
 
           ner = Taskflow("ner")
-          ner(["热梅茶是一道以梅子为主要原料制作的茶饮",
-               "《孤女》是2010年九州出版社出版的小说，作者是余兼羽"])
+          ner(["热梅茶是一道以梅子为主要原料制作的茶饮", "《孤女》是2010年九州出版社出版的小说，作者是余兼羽"])
           '''
           [[('热梅茶', '饮食类_饮品'), ('是', '肯定词'), ('一道', '数量词'), ('以', '介词'), ('梅子', '饮食类'), ('为', '肯定词'), ('主要原料', '物体类'), ('制作', '场景事件'), ('的', '助词'), ('茶饮', '饮食类_饮品')], [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), ('，', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]]
           '''
@@ -56,6 +56,13 @@ class NERTask(WordTagTask):
 
     def __init__(self, model, task, **kwargs):
         super().__init__(model=model, task=task, **kwargs)
+        self._custom_vocab = self.kwargs[
+            'custom_vocab'] if 'custom_vocab' in self.kwargs else None
+        if self._custom_vocab:
+            self._custom = Customization()
+            self._custom.load_customization(self._custom_vocab)
+        else:
+            self._custom = None
 
     def _decode(self, batch_texts, batch_pred_tags):
         batch_results = []
@@ -65,7 +72,8 @@ def _decode(self, batch_texts, batch_pred_tags):
                 for index in batch_pred_tags[sent_index][self.summary_num:-1]
             ]
             sent = batch_texts[sent_index]
-            
+            if self._custom:
+                self._custom.parse_customization(sent, tags, prefix=True)
             sent_out = []
             tags_out = []
             partial_word = ""
diff --git a/paddlenlp/taskflow/utils.py b/paddlenlp/taskflow/utils.py
@@ -694,7 +694,7 @@ def load_customization(self, filename, sep=None):
                 self.dictitem[phrase] = (tags, offset)
                 self.ac.add_word(phrase)
 
-    def parse_customization(self, query, lac_tags):
+    def parse_customization(self, query, lac_tags, prefix=False):
         """Use custom vocab to modify the lac results"""
         if not self.ac:
             logging.warning("customization dict is not load")
@@ -706,16 +706,30 @@ def parse_customization(self, query, lac_tags):
             index = begin
 
             tags, offsets = self.dictitem[phrase]
-            for tag, offset in zip(tags, offsets):
-                while index < begin + offset:
-                    if len(tag) == 0:
-                        lac_tags[index] = lac_tags[index][:-1] + 'I'
-                    else:
-                        lac_tags[index] = tag + "-I"
-                    index += 1
-
-            lac_tags[begin] = lac_tags[begin][:-1] + 'B'
-            for offset in offsets:
-                index = begin + offset
-                if index < len(lac_tags):
-                    lac_tags[index] = lac_tags[index][:-1] + 'B'
+            
+            if prefix:
+                for tag, offset in zip(tags, offsets):
+                    while index < begin + offset:
+                        if len(tag) == 0:
+                            lac_tags[index] = "I" + lac_tags[index][1:]
+                        else:
+                            lac_tags[index] = "I-" + tag
+                        index += 1
+                lac_tags[begin] = "B" + lac_tags[begin][1:]
+                for offset in offsets:
+                    index = begin + offset
+                    if index < len(lac_tags):
+                        lac_tags[index] = "B" + lac_tags[index][1:]
+            else:
+                for tag, offset in zip(tags, offsets):
+                    while index < begin + offset:
+                        if len(tag) == 0:
+                            lac_tags[index] = lac_tags[index][:-1] + "I"
+                        else:
+                            lac_tags[index] = tag + "-I"
+                        index += 1
+                lac_tags[begin] = lac_tags[begin][:-1] + "B"
+                for offset in offsets:
+                    index = begin + offset
+                    if index < len(lac_tags):
+                        lac_tags[index] = lac_tags[index][:-1] + "B"
diff --git a/paddlenlp/transformers/ernie_ctm/modeling.py b/paddlenlp/transformers/ernie_ctm/modeling.py