Support segmented inputs for ddparser (#1351)

linjieccc · web-flow · commit e27ac6a04664 · 2021-12-09T17:28:21.000+08:00
* Support segmented inputs for ddparser

* Update usage

* Update ddparser segmode

* Update README.md

* Update README.md

* Update taskflow.md
diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md
@@ -207,6 +207,8 @@ corrector(['遇到逆竟时，我们必须勇于面对，而且要愈挫愈勇
 
 ### 句法分析
 
+未分词输入:
+
 ```python
 from paddlenlp import Taskflow
 
@@ -216,21 +218,33 @@ ddp("9月9日上午纳达尔在亚瑟·阿什球场击败俄罗斯球员梅德
 
 ddp(["9月9日上午纳达尔在亚瑟·阿什球场击败俄罗斯球员梅德韦杰夫", "他送了一本书"])
 >>> [{'word': ['9月9日', '上午', '纳达尔', '在', '亚瑟·阿什球场', '击败', '俄罗斯', '球员', '梅德韦杰夫'], 'head': [2, 6, 6, 5, 6, 0, 8, 9, 6], 'deprel': ['ATT', 'ADV', 'SBV', 'MT', 'ADV', 'HED', 'ATT', 'ATT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
+```
 
-# 输出概率值和词性标签
+输出概率值和词性标签:
+
+```python
 ddp = Taskflow("dependency_parsing", prob=True, use_pos=True)
 ddp("9月9日上午纳达尔在亚瑟·阿什球场击败俄罗斯球员梅德韦杰夫")
 >>> [{'word': ['9月9日', '上午', '纳达尔', '在', '亚瑟·阿什', '球场', '击败', '俄罗斯', '球员', '梅德韦杰夫'], 'head': [2, 7, 7, 6, 6, 7, 0, 9, 10, 7], 'deprel': ['ATT', 'ADV', 'SBV', 'MT', 'ATT', 'ADV', 'HED', 'ATT', 'ATT', 'VOB'], 'postag': ['TIME', 'TIME', 'PER', 'p', 'PER', 'n', 'v', 'LOC', 'n', 'PER'], 'prob': [0.79, 0.98, 1.0, 0.49, 0.97, 0.86, 1.0, 0.85, 0.97, 0.99]}]
+```
+
+使用ddparser-ernie-1.0进行预测:
 
-# 使用ddparser-ernie-1.0进行预测
+```python
 ddp = Taskflow("dependency_parsing", model="ddparser-ernie-1.0")
 ddp("9月9日上午纳达尔在亚瑟·阿什球场击败俄罗斯球员梅德韦杰夫")
 >>> [{'word': ['9月9日', '上午', '纳达尔', '在', '亚瑟·阿什球场', '击败', '俄罗斯', '球员', '梅德韦杰夫'], 'head': [2, 6, 6, 5, 6, 0, 8, 9, 6], 'deprel': ['ATT', 'ADV', 'SBV', 'MT', 'ADV', 'HED', 'ATT', 'ATT', 'VOB']}]
 ```
 
-#### 依存关系可视化
+使用分词结果来输入:
 
-句法树可视化示例：
+```python
+ddp = Taskflow("dependency_parsing")
+ddp.from_segments([['9月9日', '上午', '纳达尔', '在', '亚瑟·阿什球场', '击败', '俄罗斯', '球员', '梅德韦杰夫']])
+>>> [{'word': ['9月9日', '上午', '纳达尔', '在', '亚瑟·阿什球场', '击败', '俄罗斯', '球员', '梅德韦杰夫'], 'head': [2, 6, 6, 5, 6, 0, 8, 9, 6], 'deprel': ['ATT', 'ADV', 'SBV', 'MT', 'ADV', 'HED', 'ATT', 'ATT', 'VOB']}]
+```
+
+#### 依存关系可视化：
 
 ```python
 from paddlenlp import Taskflow
@@ -241,6 +255,25 @@ import cv2
 cv2.imwrite('test.png', result)
 ```
 
+#### 标注关系说明：
+
+| Label |  关系类型  | 说明                     | 示例                           |
+| :---: | :--------: | :----------------------- | :----------------------------- |
+|  SBV  |  主谓关系  | 主语与谓词间的关系       | 他送了一本书(他<--送)          |
+|  VOB  |  动宾关系  | 宾语与谓词间的关系       | 他送了一本书(送-->书)          |
+|  POB  |  介宾关系  | 介词与宾语间的关系       | 我把书卖了（把-->书）          |
+|  ADV  |  状中关系  | 状语与中心词间的关系     | 我昨天买书了（昨天<--买）      |
+|  CMP  |  动补关系  | 补语与中心词间的关系     | 我都吃完了（吃-->完）          |
+|  ATT  |  定中关系  | 定语与中心词间的关系     | 他送了一本书(一本<--书)        |
+|   F   |  方位关系  | 方位词与中心词的关系     | 在公园里玩耍(公园-->里)        |
+|  COO  |  并列关系  | 同类型词语间关系        | 叔叔阿姨(叔叔-->阿姨)          |
+|  DBL  |  兼语结构  | 主谓短语做宾语的结构     | 他请我吃饭(请-->我，请-->吃饭) |
+|  DOB  | 双宾语结构 | 谓语后出现两个宾语       | 他送我一本书(送-->我，送-->书) |
+|  VV   |  连谓结构  | 同主语的多个谓词间关系   | 他外出吃饭(外出-->吃饭)        |
+|  IC   |  子句结构  | 两个结构独立或关联的单句  | 你好，书店怎么走？(你好<--走)  |
+|  MT   |  虚词成分  | 虚词与中心词间的关系     | 他送了一本书(送-->了)          |
+|  HED  |  核心关系  | 指整个句子的核心         |                               |
+
 #### 可配置参数说明
 
 * `batch_size`：批处理大小，请结合机器情况进行调整，默认为1。
@@ -250,6 +283,7 @@ cv2.imwrite('test.png', result)
 * `use_cuda`：是否使用GPU进行切词，默认为False。
 * `return_visual`：是否返回句法树的可视化结果，默认为False。
 
+
 ### 情感分析
 
 ```python
@@ -384,4 +418,4 @@ from paddlenlp import Taskflow
 
 ner = Taskflow("ner", home_path="/workspace")
 ```
-通过以上方式即可将ner任务相关文件保存至`/workspace`路径下。
+通过以上方式即可将ner任务相关文件保存至`/workspace`路径下。
diff --git a/paddlenlp/taskflow/dependency_parsing.py b/paddlenlp/taskflow/dependency_parsing.py
@@ -43,33 +43,43 @@
            from paddlenlp import Taskflow 
 
            ddp = Taskflow("dependency_parsing")
-           ddp("百度是一家高科技公司")
+           ddp("三亚是一座美丽的城市")
            '''
-           [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': ['2', '0', '5', '5', '2'], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
            '''
-           ddp(["百度是一家高科技公司", "他送了一本书"])
-           '''
-           [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': ['2', '0', '5', '5', '2'], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': ['2', '0', '2', '5', '2'], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
+           ddp(["三亚是一座美丽的城市", "他送了一本书"])
            '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
+           '''       
 
            ddp = Taskflow("dependency_parsing", prob=True, use_pos=True)
-           ddp("百度是一家高科技公司")
+           ddp("三亚是一座美丽的城市")
            '''
-           [{'word': ['百度', '是', '一家', '高科技', '公司'], 'postag': ['ORG', 'v', 'm', 'n', 'n'], 'head': ['2', '0', '5', '5', '2'], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB'], 'prob': [1.0, 1.0, 1.0, 1.0, 1.0]}]
+           [{'word': ['三亚', '是', '一座', '美丽的城市'], 'head': [2, 0, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'VOB'], 'postag': ['LOC', 'v', 'm', 'n'], 'prob': [1.0, 1.0, 1.0, 1.0]}]
            '''
 
            ddp = Taskflow("dependency_parsing", model="ddparser-ernie-1.0")
-           ddp("百度是一家高科技公司")
+           ddp("三亚是一座美丽的城市")
            '''
-           [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': ['2', '0', '5', '5', '2'], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
            '''
 
            ddp = Taskflow("dependency_parsing", model="ddparser-ernie-gram-zh")
-           ddp("百度是一家高科技公司")
+           ddp("三亚是一座美丽的城市")
            '''
-           [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': ['2', '0', '5', '5', '2'], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
            '''
 
+           # 已分词输入
+           ddp = Taskflow("dependency_parsing", segmented=True)
+           ddp.from_segments([["三亚", "是", "一座", "美丽", "的", "城市"]])
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
+           '''
+           ddp.from_segments([['三亚', '是', '一座', '美丽', '的', '城市'], ['他', '送', '了', '一本', '书']])
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
+           '''   
          """
 
 
@@ -83,7 +93,7 @@ class DDParserTask(Task):
         prob(bool): Whether to return the probability of predicted heads.
         use_pos(bool): Whether to return the postag.
         batch_size(int): Numbers of examples a batch.
-        return_visual(bool): If set True, the result will contain the dependency visualization.
+        return_visual(bool): If True, the result will contain the dependency visualization.
         kwargs (dict, optional): Additional keyword arguments passed along to the specific task. 
     """
 
@@ -141,12 +151,27 @@ def __init__(self,
 
         self.use_cuda = use_cuda
         self.lac = LAC(mode="lac" if self.use_pos else "seg",
-                       use_cuda=self.use_cuda)
+                    use_cuda=self.use_cuda)
         if self.static_mode:
             self._get_inference_model()
         else:
             self._construct_model(model)
 
+    def _check_segmented_words(self, inputs):
+        inputs = inputs[0]
+        if not all([isinstance(i, list) and i and all(i) for i in inputs]):
+            raise TypeError("Invalid input format.")
+        return inputs    
+
+    def from_segments(self, segmented_words):
+        segmented_words = self._check_segmented_words(segmented_words)
+        inputs = {}
+        inputs['words'] = segmented_words
+        inputs = self._preprocess_words(inputs)
+        outputs = self._run_model(inputs)
+        results = self._postprocess(outputs)
+        return results
+
     def _construct_input_spec(self):
         """
         Construct the input spec for the convert dygraph model to static model.
@@ -182,55 +207,61 @@ def _construct_tokenizer(self, model):
         """
         return None
 
+    def _preprocess_words(self, inputs):
+        examples = []
+        for text in inputs['words']:
+            example = {"FORM": text}
+            example = convert_example(
+                example,
+                vocabs=[self.word_vocab, self.rel_vocab])
+            examples.append(example)
+
+        batches = [
+            examples[idx:idx + self.batch_size]
+            for idx in range(0, len(examples), self.batch_size)
+        ]
+
+        def batchify_fn(batch):
+            raw_batch = [raw for raw in zip(*batch)]
+            batch = [pad_sequence(data) for data in raw_batch]
+            return batch
+
+        batches = [flat_words(batchify_fn(batch)[0]) for batch in batches]
+
+        inputs['data_loader'] = batches
+        return inputs
+
     def _preprocess(self, inputs):
         """
         Transform the raw text to the model inputs, two steps involved:
            1) Transform the raw text to token ids.
            2) Generate the other model inputs from the raw text and token ids.
         """
-        inputs = self._check_input_text(inputs)
+
         # Get the config from the kwargs
         num_workers = self.kwargs[
             'num_workers'] if 'num_workers' in self.kwargs else 0
         lazy_load = self.kwargs[
             'lazy_load'] if 'lazy_load' in self.kwargs else False
 
+        outputs = {}
+
         lac_results = []
         position = 0
 
+        inputs = self._check_input_text(inputs)
         while position < len(inputs):
             lac_results += self.lac.run(inputs[position:position +
-                                               self.batch_size])
+                                            self.batch_size])
             position += self.batch_size
 
-        outputs = {}
         if not self.use_pos:
             outputs['words'] = lac_results
         else:
             outputs['words'], outputs[
                 'postags'] = [raw for raw in zip(*lac_results)]
 
-        examples = []
-        for text in outputs['words']:
-            example = {"FORM": text, }
-            example = convert_example(
-                example,
-                vocabs=[self.word_vocab, self.rel_vocab], )
-            examples.append(example)
-
-        batches = [
-            examples[idx:idx + self.batch_size]
-            for idx in range(0, len(examples), self.batch_size)
-        ]
-
-        def batchify_fn(batch):
-            raw_batch = [raw for raw in zip(*batch)]
-            batch = [pad_sequence(data) for data in raw_batch]
-            return batch
-
-        batches = [flat_words(batchify_fn(batch)[0]) for batch in batches]
-
-        outputs['data_loader'] = batches
+        outputs = self._preprocess_words(outputs)
         return outputs
 
     def _run_model(self, inputs):
diff --git a/paddlenlp/taskflow/taskflow.py b/paddlenlp/taskflow/taskflow.py
@@ -243,3 +243,7 @@ def tasks():
         """
         task_list = list(TASKS.keys())
         return task_list
+
+    def from_segments(self, *inputs):
+        results = self.task_instance.from_segments(inputs)
+        return results