Merge branch 'develop' into few_shot_rdrop

tianxin · web-flow · commit 0327121a2759 · 2021-10-08T14:40:37.000+08:00
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ pip install --upgrade paddlenlp
 
 ### Transformer API: 强大的预训练模型生态底座
 
-覆盖**15**个网络结构和**67**个预训练模型参数，既包括百度自研的预训练模型如ERNIE系列, PLATO, SKEP等，也涵盖业界主流的中文预训练模型。也欢迎开发者进预训练模贡献！🤗
+覆盖**15**个网络结构和**67**个预训练模型参数，既包括百度自研的预训练模型如ERNIE系列, PLATO, SKEP等，也涵盖业界主流的中文预训练模型。也欢迎开发者贡献更多预训练模型！🤗
 
 ```python
 from paddlenlp.transformers import *
diff --git a/docs/data_prepare/dataset_list.md b/docs/data_prepare/dataset_list.md
@@ -92,6 +92,7 @@ PaddleNLP提供了以下数据集的快速读取API，实际使用时请根据
 |  [DuReaderQG](https://github.com/PaddlePaddle/Research/tree/master/NLP/DuReader-Robust-BASELINE) | 基于DuReader的问题生成数据集| `paddlenlp.datasets.load_dataset('dureader_qg')`|
 |  [AdvertiseGen](https://github.com/ZhihongShao/Planning-based-Hierarchical-Variational-Model) | 中文文案生成数据集| `paddlenlp.datasets.load_dataset('advertisegen')`|
 |  [LCSTS_new](https://aclanthology.org/D15-1229.pdf) | 中文摘要生成数据集| `paddlenlp.datasets.load_dataset('lcsts_new')`|
+|  [CNN/Dailymail](https://github.com/abisee/cnn-dailymail) | 英文摘要生成数据集| `paddlenlp.datasets.load_dataset('cnn_dailymail')`|
 
 ## 语料库
 
diff --git a/paddlenlp/datasets/cnn_dailymail.py b/paddlenlp/datasets/cnn_dailymail.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+import hashlib
+
+from paddle.dataset.common import md5file
+from paddlenlp.utils.downloader import get_path_from_url, _decompress
+from paddlenlp.utils.env import DATA_HOME
+from paddlenlp.utils.log import logger
+from . import DatasetBuilder
+
+
+class CnnDailymail(DatasetBuilder):
+    """
+    CNN/DailyMail non-anonymized summarization dataset.
+    The CNN / DailyMail Dataset is an English-language dataset containing
+    just over 300k unique news articles as written by journalists at CNN
+    nd the Daily Mail. The current version supports both extractive and
+    abstractive summarization, though the original version was created
+    for machine reading and comprehension and abstractive question answering.
+
+    Version 1.0.0 aimed to support supervised neural methodologies for machine
+    reading and question answering with a large amount of real natural language
+    training data and released about 313k unique articles and nearly 1M Cloze
+    style questions to go with the articles.
+    Versions 2.0.0 and 3.0.0 changed the structure of the dataset to support
+    summarization rather than question answering. Version 3.0.0 provided a
+    non-anonymized version of the data, whereas both the previous versions were
+    preprocessed to replace named entities with unique identifier labels.
+
+    An updated version of the code that does not anonymize the data is available
+    at https://github.com/abisee/cnn-dailymail.
+    """
+    lazy = False
+    META_INFO = collections.namedtuple("META_INFO", ("file", "url", "md5"))
+    SPLITS = {
+        "train": META_INFO(
+            "all_train.txt",
+            "https://paddlenlp.bj.bcebos.com/datasets/cnn_dailymail/all_train.txt",
+            "c8ca98cfcb6cf3f99a404552568490bc"),
+        "dev": META_INFO(
+            "all_val.txt",
+            "https://paddlenlp.bj.bcebos.com/datasets/cnn_dailymail/all_val.txt",
+            "83a3c483b3ed38b1392285bed668bfee"),
+        "test": META_INFO(
+            "all_test.txt",
+            "https://paddlenlp.bj.bcebos.com/datasets/cnn_dailymail/all_test.txt",
+            "4f3ac04669934dbc746b7061e68a0258")
+    }
+    cnn_dailymail = {
+        "cnn": {
+            "url":
+            "https://paddlenlp.bj.bcebos.com/datasets/cnn_dailymail/cnn_stories.tgz",
+            "md5": "85ac23a1926a831e8f46a6b8eaf57263",
+            "file_num": 92579
+        },
+        "dailymail": {
+            "url":
+            "https://paddlenlp.bj.bcebos.com/datasets/cnn_dailymail/dailymail_stories.tgz",
+            "md5": "f9c5f565e8abe86c38bfa4ae8f96fd72",
+            "file_num": 219506
+        }
+    }
+
+    def _read_text_file(self, text_file):
+        lines = []
+        with open(text_file, "r", encoding="utf8") as f:
+            for line in f:
+                lines.append(line.strip())
+        return lines
+
+    def _get_url_hashes(self, path):
+        """Get hashes of urls in file."""
+        urls = self._read_text_file(path)
+
+        def url_hash(u):
+            h = hashlib.sha1()
+            try:
+                u = u.encode("utf-8")
+            except UnicodeDecodeError:
+                logger.error("Cannot hash url: %s", u)
+            h.update(u)
+            return h.hexdigest()
+
+        return {url_hash(u): True for u in urls}
+
+    def _get_hash_from_path(self, p):
+        """Extract hash from path."""
+        basename = os.path.basename(p)
+        return basename[0:basename.find(".story")]
+
+    def _find_files(self, dl_paths, publisher, url_dict):
+        """Find files corresponding to urls."""
+        if publisher == "cnn":
+            top_dir = os.path.join(dl_paths["cnn"], "stories")
+        elif publisher == "dailymail":
+            top_dir = os.path.join(dl_paths["dailymail"], "stories")
+        else:
+            logger.error("Unsupported publisher: %s", publisher)
+        files = sorted(os.listdir(top_dir))
+
+        ret_files = []
+        for p in files:
+            if self._get_hash_from_path(p) in url_dict:
+                ret_files.append(os.path.join(top_dir, p))
+        return ret_files
+
+    def _subset_filenames(self, dl_paths, split):
+        """Get filenames for a particular split."""
+        # Get filenames for a split.
+        urls = self._get_url_hashes(dl_paths[split])
+        cnn = self._find_files(dl_paths, "cnn", urls)
+        dm = self._find_files(dl_paths, "dailymail", urls)
+        return cnn + dm
+
+    def _get_art_abs(self, story_file, version):
+        """Get abstract (highlights) and article from a story file path."""
+        # Based on https://github.com/abisee/cnn-dailymail/blob/master/
+        #     make_datafiles.py
+
+        lines = self._read_text_file(story_file)
+
+        # The github code lowercase the text and we removed it in 3.0.0.
+
+        # Put periods on the ends of lines that are missing them
+        # (this is a problem in the dataset because many image captions don't end in
+        # periods; consequently they end up in the body of the article as run-on
+        # sentences)
+        def fix_missing_period(line):
+            """Adds a period to a line that is missing a period."""
+            if "@highlight" in line:
+                return line
+            if not line:
+                return line
+            if line[-1] in [
+                    ".", "!", "?", "...", "'", "`", '"', "\u2019", "\u201d", ")"
+            ]:
+                return line
+            return line + " ."
+
+        lines = [fix_missing_period(line) for line in lines]
+
+        # Separate out article and abstract sentences
+        article_lines = []
+        highlights = []
+        next_is_highlight = False
+        for line in lines:
+            if not line:
+                continue  # empty line
+            elif line.startswith("@highlight"):
+                next_is_highlight = True
+            elif next_is_highlight:
+                highlights.append(line)
+            else:
+                article_lines.append(line)
+
+        # Make article into a single string
+        article = " ".join(article_lines)
+
+        if version >= "2.0.0":
+            abstract = "\n".join(highlights)
+        else:
+            abstract = " ".join(highlights)
+
+        return article, abstract
+
+    def _get_data(self, mode):
+        """ Check and download Dataset """
+        dl_paths = {}
+        version = self.config.get("version", "3.0.0")
+        if version not in ["1.0.0", "2.0.0", "3.0.0"]:
+            raise ValueError("Unsupported version: %s" % version)
+        dl_paths["version"] = version
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        for k, v in self.cnn_dailymail.items():
+            dir_path = os.path.join(default_root, k)
+            if not os.path.exists(dir_path):
+                get_path_from_url(v["url"], default_root, v["md5"])
+            file_num = len(os.listdir(os.path.join(dir_path, "stories")))
+            if file_num != v["file_num"]:
+                logger.warning(
+                    "Number of %s stories is %d != %d, decompress again." %
+                    (k, file_num, v["file_num"]))
+                _decompress(
+                    os.path.join(default_root, os.path.basename(v["url"])))
+            dl_paths[k] = dir_path
+        filename, url, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and
+                                            not md5file(fullname) == data_hash):
+            get_path_from_url(url, default_root, data_hash)
+        dl_paths[mode] = fullname
+        return dl_paths
+
+    def _read(self, dl_paths, split):
+        files = self._subset_filenames(dl_paths, split)
+        for p in files:
+            article, highlights = self._get_art_abs(p, dl_paths["version"])
+            if not article or not highlights:
+                continue
+            yield {
+                "article": article,
+                "highlights": highlights,
+                "id": self._get_hash_from_path(p),
+            }
diff --git a/paddlenlp/taskflow/dependency_parsing.py b/paddlenlp/taskflow/dependency_parsing.py
@@ -112,7 +112,7 @@ def __init__(self,
                 ddparser, ddparser-ernie-1.0 and ddoarser-ernie-gram-zh")
         word_vocab_path = download_file(
             self._task_path, self.model + os.path.sep + "word_vocab.json",
-            URLS[self.model][0], URLS[self.model][1])
+            URLS[self.model][0], URLS[self.model][1], self.model)
         rel_vocab_path = download_file(
             self._task_path, self.model + os.path.sep + "rel_vocab.json",
             URLS[self.model][0], URLS[self.model][1])
diff --git a/paddlenlp/taskflow/lexical_analysis.py b/paddlenlp/taskflow/lexical_analysis.py
@@ -95,7 +95,7 @@ def __init__(self, task, model, **kwargs):
         self._usage = usage
         word_dict_path = download_file(
             self._task_path, "lac_params" + os.path.sep + "word.dic",
-            URLS['lac_params'][0], URLS['lac_params'][1])
+            URLS['lac_params'][0], URLS['lac_params'][1], 'lexical_analysis')
         tag_dict_path = download_file(
             self._task_path, "lac_params" + os.path.sep + "tag.dic",
             URLS['lac_params'][0], URLS['lac_params'][1])
diff --git a/paddlenlp/taskflow/sentiment_analysis.py b/paddlenlp/taskflow/sentiment_analysis.py
@@ -115,7 +115,7 @@ def _construct_model(self, model):
             padding_idx=pad_token_id,
             pooling_type='max')
         model_path = download_file(self._task_path, model + ".pdparams",
-                                   URLS[model][0], URLS[model][1])
+                                   URLS[model][0], URLS[model][1], model)
 
         # Load the model parameter for the predict
         state_dict = paddle.load(model_path)
@@ -234,7 +234,7 @@ def _construct_model(self, model):
         model_instance = SkepSequenceModel.from_pretrained(
             model, num_classes=len(self._label_map))
         model_path = download_file(self._task_path, model + ".pdparams",
-                                   URLS[model][0], URLS[model][1])
+                                   URLS[model][0], URLS[model][1], model)
         state_dict = paddle.load(model_path)
         model_instance.set_state_dict(state_dict)
         self._model = model_instance
diff --git a/paddlenlp/taskflow/text_correction.py b/paddlenlp/taskflow/text_correction.py
@@ -139,7 +139,7 @@ def _construct_model(self, model):
             pad_pinyin_id=self._pinyin_vocab[self._pinyin_vocab.pad_token])
         # Load the model parameter for the predict
         model_path = download_file(self._task_path, model + ".pdparams",
-                                   URLS[model][0], URLS[model][1])
+                                   URLS[model][0], URLS[model][1], model)
         state_dict = paddle.load(model_path)
         model_instance.set_state_dict(state_dict)
         model_instance.eval()
diff --git a/paddlenlp/taskflow/text_generation.py b/paddlenlp/taskflow/text_generation.py
@@ -74,7 +74,7 @@ def __init__(self, task, model, **kwargs):
         if self._static_mode:
             download_file(self._task_path,
                           "static" + os.path.sep + "inference.pdiparams",
-                          URLS[self.model][0], URLS[self.model][1])
+                          URLS[self.model][0], URLS[self.model][1], model)
             self._get_inference_model()
         else:
             self._construct_model(model)
diff --git a/paddlenlp/transformers/albert/tokenizer.py b/paddlenlp/transformers/albert/tokenizer.py
@@ -151,12 +151,12 @@ def __init__(
 
         if vocab_file is not None:
             self.tokenizer = AlbertChineseTokenizer(
-                vocab_file,
+                vocab_file=vocab_file,
                 do_lower_case=False,
             )
         elif sentencepiece_model_file is not None:
             self.tokenizer = AlbertEnglishTokenizer(
-                sentencepiece_model_file,
+                sentencepiece_model_file=sentencepiece_model_file,
                 do_lower_case=True,
             )
         else:

Original file line number	Diff line number	Diff line change
`@@ -151,12 +151,12 @@ def __init__(`
`151`	`151`
`152`	`152`	`if vocab_file is not None:`
`153`	`153`	`self.tokenizer = AlbertChineseTokenizer(`
`154`		`- vocab_file,`
	`154`	`+ vocab_file=vocab_file,`
`155`	`155`	`do_lower_case=False,`
`156`	`156`	`)`
`157`	`157`	`elif sentencepiece_model_file is not None:`
`158`	`158`	`self.tokenizer = AlbertEnglishTokenizer(`
`159`		`- sentencepiece_model_file,`
	`159`	`+ sentencepiece_model_file=sentencepiece_model_file,`
`160`	`160`	`do_lower_case=True,`
`161`	`161`	`)`
`162`	`162`	`else:`