PaddlePaddle
diff --git a/‎examples/language_model/bert/run_glue.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/language_model/bert/run_glue.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/machine_reading_comprehension/SQuAD/run_squad.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/machine_reading_comprehension/SQuAD/run_squad.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddlenlp/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎paddlenlp/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎paddlenlp/data/collate.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/data/collate.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddlenlp/data/data_collator.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/data/data_collator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddlenlp/datasets/hf_datasets/cnn_dailymail.py‎
Lines changed: 287 additions & 0 deletions b/‎paddlenlp/datasets/hf_datasets/cnn_dailymail.py‎
Lines changed: 287 additions & 0 deletions
@@ -25,6 +25,7 @@
 import paddle
 from paddle.io import DataLoader
 from paddle.metric import Metric, Accuracy, Precision, Recall
+import paddlenlp
 
 from datasets import load_dataset
 from paddlenlp.data import default_data_collator, DataCollatorWithPadding
 
@@ -124,6 +124,7 @@ def main():
                               batched=True,
                               remove_columns=column_names,
                               num_proc=4)
+
     batchify_fn = lambda samples, fn=Dict(
         {
             "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
 
@@ -156,7 +156,8 @@ def prepare_validation_features(examples, tokenizer, args):
         # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
         # position is part of the context or not.
         tokenized_examples["offset_mapping"][i] = [
-            (o if sequence_ids[k] == context_index else None)
+            (o if sequence_ids[k] == context_index and
+             k != len(sequence_ids) - 1 else None)
             for k, o in enumerate(tokenized_examples["offset_mapping"][i])
         ]
 
 
@@ -17,8 +17,10 @@
 if 'datasets' in sys.modules.keys():
     from paddlenlp.utils.log import logger
     logger.warning(
-        "datasets module loaded before paddlenlp. "
-        "This may cause PaddleNLP datasets to be unavalible in intranet.")
+        "Detected that datasets module was imported before paddlenlp. "
+        "This may cause PaddleNLP datasets to be unavalible in intranet"
+        "Please import paddlenlp before datasets module to avoid download issues"
+    )
 from . import data
 from . import datasets
 from . import embeddings
 
@@ -332,4 +332,4 @@ def __call__(self, data):
                 ret.extend(result)
             else:
                 ret.append(result)
-        return tuple(ret)
+        return tuple(ret)
@@ -269,7 +269,7 @@ def __call__(self, data, return_tensors=None):
         if (labels is not None and self.model is not None and
                 hasattr(self.model, "prepare_decoder_input_ids_from_labels")):
             decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
-                labels=batch["labels"])
+                labels=paddle.to_tensor(batch["labels"]))
             if not return_tensors:
                 batch["decoder_input_ids"] = decoder_input_ids.numpy()
         if self.return_tensors:
 
@@ -0,0 +1,287 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""CNN/DailyMail Summarization dataset, non-anonymized version."""
+
+import hashlib
+import os
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_DESCRIPTION = """\
+CNN/DailyMail non-anonymized summarization dataset.
+
+There are two features:
+  - article: text of news article, used as the document to be summarized
+  - highlights: joined text of highlights with <s> and </s> around each
+    highlight, which is the target summary
+"""
+
+# The second citation introduces the source data, while the first
+# introduces the specific form (non-anonymized) we use here.
+_CITATION = """\
+@article{DBLP:journals/corr/SeeLM17,
+  author    = {Abigail See and
+               Peter J. Liu and
+               Christopher D. Manning},
+  title     = {Get To The Point: Summarization with Pointer-Generator Networks},
+  journal   = {CoRR},
+  volume    = {abs/1704.04368},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1704.04368},
+  archivePrefix = {arXiv},
+  eprint    = {1704.04368},
+  timestamp = {Mon, 13 Aug 2018 16:46:08 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/SeeLM17},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{hermann2015teaching,
+  title={Teaching machines to read and comprehend},
+  author={Hermann, Karl Moritz and Kocisky, Tomas and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil},
+  booktitle={Advances in neural information processing systems},
+  pages={1693--1701},
+  year={2015}
+}
+"""
+
+_DL_URLS = {
+    # pylint: disable=line-too-long
+    "cnn_stories":
+    "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/cnn_stories.tgz",
+    "dm_stories":
+    "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/dailymail_stories.tgz",
+    "test_urls":
+    "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_test.txt",
+    "train_urls":
+    "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_train.txt",
+    "val_urls":
+    "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_val.txt",
+    # pylint: enable=line-too-long
+}
+
+_HIGHLIGHTS = "highlights"
+_ARTICLE = "article"
+
+_SUPPORTED_VERSIONS = [
+    # Using cased version.
+    datasets.Version("3.0.0", "Using cased version."),
+    # Same data as 0.0.2
+    datasets.Version("1.0.0", ""),
+    # Having the model predict newline separators makes it easier to evaluate
+    # using summary-level ROUGE.
+    datasets.Version("2.0.0", "Separate target sentences with newline."),
+]
+
+_DEFAULT_VERSION = datasets.Version("3.0.0", "Using cased version.")
+
+
+class CnnDailymailConfig(datasets.BuilderConfig):
+    """BuilderConfig for CnnDailymail."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for CnnDailymail.
+
+        Args:
+
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(CnnDailymailConfig, self).__init__(**kwargs)
+
+
+def _get_url_hashes(path):
+    """Get hashes of urls in file."""
+    urls = _read_text_file(path)
+
+    def url_hash(u):
+        h = hashlib.sha1()
+        try:
+            u = u.encode("utf-8")
+        except UnicodeDecodeError:
+            logger.error("Cannot hash url: %s", u)
+        h.update(u)
+        return h.hexdigest()
+
+    return {url_hash(u): True for u in urls}
+
+
+def _get_hash_from_path(p):
+    """Extract hash from path."""
+    basename = os.path.basename(p)
+    return basename[0:basename.find(".story")]
+
+
+def _find_files(dl_paths, publisher, url_dict):
+    """Find files corresponding to urls."""
+    if publisher == "cnn":
+        top_dir = os.path.join(dl_paths["cnn_stories"], "cnn", "stories")
+    elif publisher == "dm":
+        top_dir = os.path.join(dl_paths["dm_stories"], "dailymail", "stories")
+    else:
+        logger.fatal("Unsupported publisher: %s", publisher)
+    files = sorted(os.listdir(top_dir))
+
+    ret_files = []
+    for p in files:
+        if _get_hash_from_path(p) in url_dict:
+            ret_files.append(os.path.join(top_dir, p))
+    return ret_files
+
+
+def _subset_filenames(dl_paths, split):
+    """Get filenames for a particular split."""
+    assert isinstance(dl_paths, dict), dl_paths
+    # Get filenames for a split.
+    if split == datasets.Split.TRAIN:
+        urls = _get_url_hashes(dl_paths["train_urls"])
+    elif split == datasets.Split.VALIDATION:
+        urls = _get_url_hashes(dl_paths["val_urls"])
+    elif split == datasets.Split.TEST:
+        urls = _get_url_hashes(dl_paths["test_urls"])
+    else:
+        logger.fatal("Unsupported split: %s", split)
+    cnn = _find_files(dl_paths, "cnn", urls)
+    dm = _find_files(dl_paths, "dm", urls)
+    return cnn + dm
+
+
+DM_SINGLE_CLOSE_QUOTE = "\u2019"  # unicode
+DM_DOUBLE_CLOSE_QUOTE = "\u201d"
+# acceptable ways to end a sentence
+END_TOKENS = [
+    ".", "!", "?", "...", "'", "`", '"', DM_SINGLE_CLOSE_QUOTE,
+    DM_DOUBLE_CLOSE_QUOTE, ")"
+]
+
+
+def _read_text_file(text_file):
+    lines = []
+    with open(text_file, "r", encoding="utf-8") as f:
+        for line in f:
+            lines.append(line.strip())
+    return lines
+
+
+def _get_art_abs(story_file, tfds_version):
+    """Get abstract (highlights) and article from a story file path."""
+    # Based on https://github.com/abisee/cnn-dailymail/blob/master/
+    #     make_datafiles.py
+
+    lines = _read_text_file(story_file)
+
+    # The github code lowercase the text and we removed it in 3.0.0.
+
+    # Put periods on the ends of lines that are missing them
+    # (this is a problem in the dataset because many image captions don't end in
+    # periods; consequently they end up in the body of the article as run-on
+    # sentences)
+    def fix_missing_period(line):
+        """Adds a period to a line that is missing a period."""
+        if "@highlight" in line:
+            return line
+        if not line:
+            return line
+        if line[-1] in END_TOKENS:
+            return line
+        return line + " ."
+
+    lines = [fix_missing_period(line) for line in lines]
+
+    # Separate out article and abstract sentences
+    article_lines = []
+    highlights = []
+    next_is_highlight = False
+    for line in lines:
+        if not line:
+            continue  # empty line
+        elif line.startswith("@highlight"):
+            next_is_highlight = True
+        elif next_is_highlight:
+            highlights.append(line)
+        else:
+            article_lines.append(line)
+
+    # Make article into a single string
+    article = " ".join(article_lines)
+
+    if tfds_version >= "2.0.0":
+        abstract = "\n".join(highlights)
+    else:
+        abstract = " ".join(highlights)
+
+    return article, abstract
+
+
+class CnnDailymail(datasets.GeneratorBasedBuilder):
+    """CNN/DailyMail non-anonymized summarization dataset."""
+
+    BUILDER_CONFIGS = [
+        CnnDailymailConfig(
+            name=str(version), description="Plain text", version=version)
+        for version in _SUPPORTED_VERSIONS
+    ]
+
+    def _info(self):
+        # Should return a datasets.DatasetInfo object
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features({
+                _ARTICLE: datasets.Value("string"),
+                _HIGHLIGHTS: datasets.Value("string"),
+                "id": datasets.Value("string"),
+            }),
+            supervised_keys=None,
+            homepage="https://github.com/abisee/cnn-dailymail",
+            citation=_CITATION, )
+
+    def _vocab_text_gen(self, paths):
+        for _, ex in self._generate_examples(paths):
+            yield " ".join([ex[_ARTICLE], ex[_HIGHLIGHTS]])
+
+    def _split_generators(self, dl_manager):
+        dl_paths = dl_manager.download_and_extract(_DL_URLS)
+        train_files = _subset_filenames(dl_paths, datasets.Split.TRAIN)
+        # Generate shared vocabulary
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"files": train_files}),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "files": _subset_filenames(dl_paths,
+                                               datasets.Split.VALIDATION)
+                }, ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "files": _subset_filenames(dl_paths, datasets.Split.TEST)
+                }),
+        ]
+
+    def _generate_examples(self, files):
+        for p in files:
+            article, highlights = _get_art_abs(p, self.config.version)
+            if not article or not highlights:
+                continue
+            fname = os.path.basename(p)
+            yield fname, {
+                _ARTICLE: article,
+                _HIGHLIGHTS: highlights,
+                "id": _get_hash_from_path(fname),
+            }
Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ def main():`
`124`	`124`	`batched=True,`
`125`	`125`	`remove_columns=column_names,`
`126`	`126`	`num_proc=4)`
	`127`	`+`
`127`	`128`	`batchify_fn = lambda samples, fn=Dict(`
`128`	`129`	`{`
`129`	`130`	`"input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),`
Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,8 @@ def prepare_validation_features(examples, tokenizer, args):`
`156`	`156`	`# Set to None the offset_mapping that are not part of the context so it's easy to determine if a token`
`157`	`157`	`# position is part of the context or not.`
`158`	`158`	`tokenized_examples["offset_mapping"][i] = [`
`159`		`- (o if sequence_ids[k] == context_index else None)`
	`159`	`+ (o if sequence_ids[k] == context_index and`
	`160`	`+ k != len(sequence_ids) - 1 else None)`
`160`	`161`	`for k, o in enumerate(tokenized_examples["offset_mapping"][i])`
`161`	`162`	`]`
`162`	`163`