Skip to content

Commit dc8c8f2

Browse files
KingZeyuChen
andauthored
Add PAWS-X Dataset (#559)
* #530 add parameter verification * skip illegal lines in dataset likes paws-x-zh * Add PAWS-X Dataset #447 * Add PAWS-X Dataset #447 Co-authored-by: Zeyu Chen <[email protected]>
1 parent 0c8afd0 commit dc8c8f2

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

paddlenlp/datasets/paws-x.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import collections
16+
import json
17+
import os
18+
19+
from paddle.dataset.common import md5file
20+
from paddle.utils.download import get_path_from_url
21+
from paddlenlp.utils.env import DATA_HOME
22+
from . import DatasetBuilder
23+
24+
__all__ = ['PAWS']
25+
26+
class PAWS(DatasetBuilder):
27+
"""
28+
PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification
29+
More information please refer to `https://arxiv.org/abs/1908.11828`
30+
Here we only store simplified Chinese(zh) version.
31+
"""
32+
URL = "https://dataset-bj.cdn.bcebos.com/qianyan/paws-x-zh.zip"
33+
MD5 = "f1c6f2ab8afb1f29fe04a0c929e3ab1c"
34+
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
35+
SPLITS = {
36+
'train': META_INFO(
37+
os.path.join('paws-x-zh', 'paws-x-zh', 'train.tsv'),
38+
'3422ba98e5151c91bbb0a785c4873a4c'),
39+
'dev': META_INFO(
40+
os.path.join('paws-x-zh', 'paws-x-zh', 'dev.tsv'),
41+
'dc163453e728cf118e17b4065d6602c8'),
42+
'test': META_INFO(
43+
os.path.join('paws-x-zh', 'paws-x-zh', 'test.tsv'),
44+
'5b7320760e70559591092cb01b6f5955'),
45+
}
46+
47+
def _get_data(self, mode, **kwargs):
48+
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
49+
filename, data_hash = self.SPLITS[mode]
50+
fullname = os.path.join(default_root, filename)
51+
if not os.path.exists(fullname) or (data_hash and
52+
not md5file(fullname) == data_hash):
53+
get_path_from_url(self.URL, default_root, self.MD5)
54+
55+
return fullname
56+
57+
def _read(self, filename):
58+
"""Reads data."""
59+
with open(filename, 'r', encoding='utf-8') as f:
60+
for line in f:
61+
data = line.strip().split("\t")
62+
if len(data) == 3:
63+
sentence1, sentence2, label = data
64+
yield {"sentence1": sentence1, "sentence2": sentence2, "label": label}
65+
elif len(data) == 2:
66+
sentence1, sentence2 = data
67+
yield {"sentence1": sentence1, "sentence2": sentence2, "label":''}
68+
else:
69+
continue
70+
71+
def get_labels(self):
72+
"""
73+
Return labels of the PAWS-X object.
74+
"""
75+
return ["0", "1"]

0 commit comments

Comments
 (0)