Skip to content

Commit 07d414a

Browse files
authored
update chnsenticorp and lcqmc to qianyan format (#482)
* update chnsenticorp and lcqmc to qianyan format * update md5 check
1 parent 3ce34a1 commit 07d414a

File tree

3 files changed

+34
-22
lines changed

3 files changed

+34
-22
lines changed

docs/data_prepare/dataset_list.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ PaddleNLP提供了以下数据集的快速读取API,实际使用时请根据
6969
| ---- | --------- | ------ |
7070
| [BSTC](https://aistudio.baidu.com/aistudio/competition/detail/44/) | 千言数据集:机器同传,包括transcription_translation和asr | `paddlenlp.datasets.load_dataset('bstc', 'asr')`|
7171

72+
## 对话系统
73+
74+
| 数据集名称 | 简介 | 调用方法 |
75+
| ---- | --------- | ------ |
76+
| [DuConv](https://aistudio.baidu.com/aistudio/competition/detail/48/) | 千言数据集:开放域对话,中文知识型对话数据集 | `paddlenlp.datasets.load_dataset('duconv')`|
7277

7378
## 文本生成
7479

paddlenlp/datasets/chnsenticorp.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,19 @@ class ChnSentiCorp(DatasetBuilder):
3131
3232
"""
3333

34-
URL = "https://bj.bcebos.com/paddlehub-dataset/chnsenticorp.tar.gz"
35-
MD5 = "fbb3217aeac76a2840d2d5cd19688b07"
34+
URL = "https://dataset-bj.cdn.bcebos.com/qianyan/ChnSentiCorp.zip"
35+
MD5 = "7ef61b08ad10fbddf2ba97613f071561"
3636
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
3737
SPLITS = {
3838
'train': META_INFO(
39-
os.path.join('chnsenticorp', 'train.tsv'),
39+
os.path.join('ChnSentiCorp', 'ChnSentiCorp', 'train.tsv'),
4040
'689360c4a4a9ce8d8719ed500ae80907'),
4141
'dev': META_INFO(
42-
os.path.join('chnsenticorp', 'dev.tsv'),
43-
'05e4b02561c2a327833e05bbe8156cec'),
42+
os.path.join('ChnSentiCorp', 'ChnSentiCorp', 'dev.tsv'),
43+
'20c77cc2371634731a367996b097ec0a'),
4444
'test': META_INFO(
45-
os.path.join('chnsenticorp', 'test.tsv'),
46-
'917dfc6fbce596bb01a91abaa6c86f9e'),
45+
os.path.join('ChnSentiCorp', 'ChnSentiCorp', 'test.tsv'),
46+
'9b4dc7d1e4ada48c645b7e938592f49c'),
4747
}
4848

4949
def _get_data(self, mode, **kwargs):
@@ -57,7 +57,7 @@ def _get_data(self, mode, **kwargs):
5757

5858
return fullname
5959

60-
def _read(self, filename):
60+
def _read(self, filename, split):
6161
"""Reads data."""
6262
with open(filename, 'r', encoding='utf-8') as f:
6363
head = None
@@ -66,8 +66,15 @@ def _read(self, filename):
6666
if not head:
6767
head = data
6868
else:
69-
label, text = data
70-
yield {"text": text, "label": label}
69+
if split == 'train':
70+
label, text = data
71+
yield {"text": text, "label": label, "qid": ''}
72+
elif split == 'dev':
73+
qid, label, text = data
74+
yield {"text": text, "label": label, "qid": qid}
75+
elif split == 'test':
76+
qid, text = data
77+
yield {"text": text, "label": '', "qid": qid}
7178

7279
def get_labels(self):
7380
"""

paddlenlp/datasets/lcqmc.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,19 @@ class LCQMC(DatasetBuilder):
3131
3232
"""
3333

34-
URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
35-
MD5 = "62a7ba36f786a82ae59bbde0b0a9af0c"
34+
URL = "https://dataset-bj.cdn.bcebos.com/qianyan/lcqmc.zip"
35+
MD5 = "7069fa0cffbd2110845869c61f83814a"
3636
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
3737
SPLITS = {
3838
'train': META_INFO(
39-
os.path.join('lcqmc', 'train.tsv'),
40-
'2193c022439b038ac12c0ae918b211a1'),
39+
os.path.join('lcqmc', 'lcqmc', 'train.tsv'),
40+
'479d94fe575981f236319f2a5b8b3c03'),
4141
'dev': META_INFO(
42-
os.path.join('lcqmc', 'dev.tsv'),
43-
'c5dcba253cb4105d914964fd8b3c0e94'),
42+
os.path.join('lcqmc', 'lcqmc', 'dev.tsv'),
43+
'089329fb44ef26155baef9c9c8c823ba'),
4444
'test': META_INFO(
45-
os.path.join('lcqmc', 'test.tsv'),
46-
'8f4b71e15e67696cc9e112a459ec42bd'),
45+
os.path.join('lcqmc', 'lcqmc', 'test.tsv'),
46+
'a4a483f2f871d57e0f3894fca0d0f8f0'),
4747
}
4848

4949
def _get_data(self, mode, **kwargs):
@@ -59,14 +59,14 @@ def _get_data(self, mode, **kwargs):
5959
def _read(self, filename):
6060
"""Reads data."""
6161
with open(filename, 'r', encoding='utf-8') as f:
62-
head = None
6362
for line in f:
6463
data = line.strip().split("\t")
65-
if not head:
66-
head = data
67-
else:
64+
if len(data) == 3:
6865
query, title, label = data
6966
yield {"query": query, "title": title, "label": label}
67+
else:
68+
query, title = data
69+
yield {"query": query, "title": title, "label": ''}
7070

7171
def get_labels(self):
7272
"""

0 commit comments

Comments
 (0)