Skip to content

Commit 9a45f83

Browse files
author
jiaqianjing
authored
Add COTE-BD & COTE-MFW Dataset (#593)
* Add COTE-BD Dataset #447 * Update cote.py Add COTE-MFW Dataset #447 * Update cote.py add annotation of COTE-MFW * update * update
1 parent a145bca commit 9a45f83

File tree

1 file changed

+40
-11
lines changed

1 file changed

+40
-11
lines changed

paddlenlp/datasets/cote.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
class Cote(DatasetBuilder):
2727
"""
28-
COTE_DP dataset for Opinion Role Labeling task.
28+
COTE_DP/COTE-BD/COTE-MFW dataset for Opinion Role Labeling task.
2929
More information please refer to https://aistudio.baidu.com/aistudio/competition/detail/50/?isFromLuge=1.
3030
3131
"""
@@ -37,22 +37,52 @@ class Cote(DatasetBuilder):
3737
'splits': {
3838
'train': [
3939
os.path.join('COTE-DP', 'train.tsv'),
40-
'17d11ca91b7979f2c2023757650096e5', (0, 1), 1
40+
'17d11ca91b7979f2c2023757650096e5'
4141
],
4242
'test': [
4343
os.path.join('COTE-DP', 'test.tsv'),
44-
'5bb9b9ccaaee6bcc1ac7a6c852b46f66', (1, ), 1
44+
'5bb9b9ccaaee6bcc1ac7a6c852b46f66'
4545
],
4646
},
4747
'labels': ["B", "I", "O"]
4848
},
49+
'bd': {
50+
'url': "https://dataset-bj.cdn.bcebos.com/qianyan/COTE-BD.zip",
51+
'md5': "8d87ff9bb6f5e5d46269d72632a1b01f",
52+
'splits': {
53+
'train': [
54+
os.path.join('COTE-BD', 'train.tsv'),
55+
'4c08ccbcc373cb3bf05c3429d435f608'
56+
],
57+
'test': [
58+
os.path.join('COTE-BD', 'test.tsv'),
59+
'aeb5c9af61488dadb12cbcc1d2180667'
60+
],
61+
},
62+
'labels': ["B", "I", "O"]
63+
},
64+
'mfw': {
65+
'url': "https://dataset-bj.cdn.bcebos.com/qianyan/COTE-MFW.zip",
66+
'md5': "c85326bf2be4424d03373ea70cb32c3f",
67+
'splits': {
68+
'train': [
69+
os.path.join('COTE-MFW', 'train.tsv'),
70+
'01fc90b9098d35615df6b8d257eb46ca'
71+
],
72+
'test': [
73+
os.path.join('COTE-MFW', 'test.tsv'),
74+
'c61a475917a461089db141c59c688343'
75+
],
76+
},
77+
'labels': ["B", "I", "O"]
78+
}
4979
}
5080

5181
def _get_data(self, mode, **kwargs):
5282
"""Downloads dataset."""
5383
builder_config = self.BUILDER_CONFIGS[self.name]
54-
default_root = os.path.join(DATA_HOME, 'COTE-DP')
55-
filename, data_hash, _, _ = builder_config['splits'][mode]
84+
default_root = os.path.join(DATA_HOME, f'COTE-{self.name.upper()}')
85+
filename, data_hash = builder_config['splits'][mode]
5686
fullname = os.path.join(default_root, filename)
5787
if not os.path.exists(fullname) or (data_hash and
5888
not md5file(fullname) == data_hash):
@@ -64,21 +94,19 @@ def _get_data(self, mode, **kwargs):
6494

6595
def _read(self, filename, split):
6696
"""Reads data"""
67-
_, _, field_indices, num_discard_samples = self.BUILDER_CONFIGS[
68-
self.name]['splits'][split]
6997
with open(filename, 'r', encoding='utf-8') as f:
7098
for idx, line in enumerate(f):
71-
if idx < num_discard_samples:
99+
if idx == 0:
100+
# ignore first line about title
72101
continue
73102
line_stripped = line.strip().split('\t')
74103
if not line_stripped:
75104
continue
76-
example = [line_stripped[indice] for indice in field_indices]
77105
if split == "test":
78-
yield {"tokens": list(example[0])}
106+
yield {"tokens": list(line_stripped[1])}
79107
else:
80108
try:
81-
entity, text = example[0], example[1]
109+
entity, text = line_stripped[0], line_stripped[1]
82110
start_idx = text.index(entity)
83111
except:
84112
# drop the dirty data
@@ -94,6 +122,7 @@ def _read(self, filename, split):
94122
"entity": entity
95123
}
96124

125+
97126
def get_labels(self):
98127
"""
99128
Return labels of the COTE.

0 commit comments

Comments
 (0)