25
25
26
26
class Cote (DatasetBuilder ):
27
27
"""
28
- COTE_DP dataset for Opinion Role Labeling task.
28
+ COTE_DP/COTE-BD/COTE-MFW dataset for Opinion Role Labeling task.
29
29
More information please refer to https://aistudio.baidu.com/aistudio/competition/detail/50/?isFromLuge=1.
30
30
31
31
"""
@@ -37,22 +37,52 @@ class Cote(DatasetBuilder):
37
37
'splits' : {
38
38
'train' : [
39
39
os .path .join ('COTE-DP' , 'train.tsv' ),
40
- '17d11ca91b7979f2c2023757650096e5' , ( 0 , 1 ), 1
40
+ '17d11ca91b7979f2c2023757650096e5'
41
41
],
42
42
'test' : [
43
43
os .path .join ('COTE-DP' , 'test.tsv' ),
44
- '5bb9b9ccaaee6bcc1ac7a6c852b46f66' , ( 1 , ), 1
44
+ '5bb9b9ccaaee6bcc1ac7a6c852b46f66'
45
45
],
46
46
},
47
47
'labels' : ["B" , "I" , "O" ]
48
48
},
49
+ 'bd' : {
50
+ 'url' : "https://dataset-bj.cdn.bcebos.com/qianyan/COTE-BD.zip" ,
51
+ 'md5' : "8d87ff9bb6f5e5d46269d72632a1b01f" ,
52
+ 'splits' : {
53
+ 'train' : [
54
+ os .path .join ('COTE-BD' , 'train.tsv' ),
55
+ '4c08ccbcc373cb3bf05c3429d435f608'
56
+ ],
57
+ 'test' : [
58
+ os .path .join ('COTE-BD' , 'test.tsv' ),
59
+ 'aeb5c9af61488dadb12cbcc1d2180667'
60
+ ],
61
+ },
62
+ 'labels' : ["B" , "I" , "O" ]
63
+ },
64
+ 'mfw' : {
65
+ 'url' : "https://dataset-bj.cdn.bcebos.com/qianyan/COTE-MFW.zip" ,
66
+ 'md5' : "c85326bf2be4424d03373ea70cb32c3f" ,
67
+ 'splits' : {
68
+ 'train' : [
69
+ os .path .join ('COTE-MFW' , 'train.tsv' ),
70
+ '01fc90b9098d35615df6b8d257eb46ca'
71
+ ],
72
+ 'test' : [
73
+ os .path .join ('COTE-MFW' , 'test.tsv' ),
74
+ 'c61a475917a461089db141c59c688343'
75
+ ],
76
+ },
77
+ 'labels' : ["B" , "I" , "O" ]
78
+ }
49
79
}
50
80
51
81
def _get_data (self , mode , ** kwargs ):
52
82
"""Downloads dataset."""
53
83
builder_config = self .BUILDER_CONFIGS [self .name ]
54
- default_root = os .path .join (DATA_HOME , 'COTE-DP ' )
55
- filename , data_hash , _ , _ = builder_config ['splits' ][mode ]
84
+ default_root = os .path .join (DATA_HOME , f 'COTE-{ self . name . upper () } ' )
85
+ filename , data_hash = builder_config ['splits' ][mode ]
56
86
fullname = os .path .join (default_root , filename )
57
87
if not os .path .exists (fullname ) or (data_hash and
58
88
not md5file (fullname ) == data_hash ):
@@ -64,21 +94,19 @@ def _get_data(self, mode, **kwargs):
64
94
65
95
def _read (self , filename , split ):
66
96
"""Reads data"""
67
- _ , _ , field_indices , num_discard_samples = self .BUILDER_CONFIGS [
68
- self .name ]['splits' ][split ]
69
97
with open (filename , 'r' , encoding = 'utf-8' ) as f :
70
98
for idx , line in enumerate (f ):
71
- if idx < num_discard_samples :
99
+ if idx == 0 :
100
+ # ignore first line about title
72
101
continue
73
102
line_stripped = line .strip ().split ('\t ' )
74
103
if not line_stripped :
75
104
continue
76
- example = [line_stripped [indice ] for indice in field_indices ]
77
105
if split == "test" :
78
- yield {"tokens" : list (example [ 0 ])}
106
+ yield {"tokens" : list (line_stripped [ 1 ])}
79
107
else :
80
108
try :
81
- entity , text = example [0 ], example [1 ]
109
+ entity , text = line_stripped [0 ], line_stripped [1 ]
82
110
start_idx = text .index (entity )
83
111
except :
84
112
# drop the dirty data
@@ -94,6 +122,7 @@ def _read(self, filename, split):
94
122
"entity" : entity
95
123
}
96
124
125
+
97
126
def get_labels (self ):
98
127
"""
99
128
Return labels of the COTE.
0 commit comments