Skip to content

Commit 0d8192f

Browse files
authored
Update conll05.py
the label file in the conll05 dataset has a wrong order
1 parent c490f1b commit 0d8192f

File tree

1 file changed

+23
-1
lines changed

1 file changed

+23
-1
lines changed

python/paddle/v2/dataset/conll05.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,28 @@
4141
UNK_IDX = 0
4242

4343

44+
def load_label_dict(filename):
45+
d = dict()
46+
tag_dict = set()
47+
with open(filename, 'r') as f:
48+
for i, line in enumerate(f):
49+
line = line.strip()
50+
if line.startswith("B-"):
51+
tag_dict.add(line[2:])
52+
elif line.startswith("I-"):
53+
tag_dict.add(line[2:])
54+
else:
55+
continue
56+
index = 0
57+
for tag in tag_dict:
58+
d["B-" + tag] = index
59+
index += 1
60+
d["I-" + tag] = index
61+
index += 1
62+
d["O"] = index
63+
return d
64+
65+
4466
def load_dict(filename):
4567
d = dict()
4668
with open(filename, 'r') as f:
@@ -188,7 +210,7 @@ def get_dict():
188210
verb_dict = load_dict(
189211
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
190212
VERBDICT_MD5))
191-
label_dict = load_dict(
213+
label_dict = load_label_dict(
192214
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
193215
TRGDICT_MD5))
194216
return word_dict, verb_dict, label_dict

0 commit comments

Comments
 (0)