Skip to content

Commit daa8ed3

Browse files
Merge pull request #8495 from PaddlePaddle/fix_conll05_bug
Update conll05.py
2 parents c02f773 + 97094e4 commit daa8ed3

File tree

1 file changed

+21
-1
lines changed

1 file changed

+21
-1
lines changed

python/paddle/v2/dataset/conll05.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,26 @@
4141
UNK_IDX = 0
4242

4343

44+
def load_label_dict(filename):
45+
d = dict()
46+
tag_dict = set()
47+
with open(filename, 'r') as f:
48+
for i, line in enumerate(f):
49+
line = line.strip()
50+
if line.startswith("B-"):
51+
tag_dict.add(line[2:])
52+
elif line.startswith("I-"):
53+
tag_dict.add(line[2:])
54+
index = 0
55+
for tag in tag_dict:
56+
d["B-" + tag] = index
57+
index += 1
58+
d["I-" + tag] = index
59+
index += 1
60+
d["O"] = index
61+
return d
62+
63+
4464
def load_dict(filename):
4565
d = dict()
4666
with open(filename, 'r') as f:
@@ -188,7 +208,7 @@ def get_dict():
188208
verb_dict = load_dict(
189209
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
190210
VERBDICT_MD5))
191-
label_dict = load_dict(
211+
label_dict = load_label_dict(
192212
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
193213
TRGDICT_MD5))
194214
return word_dict, verb_dict, label_dict

0 commit comments

Comments
 (0)