Skip to content

Commit d9e5b17

Browse files
committed
Merge adjacent Target NERs to handle multi-word entities.
1 parent 2272256 commit d9e5b17

File tree

1 file changed

+26
-0
lines changed

1 file changed

+26
-0
lines changed

src/parserindexer/corenlpparser.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,32 @@ def parse_names(self, text, meta):
4141
'source': 'corenlp'
4242
}
4343
names.append(name)
44+
45+
# Handle multi-word tokens:
46+
# Merge any adjacent Target tokens, if of the same type and
47+
# separated by a space, into one span.
48+
new_names = []
49+
skip_names = []
50+
for n in names:
51+
if n in skip_names:
52+
continue
53+
next_name = [n2 for n2 in names if \
54+
n['label'] == 'Target' and
55+
n2['label'] == n['label'] and
56+
int(n2['begin']) == int(n['end']) + 1 and
57+
text[int(n['end'])] == ' ']
58+
if len(next_name) > 0:
59+
print('Merging %s and %s' % (n['text'], next_name[0]['text']))
60+
n['text'] += ' ' + next_name[0]['text']
61+
n['end'] = next_name[0]['end']
62+
skip_names.append(next_name[0])
63+
64+
# Either way, save this one
65+
new_names.append(n)
66+
67+
if len(names) != len(new_names):
68+
print('%d -> %d NERs' % (len(names), len(new_names)))
69+
4470
if names:
4571
meta['ner'] = names
4672
meta['X-Parsed-By'].append(CoreNLPParser.CORENLP_PARSER)

0 commit comments

Comments
 (0)