Skip to content

Commit a389ffa

Browse files
committed
Added support for UTF8 inputs.
Added check for whether NER model file exists. Updated logic for merging adjacent Target NPs.
1 parent 6e8afbd commit a389ffa

File tree

1 file changed

+16
-6
lines changed

1 file changed

+16
-6
lines changed

src/parserindexer/corenlpparser.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from __future__ import print_function
22

3+
import sys
4+
# The following two lines make CoreNLP happy
5+
reload(sys)
6+
sys.setdefaultencoding('UTF8')
37
from parser import *
48
from journalparser import *
59
from pycorenlp import StanfordCoreNLP
@@ -12,20 +16,25 @@ def __init__(self, **kwargs):
1216
super(CoreNLPParser, self).__init__(**kwargs)
1317
self.corenlp = StanfordCoreNLP(kwargs['corenlp_url'] )
1418
self.props = {
15-
'annotators': 'ner',
19+
'annotators': 'tokenize,ssplit,lemma,pos,ner',
1620
'outputFormat': 'json',
1721
'ner.useSUTime': False, # dont want SUTime model
1822
'ner.applyNumericClassifiers': False, # Dont want numeric classifier
1923
}
2024
if kwargs.get('ner_model'): # set NER model from CLI
25+
if not os.path.exists(kwargs.get('ner_model')):
26+
print('Error: Could not find NER model %s.' %
27+
kwargs.get('ner_model'))
28+
sys.exit(1)
2129
self.props['ner.model'] = kwargs['ner_model']
2230
print("CoreNLP Properties : ", self.props)
2331

2432
def parse_names(self, text, meta):
2533
if type(text) != str:
26-
text = text.encode('ascii', errors='ignore')
34+
text = text.encode('utf8') #, errors='ignore')
2735
if text[0].isspace(): # dont strip white spaces
2836
text = '.' + text[1:]
37+
2938
output = self.corenlp.annotate(text, properties=self.props)
3039
# flatten sentences and tokens
3140
tokenlists = [s['tokens'] for s in output['sentences']]
@@ -53,11 +62,12 @@ def parse_names(self, text, meta):
5362
continue
5463
next_name = [n2 for n2 in names if \
5564
n['label'] == 'Target' and
56-
n2['label'] == n['label'] and
57-
int(n2['begin']) == int(n['end']) + 1 and
58-
text[int(n['end'])] == ' ']
65+
n2['label'] == 'Target' and
66+
int(n2['begin']) == int(n['end']) + 1]
5967
if len(next_name) > 0:
60-
print('Merging %s and %s' % (n['text'], next_name[0]['text']))
68+
print('%s: Merging %s and %s' %
69+
(meta['resourceName'],
70+
n['text'], next_name[0]['text']))
6171
n['text'] += ' ' + next_name[0]['text']
6272
n['end'] = next_name[0]['end']
6373
skip_names.append(next_name[0])

0 commit comments

Comments
 (0)