11from __future__ import print_function
22
3+ import sys
4+ # The following two lines make CoreNLP happy
5+ reload (sys )
6+ sys .setdefaultencoding ('UTF8' )
37from parser import *
48from journalparser import *
59from pycorenlp import StanfordCoreNLP
@@ -12,20 +16,25 @@ def __init__(self, **kwargs):
1216 super (CoreNLPParser , self ).__init__ (** kwargs )
1317 self .corenlp = StanfordCoreNLP (kwargs ['corenlp_url' ] )
1418 self .props = {
15- 'annotators' : 'ner' ,
19+ 'annotators' : 'tokenize,ssplit,lemma,pos, ner' ,
1620 'outputFormat' : 'json' ,
1721 'ner.useSUTime' : False , # dont want SUTime model
1822 'ner.applyNumericClassifiers' : False , # Dont want numeric classifier
1923 }
2024 if kwargs .get ('ner_model' ): # set NER model from CLI
25+ if not os .path .exists (kwargs .get ('ner_model' )):
26+ print ('Error: Could not find NER model %s.' %
27+ kwargs .get ('ner_model' ))
28+ sys .exit (1 )
2129 self .props ['ner.model' ] = kwargs ['ner_model' ]
2230 print ("CoreNLP Properties : " , self .props )
2331
2432 def parse_names (self , text , meta ):
2533 if type (text ) != str :
26- text = text .encode ('ascii' , errors = 'ignore' )
34+ text = text .encode ('utf8' ) # , errors='ignore')
2735 if text [0 ].isspace (): # dont strip white spaces
2836 text = '.' + text [1 :]
37+
2938 output = self .corenlp .annotate (text , properties = self .props )
3039 # flatten sentences and tokens
3140 tokenlists = [s ['tokens' ] for s in output ['sentences' ]]
@@ -53,11 +62,12 @@ def parse_names(self, text, meta):
5362 continue
5463 next_name = [n2 for n2 in names if \
5564 n ['label' ] == 'Target' and
56- n2 ['label' ] == n ['label' ] and
57- int (n2 ['begin' ]) == int (n ['end' ]) + 1 and
58- text [int (n ['end' ])] == ' ' ]
65+ n2 ['label' ] == 'Target' and
66+ int (n2 ['begin' ]) == int (n ['end' ]) + 1 ]
5967 if len (next_name ) > 0 :
60- print ('Merging %s and %s' % (n ['text' ], next_name [0 ]['text' ]))
68+ print ('%s: Merging %s and %s' %
69+ (meta ['resourceName' ],
70+ n ['text' ], next_name [0 ]['text' ]))
6171 n ['text' ] += ' ' + next_name [0 ]['text' ]
6272 n ['end' ] = next_name [0 ]['end' ]
6373 skip_names .append (next_name [0 ])
0 commit comments