Skip to content

Commit 2b88469

Browse files
authored
Merge pull request #18 from USCDataScience/issue16-parsing
Improve LPSC header handling and degree symbol
2 parents d42f56f + a3e710a commit 2b88469

File tree

3 files changed

+28
-9
lines changed

3 files changed

+28
-9
lines changed

src/parserindexer/corenlpparser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# The following two lines make CoreNLP happy
55
reload(sys)
66
sys.setdefaultencoding('UTF8')
7+
import urllib
78
from parser import *
89
from journalparser import *
910
from pycorenlp import StanfordCoreNLP
@@ -35,6 +36,8 @@ def parse_names(self, text, meta):
3536
if text[0].isspace(): # dont strip white spaces
3637
text = '.' + text[1:]
3738

39+
# Quote (with percent-encoding) reserved characters in URL for CorenLP
40+
text = urllib.quote(text)
3841
output = self.corenlp.annotate(text, properties=self.props)
3942
# flatten sentences and tokens
4043
tokenlists = [s['tokens'] for s in output['sentences']]

src/parserindexer/journalparser.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def parse_file(self, path):
4444
0x201C:0x22, 0x201D:0x22, # double quote
4545
0x2010:0x2d, 0x2011:0x2d, 0x2012:0x2d, 0x2013:0x2d, # hyphens
4646
0xFF0C:0x2c, # comma
47+
0xF0B0:0xb0, # degree
4748
0x00A0:0x20, # space
4849
0x2219:0x2e, 0x2022:0x2e, # bullets
4950
}
@@ -63,12 +64,13 @@ def parse_file(self, path):
6364
content_ann = re.sub(r'([0-9][0-9][0-9][0-9].PDF)', '', content_ann,
6465
flags=re.IGNORECASE)
6566
# And "xx(th|st) Lunar and Planetary Science Conference ((19|20)xx)"
66-
content_ann = re.sub(r'([0-9][0-9].. Lunar and Planetary Science Conference \((19|20)[0-9][0-9]\)) ?',
67+
# with optional parentheses, optional LPI contrib
68+
content_ann = re.sub(r'([0-9][0-9].. Lunar and Planetary Science Conference \(?(19|20)[0-9][0-9]\)?)( \(LPI Contrib. No. [0-9][0-9][0-9][0-9]\))? ?',
6769
'', content_ann,
6870
flags=re.IGNORECASE)
6971
# And "Lunar and Planetary Science XXXIII (2002)"
7072
# with Roman numeral and optional year
71-
content_ann = re.sub(r'(Lunar and Planetary Science [CDILVXM]+( \((19|20)[0-9][0-9]\))?) ?',
73+
content_ann = re.sub(r'(Lunar and Planetary Science [CDILVXM]+ (\((19|20)[0-9][0-9]\))?) ?',
7274
'', content_ann,
7375
flags=re.IGNORECASE)
7476

@@ -80,9 +82,11 @@ def parse_file(self, path):
8082

8183
# 6. Move references to their own field (references)
8284
refs = extract_references(content_ann)
83-
for ref_id in refs: # preserve length; insert whitespace
84-
content_ann = content_ann.replace(refs[ref_id],
85-
' ' * len(refs[ref_id]))
85+
# This does weird things to citations, not just references,
86+
# so disable it for now.
87+
#for ref_id in refs: # preserve length; insert whitespace
88+
# content_ann = content_ann.replace(refs[ref_id],
89+
# ' ' * len(refs[ref_id]))
8690
parsed['references'] = refs.values()
8791

8892
# Store the modified content

src/parserindexer/json2brat.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,28 @@ def convert_json_to_brat(jsonfile, outdir):
2323

2424
# Iterate over documents
2525
for d in docs:
26+
res_name = d['metadata']['resourceName']
27+
if type(res_name) == list:
28+
# Sometimes Tika returns this as something like
29+
# "resourceName": ["2005_1725.pdf", "High Quality.joboptions"]
30+
res_name = res_name[0]
31+
32+
# Output text into a .txt file
33+
text = d['content_ann_s']
34+
outfn = os.path.join(outdir, res_name[:-4] + '.txt')
35+
with io.open(outfn, 'w', encoding='utf8') as outf:
36+
print('Writing text to %s' % outfn)
37+
outf.write(text + '\n')
38+
2639
if 'ner' not in d['metadata']:
27-
print 'No named entities found for', d['file']
40+
print('No named entities found for %s' % d['file'])
2841
continue
2942

3043
# Output relevant annotations into a brat .ann file
3144
ners = d['metadata']['ner']
32-
outfn = os.path.join(outdir,
33-
d['metadata']['resourceName'][:-4] + '.ann')
45+
outfn = os.path.join(outdir, res_name[:-4] + '.ann')
3446
outf = io.open(outfn, 'w', encoding='utf8')
35-
print 'Writing to', outfn
47+
print('Writing annotations to %s' % outfn)
3648
for (t, n) in enumerate(ners):
3749
outf.write('T%d\t%s %s %s\t%s\n' % \
3850
(t+1, n['label'], n['begin'], n['end'], n['text']))

0 commit comments

Comments
 (0)