Merge pull request #18 from USCDataScience/issue16-parsing

wkiri · web-flow · commit 2b88469063b3 · 2021-06-02T10:24:54.000-07:00
Improve LPSC header handling and degree symbol
diff --git a/src/parserindexer/corenlpparser.py b/src/parserindexer/corenlpparser.py
@@ -4,6 +4,7 @@
 # The following two lines make CoreNLP happy
 reload(sys)
 sys.setdefaultencoding('UTF8')
+import urllib
 from parser import *
 from journalparser import *
 from pycorenlp import StanfordCoreNLP
@@ -35,6 +36,8 @@ def parse_names(self, text, meta):
         if text[0].isspace(): # dont strip white spaces
             text = '.' + text[1:]
 
+        # Quote (with percent-encoding) reserved characters in URL for CorenLP
+        text = urllib.quote(text)
         output = self.corenlp.annotate(text, properties=self.props)
         # flatten sentences and tokens
         tokenlists = [s['tokens'] for s in output['sentences']]
diff --git a/src/parserindexer/journalparser.py b/src/parserindexer/journalparser.py
@@ -44,6 +44,7 @@ def parse_file(self, path):
                  0x201C:0x22, 0x201D:0x22, # double quote
                  0x2010:0x2d, 0x2011:0x2d, 0x2012:0x2d, 0x2013:0x2d, # hyphens
                  0xFF0C:0x2c, # comma
+                 0xF0B0:0xb0, # degree
                  0x00A0:0x20, # space
                  0x2219:0x2e, 0x2022:0x2e, # bullets
                  }
@@ -63,12 +64,13 @@ def parse_file(self, path):
         content_ann = re.sub(r'([0-9][0-9][0-9][0-9].PDF)', '', content_ann,
                          flags=re.IGNORECASE)
         # And "xx(th|st) Lunar and Planetary Science Conference ((19|20)xx)"
-        content_ann = re.sub(r'([0-9][0-9].. Lunar and Planetary Science Conference \((19|20)[0-9][0-9]\)) ?', 
+        # with optional parentheses, optional LPI contrib
+        content_ann = re.sub(r'([0-9][0-9].. Lunar and Planetary Science Conference \(?(19|20)[0-9][0-9]\)?)( \(LPI Contrib. No. [0-9][0-9][0-9][0-9]\))? ?', 
                          '', content_ann,
                          flags=re.IGNORECASE)
         # And "Lunar and Planetary Science XXXIII (2002)"
         # with Roman numeral and optional year
-        content_ann = re.sub(r'(Lunar and Planetary Science [CDILVXM]+( \((19|20)[0-9][0-9]\))?) ?', 
+        content_ann = re.sub(r'(Lunar and Planetary Science [CDILVXM]+ (\((19|20)[0-9][0-9]\))?) ?', 
                          '', content_ann,
                          flags=re.IGNORECASE)
 
@@ -80,9 +82,11 @@ def parse_file(self, path):
 
         # 6. Move references to their own field (references)
         refs = extract_references(content_ann)
-        for ref_id in refs:  # preserve length; insert whitespace
-            content_ann = content_ann.replace(refs[ref_id],
-                                              ' ' * len(refs[ref_id]))
+        # This does weird things to citations, not just references,
+        # so disable it for now.
+        #for ref_id in refs:  # preserve length; insert whitespace
+        #    content_ann = content_ann.replace(refs[ref_id],
+        #                                      ' ' * len(refs[ref_id]))
         parsed['references'] = refs.values()
 
         # Store the modified content
diff --git a/src/parserindexer/json2brat.py b/src/parserindexer/json2brat.py
@@ -23,16 +23,28 @@ def convert_json_to_brat(jsonfile, outdir):
 
     # Iterate over documents
     for d in docs:
+        res_name = d['metadata']['resourceName']
+        if type(res_name) == list:
+            # Sometimes Tika returns this as something like
+            # "resourceName": ["2005_1725.pdf", "High Quality.joboptions"]
+            res_name = res_name[0]
+
+        # Output text into a .txt file
+        text = d['content_ann_s']
+        outfn = os.path.join(outdir, res_name[:-4] + '.txt')
+        with io.open(outfn, 'w', encoding='utf8') as outf:
+            print('Writing text to %s' % outfn)
+            outf.write(text + '\n')
+
         if 'ner' not in d['metadata']:
-            print 'No named entities found for', d['file']
+            print('No named entities found for %s' % d['file'])
             continue
 
         # Output relevant annotations into a brat .ann file
         ners = d['metadata']['ner']
-        outfn = os.path.join(outdir, 
-                             d['metadata']['resourceName'][:-4] + '.ann')
+        outfn = os.path.join(outdir, res_name[:-4] + '.ann')
         outf = io.open(outfn, 'w', encoding='utf8')
-        print 'Writing to', outfn
+        print('Writing annotations to %s' % outfn)
         for (t, n) in enumerate(ners):
             outf.write('T%d\t%s %s %s\t%s\n' % \
                        (t+1, n['label'], n['begin'], n['end'], n['text']))