Skip to content

Commit 12eb699

Browse files
committed
Updated UTF-8 text parsing to match that in extract_text_utf8.py in the MTE repo.
1 parent a83fb5d commit 12eb699

File tree

1 file changed

+63
-6
lines changed

1 file changed

+63
-6
lines changed

src/parserindexer/journalparser.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import print_function
22

33
from parser import *
4+
from brat_ann_indexer import extract_references
45

56
class JournalParser(Parser):
67
"""
@@ -22,12 +23,67 @@ def parse_file(self, path):
2223
parsed = super(JournalParser, self).parse_file(path)
2324
pdf_md = parsed['metadata']
2425
assert pdf_md['Content-Type'] == JournalParser._PDF_TYPE
25-
assert JournalParser._JOURNAL_PARSER in set(pdf_md['X-Parsed-By'])
26+
# Why would we check that it's already been parsed before doing so?
27+
#assert JournalParser._JOURNAL_PARSER in set(pdf_md['X-Parsed-By'])
28+
29+
# Result of Tika parsing is in parsed['content']
30+
#content = parsed['content'].strip()
31+
#parsed['content'] = content # stripped off the whitespaces
32+
#assert type(content) == str or type(content) == unicode
33+
34+
# Improve parsing and save in parsed['content_ann_s']
35+
content_ann = parsed['content']
36+
assert type(content_ann) == str or type(content_ann) == unicode
37+
38+
#### New parsing (after extract_text_utf8.py)
39+
# 0. Translate some UTF-8 punctuation to ASCII
40+
punc = { 0x2018:0x27, 0x2019:0x27, # single quote
41+
0x201C:0x22, 0x201D:0x22, # double quote
42+
0x2010:0x2d, 0x2011:0x2d, 0x2012:0x2d, 0x2013:0x2d, # hyphens
43+
0xFF0C:0x2c, # comma
44+
0x00A0:0x20, # space
45+
0x2219:0x2e, 0x2022:0x2e, # bullets
46+
}
47+
content_ann = content_ann.translate(punc)
48+
49+
# 1. Replace newlines that separate words with a space (unless hyphen)
50+
content_ann = re.sub(r'([^\s-])[\r|\n]+([^\s])','\\1 \\2', content_ann)
51+
52+
# 2. Remove hyphenation at the end of lines
53+
# (this is sometimes bad, as with "Fe-\nrich")
54+
content_ann = content_ann.replace('-\n','\n')
55+
56+
# 3. Remove all newlines
57+
content_ann = re.sub(r'[\r|\n]+','', content_ann)
58+
59+
# 4. Remove xxxx.PDF
60+
content_ann = re.sub(r'([0-9][0-9][0-9][0-9].PDF)', '', content_ann,
61+
flags=re.IGNORECASE)
62+
# And "Lunar and Planetary Science Conference (201x)"
63+
content_ann = re.sub(r'([0-9][0-9].. Lunar and Planetary Science Conference \(201[0-9]\))',
64+
'', content_ann,
65+
flags=re.IGNORECASE)
66+
67+
# 5. Remove mailto: links
68+
content_ann = re.sub(r'mailto:[^\s]+','', content_ann)
69+
70+
#print(content_ann)
71+
#raw_input()
72+
73+
# 6. Move references to their own field (references)
74+
refs = extract_references(content_ann)
75+
for ref_id in refs: # preserve length; insert whitespace
76+
content_ann = content_ann.replace(refs[ref_id],
77+
' ' * len(refs[ref_id]))
78+
parsed['references'] = refs.values()
79+
80+
# Store the modified content
81+
parsed['content_ann_s'] = content_ann
82+
83+
# Find named entities
84+
self.parse_names(content_ann, pdf_md)
85+
#self.parse_names(content, pdf_md)
2686

27-
content = parsed['content'].strip()
28-
parsed['content'] = content # stripped off the whitespaces
29-
assert type(content) == str or type(content) == unicode
30-
self.parse_names(content, pdf_md)
3187
return parsed
3288

3389
def parse_names(self, content, meta):
@@ -41,7 +97,8 @@ def parse_names(self, content, meta):
4197
for entity_type in ner_keys:
4298
meta[entity_type] = ner_md[entity_type]
4399
# Merged NER and Journal Parsers
44-
meta['X-Parsed-By'].append(JournalParser._NER_PARSER)
100+
# This was NER_PARSER, which makes no sense. Now JOURNAL_PARSER.
101+
meta['X-Parsed-By'].append(JournalParser._JOURNAL_PARSER)
45102

46103
if __name__ == '__main__':
47104
args = vars(CliParser(JournalParser).parse_args())

0 commit comments

Comments
 (0)