11from __future__ import print_function
22
33from parser import *
4+ from brat_ann_indexer import extract_references
45
56class JournalParser (Parser ):
67 """
@@ -22,12 +23,67 @@ def parse_file(self, path):
2223 parsed = super (JournalParser , self ).parse_file (path )
2324 pdf_md = parsed ['metadata' ]
2425 assert pdf_md ['Content-Type' ] == JournalParser ._PDF_TYPE
25- assert JournalParser ._JOURNAL_PARSER in set (pdf_md ['X-Parsed-By' ])
26+ # Why would we check that it's already been parsed before doing so?
27+ #assert JournalParser._JOURNAL_PARSER in set(pdf_md['X-Parsed-By'])
28+
29+ # Result of Tika parsing is in parsed['content']
30+ #content = parsed['content'].strip()
31+ #parsed['content'] = content # stripped off the whitespaces
32+ #assert type(content) == str or type(content) == unicode
33+
34+ # Improve parsing and save in parsed['content_ann_s']
35+ content_ann = parsed ['content' ]
36+ assert type (content_ann ) == str or type (content_ann ) == unicode
37+
38+ #### New parsing (after extract_text_utf8.py)
39+ # 0. Translate some UTF-8 punctuation to ASCII
40+ punc = { 0x2018 :0x27 , 0x2019 :0x27 , # single quote
41+ 0x201C :0x22 , 0x201D :0x22 , # double quote
42+ 0x2010 :0x2d , 0x2011 :0x2d , 0x2012 :0x2d , 0x2013 :0x2d , # hyphens
43+ 0xFF0C :0x2c , # comma
44+ 0x00A0 :0x20 , # space
45+ 0x2219 :0x2e , 0x2022 :0x2e , # bullets
46+ }
47+ content_ann = content_ann .translate (punc )
48+
49+ # 1. Replace newlines that separate words with a space (unless hyphen)
50+ content_ann = re .sub (r'([^\s-])[\r|\n]+([^\s])' ,'\\ 1 \\ 2' , content_ann )
51+
52+ # 2. Remove hyphenation at the end of lines
53+ # (this is sometimes bad, as with "Fe-\nrich")
54+ content_ann = content_ann .replace ('-\n ' ,'\n ' )
55+
56+ # 3. Remove all newlines
57+ content_ann = re .sub (r'[\r|\n]+' ,'' , content_ann )
58+
59+ # 4. Remove xxxx.PDF
60+ content_ann = re .sub (r'([0-9][0-9][0-9][0-9].PDF)' , '' , content_ann ,
61+ flags = re .IGNORECASE )
62+ # And "Lunar and Planetary Science Conference (201x)"
63+ content_ann = re .sub (r'([0-9][0-9].. Lunar and Planetary Science Conference \(201[0-9]\))' ,
64+ '' , content_ann ,
65+ flags = re .IGNORECASE )
66+
67+ # 5. Remove mailto: links
68+ content_ann = re .sub (r'mailto:[^\s]+' ,'' , content_ann )
69+
70+ #print(content_ann)
71+ #raw_input()
72+
73+ # 6. Move references to their own field (references)
74+ refs = extract_references (content_ann )
75+ for ref_id in refs : # preserve length; insert whitespace
76+ content_ann = content_ann .replace (refs [ref_id ],
77+ ' ' * len (refs [ref_id ]))
78+ parsed ['references' ] = refs .values ()
79+
80+ # Store the modified content
81+ parsed ['content_ann_s' ] = content_ann
82+
83+ # Find named entities
84+ self .parse_names (content_ann , pdf_md )
85+ #self.parse_names(content, pdf_md)
2686
27- content = parsed ['content' ].strip ()
28- parsed ['content' ] = content # stripped off the whitespaces
29- assert type (content ) == str or type (content ) == unicode
30- self .parse_names (content , pdf_md )
3187 return parsed
3288
3389 def parse_names (self , content , meta ):
@@ -41,7 +97,8 @@ def parse_names(self, content, meta):
4197 for entity_type in ner_keys :
4298 meta [entity_type ] = ner_md [entity_type ]
4399 # Merged NER and Journal Parsers
44- meta ['X-Parsed-By' ].append (JournalParser ._NER_PARSER )
100+ # This was NER_PARSER, which makes no sense. Now JOURNAL_PARSER.
101+ meta ['X-Parsed-By' ].append (JournalParser ._JOURNAL_PARSER )
45102
46103if __name__ == '__main__' :
47104 args = vars (CliParser (JournalParser ).parse_args ())
0 commit comments