@@ -44,6 +44,7 @@ def parse_file(self, path):
4444 0x201C :0x22 , 0x201D :0x22 , # double quote
4545 0x2010 :0x2d , 0x2011 :0x2d , 0x2012 :0x2d , 0x2013 :0x2d , # hyphens
4646 0xFF0C :0x2c , # comma
47+ 0xF0B0 :0xb0 , # degree
4748 0x00A0 :0x20 , # space
4849 0x2219 :0x2e , 0x2022 :0x2e , # bullets
4950 }
@@ -63,12 +64,13 @@ def parse_file(self, path):
6364 content_ann = re .sub (r'([0-9][0-9][0-9][0-9].PDF)' , '' , content_ann ,
6465 flags = re .IGNORECASE )
6566 # And "xx(th|st) Lunar and Planetary Science Conference ((19|20)xx)"
66- content_ann = re .sub (r'([0-9][0-9].. Lunar and Planetary Science Conference \((19|20)[0-9][0-9]\)) ?' ,
67+ # with optional parentheses, optional LPI contrib
68+ content_ann = re .sub (r'([0-9][0-9].. Lunar and Planetary Science Conference \(?(19|20)[0-9][0-9]\)?)( \(LPI Contrib. No. [0-9][0-9][0-9][0-9]\))? ?' ,
6769 '' , content_ann ,
6870 flags = re .IGNORECASE )
6971 # And "Lunar and Planetary Science XXXIII (2002)"
7072 # with Roman numeral and optional year
71- content_ann = re .sub (r'(Lunar and Planetary Science [CDILVXM]+( \((19|20)[0-9][0-9]\))?) ?' ,
73+ content_ann = re .sub (r'(Lunar and Planetary Science [CDILVXM]+ ( \((19|20)[0-9][0-9]\))?) ?' ,
7274 '' , content_ann ,
7375 flags = re .IGNORECASE )
7476
@@ -80,9 +82,11 @@ def parse_file(self, path):
8082
8183 # 6. Move references to their own field (references)
8284 refs = extract_references (content_ann )
83- for ref_id in refs : # preserve length; insert whitespace
84- content_ann = content_ann .replace (refs [ref_id ],
85- ' ' * len (refs [ref_id ]))
85+ # This does weird things to citations, not just references,
86+ # so disable it for now.
87+ #for ref_id in refs: # preserve length; insert whitespace
88+ # content_ann = content_ann.replace(refs[ref_id],
89+ # ' ' * len(refs[ref_id]))
8690 parsed ['references' ] = refs .values ()
8791
8892 # Store the modified content
0 commit comments