Skip to content

Commit fa6de1c

Browse files
committed
Update regular expression to parse other LPSC header formats. (#16)
1 parent d3e0d35 commit fa6de1c

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

src/parserindexer/journalparser.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,13 @@ def parse_file(self, path):
5959
# 4. Remove xxxx.PDF
6060
content_ann = re.sub(r'([0-9][0-9][0-9][0-9].PDF)', '', content_ann,
6161
flags=re.IGNORECASE)
62-
# And "Lunar and Planetary Science Conference (201x)"
63-
content_ann = re.sub(r'([0-9][0-9].. Lunar and Planetary Science Conference \(201[0-9]\))',
62+
# And "xx(th|st) Lunar and Planetary Science Conference ((19|20)xx)"
63+
content_ann = re.sub(r'([0-9][0-9].. Lunar and Planetary Science Conference \((19|20)[0-9][0-9]\)) ?',
64+
'', content_ann,
65+
flags=re.IGNORECASE)
66+
# And "Lunar and Planetary Science XXXIII (2002)"
67+
# with Roman numeral and optional year
68+
content_ann = re.sub(r'(Lunar and Planetary Science [CDILVXM]+( \((19|20)[0-9][0-9]\))?) ?',
6469
'', content_ann,
6570
flags=re.IGNORECASE)
6671

0 commit comments

Comments
 (0)