Skip to content

Commit dea4fae

Browse files
authored
Merge pull request #37 from ehenneken/OUPFTxml_parser
OUP parser for references extracted from fulltext
2 parents 67236ea + b99a7bc commit dea4fae

File tree

4 files changed

+366
-0
lines changed

4 files changed

+366
-0
lines changed

adsrefpipe/refparsers/OUPFTxml.py

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
2+
import sys, os
3+
import regex as re
4+
import argparse
5+
from typing import List, Dict
6+
7+
from adsputils import setup_logging, load_config
8+
logger = setup_logging('refparsers')
9+
config = {}
10+
config.update(load_config())
11+
12+
from adsrefpipe.refparsers.reference import XMLreference, ReferenceError
13+
from adsrefpipe.refparsers.toREFs import XMLtoREFs
14+
from adsrefpipe.refparsers.unicode import tostr
15+
16+
17+
class OUPFTreference(XMLreference):
18+
"""
19+
This class handles parsing OUP references in XML format. It extracts citation information such as authors,
20+
year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details.
21+
22+
Examples from MNRAS:
23+
24+
1. Common cases:
25+
<ref id="B22"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Abadi</surname> <given-names>M.</given-names> </string-name> <etal>et al</etal>.</person-group>, <year>2016</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1603.04467" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1603.04467</ext-link>)</comment></mixed-citation> </ref>
26+
<ref id="B2"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <collab>Astropy Collaboration</collab> <etal>et al</etal>.</person-group>, <year>2013</year>, <source/>A&amp;A, <volume>558</volume>, <fpage>A33</fpage></mixed-citation> </ref>
27+
28+
2. Rarer: lack of <person-group>
29+
<ref id="B44"> <mixed-citation publication-type="journal"> <collab>Planck Collaboration</collab>VI, <year>2018</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1807.06209" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1807.06209</ext-link>)</comment></mixed-citation> </ref>
30+
31+
Note: unfortunately for PTEP we have volumes where the xlink namespace is not defined in each reference,
32+
so we have to add it ourself to satisfy the XML parser. Here's an example:
33+
34+
<ref id="B24"> <label>[24]</label> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Audenaert</surname> <given-names>K.</given-names> </string-name>, <string-name name-style="western"><surname>Eisert</surname><given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Plenio</surname><given-names>M. B.</given-names></string-name>, and <string-name name-style="western"><surname>Werner</surname><given-names>R. F.</given-names></string-name></person-group>, <source/>Phys. Rev. A<volume>66</volume>, <fpage>042327</fpage> (<year>2002</year>) [<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/quant-ph/0205025">arXiv:quant-ph/0205025</ext-link>] [<ext-link ext-link-type="uri" xlink:href="http://www.inspirehep.net/search?p=find+EPRINT+quant-ph/0205025">Search <sc>in</sc>SPIRE</ext-link>]. (<comment><ext-link ext-link-type="doi" xlink:href="http://doi.org/10.1103/PhysRevA.66.042327">http://dx.doi.org/10.1103/PhysRevA.66.042327</ext-link></comment>)</mixed-citation> </ref>
35+
"""
36+
37+
# to match `amp`
38+
re_match_amp = re.compile(r'__amp;?')
39+
# to match and remove <etal> tags and their contents (case-insensitive)
40+
re_replace_etal = re.compile(r'<etal>.*</etal>', flags=re.IGNORECASE)
41+
# to match and remove unnecessary XML processing instructions
42+
re_replace_useless_tag = re.compile(r'(<\?[^\?>]*\?>)')
43+
# to match and remove extra spaces before a semicolon
44+
re_replace_extra_space = re.compile(r'^\s*;\s*')
45+
# to match "ASP Conf. Ser. Vol. <number>" pattern
46+
re_ASPC = re.compile('ASP Conf[.] Ser[.] Vol[.] (\d+)')
47+
# to match "Astrophysics and Space Science Library, Vol. <number>" pattern
48+
re_ASSL = re.compile('Astrophysics and Space Science Library, Vol[.] (\d+)|Vol[.] (\d+) of Astrophysics and Space Science Library')
49+
# to match any alphabetic characters in a year string
50+
re_char_in_year = re.compile('[A-Za-z]')
51+
# to match the words 'thesis' or 'dissertation' (case-insensitive)
52+
re_thesis = re.compile('(thesis|dissertation)', flags=re.IGNORECASE)
53+
54+
def parse(self):
55+
"""
56+
parse the OUPFT reference and extract citation information such as authors, year, title, and DOI
57+
58+
:return:
59+
"""
60+
self.parsed = 0
61+
62+
refstr = self.dexml(self.reference_str.toxml())
63+
64+
authors = self.parse_authors()
65+
year = self.xmlnode_nodecontents('year')
66+
if year:
67+
year = self.re_char_in_year.sub('', year)
68+
69+
title = self.xmlnode_nodecontents('article-title') or self.xmlnode_nodecontents('chapter-title') or self.xmlnode_nodecontents('bookTitle')
70+
71+
comment = self.xmlnode_nodecontents('comment')
72+
73+
journal = self.xmlnode_nodecontents('source')
74+
if journal:
75+
journal = self.re_match_amp.sub('&', journal)
76+
if not journal:
77+
match = self.re_ASPC.search(refstr)
78+
if match:
79+
journal = 'ASPC'
80+
volume = match.group(1)
81+
else:
82+
match = self.re_ASSL.search(refstr)
83+
if match:
84+
journal = 'ASSL'
85+
volume = match.group(1) or match.group(2) or ''
86+
if not journal:
87+
journal = self.xmlnode_nodecontents('conf-name')
88+
if not journal:
89+
# see if it is thesis
90+
if self.re_thesis.search(refstr):
91+
journal = 'Thesis'
92+
93+
94+
volume = self.xmlnode_nodecontents('volume')
95+
pages = self.xmlnode_nodecontents('fpage')
96+
series = self.xmlnode_nodecontents('series')
97+
98+
cittype = self.xmlnode_attribute('nlm-citation', 'citation-type') or self.xmlnode_attribute('citation', 'citation-type') or self.xmlnode_attribute('mixed-citation', 'publication-type')
99+
if comment and cittype in ['journal', 'confproc'] and not volume and not pages:
100+
try:
101+
volume, pages = comment.split()
102+
except:
103+
pass
104+
105+
# these fields are already formatted the way we expect them
106+
self['authors'] = authors
107+
self['year'] = year
108+
self['jrlstr'] = journal
109+
self['ttlstr'] = title
110+
self['volume'] = self.parse_volume(volume)
111+
self['page'], self['qualifier'] = self.parse_pages(pages, letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ")
112+
self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])
113+
self['series'] = series
114+
115+
doi = self.parse_doi(refstr, comment)
116+
eprint = self.parse_eprint(refstr)
117+
118+
if doi:
119+
self['doi'] = doi
120+
if eprint:
121+
self['eprint'] = eprint
122+
123+
self['refstr'] = self.get_reference_str()
124+
if not self['refstr']:
125+
self['refplaintext'] = self.get_reference_plain_text(self.to_ascii(refstr))
126+
127+
self.parsed = 1
128+
129+
def parse_authors(self) -> str:
130+
"""
131+
parse the authors from the reference string and format them accordingly
132+
133+
:return: a formatted string of authors
134+
"""
135+
authors = self.xmlnode_nodescontents('person-group', attrs={'person-group-type': 'author'}, keepxml=1) or \
136+
self.xmlnode_nodescontents('name', keepxml=1) or \
137+
self.xmlnode_nodescontents('string-name', keepxml=1)
138+
139+
collab = self.xmlnode_nodescontents('collab')
140+
141+
author_list = []
142+
for author in authors:
143+
an_author = ''
144+
# some of name tags include junk xml tags, remove them
145+
# <person-group person-group-type='author'><name name-style='western'><surname><?A3B2 twb 0.2w?><?A3B2 tlsb -0.01w?>Cunningham</surname>
146+
author, lastname = self.extract_tag(author, 'surname')
147+
author, givennames = self.extract_tag(author, 'given-names')
148+
if lastname: an_author = self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(lastname)))
149+
if an_author and givennames: an_author += ', ' + self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(givennames)))
150+
if an_author:
151+
author_list.append(an_author)
152+
else:
153+
# when there is no tag (ie, <person-group person-group-type='author'>Schultheis M.<etal>et al</etal>.)
154+
author_list.append(self.re_replace_etal.sub(' et. al', author))
155+
156+
if collab:
157+
author_list = collab + author_list
158+
159+
authors = ", ".join(author_list)
160+
authors = self.re_match_amp.sub('', authors)
161+
162+
return authors
163+
164+
def parse_doi(self, refstr: str, comment: str) -> str:
165+
"""
166+
parse the DOI from the reference string or comment field, falling back to extracting it from the refstr
167+
168+
attempts to extract a DOI from different sources: first, from the 'pub-id' XML node content; if not found,
169+
it checks the comment field; if neither contains the DOI, it tries to extract it from the reference string.
170+
171+
:param refstr: the reference string potentially containing the DOI
172+
:param comment: a comment related to the reference that may contain the DOI
173+
:return: the extracted DOI if found, or an empty string if not
174+
"""
175+
doi = self.match_doi(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'doi'}))
176+
if doi:
177+
return doi
178+
# see if there is a doi in the comment field
179+
doi = self.match_doi(comment)
180+
if doi:
181+
return doi
182+
# attempt to extract it from refstr
183+
doi = self.match_doi(refstr)
184+
if doi:
185+
return doi
186+
return ''
187+
188+
def parse_eprint(self, refstr: str) -> str:
189+
"""
190+
parse the eprint from the reference string
191+
192+
attempts to extract the eprint from the 'pub-id' and 'elocation-id' XML nodes,
193+
then tries to extract it from the reference string if not found in the XML nodes
194+
195+
:param refstr: the reference string potentially containing the eprint
196+
:return: the extracted eprint if found, or an empty string if not
197+
"""
198+
# note that the id might have been identified incorrectly, hence verify it
199+
# <pub-id pub-id-type="arxiv">arXiv:10.1029/2001JB000553</pub-id>
200+
eprint = self.match_arxiv_id(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'arxiv'}))
201+
if eprint:
202+
return f"arXiv:{eprint}"
203+
# <elocation-id content-type="arxiv">arXiv:1309.6955</elocation-id>
204+
eprint = self.match_arxiv_id(self.xmlnode_nodecontents('elocation-id', attrs={'content-type': 'arxiv'}))
205+
if eprint:
206+
return f"arXiv:{eprint}"
207+
# attempt to extract it from refstr
208+
eprint = self.match_arxiv_id(refstr)
209+
if eprint:
210+
return f"arXiv:{eprint}"
211+
return ''
212+
213+
214+
class OUPFTtoREFs(XMLtoREFs):
215+
"""
216+
This class converts OUP XML references to a standardized reference format. It processes raw OUP references from
217+
either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI.
218+
"""
219+
220+
# to clean up XML blocks by removing certain tags
221+
block_cleanup = [
222+
(re.compile(r'</?ext-link.*?>'), ''),
223+
(re.compile(r'</?uri.*?>'), ''),
224+
(re.compile(r'<etal\s*/>'), '<surname>et al.</surname>'),
225+
]
226+
# to clean up references by replacing certain patterns
227+
reference_cleanup = [
228+
(re.compile(r'</?(ext-link|x).*?>'), ''),
229+
(re.compile(r'\sxlink:type="simple"'), ''),
230+
(re.compile(r'\s+xlink:href='), ' href='),
231+
(re.compile(r'<inline-formula>.*?</inline-formula>'), ''),
232+
(re.compile(r'\s+xlink:type='), ' type='),
233+
(re.compile(r'</?x.*?>'), ''),
234+
]
235+
236+
# to match <person-group> tags and their contents
237+
re_author_tag = re.compile(r'(<person-group.*</person-group>)')
238+
# to match author placeholder represented by three or more hyphens
239+
re_author_placeholder = re.compile(r'(-{3,})')
240+
241+
def __init__(self, filename: str, buffer: str):
242+
"""
243+
initialize the OUPtoREFs object to process OUP references
244+
245+
:param filename: the path to the source file
246+
:param buffer: the XML references as a buffer
247+
"""
248+
XMLtoREFs.__init__(self, filename, buffer, parsername=OUPFTtoREFs, tag='ref', cleanup=self.block_cleanup, encoding='ISO-8859-1')
249+
250+
def cleanup(self, reference: str) -> str:
251+
"""
252+
clean up the reference string by replacing specific patterns
253+
254+
:param reference: the raw reference string to clean
255+
:return: cleaned reference string
256+
"""
257+
for (compiled_re, replace_str) in self.reference_cleanup:
258+
reference = compiled_re.sub(replace_str, reference)
259+
return reference
260+
261+
def missing_authors(self, prev_reference: str, cur_reference: str) -> str:
262+
"""
263+
replace author placeholder in the current reference with authors from the previous reference
264+
265+
:param prev_reference: the previous reference containing the author information
266+
:param cur_reference: the current reference containing the author placeholder
267+
:return: the current reference with the author placeholder replaced, or the original current reference if no placeholder is found
268+
"""
269+
if prev_reference and self.re_author_placeholder.search(cur_reference):
270+
match = self.re_author_tag.search(prev_reference)
271+
if match:
272+
return self.re_author_placeholder.sub(match.group(0), cur_reference)
273+
return cur_reference
274+
275+
def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]:
276+
"""
277+
perform reference cleaning and parsing, then dispatch the parsed references
278+
279+
:return: a list of dictionaries containing bibcodes and parsed references
280+
"""
281+
references = []
282+
for raw_block_references in self.raw_references:
283+
bibcode = raw_block_references['bibcode']
284+
block_references = raw_block_references['block_references']
285+
item_nums = raw_block_references.get('item_nums', [])
286+
287+
parsed_references = []
288+
prev_reference = ''
289+
for i, raw_reference in enumerate(block_references):
290+
reference = self.cleanup(raw_reference)
291+
reference = self.missing_authors(prev_reference, reference)
292+
prev_reference = reference
293+
294+
logger.debug("OUPxml: parsing %s" % reference)
295+
try:
296+
oup_reference = OUPFTreference(reference)
297+
parsed_references.append(self.merge({**oup_reference.get_parsed_reference(), 'refraw': raw_reference}, self.any_item_num(item_nums, i)))
298+
except ReferenceError as error_desc:
299+
logger.error("OUPFTxml: error parsing reference: %s" % error_desc)
300+
301+
references.append({'bibcode': bibcode, 'references': parsed_references})
302+
logger.debug("%s: parsed %d references out of %d found references" % (bibcode, len(parsed_references), len(block_references)))
303+
304+
return references
305+
306+
307+
# This is the main program used for manual testing and verification of OUPxml references.
308+
# It allows parsing references from either a file or a buffer, and if no input is provided,
309+
# it runs a source test file to verify the functionality against expected parsed results.
310+
# The test results are printed to indicate whether the parsing is successful or not.
311+
if __name__ == '__main__': # pragma: no cover
312+
from adsrefpipe.tests.unittests.stubdata import parsed_references
313+
parser = argparse.ArgumentParser(description='Parse OUPFT references')
314+
parser.add_argument('-f', '--filename', help='the path to source file')
315+
parser.add_argument('-b', '--buffer', help='xml reference(s)')
316+
args = parser.parse_args()
317+
if args.filename:
318+
print(OUPFTtoREFs(filename=args.filename, buffer=None).process_and_dispatch())
319+
elif args.buffer:
320+
print(OUPFTtoREFs(buffer=args.buffer, filename=None).process_and_dispatch())
321+
# if no reference source is provided, just run the source test file
322+
elif not args.filename and not args.buffer:
323+
filename = os.path.abspath(os.path.dirname(__file__) + '/../tests/unittests/stubdata/test.oupft.xml')
324+
result = OUPFTtoREFs(filename=filename, buffer=None).process_and_dispatch()
325+
if result == parsed_references.parsed_oup:
326+
print('Test passed!')
327+
else:
328+
print('Test failed!')
329+
sys.exit(0)

adsrefpipe/tests/unittests/stubdata/parsed_references.py

Lines changed: 2 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<ADSBIBCODE>2001FOO...999..999X</ADSBIBCODE>
2+
<ref id="bib1"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Abramowitz</surname> <given-names>M.</given-names> </name> <name> <surname>Stegun</surname> <given-names>I.</given-names> </name> </person-group>, <year>1964</year>, <source>Handbook of Mathematical Functions</source>. <publisher-name>Dover</publisher-name>, <publisher-loc>New York</publisher-loc></mixed-citation> </ref>
3+
<ref id="bib2"> <mixed-citation publication-type="journal" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Andredakis</surname> <given-names>Y.</given-names> </name> <name> <surname>Peletier</surname> <given-names>R.</given-names> </name> <name> <surname>Balcells</surname> <given-names>M.</given-names> </name> </person-group>, <year>1995</year>, <source>MNRAS</source>, <volume>275</volume>, <fpage>874</fpage></mixed-citation> </ref>
4+
<ref id="bib3"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Hubble</surname> <given-names>E.</given-names> </name> </person-group>, <year>1936</year>, <source>The Realm of the Nebulae</source>, <publisher-name>Yale Univ. Press</publisher-name>. <publisher-loc>New Haven, CT</publisher-loc></mixed-citation> </ref>
5+
<ref id="bib4"> <mixed-citation publication-type="other" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Prieto</surname> <given-names>M.</given-names> </name> <name> <surname>Aguerri</surname> <given-names>J. A. L.</given-names> </name> <name> <surname>Varela</surname> <given-names>A. M.</given-names> </name> <name> <surname>Munoz-Tun&oacute;n</surname> <given-names>C.</given-names> </name> </person-group>, <year>2000</year>, <source>A&amp;A, in press</source></mixed-citation> </ref>
6+
<ref id="bib5"><mixed-citation publication-type="book"><person-group person-group-type="author"><collab>Lightkurve Collaboration</collab></person-group><etal>et al</etal>., <year>2018</year>, <source>Lightkurve: Kepler and TESS time series analysis in Python, Astrophysics Source Code Library</source>. <comment>record (ascl:1812.013)</comment></mixed-citation></ref>
7+
<ref id="bib6"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name> <surname>Colberg</surname> &nbsp;<given-names>J. M.</given-names></string-name> &nbsp;<etal>et al.</etal></person-group>, <year>2008</year>, <source>MNRAS</source>, <volume>387</volume>, <fpage>933</fpage> &nbsp;<pub-id pub-id-type="doi">10.1111/j.1365-2966.2008.13307.x</pub-id></mixed-citation> </ref>
8+
<ref id="bib7"><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Schmidt</surname> <given-names>S. P.</given-names></string-name> <etal>et al.</etal></person-group>, <year>2025</year>, <comment>preprint</comment> (<pub-id pub-id-type="arxiv">arXiv:2501.18477</pub-id>)</mixed-citation></ref>

0 commit comments

Comments
 (0)