Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
340 changes: 340 additions & 0 deletions adsrefpipe/refparsers/OUPFTxml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@

import sys, os
import regex as re
import argparse
from typing import List, Dict

from adsputils import setup_logging, load_config
logger = setup_logging('refparsers')
config = {}
config.update(load_config())

from adsrefpipe.refparsers.reference import XMLreference, ReferenceError
from adsrefpipe.refparsers.toREFs import XMLtoREFs
from adsrefpipe.refparsers.unicode import tostr


class OUPFTreference(XMLreference):
"""
This class handles parsing OUP references in XML format. It extracts citation information such as authors,
year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details.

Examples from MNRAS:

1. Common cases:
<ref id="B22"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Abadi</surname> <given-names>M.</given-names> </string-name> <etal>et al</etal>.</person-group>, <year>2016</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1603.04467" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1603.04467</ext-link>)</comment></mixed-citation> </ref>
<ref id="B2"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <collab>Astropy Collaboration</collab> <etal>et al</etal>.</person-group>, <year>2013</year>, <source/>A&amp;A, <volume>558</volume>, <fpage>A33</fpage></mixed-citation> </ref>

2. Rarer: lack of <person-group>
<ref id="B44"> <mixed-citation publication-type="journal"> <collab>Planck Collaboration</collab>VI, <year>2018</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1807.06209" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1807.06209</ext-link>)</comment></mixed-citation> </ref>

Note: unfortunately for PTEP we have volumes where the xlink namespace is not defined in each reference,
so we have to add it ourself to satisfy the XML parser. Here's an example:

<ref id="B24"> <label>[24]</label> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Audenaert</surname> <given-names>K.</given-names> </string-name>, <string-name name-style="western"><surname>Eisert</surname><given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Plenio</surname><given-names>M. B.</given-names></string-name>, and <string-name name-style="western"><surname>Werner</surname><given-names>R. F.</given-names></string-name></person-group>, <source/>Phys. Rev. A<volume>66</volume>, <fpage>042327</fpage> (<year>2002</year>) [<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/quant-ph/0205025">arXiv:quant-ph/0205025</ext-link>] [<ext-link ext-link-type="uri" xlink:href="http://www.inspirehep.net/search?p=find+EPRINT+quant-ph/0205025">Search <sc>in</sc>SPIRE</ext-link>]. (<comment><ext-link ext-link-type="doi" xlink:href="http://doi.org/10.1103/PhysRevA.66.042327">http://dx.doi.org/10.1103/PhysRevA.66.042327</ext-link></comment>)</mixed-citation> </ref>
"""

# to match `amp`
re_match_amp = re.compile(r'__amp;?')
# to match and remove <etal> tags and their contents (case-insensitive)
re_replace_etal = re.compile(r'<etal>.*</etal>', flags=re.IGNORECASE)
# to match and remove unnecessary XML processing instructions
re_replace_useless_tag = re.compile(r'(<\?[^\?>]*\?>)')
# to match and remove extra spaces before a semicolon
re_replace_extra_space = re.compile(r'^\s*;\s*')
# to match "ASP Conf. Ser. Vol. <number>" pattern
re_ASPC = re.compile('ASP Conf[.] Ser[.] Vol[.] (\d+)')
# to match "Astrophysics and Space Science Library, Vol. <number>" pattern
re_ASSL = re.compile('Astrophysics and Space Science Library, Vol[.] (\d+)|Vol[.] (\d+) of Astrophysics and Space Science Library')
# to match any alphabetic characters in a year string
re_char_in_year = re.compile('[A-Za-z]')
# to match the words 'thesis' or 'dissertation' (case-insensitive)
re_thesis = re.compile('(thesis|dissertation)', flags=re.IGNORECASE)

def parse(self):
"""
parse the OUPFT reference and extract citation information such as authors, year, title, and DOI

:return:
"""
self.parsed = 0

refstr = self.dexml(self.reference_str.toxml())

authors = self.parse_authors()
year = self.xmlnode_nodecontents('year')
if year:
year = self.re_char_in_year.sub('', year)

title = self.xmlnode_nodecontents('article-title') or self.xmlnode_nodecontents('chapter-title') or self.xmlnode_nodecontents('bookTitle')

comment = self.xmlnode_nodecontents('comment')

volume = ''
journal = self.xmlnode_nodecontents('source')
if journal:
journal = self.re_match_amp.sub('&', journal)
if not journal:
match = self.re_ASPC.search(refstr)
if match:
journal = 'ASPC'
volume = match.group(1)
else:
match = self.re_ASSL.search(refstr)
if match:
journal = 'ASSL'
volume = match.group(1)
if not journal:
journal = self.xmlnode_nodecontents('conf-name')
if not journal:
# see if it is thesis
if self.re_thesis.search(refstr):
journal = 'Thesis'

if not volume:
volume = self.xmlnode_nodecontents('volume').lower().replace('vol', '').strip()

pages = self.xmlnode_nodecontents('fpage')
series = self.xmlnode_nodecontents('series')

type = self.xmlnode_attribute('nlm-citation', 'citation-type') or self.xmlnode_attribute('citation', 'citation-type')
if comment and type in ['journal', 'confproc'] and not volume and not pages:
try:
volume, pages = comment.split()
except:
pass

# these fields are already formatted the way we expect them
self['authors'] = authors
self['year'] = year
self['jrlstr'] = journal.replace('amp', '&')
self['ttlstr'] = title
self['volume'] = self.parse_volume(volume)
self['page'], self['qualifier'] = self.parse_pages(pages, letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ")
self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])
self['series'] = series

doi = self.parse_doi(refstr, comment)
eprint = self.parse_eprint(refstr)

# these fields are already formatted the way we expect them
self['authors'] = authors
self['year'] = year
self['jrlstr'] = journal
self['ttlstr'] = title
self['volume'] = volume
self['page'], self['qualifier'] = self.parse_pages(pages)
self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])

if doi:
self['doi'] = doi
if eprint:
self['eprint'] = eprint

self['refstr'] = self.get_reference_str()
if not self['refstr']:
self['refplaintext'] = self.get_reference_plain_text(self.to_ascii(refstr))

self.parsed = 1

def parse_authors(self) -> str:
"""
parse the authors from the reference string and format them accordingly

:return: a formatted string of authors
"""
authors = self.xmlnode_nodescontents('person-group', attrs={'person-group-type': 'author'}, keepxml=1) or \
self.xmlnode_nodescontents('name', keepxml=1) or \
self.xmlnode_nodescontents('string-name', keepxml=1)

collab = self.xmlnode_nodescontents('collab')

author_list = []
for author in authors:
an_author = ''
# some of name tags include junk xml tags, remove them
# <person-group person-group-type='author'><name name-style='western'><surname><?A3B2 twb 0.2w?><?A3B2 tlsb -0.01w?>Cunningham</surname>
author, lastname = self.extract_tag(author, 'surname')
author, givennames = self.extract_tag(author, 'given-names')
if lastname: an_author = self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(lastname)))
if an_author and givennames: an_author += ', ' + self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(givennames)))
if an_author:
author_list.append(an_author)
else:
# when there is no tag (ie, <person-group person-group-type='author'>Schultheis M.<etal>et al</etal>.)
author_list.append(self.re_replace_etal.sub(' et. al', author))

if collab:
author_list = collab + author_list

authors = ", ".join(author_list)
authors = self.re_match_amp.sub('', authors)

return authors

def parse_doi(self, refstr: str, comment: str) -> str:
"""
parse the DOI from the reference string or comment field, falling back to extracting it from the refstr

attempts to extract a DOI from different sources: first, from the 'pub-id' XML node content; if not found,
it checks the comment field; if neither contains the DOI, it tries to extract it from the reference string.

:param refstr: the reference string potentially containing the DOI
:param comment: a comment related to the reference that may contain the DOI
:return: the extracted DOI if found, or an empty string if not
"""
doi = self.match_doi(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'doi'}))
if doi:
return doi
# see if there is a doi in the comment field
doi = self.match_doi(comment)
if doi:
return doi
# attempt to extract it from refstr
doi = self.match_doi(refstr)
if doi:
return doi
return ''

def parse_eprint(self, refstr: str) -> str:
"""
parse the eprint from the reference string

attempts to extract the eprint from the 'pub-id' and 'elocation-id' XML nodes,
then tries to extract it from the reference string if not found in the XML nodes

:param refstr: the reference string potentially containing the eprint
:return: the extracted eprint if found, or an empty string if not
"""
# note that the id might have been identified incorrectly, hence verify it
# <pub-id pub-id-type="arxiv">arXiv:10.1029/2001JB000553</pub-id>
eprint = self.match_arxiv_id(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'arxiv'}))
if eprint:
return f"arXiv:{eprint}"
# <elocation-id content-type="arxiv">arXiv:1309.6955</elocation-id>
eprint = self.match_arxiv_id(self.xmlnode_nodecontents('elocation-id', attrs={'content-type': 'arxiv'}))
if eprint:
return f"arXiv:{eprint}"
# attempt to extract it from refstr
eprint = self.match_arxiv_id(refstr)
if eprint:
return f"arXiv:{eprint}"
return ''


class OUPFTtoREFs(XMLtoREFs):
"""
This class converts OUP XML references to a standardized reference format. It processes raw OUP references from
either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI.
"""

# to clean up XML blocks by removing certain tags
block_cleanup = [
(re.compile(r'</?ext-link.*?>'), ''),
(re.compile(r'</?uri.*?>'), ''),
(re.compile(r'<etal\s*/>'), '<surname>et al.</surname>'),
]
# to clean up references by replacing certain patterns
reference_cleanup = [
(re.compile(r'</?(ext-link|x).*?>'), ''),
(re.compile(r'\sxlink:type="simple"'), ''),
(re.compile(r'\s+xlink:href='), ' href='),
(re.compile(r'<inline-formula>.*?</inline-formula>'), ''),
(re.compile(r'\s+xlink:type='), ' type='),
(re.compile(r'</?x.*?>'), ''),
]

# to match <person-group> tags and their contents
re_author_tag = re.compile(r'(<person-group.*</person-group>)')
# to match author placeholder represented by three or more hyphens
re_author_placeholder = re.compile(r'(-{3,})')

def __init__(self, filename: str, buffer: str):
"""
initialize the OUPtoREFs object to process OUP references

:param filename: the path to the source file
:param buffer: the XML references as a buffer
"""
XMLtoREFs.__init__(self, filename, buffer, parsername=OUPFTtoREFs, tag='ref', cleanup=self.block_cleanup, encoding='ISO-8859-1')

def cleanup(self, reference: str) -> str:
"""
clean up the reference string by replacing specific patterns

:param reference: the raw reference string to clean
:return: cleaned reference string
"""
for (compiled_re, replace_str) in self.reference_cleanup:
reference = compiled_re.sub(replace_str, reference)
return reference

def missing_authors(self, prev_reference: str, cur_reference: str) -> str:
"""
replace author placeholder in the current reference with authors from the previous reference

:param prev_reference: the previous reference containing the author information
:param cur_reference: the current reference containing the author placeholder
:return: the current reference with the author placeholder replaced, or the original current reference if no placeholder is found
"""
if prev_reference and self.re_author_placeholder.search(cur_reference):
match = self.re_author_tag.search(prev_reference)
if match:
return self.re_author_placeholder.sub(match.group(0), cur_reference)
return cur_reference

def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]:
"""
perform reference cleaning and parsing, then dispatch the parsed references

:return: a list of dictionaries containing bibcodes and parsed references
"""
references = []
for raw_block_references in self.raw_references:
bibcode = raw_block_references['bibcode']
block_references = raw_block_references['block_references']
item_nums = raw_block_references.get('item_nums', [])

parsed_references = []
prev_reference = ''
for i, raw_reference in enumerate(block_references):
reference = self.cleanup(raw_reference)
reference = self.missing_authors(prev_reference, reference)
prev_reference = reference

logger.debug("OUPxml: parsing %s" % reference)
try:
oup_reference = OUPFTreference(reference)
parsed_references.append(self.merge({**oup_reference.get_parsed_reference(), 'refraw': raw_reference}, self.any_item_num(item_nums, i)))
except ReferenceError as error_desc:
logger.error("OUPFTxml: error parsing reference: %s" % error_desc)

references.append({'bibcode': bibcode, 'references': parsed_references})
logger.debug("%s: parsed %d references" % (bibcode, len(references)))

return references


# This is the main program used for manual testing and verification of OUPxml references.
# It allows parsing references from either a file or a buffer, and if no input is provided,
# it runs a source test file to verify the functionality against expected parsed results.
# The test results are printed to indicate whether the parsing is successful or not.
from adsrefpipe.tests.unittests.stubdata import parsed_references
if __name__ == '__main__': # pragma: no cover
parser = argparse.ArgumentParser(description='Parse OUPFT references')
parser.add_argument('-f', '--filename', help='the path to source file')
parser.add_argument('-b', '--buffer', help='xml reference(s)')
args = parser.parse_args()
if args.filename:
print(OUPFTtoREFs(filename=args.filename, buffer=None).process_and_dispatch())
elif args.buffer:
print(OUPFTtoREFs(buffer=args.buffer, filename=None).process_and_dispatch())
# if no reference source is provided, just run the source test file
elif not args.filename and not args.buffer:
filename = os.path.abspath(os.path.dirname(__file__) + '/../tests/unittests/stubdata/test.oupft.xml')
result = OUPFTtoREFs(filename=filename, buffer=None).process_and_dispatch()
if result == parsed_references.parsed_oup:
print('Test passed!')
else:
print('Test failed!')
sys.exit(0)
2 changes: 2 additions & 0 deletions adsrefpipe/tests/unittests/stubdata/parsed_references.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions adsrefpipe/tests/unittests/stubdata/test.oupft.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<ADSBIBCODE>2001FOO...999..999X</ADSBIBCODE>
<ref id="bib1"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Abramowitz</surname> <given-names>M.</given-names> </name> <name> <surname>Stegun</surname> <given-names>I.</given-names> </name> </person-group>, <year>1964</year>, <source>Handbook of Mathematical Functions</source>. <publisher-name>Dover</publisher-name>, <publisher-loc>New York</publisher-loc></mixed-citation> </ref>
<ref id="bib2"> <mixed-citation publication-type="journal" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Andredakis</surname> <given-names>Y.</given-names> </name> <name> <surname>Peletier</surname> <given-names>R.</given-names> </name> <name> <surname>Balcells</surname> <given-names>M.</given-names> </name> </person-group>, <year>1995</year>, <source>MNRAS</source>, <volume>275</volume>, <fpage>874</fpage></mixed-citation> </ref>
<ref id="bib3"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Hubble</surname> <given-names>E.</given-names> </name> </person-group>, <year>1936</year>, <source>The Realm of the Nebulae</source>, <publisher-name>Yale Univ. Press</publisher-name>. <publisher-loc>New Haven, CT</publisher-loc></mixed-citation> </ref>
<ref id="bib4"> <mixed-citation publication-type="other" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Prieto</surname> <given-names>M.</given-names> </name> <name> <surname>Aguerri</surname> <given-names>J. A. L.</given-names> </name> <name> <surname>Varela</surname> <given-names>A. M.</given-names> </name> <name> <surname>Munoz-Tun&oacute;n</surname> <given-names>C.</given-names> </name> </person-group>, <year>2000</year>, <source>A&amp;A, in press</source></mixed-citation> </ref>
<ref id="bib5"><mixed-citation publication-type="book"><person-group person-group-type="author"><collab>Lightkurve Collaboration</collab></person-group><etal>et al</etal>., <year>2018</year>, <source>Lightkurve: Kepler and TESS time series analysis in Python, Astrophysics Source Code Library</source>. <comment>record (ascl:1812.013)</comment></mixed-citation></ref>
<ref id="bib6"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name> <surname>Colberg</surname> &nbsp;<given-names>J. M.</given-names></string-name> &nbsp;<etal>et al.</etal></person-group>, <year>2008</year>, <source>MNRAS</source>, <volume>387</volume>, <fpage>933</fpage> &nbsp;<pub-id pub-id-type="doi">10.1111/j.1365-2966.2008.13307.x</pub-id></mixed-citation> </ref>
<ref id="bib7"><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Schmidt</surname> <given-names>S. P.</given-names></string-name> <etal>et al.</etal></person-group>, <year>2025</year>, <comment>preprint</comment> (<pub-id pub-id-type="arxiv">arXiv:2501.18477</pub-id>)</mixed-citation></ref>
Loading
Loading