adsabs
diff --git a/‎adsrefpipe/refparsers/OUPFTxml.py‎
Lines changed: 340 additions & 0 deletions b/‎adsrefpipe/refparsers/OUPFTxml.py‎
Lines changed: 340 additions & 0 deletions
diff --git a/‎adsrefpipe/tests/unittests/stubdata/parsed_references.py‎
Lines changed: 2 additions & 0 deletions b/‎adsrefpipe/tests/unittests/stubdata/parsed_references.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎adsrefpipe/tests/unittests/stubdata/test.oupft.xml‎
Lines changed: 8 additions & 0 deletions b/‎adsrefpipe/tests/unittests/stubdata/test.oupft.xml‎
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,340 @@
+
+import sys, os
+import regex as re
+import argparse
+from typing import List, Dict
+
+from adsputils import setup_logging, load_config
+logger = setup_logging('refparsers')
+config = {}
+config.update(load_config())
+
+from adsrefpipe.refparsers.reference import XMLreference, ReferenceError
+from adsrefpipe.refparsers.toREFs import XMLtoREFs
+from adsrefpipe.refparsers.unicode import tostr
+
+
+class OUPFTreference(XMLreference):
+    """
+    This class handles parsing OUP references in XML format. It extracts citation information such as authors,
+    year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details.
+    
+    Examples from MNRAS:
+
+    1. Common cases:
+    <ref id="B22"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Abadi</surname> <given-names>M.</given-names> </string-name> <etal>et al</etal>.</person-group>, <year>2016</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1603.04467" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1603.04467</ext-link>)</comment></mixed-citation> </ref>
+    <ref id="B2"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <collab>Astropy Collaboration</collab> <etal>et al</etal>.</person-group>, <year>2013</year>, <source/>A&amp;A, <volume>558</volume>, <fpage>A33</fpage></mixed-citation> </ref>
+
+    2. Rarer: lack of <person-group>
+    <ref id="B44"> <mixed-citation publication-type="journal"> <collab>Planck Collaboration</collab>VI, <year>2018</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1807.06209" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1807.06209</ext-link>)</comment></mixed-citation> </ref>
+
+    Note: unfortunately for PTEP we have volumes where the xlink namespace is not defined in each reference, 
+    so we have to add it ourself to satisfy the XML parser. Here's an example:
+
+    <ref id="B24"> <label>[24]</label> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Audenaert</surname> <given-names>K.</given-names> </string-name>, <string-name name-style="western"><surname>Eisert</surname><given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Plenio</surname><given-names>M. B.</given-names></string-name>, and <string-name name-style="western"><surname>Werner</surname><given-names>R. F.</given-names></string-name></person-group>, <source/>Phys. Rev. A<volume>66</volume>, <fpage>042327</fpage> (<year>2002</year>) [<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/quant-ph/0205025">arXiv:quant-ph/0205025</ext-link>] [<ext-link ext-link-type="uri" xlink:href="http://www.inspirehep.net/search?p=find+EPRINT+quant-ph/0205025">Search <sc>in</sc>SPIRE</ext-link>]. (<comment><ext-link ext-link-type="doi" xlink:href="http://doi.org/10.1103/PhysRevA.66.042327">http://dx.doi.org/10.1103/PhysRevA.66.042327</ext-link></comment>)</mixed-citation> </ref>
+    """
+
+    # to match `amp`
+    re_match_amp = re.compile(r'__amp;?')
+    # to match and remove <etal> tags and their contents (case-insensitive)
+    re_replace_etal = re.compile(r'<etal>.*</etal>', flags=re.IGNORECASE)
+    # to match and remove unnecessary XML processing instructions
+    re_replace_useless_tag = re.compile(r'(<\?[^\?>]*\?>)')
+    # to match and remove extra spaces before a semicolon
+    re_replace_extra_space = re.compile(r'^\s*;\s*')
+    # to match "ASP Conf. Ser. Vol. <number>" pattern
+    re_ASPC = re.compile('ASP Conf[.] Ser[.] Vol[.] (\d+)')
+    # to match "Astrophysics and Space Science Library, Vol. <number>" pattern
+    re_ASSL = re.compile('Astrophysics and Space Science Library, Vol[.] (\d+)|Vol[.] (\d+) of Astrophysics and Space Science Library')
+    # to match any alphabetic characters in a year string
+    re_char_in_year = re.compile('[A-Za-z]')
+    # to match the words 'thesis' or 'dissertation' (case-insensitive)
+    re_thesis = re.compile('(thesis|dissertation)', flags=re.IGNORECASE)
+
+    def parse(self):
+        """
+        parse the OUPFT reference and extract citation information such as authors, year, title, and DOI
+
+        :return:
+        """
+        self.parsed = 0
+
+        refstr = self.dexml(self.reference_str.toxml())
+
+        authors = self.parse_authors()
+        year = self.xmlnode_nodecontents('year')
+        if year:
+            year = self.re_char_in_year.sub('', year)
+
+        title = self.xmlnode_nodecontents('article-title') or self.xmlnode_nodecontents('chapter-title') or self.xmlnode_nodecontents('bookTitle')
+
+        comment = self.xmlnode_nodecontents('comment')
+
+        volume = ''
+        journal = self.xmlnode_nodecontents('source')
+        if journal:
+            journal = self.re_match_amp.sub('&', journal)
+        if not journal:
+            match = self.re_ASPC.search(refstr)
+            if match:
+                journal = 'ASPC'
+                volume = match.group(1)
+            else:
+                match = self.re_ASSL.search(refstr)
+                if match:
+                    journal = 'ASSL'
+                    volume = match.group(1)
+        if not journal:
+            journal = self.xmlnode_nodecontents('conf-name')
+            if not journal:
+                # see if it is thesis
+                if self.re_thesis.search(refstr):
+                    journal = 'Thesis'
+
+        if not volume:
+            volume = self.xmlnode_nodecontents('volume').lower().replace('vol', '').strip()
+
+        pages = self.xmlnode_nodecontents('fpage')
+        series = self.xmlnode_nodecontents('series')
+
+        type = self.xmlnode_attribute('nlm-citation', 'citation-type') or self.xmlnode_attribute('citation', 'citation-type')
+        if comment and type in ['journal', 'confproc'] and not volume and not pages:
+            try:
+                volume, pages = comment.split()
+            except:
+                pass
+
+        # these fields are already formatted the way we expect them
+        self['authors'] = authors
+        self['year'] = year
+        self['jrlstr'] = journal.replace('amp', '&')
+        self['ttlstr'] = title
+        self['volume'] = self.parse_volume(volume)
+        self['page'], self['qualifier'] = self.parse_pages(pages, letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ")
+        self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])
+        self['series'] = series
+
+        doi = self.parse_doi(refstr, comment)
+        eprint = self.parse_eprint(refstr)
+
+        # these fields are already formatted the way we expect them
+        self['authors'] = authors
+        self['year'] = year
+        self['jrlstr'] = journal
+        self['ttlstr'] = title
+        self['volume'] = volume
+        self['page'], self['qualifier'] = self.parse_pages(pages)
+        self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])
+
+        if doi:
+            self['doi'] = doi
+        if eprint:
+            self['eprint'] = eprint
+
+        self['refstr'] = self.get_reference_str()
+        if not self['refstr']:
+            self['refplaintext'] = self.get_reference_plain_text(self.to_ascii(refstr))
+
+        self.parsed = 1
+
+    def parse_authors(self) -> str:
+        """
+        parse the authors from the reference string and format them accordingly
+
+        :return: a formatted string of authors
+        """
+        authors = self.xmlnode_nodescontents('person-group', attrs={'person-group-type': 'author'}, keepxml=1) or \
+                  self.xmlnode_nodescontents('name', keepxml=1) or \
+                  self.xmlnode_nodescontents('string-name', keepxml=1)
+
+        collab = self.xmlnode_nodescontents('collab')
+
+        author_list = []
+        for author in authors:
+            an_author = ''
+            # some of name tags include junk xml tags, remove them
+            # <person-group person-group-type='author'><name name-style='western'><surname><?A3B2 twb 0.2w?><?A3B2 tlsb -0.01w?>Cunningham</surname>
+            author, lastname = self.extract_tag(author, 'surname')
+            author, givennames = self.extract_tag(author, 'given-names')
+            if lastname: an_author = self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(lastname)))
+            if an_author and givennames: an_author += ', ' + self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(givennames)))
+            if an_author:
+                author_list.append(an_author)
+            else:
+                # when there is no tag (ie, <person-group person-group-type='author'>Schultheis M.<etal>et al</etal>.)
+                author_list.append(self.re_replace_etal.sub(' et. al', author))
+
+        if collab:
+            author_list = collab + author_list
+
+        authors = ", ".join(author_list)
+        authors = self.re_match_amp.sub('', authors)
+
+        return authors
+
+    def parse_doi(self, refstr: str, comment: str) -> str:
+        """
+        parse the DOI from the reference string or comment field, falling back to extracting it from the refstr
+
+        attempts to extract a DOI from different sources: first, from the 'pub-id' XML node content; if not found,
+        it checks the comment field; if neither contains the DOI, it tries to extract it from the reference string.
+
+        :param refstr: the reference string potentially containing the DOI
+        :param comment: a comment related to the reference that may contain the DOI
+        :return: the extracted DOI if found, or an empty string if not
+        """
+        doi = self.match_doi(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'doi'}))
+        if doi:
+            return doi
+        # see if there is a doi in the comment field
+        doi = self.match_doi(comment)
+        if doi:
+            return doi
+        # attempt to extract it from refstr
+        doi = self.match_doi(refstr)
+        if doi:
+            return doi
+        return ''
+
+    def parse_eprint(self, refstr: str) -> str:
+        """
+        parse the eprint from the reference string
+
+        attempts to extract the eprint from the 'pub-id' and 'elocation-id' XML nodes,
+        then tries to extract it from the reference string if not found in the XML nodes
+
+        :param refstr: the reference string potentially containing the eprint
+        :return: the extracted eprint if found, or an empty string if not
+        """
+        # note that the id might have been identified incorrectly, hence verify it
+        # <pub-id pub-id-type="arxiv">arXiv:10.1029/2001JB000553</pub-id>
+        eprint = self.match_arxiv_id(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'arxiv'}))
+        if eprint:
+            return f"arXiv:{eprint}"
+        # <elocation-id content-type="arxiv">arXiv:1309.6955</elocation-id>
+        eprint = self.match_arxiv_id(self.xmlnode_nodecontents('elocation-id', attrs={'content-type': 'arxiv'}))
+        if eprint:
+            return f"arXiv:{eprint}"
+        # attempt to extract it from refstr
+        eprint = self.match_arxiv_id(refstr)
+        if eprint:
+            return f"arXiv:{eprint}"
+        return ''
+
+
+class OUPFTtoREFs(XMLtoREFs):
+    """
+    This class converts OUP XML references to a standardized reference format. It processes raw OUP references from
+    either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI.
+    """
+
+    # to clean up XML blocks by removing certain tags
+    block_cleanup = [
+        (re.compile(r'</?ext-link.*?>'), ''),
+        (re.compile(r'</?uri.*?>'), ''),
+        (re.compile(r'<etal\s*/>'), '<surname>et al.</surname>'),
+    ]
+    # to clean up references by replacing certain patterns
+    reference_cleanup = [
+        (re.compile(r'</?(ext-link|x).*?>'), ''),
+        (re.compile(r'\sxlink:type="simple"'), ''),
+        (re.compile(r'\s+xlink:href='), ' href='),
+        (re.compile(r'<inline-formula>.*?</inline-formula>'), ''),
+        (re.compile(r'\s+xlink:type='), ' type='),
+        (re.compile(r'</?x.*?>'), ''),
+    ]
+
+    # to match <person-group> tags and their contents
+    re_author_tag = re.compile(r'(<person-group.*</person-group>)')
+    # to match author placeholder represented by three or more hyphens
+    re_author_placeholder = re.compile(r'(-{3,})')
+
+    def __init__(self, filename: str, buffer: str):
+        """
+        initialize the OUPtoREFs object to process OUP references
+
+        :param filename: the path to the source file
+        :param buffer: the XML references as a buffer
+        """
+        XMLtoREFs.__init__(self, filename, buffer, parsername=OUPFTtoREFs, tag='ref', cleanup=self.block_cleanup, encoding='ISO-8859-1')
+
+    def cleanup(self, reference: str) -> str:
+        """
+        clean up the reference string by replacing specific patterns
+
+        :param reference: the raw reference string to clean
+        :return: cleaned reference string
+        """
+        for (compiled_re, replace_str) in self.reference_cleanup:
+            reference = compiled_re.sub(replace_str, reference)
+        return reference
+
+    def missing_authors(self, prev_reference: str, cur_reference: str) -> str:
+        """
+        replace author placeholder in the current reference with authors from the previous reference
+
+        :param prev_reference: the previous reference containing the author information
+        :param cur_reference: the current reference containing the author placeholder
+        :return: the current reference with the author placeholder replaced, or the original current reference if no placeholder is found
+        """
+        if prev_reference and self.re_author_placeholder.search(cur_reference):
+            match = self.re_author_tag.search(prev_reference)
+            if match:
+                return self.re_author_placeholder.sub(match.group(0), cur_reference)
+        return cur_reference
+
+    def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]:
+        """
+        perform reference cleaning and parsing, then dispatch the parsed references
+
+        :return: a list of dictionaries containing bibcodes and parsed references
+        """
+        references = []
+        for raw_block_references in self.raw_references:
+            bibcode = raw_block_references['bibcode']
+            block_references = raw_block_references['block_references']
+            item_nums = raw_block_references.get('item_nums', [])
+
+            parsed_references = []
+            prev_reference = ''
+            for i, raw_reference in enumerate(block_references):
+                reference = self.cleanup(raw_reference)
+                reference = self.missing_authors(prev_reference, reference)
+                prev_reference = reference
+
+                logger.debug("OUPxml: parsing %s" % reference)
+                try:
+                    oup_reference = OUPFTreference(reference)
+                    parsed_references.append(self.merge({**oup_reference.get_parsed_reference(), 'refraw': raw_reference}, self.any_item_num(item_nums, i)))
+                except ReferenceError as error_desc:
+                    logger.error("OUPFTxml: error parsing reference: %s" % error_desc)
+
+            references.append({'bibcode': bibcode, 'references': parsed_references})
+            logger.debug("%s: parsed %d references" % (bibcode, len(references)))
+
+        return references
+
+
+# This is the main program used for manual testing and verification of OUPxml references.
+# It allows parsing references from either a file or a buffer, and if no input is provided,
+# it runs a source test file to verify the functionality against expected parsed results.
+# The test results are printed to indicate whether the parsing is successful or not.
+from adsrefpipe.tests.unittests.stubdata import parsed_references
+if __name__ == '__main__':  # pragma: no cover
+    parser = argparse.ArgumentParser(description='Parse OUPFT references')
+    parser.add_argument('-f', '--filename', help='the path to source file')
+    parser.add_argument('-b', '--buffer', help='xml reference(s)')
+    args = parser.parse_args()
+    if args.filename:
+        print(OUPFTtoREFs(filename=args.filename, buffer=None).process_and_dispatch())
+    elif args.buffer:
+        print(OUPFTtoREFs(buffer=args.buffer, filename=None).process_and_dispatch())
+    # if no reference source is provided, just run the source test file
+    elif not args.filename and not args.buffer:
+        filename = os.path.abspath(os.path.dirname(__file__) + '/../tests/unittests/stubdata/test.oupft.xml')
+        result = OUPFTtoREFs(filename=filename, buffer=None).process_and_dispatch()
+        if result == parsed_references.parsed_oup:
+            print('Test passed!')
+        else:
+            print('Test failed!')
+    sys.exit(0)
@@ -0,0 +1,8 @@
+<ADSBIBCODE>2001FOO...999..999X</ADSBIBCODE>
+<ref id="bib1"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Abramowitz</surname> <given-names>M.</given-names> </name> <name> <surname>Stegun</surname> <given-names>I.</given-names> </name> </person-group>, <year>1964</year>, <source>Handbook of Mathematical Functions</source>. <publisher-name>Dover</publisher-name>, <publisher-loc>New York</publisher-loc></mixed-citation> </ref>
+<ref id="bib2"> <mixed-citation publication-type="journal" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Andredakis</surname> <given-names>Y.</given-names> </name> <name> <surname>Peletier</surname> <given-names>R.</given-names> </name> <name> <surname>Balcells</surname> <given-names>M.</given-names> </name> </person-group>, <year>1995</year>, <source>MNRAS</source>, <volume>275</volume>, <fpage>874</fpage></mixed-citation> </ref>
+<ref id="bib3"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Hubble</surname> <given-names>E.</given-names> </name> </person-group>, <year>1936</year>, <source>The Realm of the Nebulae</source>, <publisher-name>Yale Univ. Press</publisher-name>. <publisher-loc>New Haven, CT</publisher-loc></mixed-citation> </ref>
+<ref id="bib4"> <mixed-citation publication-type="other" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Prieto</surname> <given-names>M.</given-names> </name> <name> <surname>Aguerri</surname> <given-names>J. A. L.</given-names> </name> <name> <surname>Varela</surname> <given-names>A. M.</given-names> </name> <name> <surname>Munoz-Tun&oacute;n</surname> <given-names>C.</given-names> </name> </person-group>, <year>2000</year>, <source>A&amp;A, in press</source></mixed-citation> </ref>
+<ref id="bib5"><mixed-citation publication-type="book"><person-group person-group-type="author"><collab>Lightkurve Collaboration</collab></person-group><etal>et al</etal>., <year>2018</year>, <source>Lightkurve: Kepler and TESS time series analysis in Python, Astrophysics Source Code Library</source>. <comment>record (ascl:1812.013)</comment></mixed-citation></ref>
+<ref id="bib6"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name> <surname>Colberg</surname> &nbsp;<given-names>J. M.</given-names></string-name> &nbsp;<etal>et al.</etal></person-group>, <year>2008</year>, <source>MNRAS</source>, <volume>387</volume>, <fpage>933</fpage> &nbsp;<pub-id pub-id-type="doi">10.1111/j.1365-2966.2008.13307.x</pub-id></mixed-citation> </ref>
+<ref id="bib7"><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Schmidt</surname> <given-names>S. P.</given-names></string-name> <etal>et al.</etal></person-group>, <year>2025</year>, <comment>preprint</comment> (<pub-id pub-id-type="arxiv">arXiv:2501.18477</pub-id>)</mixed-citation></ref>