ADSReferencePipeline/adsrefpipe/refparsers/OUPFTxml.py at 2310e2e41a0a602e6ceced17f4800cffcedde7f9 · ehenneken/ADSReferencePipeline · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340

import sys, os
import regex as re
import argparse
from typing import List, Dict

from adsputils import setup_logging, load_config
logger = setup_logging('refparsers')
config = {}
config.update(load_config())

from adsrefpipe.refparsers.reference import XMLreference, ReferenceError
from adsrefpipe.refparsers.toREFs import XMLtoREFs
from adsrefpipe.refparsers.unicode import tostr


class OUPFTreference(XMLreference):
    """
    This class handles parsing OUP references in XML format. It extracts citation information such as authors,
    year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details.

    Examples from MNRAS:

    1. Common cases:
    <ref id="B22"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Abadi</surname> <given-names>M.</given-names> </string-name> <etal>et al</etal>.</person-group>, <year>2016</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1603.04467" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1603.04467</ext-link>)</comment></mixed-citation> </ref>
    <ref id="B2"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <collab>Astropy Collaboration</collab> <etal>et al</etal>.</person-group>, <year>2013</year>, <source/>A&amp;A, <volume>558</volume>, <fpage>A33</fpage></mixed-citation> </ref>

    2. Rarer: lack of <person-group>
    <ref id="B44"> <mixed-citation publication-type="journal"> <collab>Planck Collaboration</collab>VI, <year>2018</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1807.06209" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1807.06209</ext-link>)</comment></mixed-citation> </ref>

    Note: unfortunately for PTEP we have volumes where the xlink namespace is not defined in each reference,
    so we have to add it ourself to satisfy the XML parser. Here's an example:

    <ref id="B24"> <label>[24]</label> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Audenaert</surname> <given-names>K.</given-names> </string-name>, <string-name name-style="western"><surname>Eisert</surname><given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Plenio</surname><given-names>M. B.</given-names></string-name>, and <string-name name-style="western"><surname>Werner</surname><given-names>R. F.</given-names></string-name></person-group>, <source/>Phys. Rev. A<volume>66</volume>, <fpage>042327</fpage> (<year>2002</year>) [<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/quant-ph/0205025">arXiv:quant-ph/0205025</ext-link>] [<ext-link ext-link-type="uri" xlink:href="http://www.inspirehep.net/search?p=find+EPRINT+quant-ph/0205025">Search <sc>in</sc>SPIRE</ext-link>]. (<comment><ext-link ext-link-type="doi" xlink:href="http://doi.org/10.1103/PhysRevA.66.042327">http://dx.doi.org/10.1103/PhysRevA.66.042327</ext-link></comment>)</mixed-citation> </ref>
    """

    # to match `amp`
    re_match_amp = re.compile(r'__amp;?')
    # to match and remove <etal> tags and their contents (case-insensitive)
    re_replace_etal = re.compile(r'<etal>.*</etal>', flags=re.IGNORECASE)
    # to match and remove unnecessary XML processing instructions
    re_replace_useless_tag = re.compile(r'(<\?[^\?>]*\?>)')
    # to match and remove extra spaces before a semicolon
    re_replace_extra_space = re.compile(r'^\s*;\s*')
    # to match "ASP Conf. Ser. Vol. <number>" pattern
    re_ASPC = re.compile('ASP Conf[.] Ser[.] Vol[.] (\d+)')
    # to match "Astrophysics and Space Science Library, Vol. <number>" pattern
    re_ASSL = re.compile('Astrophysics and Space Science Library, Vol[.] (\d+)|Vol[.] (\d+) of Astrophysics and Space Science Library')
    # to match any alphabetic characters in a year string
    re_char_in_year = re.compile('[A-Za-z]')
    # to match the words 'thesis' or 'dissertation' (case-insensitive)
    re_thesis = re.compile('(thesis|dissertation)', flags=re.IGNORECASE)

    def parse(self):
        """
        parse the OUPFT reference and extract citation information such as authors, year, title, and DOI

        :return:
        """
        self.parsed = 0

        refstr = self.dexml(self.reference_str.toxml())

        authors = self.parse_authors()
        year = self.xmlnode_nodecontents('year')
        if year:
            year = self.re_char_in_year.sub('', year)

        title = self.xmlnode_nodecontents('article-title') or self.xmlnode_nodecontents('chapter-title') or self.xmlnode_nodecontents('bookTitle')

        comment = self.xmlnode_nodecontents('comment')

        volume = ''
        journal = self.xmlnode_nodecontents('source')
        if journal:
            journal = self.re_match_amp.sub('&', journal)
        if not journal:
            match = self.re_ASPC.search(refstr)
            if match:
                journal = 'ASPC'
                volume = match.group(1)
            else:
                match = self.re_ASSL.search(refstr)
                if match:
                    journal = 'ASSL'
                    volume = match.group(1)
        if not journal:
            journal = self.xmlnode_nodecontents('conf-name')
            if not journal:
                # see if it is thesis
                if self.re_thesis.search(refstr):
                    journal = 'Thesis'

        if not volume:
            volume = self.xmlnode_nodecontents('volume').lower().replace('vol', '').strip()

        pages = self.xmlnode_nodecontents('fpage')
        series = self.xmlnode_nodecontents('series')

        type = self.xmlnode_attribute('nlm-citation', 'citation-type') or self.xmlnode_attribute('citation', 'citation-type')
        if comment and type in ['journal', 'confproc'] and not volume and not pages:
            try:
                volume, pages = comment.split()
            except:
                pass

        # these fields are already formatted the way we expect them
        self['authors'] = authors
        self['year'] = year
        self['jrlstr'] = journal.replace('amp', '&')
        self['ttlstr'] = title
        self['volume'] = self.parse_volume(volume)
        self['page'], self['qualifier'] = self.parse_pages(pages, letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ")
        self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])
        self['series'] = series

        doi = self.parse_doi(refstr, comment)
        eprint = self.parse_eprint(refstr)

        # these fields are already formatted the way we expect them
        self['authors'] = authors
        self['year'] = year
        self['jrlstr'] = journal
        self['ttlstr'] = title
        self['volume'] = volume
        self['page'], self['qualifier'] = self.parse_pages(pages)
        self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])

        if doi:
            self['doi'] = doi
        if eprint:
            self['eprint'] = eprint

        self['refstr'] = self.get_reference_str()
        if not self['refstr']:
            self['refplaintext'] = self.get_reference_plain_text(self.to_ascii(refstr))

        self.parsed = 1

    def parse_authors(self) -> str:
        """
        parse the authors from the reference string and format them accordingly

        :return: a formatted string of authors
        """
        authors = self.xmlnode_nodescontents('person-group', attrs={'person-group-type': 'author'}, keepxml=1) or \
                  self.xmlnode_nodescontents('name', keepxml=1) or \
                  self.xmlnode_nodescontents('string-name', keepxml=1)

        collab = self.xmlnode_nodescontents('collab')

        author_list = []
        for author in authors:
            an_author = ''
            # some of name tags include junk xml tags, remove them
            # <person-group person-group-type='author'><name name-style='western'><surname><?A3B2 twb 0.2w?><?A3B2 tlsb -0.01w?>Cunningham</surname>
            author, lastname = self.extract_tag(author, 'surname')
            author, givennames = self.extract_tag(author, 'given-names')
            if lastname: an_author = self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(lastname)))
            if an_author and givennames: an_author += ', ' + self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(givennames)))
            if an_author:
                author_list.append(an_author)
            else:
                # when there is no tag (ie, <person-group person-group-type='author'>Schultheis M.<etal>et al</etal>.)
                author_list.append(self.re_replace_etal.sub(' et. al', author))

        if collab:
            author_list = collab + author_list

        authors = ", ".join(author_list)
        authors = self.re_match_amp.sub('', authors)

        return authors

    def parse_doi(self, refstr: str, comment: str) -> str:
        """
        parse the DOI from the reference string or comment field, falling back to extracting it from the refstr

        attempts to extract a DOI from different sources: first, from the 'pub-id' XML node content; if not found,
        it checks the comment field; if neither contains the DOI, it tries to extract it from the reference string.

        :param refstr: the reference string potentially containing the DOI
        :param comment: a comment related to the reference that may contain the DOI
        :return: the extracted DOI if found, or an empty string if not
        """
        doi = self.match_doi(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'doi'}))
        if doi:
            return doi
        # see if there is a doi in the comment field
        doi = self.match_doi(comment)
        if doi:
            return doi
        # attempt to extract it from refstr
        doi = self.match_doi(refstr)
        if doi:
            return doi
        return ''

    def parse_eprint(self, refstr: str) -> str:
        """
        parse the eprint from the reference string

        attempts to extract the eprint from the 'pub-id' and 'elocation-id' XML nodes,
        then tries to extract it from the reference string if not found in the XML nodes

        :param refstr: the reference string potentially containing the eprint
        :return: the extracted eprint if found, or an empty string if not
        """
        # note that the id might have been identified incorrectly, hence verify it
        # <pub-id pub-id-type="arxiv">arXiv:10.1029/2001JB000553</pub-id>
        eprint = self.match_arxiv_id(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'arxiv'}))
        if eprint:
            return f"arXiv:{eprint}"
        # <elocation-id content-type="arxiv">arXiv:1309.6955</elocation-id>
        eprint = self.match_arxiv_id(self.xmlnode_nodecontents('elocation-id', attrs={'content-type': 'arxiv'}))
        if eprint:
            return f"arXiv:{eprint}"
        # attempt to extract it from refstr
        eprint = self.match_arxiv_id(refstr)
        if eprint:
            return f"arXiv:{eprint}"
        return ''


class OUPFTtoREFs(XMLtoREFs):
    """
    This class converts OUP XML references to a standardized reference format. It processes raw OUP references from
    either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI.
    """

    # to clean up XML blocks by removing certain tags
    block_cleanup = [
        (re.compile(r'</?ext-link.*?>'), ''),
        (re.compile(r'</?uri.*?>'), ''),
        (re.compile(r'<etal\s*/>'), '<surname>et al.</surname>'),
    ]
    # to clean up references by replacing certain patterns
    reference_cleanup = [
        (re.compile(r'</?(ext-link|x).*?>'), ''),
        (re.compile(r'\sxlink:type="simple"'), ''),
        (re.compile(r'\s+xlink:href='), ' href='),
        (re.compile(r'<inline-formula>.*?</inline-formula>'), ''),
        (re.compile(r'\s+xlink:type='), ' type='),
        (re.compile(r'</?x.*?>'), ''),
    ]

    # to match <person-group> tags and their contents
    re_author_tag = re.compile(r'(<person-group.*</person-group>)')
    # to match author placeholder represented by three or more hyphens
    re_author_placeholder = re.compile(r'(-{3,})')

    def __init__(self, filename: str, buffer: str):
        """
        initialize the OUPtoREFs object to process OUP references

        :param filename: the path to the source file
        :param buffer: the XML references as a buffer
        """
        XMLtoREFs.__init__(self, filename, buffer, parsername=OUPFTtoREFs, tag='ref', cleanup=self.block_cleanup, encoding='ISO-8859-1')

    def cleanup(self, reference: str) -> str:
        """
        clean up the reference string by replacing specific patterns

        :param reference: the raw reference string to clean
        :return: cleaned reference string
        """
        for (compiled_re, replace_str) in self.reference_cleanup:
            reference = compiled_re.sub(replace_str, reference)
        return reference

    def missing_authors(self, prev_reference: str, cur_reference: str) -> str:
        """
        replace author placeholder in the current reference with authors from the previous reference

        :param prev_reference: the previous reference containing the author information
        :param cur_reference: the current reference containing the author placeholder
        :return: the current reference with the author placeholder replaced, or the original current reference if no placeholder is found
        """
        if prev_reference and self.re_author_placeholder.search(cur_reference):
            match = self.re_author_tag.search(prev_reference)
            if match:
                return self.re_author_placeholder.sub(match.group(0), cur_reference)
        return cur_reference

    def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]:
        """
        perform reference cleaning and parsing, then dispatch the parsed references

        :return: a list of dictionaries containing bibcodes and parsed references
        """
        references = []
        for raw_block_references in self.raw_references:
            bibcode = raw_block_references['bibcode']
            block_references = raw_block_references['block_references']
            item_nums = raw_block_references.get('item_nums', [])

            parsed_references = []
            prev_reference = ''
            for i, raw_reference in enumerate(block_references):
                reference = self.cleanup(raw_reference)
                reference = self.missing_authors(prev_reference, reference)
                prev_reference = reference

                logger.debug("OUPxml: parsing %s" % reference)
                try:
                    oup_reference = OUPFTreference(reference)
                    parsed_references.append(self.merge({**oup_reference.get_parsed_reference(), 'refraw': raw_reference}, self.any_item_num(item_nums, i)))
                except ReferenceError as error_desc:
                    logger.error("OUPFTxml: error parsing reference: %s" % error_desc)

            references.append({'bibcode': bibcode, 'references': parsed_references})
            logger.debug("%s: parsed %d references" % (bibcode, len(references)))

        return references


# This is the main program used for manual testing and verification of OUPxml references.
# It allows parsing references from either a file or a buffer, and if no input is provided,
# it runs a source test file to verify the functionality against expected parsed results.
# The test results are printed to indicate whether the parsing is successful or not.
from adsrefpipe.tests.unittests.stubdata import parsed_references
if __name__ == '__main__':  # pragma: no cover
    parser = argparse.ArgumentParser(description='Parse OUPFT references')
    parser.add_argument('-f', '--filename', help='the path to source file')
    parser.add_argument('-b', '--buffer', help='xml reference(s)')
    args = parser.parse_args()
    if args.filename:
        print(OUPFTtoREFs(filename=args.filename, buffer=None).process_and_dispatch())
    elif args.buffer:
        print(OUPFTtoREFs(buffer=args.buffer, filename=None).process_and_dispatch())
    # if no reference source is provided, just run the source test file
    elif not args.filename and not args.buffer:
        filename = os.path.abspath(os.path.dirname(__file__) + '/../tests/unittests/stubdata/test.oupft.xml')
        result = OUPFTtoREFs(filename=filename, buffer=None).process_and_dispatch()
        if result == parsed_references.parsed_oup:
            print('Test passed!')
        else:
            print('Test failed!')
    sys.exit(0)