|
| 1 | + |
| 2 | +import sys, os |
| 3 | +import regex as re |
| 4 | +import argparse |
| 5 | +from typing import List, Dict |
| 6 | + |
| 7 | +from adsputils import setup_logging, load_config |
| 8 | +logger = setup_logging('refparsers') |
| 9 | +config = {} |
| 10 | +config.update(load_config()) |
| 11 | + |
| 12 | +from adsrefpipe.refparsers.reference import XMLreference, ReferenceError |
| 13 | +from adsrefpipe.refparsers.toREFs import XMLtoREFs |
| 14 | +from adsrefpipe.refparsers.unicode import tostr |
| 15 | + |
| 16 | + |
| 17 | +class OUPFTreference(XMLreference): |
| 18 | + """ |
| 19 | + This class handles parsing OUP references in XML format. It extracts citation information such as authors, |
| 20 | + year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details. |
| 21 | + |
| 22 | + Examples from MNRAS: |
| 23 | +
|
| 24 | + 1. Common cases: |
| 25 | + <ref id="B22"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Abadi</surname> <given-names>M.</given-names> </string-name> <etal>et al</etal>.</person-group>, <year>2016</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1603.04467" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1603.04467</ext-link>)</comment></mixed-citation> </ref> |
| 26 | + <ref id="B2"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <collab>Astropy Collaboration</collab> <etal>et al</etal>.</person-group>, <year>2013</year>, <source/>A&A, <volume>558</volume>, <fpage>A33</fpage></mixed-citation> </ref> |
| 27 | +
|
| 28 | + 2. Rarer: lack of <person-group> |
| 29 | + <ref id="B44"> <mixed-citation publication-type="journal"> <collab>Planck Collaboration</collab>VI, <year>2018</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1807.06209" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1807.06209</ext-link>)</comment></mixed-citation> </ref> |
| 30 | +
|
| 31 | + Note: unfortunately for PTEP we have volumes where the xlink namespace is not defined in each reference, |
| 32 | + so we have to add it ourself to satisfy the XML parser. Here's an example: |
| 33 | +
|
| 34 | + <ref id="B24"> <label>[24]</label> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Audenaert</surname> <given-names>K.</given-names> </string-name>, <string-name name-style="western"><surname>Eisert</surname><given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Plenio</surname><given-names>M. B.</given-names></string-name>, and <string-name name-style="western"><surname>Werner</surname><given-names>R. F.</given-names></string-name></person-group>, <source/>Phys. Rev. A<volume>66</volume>, <fpage>042327</fpage> (<year>2002</year>) [<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/quant-ph/0205025">arXiv:quant-ph/0205025</ext-link>] [<ext-link ext-link-type="uri" xlink:href="http://www.inspirehep.net/search?p=find+EPRINT+quant-ph/0205025">Search <sc>in</sc>SPIRE</ext-link>]. (<comment><ext-link ext-link-type="doi" xlink:href="http://doi.org/10.1103/PhysRevA.66.042327">http://dx.doi.org/10.1103/PhysRevA.66.042327</ext-link></comment>)</mixed-citation> </ref> |
| 35 | + """ |
| 36 | + |
| 37 | + # to match `amp` |
| 38 | + re_match_amp = re.compile(r'__amp;?') |
| 39 | + # to match and remove <etal> tags and their contents (case-insensitive) |
| 40 | + re_replace_etal = re.compile(r'<etal>.*</etal>', flags=re.IGNORECASE) |
| 41 | + # to match and remove unnecessary XML processing instructions |
| 42 | + re_replace_useless_tag = re.compile(r'(<\?[^\?>]*\?>)') |
| 43 | + # to match and remove extra spaces before a semicolon |
| 44 | + re_replace_extra_space = re.compile(r'^\s*;\s*') |
| 45 | + # to match "ASP Conf. Ser. Vol. <number>" pattern |
| 46 | + re_ASPC = re.compile('ASP Conf[.] Ser[.] Vol[.] (\d+)') |
| 47 | + # to match "Astrophysics and Space Science Library, Vol. <number>" pattern |
| 48 | + re_ASSL = re.compile('Astrophysics and Space Science Library, Vol[.] (\d+)|Vol[.] (\d+) of Astrophysics and Space Science Library') |
| 49 | + # to match any alphabetic characters in a year string |
| 50 | + re_char_in_year = re.compile('[A-Za-z]') |
| 51 | + # to match the words 'thesis' or 'dissertation' (case-insensitive) |
| 52 | + re_thesis = re.compile('(thesis|dissertation)', flags=re.IGNORECASE) |
| 53 | + |
| 54 | + def parse(self): |
| 55 | + """ |
| 56 | + parse the OUPFT reference and extract citation information such as authors, year, title, and DOI |
| 57 | +
|
| 58 | + :return: |
| 59 | + """ |
| 60 | + self.parsed = 0 |
| 61 | + |
| 62 | + refstr = self.dexml(self.reference_str.toxml()) |
| 63 | + |
| 64 | + authors = self.parse_authors() |
| 65 | + year = self.xmlnode_nodecontents('year') |
| 66 | + if year: |
| 67 | + year = self.re_char_in_year.sub('', year) |
| 68 | + |
| 69 | + title = self.xmlnode_nodecontents('article-title') or self.xmlnode_nodecontents('chapter-title') or self.xmlnode_nodecontents('bookTitle') |
| 70 | + |
| 71 | + comment = self.xmlnode_nodecontents('comment') |
| 72 | + |
| 73 | + journal = self.xmlnode_nodecontents('source') |
| 74 | + if journal: |
| 75 | + journal = self.re_match_amp.sub('&', journal) |
| 76 | + if not journal: |
| 77 | + match = self.re_ASPC.search(refstr) |
| 78 | + if match: |
| 79 | + journal = 'ASPC' |
| 80 | + volume = match.group(1) |
| 81 | + else: |
| 82 | + match = self.re_ASSL.search(refstr) |
| 83 | + if match: |
| 84 | + journal = 'ASSL' |
| 85 | + volume = match.group(1) or match.group(2) or '' |
| 86 | + if not journal: |
| 87 | + journal = self.xmlnode_nodecontents('conf-name') |
| 88 | + if not journal: |
| 89 | + # see if it is thesis |
| 90 | + if self.re_thesis.search(refstr): |
| 91 | + journal = 'Thesis' |
| 92 | + |
| 93 | + |
| 94 | + volume = self.xmlnode_nodecontents('volume') |
| 95 | + pages = self.xmlnode_nodecontents('fpage') |
| 96 | + series = self.xmlnode_nodecontents('series') |
| 97 | + |
| 98 | + cittype = self.xmlnode_attribute('nlm-citation', 'citation-type') or self.xmlnode_attribute('citation', 'citation-type') or self.xmlnode_attribute('mixed-citation', 'publication-type') |
| 99 | + if comment and cittype in ['journal', 'confproc'] and not volume and not pages: |
| 100 | + try: |
| 101 | + volume, pages = comment.split() |
| 102 | + except: |
| 103 | + pass |
| 104 | + |
| 105 | + # these fields are already formatted the way we expect them |
| 106 | + self['authors'] = authors |
| 107 | + self['year'] = year |
| 108 | + self['jrlstr'] = journal |
| 109 | + self['ttlstr'] = title |
| 110 | + self['volume'] = self.parse_volume(volume) |
| 111 | + self['page'], self['qualifier'] = self.parse_pages(pages, letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ") |
| 112 | + self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier']) |
| 113 | + self['series'] = series |
| 114 | + |
| 115 | + doi = self.parse_doi(refstr, comment) |
| 116 | + eprint = self.parse_eprint(refstr) |
| 117 | + |
| 118 | + if doi: |
| 119 | + self['doi'] = doi |
| 120 | + if eprint: |
| 121 | + self['eprint'] = eprint |
| 122 | + |
| 123 | + self['refstr'] = self.get_reference_str() |
| 124 | + if not self['refstr']: |
| 125 | + self['refplaintext'] = self.get_reference_plain_text(self.to_ascii(refstr)) |
| 126 | + |
| 127 | + self.parsed = 1 |
| 128 | + |
| 129 | + def parse_authors(self) -> str: |
| 130 | + """ |
| 131 | + parse the authors from the reference string and format them accordingly |
| 132 | +
|
| 133 | + :return: a formatted string of authors |
| 134 | + """ |
| 135 | + authors = self.xmlnode_nodescontents('person-group', attrs={'person-group-type': 'author'}, keepxml=1) or \ |
| 136 | + self.xmlnode_nodescontents('name', keepxml=1) or \ |
| 137 | + self.xmlnode_nodescontents('string-name', keepxml=1) |
| 138 | + |
| 139 | + collab = self.xmlnode_nodescontents('collab') |
| 140 | + |
| 141 | + author_list = [] |
| 142 | + for author in authors: |
| 143 | + an_author = '' |
| 144 | + # some of name tags include junk xml tags, remove them |
| 145 | + # <person-group person-group-type='author'><name name-style='western'><surname><?A3B2 twb 0.2w?><?A3B2 tlsb -0.01w?>Cunningham</surname> |
| 146 | + author, lastname = self.extract_tag(author, 'surname') |
| 147 | + author, givennames = self.extract_tag(author, 'given-names') |
| 148 | + if lastname: an_author = self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(lastname))) |
| 149 | + if an_author and givennames: an_author += ', ' + self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(givennames))) |
| 150 | + if an_author: |
| 151 | + author_list.append(an_author) |
| 152 | + else: |
| 153 | + # when there is no tag (ie, <person-group person-group-type='author'>Schultheis M.<etal>et al</etal>.) |
| 154 | + author_list.append(self.re_replace_etal.sub(' et. al', author)) |
| 155 | + |
| 156 | + if collab: |
| 157 | + author_list = collab + author_list |
| 158 | + |
| 159 | + authors = ", ".join(author_list) |
| 160 | + authors = self.re_match_amp.sub('', authors) |
| 161 | + |
| 162 | + return authors |
| 163 | + |
| 164 | + def parse_doi(self, refstr: str, comment: str) -> str: |
| 165 | + """ |
| 166 | + parse the DOI from the reference string or comment field, falling back to extracting it from the refstr |
| 167 | +
|
| 168 | + attempts to extract a DOI from different sources: first, from the 'pub-id' XML node content; if not found, |
| 169 | + it checks the comment field; if neither contains the DOI, it tries to extract it from the reference string. |
| 170 | +
|
| 171 | + :param refstr: the reference string potentially containing the DOI |
| 172 | + :param comment: a comment related to the reference that may contain the DOI |
| 173 | + :return: the extracted DOI if found, or an empty string if not |
| 174 | + """ |
| 175 | + doi = self.match_doi(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'doi'})) |
| 176 | + if doi: |
| 177 | + return doi |
| 178 | + # see if there is a doi in the comment field |
| 179 | + doi = self.match_doi(comment) |
| 180 | + if doi: |
| 181 | + return doi |
| 182 | + # attempt to extract it from refstr |
| 183 | + doi = self.match_doi(refstr) |
| 184 | + if doi: |
| 185 | + return doi |
| 186 | + return '' |
| 187 | + |
| 188 | + def parse_eprint(self, refstr: str) -> str: |
| 189 | + """ |
| 190 | + parse the eprint from the reference string |
| 191 | +
|
| 192 | + attempts to extract the eprint from the 'pub-id' and 'elocation-id' XML nodes, |
| 193 | + then tries to extract it from the reference string if not found in the XML nodes |
| 194 | +
|
| 195 | + :param refstr: the reference string potentially containing the eprint |
| 196 | + :return: the extracted eprint if found, or an empty string if not |
| 197 | + """ |
| 198 | + # note that the id might have been identified incorrectly, hence verify it |
| 199 | + # <pub-id pub-id-type="arxiv">arXiv:10.1029/2001JB000553</pub-id> |
| 200 | + eprint = self.match_arxiv_id(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'arxiv'})) |
| 201 | + if eprint: |
| 202 | + return f"arXiv:{eprint}" |
| 203 | + # <elocation-id content-type="arxiv">arXiv:1309.6955</elocation-id> |
| 204 | + eprint = self.match_arxiv_id(self.xmlnode_nodecontents('elocation-id', attrs={'content-type': 'arxiv'})) |
| 205 | + if eprint: |
| 206 | + return f"arXiv:{eprint}" |
| 207 | + # attempt to extract it from refstr |
| 208 | + eprint = self.match_arxiv_id(refstr) |
| 209 | + if eprint: |
| 210 | + return f"arXiv:{eprint}" |
| 211 | + return '' |
| 212 | + |
| 213 | + |
| 214 | +class OUPFTtoREFs(XMLtoREFs): |
| 215 | + """ |
| 216 | + This class converts OUP XML references to a standardized reference format. It processes raw OUP references from |
| 217 | + either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI. |
| 218 | + """ |
| 219 | + |
| 220 | + # to clean up XML blocks by removing certain tags |
| 221 | + block_cleanup = [ |
| 222 | + (re.compile(r'</?ext-link.*?>'), ''), |
| 223 | + (re.compile(r'</?uri.*?>'), ''), |
| 224 | + (re.compile(r'<etal\s*/>'), '<surname>et al.</surname>'), |
| 225 | + ] |
| 226 | + # to clean up references by replacing certain patterns |
| 227 | + reference_cleanup = [ |
| 228 | + (re.compile(r'</?(ext-link|x).*?>'), ''), |
| 229 | + (re.compile(r'\sxlink:type="simple"'), ''), |
| 230 | + (re.compile(r'\s+xlink:href='), ' href='), |
| 231 | + (re.compile(r'<inline-formula>.*?</inline-formula>'), ''), |
| 232 | + (re.compile(r'\s+xlink:type='), ' type='), |
| 233 | + (re.compile(r'</?x.*?>'), ''), |
| 234 | + ] |
| 235 | + |
| 236 | + # to match <person-group> tags and their contents |
| 237 | + re_author_tag = re.compile(r'(<person-group.*</person-group>)') |
| 238 | + # to match author placeholder represented by three or more hyphens |
| 239 | + re_author_placeholder = re.compile(r'(-{3,})') |
| 240 | + |
| 241 | + def __init__(self, filename: str, buffer: str): |
| 242 | + """ |
| 243 | + initialize the OUPtoREFs object to process OUP references |
| 244 | +
|
| 245 | + :param filename: the path to the source file |
| 246 | + :param buffer: the XML references as a buffer |
| 247 | + """ |
| 248 | + XMLtoREFs.__init__(self, filename, buffer, parsername=OUPFTtoREFs, tag='ref', cleanup=self.block_cleanup, encoding='ISO-8859-1') |
| 249 | + |
| 250 | + def cleanup(self, reference: str) -> str: |
| 251 | + """ |
| 252 | + clean up the reference string by replacing specific patterns |
| 253 | +
|
| 254 | + :param reference: the raw reference string to clean |
| 255 | + :return: cleaned reference string |
| 256 | + """ |
| 257 | + for (compiled_re, replace_str) in self.reference_cleanup: |
| 258 | + reference = compiled_re.sub(replace_str, reference) |
| 259 | + return reference |
| 260 | + |
| 261 | + def missing_authors(self, prev_reference: str, cur_reference: str) -> str: |
| 262 | + """ |
| 263 | + replace author placeholder in the current reference with authors from the previous reference |
| 264 | +
|
| 265 | + :param prev_reference: the previous reference containing the author information |
| 266 | + :param cur_reference: the current reference containing the author placeholder |
| 267 | + :return: the current reference with the author placeholder replaced, or the original current reference if no placeholder is found |
| 268 | + """ |
| 269 | + if prev_reference and self.re_author_placeholder.search(cur_reference): |
| 270 | + match = self.re_author_tag.search(prev_reference) |
| 271 | + if match: |
| 272 | + return self.re_author_placeholder.sub(match.group(0), cur_reference) |
| 273 | + return cur_reference |
| 274 | + |
| 275 | + def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]: |
| 276 | + """ |
| 277 | + perform reference cleaning and parsing, then dispatch the parsed references |
| 278 | +
|
| 279 | + :return: a list of dictionaries containing bibcodes and parsed references |
| 280 | + """ |
| 281 | + references = [] |
| 282 | + for raw_block_references in self.raw_references: |
| 283 | + bibcode = raw_block_references['bibcode'] |
| 284 | + block_references = raw_block_references['block_references'] |
| 285 | + item_nums = raw_block_references.get('item_nums', []) |
| 286 | + |
| 287 | + parsed_references = [] |
| 288 | + prev_reference = '' |
| 289 | + for i, raw_reference in enumerate(block_references): |
| 290 | + reference = self.cleanup(raw_reference) |
| 291 | + reference = self.missing_authors(prev_reference, reference) |
| 292 | + prev_reference = reference |
| 293 | + |
| 294 | + logger.debug("OUPxml: parsing %s" % reference) |
| 295 | + try: |
| 296 | + oup_reference = OUPFTreference(reference) |
| 297 | + parsed_references.append(self.merge({**oup_reference.get_parsed_reference(), 'refraw': raw_reference}, self.any_item_num(item_nums, i))) |
| 298 | + except ReferenceError as error_desc: |
| 299 | + logger.error("OUPFTxml: error parsing reference: %s" % error_desc) |
| 300 | + |
| 301 | + references.append({'bibcode': bibcode, 'references': parsed_references}) |
| 302 | + logger.debug("%s: parsed %d references out of %d found references" % (bibcode, len(parsed_references), len(block_references))) |
| 303 | + |
| 304 | + return references |
| 305 | + |
| 306 | + |
| 307 | +# This is the main program used for manual testing and verification of OUPxml references. |
| 308 | +# It allows parsing references from either a file or a buffer, and if no input is provided, |
| 309 | +# it runs a source test file to verify the functionality against expected parsed results. |
| 310 | +# The test results are printed to indicate whether the parsing is successful or not. |
| 311 | +if __name__ == '__main__': # pragma: no cover |
| 312 | + from adsrefpipe.tests.unittests.stubdata import parsed_references |
| 313 | + parser = argparse.ArgumentParser(description='Parse OUPFT references') |
| 314 | + parser.add_argument('-f', '--filename', help='the path to source file') |
| 315 | + parser.add_argument('-b', '--buffer', help='xml reference(s)') |
| 316 | + args = parser.parse_args() |
| 317 | + if args.filename: |
| 318 | + print(OUPFTtoREFs(filename=args.filename, buffer=None).process_and_dispatch()) |
| 319 | + elif args.buffer: |
| 320 | + print(OUPFTtoREFs(buffer=args.buffer, filename=None).process_and_dispatch()) |
| 321 | + # if no reference source is provided, just run the source test file |
| 322 | + elif not args.filename and not args.buffer: |
| 323 | + filename = os.path.abspath(os.path.dirname(__file__) + '/../tests/unittests/stubdata/test.oupft.xml') |
| 324 | + result = OUPFTtoREFs(filename=filename, buffer=None).process_and_dispatch() |
| 325 | + if result == parsed_references.parsed_oup: |
| 326 | + print('Test passed!') |
| 327 | + else: |
| 328 | + print('Test failed!') |
| 329 | + sys.exit(0) |
0 commit comments