Skip to content

Commit 2310e2e

Browse files
committed
OUP parser for references extracted from fulltext
1 parent c1c2363 commit 2310e2e

File tree

4 files changed

+377
-0
lines changed

4 files changed

+377
-0
lines changed

adsrefpipe/refparsers/OUPFTxml.py

Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
2+
import sys, os
3+
import regex as re
4+
import argparse
5+
from typing import List, Dict
6+
7+
from adsputils import setup_logging, load_config
8+
logger = setup_logging('refparsers')
9+
config = {}
10+
config.update(load_config())
11+
12+
from adsrefpipe.refparsers.reference import XMLreference, ReferenceError
13+
from adsrefpipe.refparsers.toREFs import XMLtoREFs
14+
from adsrefpipe.refparsers.unicode import tostr
15+
16+
17+
class OUPFTreference(XMLreference):
18+
"""
19+
This class handles parsing OUP references in XML format. It extracts citation information such as authors,
20+
year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details.
21+
22+
Examples from MNRAS:
23+
24+
1. Common cases:
25+
<ref id="B22"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Abadi</surname> <given-names>M.</given-names> </string-name> <etal>et al</etal>.</person-group>, <year>2016</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1603.04467" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1603.04467</ext-link>)</comment></mixed-citation> </ref>
26+
<ref id="B2"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <collab>Astropy Collaboration</collab> <etal>et al</etal>.</person-group>, <year>2013</year>, <source/>A&amp;A, <volume>558</volume>, <fpage>A33</fpage></mixed-citation> </ref>
27+
28+
2. Rarer: lack of <person-group>
29+
<ref id="B44"> <mixed-citation publication-type="journal"> <collab>Planck Collaboration</collab>VI, <year>2018</year>, <comment>preprint (<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1807.06209" xmlns:xlink="http://www.w3.org/1999/xlink">arXiv:1807.06209</ext-link>)</comment></mixed-citation> </ref>
30+
31+
Note: unfortunately for PTEP we have volumes where the xlink namespace is not defined in each reference,
32+
so we have to add it ourself to satisfy the XML parser. Here's an example:
33+
34+
<ref id="B24"> <label>[24]</label> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name name-style="western"> <surname>Audenaert</surname> <given-names>K.</given-names> </string-name>, <string-name name-style="western"><surname>Eisert</surname><given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Plenio</surname><given-names>M. B.</given-names></string-name>, and <string-name name-style="western"><surname>Werner</surname><given-names>R. F.</given-names></string-name></person-group>, <source/>Phys. Rev. A<volume>66</volume>, <fpage>042327</fpage> (<year>2002</year>) [<ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/quant-ph/0205025">arXiv:quant-ph/0205025</ext-link>] [<ext-link ext-link-type="uri" xlink:href="http://www.inspirehep.net/search?p=find+EPRINT+quant-ph/0205025">Search <sc>in</sc>SPIRE</ext-link>]. (<comment><ext-link ext-link-type="doi" xlink:href="http://doi.org/10.1103/PhysRevA.66.042327">http://dx.doi.org/10.1103/PhysRevA.66.042327</ext-link></comment>)</mixed-citation> </ref>
35+
"""
36+
37+
# to match `amp`
38+
re_match_amp = re.compile(r'__amp;?')
39+
# to match and remove <etal> tags and their contents (case-insensitive)
40+
re_replace_etal = re.compile(r'<etal>.*</etal>', flags=re.IGNORECASE)
41+
# to match and remove unnecessary XML processing instructions
42+
re_replace_useless_tag = re.compile(r'(<\?[^\?>]*\?>)')
43+
# to match and remove extra spaces before a semicolon
44+
re_replace_extra_space = re.compile(r'^\s*;\s*')
45+
# to match "ASP Conf. Ser. Vol. <number>" pattern
46+
re_ASPC = re.compile('ASP Conf[.] Ser[.] Vol[.] (\d+)')
47+
# to match "Astrophysics and Space Science Library, Vol. <number>" pattern
48+
re_ASSL = re.compile('Astrophysics and Space Science Library, Vol[.] (\d+)|Vol[.] (\d+) of Astrophysics and Space Science Library')
49+
# to match any alphabetic characters in a year string
50+
re_char_in_year = re.compile('[A-Za-z]')
51+
# to match the words 'thesis' or 'dissertation' (case-insensitive)
52+
re_thesis = re.compile('(thesis|dissertation)', flags=re.IGNORECASE)
53+
54+
def parse(self):
55+
"""
56+
parse the OUPFT reference and extract citation information such as authors, year, title, and DOI
57+
58+
:return:
59+
"""
60+
self.parsed = 0
61+
62+
refstr = self.dexml(self.reference_str.toxml())
63+
64+
authors = self.parse_authors()
65+
year = self.xmlnode_nodecontents('year')
66+
if year:
67+
year = self.re_char_in_year.sub('', year)
68+
69+
title = self.xmlnode_nodecontents('article-title') or self.xmlnode_nodecontents('chapter-title') or self.xmlnode_nodecontents('bookTitle')
70+
71+
comment = self.xmlnode_nodecontents('comment')
72+
73+
volume = ''
74+
journal = self.xmlnode_nodecontents('source')
75+
if journal:
76+
journal = self.re_match_amp.sub('&', journal)
77+
if not journal:
78+
match = self.re_ASPC.search(refstr)
79+
if match:
80+
journal = 'ASPC'
81+
volume = match.group(1)
82+
else:
83+
match = self.re_ASSL.search(refstr)
84+
if match:
85+
journal = 'ASSL'
86+
volume = match.group(1)
87+
if not journal:
88+
journal = self.xmlnode_nodecontents('conf-name')
89+
if not journal:
90+
# see if it is thesis
91+
if self.re_thesis.search(refstr):
92+
journal = 'Thesis'
93+
94+
if not volume:
95+
volume = self.xmlnode_nodecontents('volume').lower().replace('vol', '').strip()
96+
97+
pages = self.xmlnode_nodecontents('fpage')
98+
series = self.xmlnode_nodecontents('series')
99+
100+
type = self.xmlnode_attribute('nlm-citation', 'citation-type') or self.xmlnode_attribute('citation', 'citation-type')
101+
if comment and type in ['journal', 'confproc'] and not volume and not pages:
102+
try:
103+
volume, pages = comment.split()
104+
except:
105+
pass
106+
107+
# these fields are already formatted the way we expect them
108+
self['authors'] = authors
109+
self['year'] = year
110+
self['jrlstr'] = journal.replace('amp', '&')
111+
self['ttlstr'] = title
112+
self['volume'] = self.parse_volume(volume)
113+
self['page'], self['qualifier'] = self.parse_pages(pages, letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ")
114+
self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])
115+
self['series'] = series
116+
117+
doi = self.parse_doi(refstr, comment)
118+
eprint = self.parse_eprint(refstr)
119+
120+
# these fields are already formatted the way we expect them
121+
self['authors'] = authors
122+
self['year'] = year
123+
self['jrlstr'] = journal
124+
self['ttlstr'] = title
125+
self['volume'] = volume
126+
self['page'], self['qualifier'] = self.parse_pages(pages)
127+
self['pages'] = self.combine_page_qualifier(self['page'], self['qualifier'])
128+
129+
if doi:
130+
self['doi'] = doi
131+
if eprint:
132+
self['eprint'] = eprint
133+
134+
self['refstr'] = self.get_reference_str()
135+
if not self['refstr']:
136+
self['refplaintext'] = self.get_reference_plain_text(self.to_ascii(refstr))
137+
138+
self.parsed = 1
139+
140+
def parse_authors(self) -> str:
141+
"""
142+
parse the authors from the reference string and format them accordingly
143+
144+
:return: a formatted string of authors
145+
"""
146+
authors = self.xmlnode_nodescontents('person-group', attrs={'person-group-type': 'author'}, keepxml=1) or \
147+
self.xmlnode_nodescontents('name', keepxml=1) or \
148+
self.xmlnode_nodescontents('string-name', keepxml=1)
149+
150+
collab = self.xmlnode_nodescontents('collab')
151+
152+
author_list = []
153+
for author in authors:
154+
an_author = ''
155+
# some of name tags include junk xml tags, remove them
156+
# <person-group person-group-type='author'><name name-style='western'><surname><?A3B2 twb 0.2w?><?A3B2 tlsb -0.01w?>Cunningham</surname>
157+
author, lastname = self.extract_tag(author, 'surname')
158+
author, givennames = self.extract_tag(author, 'given-names')
159+
if lastname: an_author = self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(lastname)))
160+
if an_author and givennames: an_author += ', ' + self.re_replace_extra_space.sub('', self.re_replace_useless_tag.sub('', tostr(givennames)))
161+
if an_author:
162+
author_list.append(an_author)
163+
else:
164+
# when there is no tag (ie, <person-group person-group-type='author'>Schultheis M.<etal>et al</etal>.)
165+
author_list.append(self.re_replace_etal.sub(' et. al', author))
166+
167+
if collab:
168+
author_list = collab + author_list
169+
170+
authors = ", ".join(author_list)
171+
authors = self.re_match_amp.sub('', authors)
172+
173+
return authors
174+
175+
def parse_doi(self, refstr: str, comment: str) -> str:
176+
"""
177+
parse the DOI from the reference string or comment field, falling back to extracting it from the refstr
178+
179+
attempts to extract a DOI from different sources: first, from the 'pub-id' XML node content; if not found,
180+
it checks the comment field; if neither contains the DOI, it tries to extract it from the reference string.
181+
182+
:param refstr: the reference string potentially containing the DOI
183+
:param comment: a comment related to the reference that may contain the DOI
184+
:return: the extracted DOI if found, or an empty string if not
185+
"""
186+
doi = self.match_doi(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'doi'}))
187+
if doi:
188+
return doi
189+
# see if there is a doi in the comment field
190+
doi = self.match_doi(comment)
191+
if doi:
192+
return doi
193+
# attempt to extract it from refstr
194+
doi = self.match_doi(refstr)
195+
if doi:
196+
return doi
197+
return ''
198+
199+
def parse_eprint(self, refstr: str) -> str:
200+
"""
201+
parse the eprint from the reference string
202+
203+
attempts to extract the eprint from the 'pub-id' and 'elocation-id' XML nodes,
204+
then tries to extract it from the reference string if not found in the XML nodes
205+
206+
:param refstr: the reference string potentially containing the eprint
207+
:return: the extracted eprint if found, or an empty string if not
208+
"""
209+
# note that the id might have been identified incorrectly, hence verify it
210+
# <pub-id pub-id-type="arxiv">arXiv:10.1029/2001JB000553</pub-id>
211+
eprint = self.match_arxiv_id(self.xmlnode_nodecontents('pub-id', attrs={'pub-id-type': 'arxiv'}))
212+
if eprint:
213+
return f"arXiv:{eprint}"
214+
# <elocation-id content-type="arxiv">arXiv:1309.6955</elocation-id>
215+
eprint = self.match_arxiv_id(self.xmlnode_nodecontents('elocation-id', attrs={'content-type': 'arxiv'}))
216+
if eprint:
217+
return f"arXiv:{eprint}"
218+
# attempt to extract it from refstr
219+
eprint = self.match_arxiv_id(refstr)
220+
if eprint:
221+
return f"arXiv:{eprint}"
222+
return ''
223+
224+
225+
class OUPFTtoREFs(XMLtoREFs):
226+
"""
227+
This class converts OUP XML references to a standardized reference format. It processes raw OUP references from
228+
either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI.
229+
"""
230+
231+
# to clean up XML blocks by removing certain tags
232+
block_cleanup = [
233+
(re.compile(r'</?ext-link.*?>'), ''),
234+
(re.compile(r'</?uri.*?>'), ''),
235+
(re.compile(r'<etal\s*/>'), '<surname>et al.</surname>'),
236+
]
237+
# to clean up references by replacing certain patterns
238+
reference_cleanup = [
239+
(re.compile(r'</?(ext-link|x).*?>'), ''),
240+
(re.compile(r'\sxlink:type="simple"'), ''),
241+
(re.compile(r'\s+xlink:href='), ' href='),
242+
(re.compile(r'<inline-formula>.*?</inline-formula>'), ''),
243+
(re.compile(r'\s+xlink:type='), ' type='),
244+
(re.compile(r'</?x.*?>'), ''),
245+
]
246+
247+
# to match <person-group> tags and their contents
248+
re_author_tag = re.compile(r'(<person-group.*</person-group>)')
249+
# to match author placeholder represented by three or more hyphens
250+
re_author_placeholder = re.compile(r'(-{3,})')
251+
252+
def __init__(self, filename: str, buffer: str):
253+
"""
254+
initialize the OUPtoREFs object to process OUP references
255+
256+
:param filename: the path to the source file
257+
:param buffer: the XML references as a buffer
258+
"""
259+
XMLtoREFs.__init__(self, filename, buffer, parsername=OUPFTtoREFs, tag='ref', cleanup=self.block_cleanup, encoding='ISO-8859-1')
260+
261+
def cleanup(self, reference: str) -> str:
262+
"""
263+
clean up the reference string by replacing specific patterns
264+
265+
:param reference: the raw reference string to clean
266+
:return: cleaned reference string
267+
"""
268+
for (compiled_re, replace_str) in self.reference_cleanup:
269+
reference = compiled_re.sub(replace_str, reference)
270+
return reference
271+
272+
def missing_authors(self, prev_reference: str, cur_reference: str) -> str:
273+
"""
274+
replace author placeholder in the current reference with authors from the previous reference
275+
276+
:param prev_reference: the previous reference containing the author information
277+
:param cur_reference: the current reference containing the author placeholder
278+
:return: the current reference with the author placeholder replaced, or the original current reference if no placeholder is found
279+
"""
280+
if prev_reference and self.re_author_placeholder.search(cur_reference):
281+
match = self.re_author_tag.search(prev_reference)
282+
if match:
283+
return self.re_author_placeholder.sub(match.group(0), cur_reference)
284+
return cur_reference
285+
286+
def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]:
287+
"""
288+
perform reference cleaning and parsing, then dispatch the parsed references
289+
290+
:return: a list of dictionaries containing bibcodes and parsed references
291+
"""
292+
references = []
293+
for raw_block_references in self.raw_references:
294+
bibcode = raw_block_references['bibcode']
295+
block_references = raw_block_references['block_references']
296+
item_nums = raw_block_references.get('item_nums', [])
297+
298+
parsed_references = []
299+
prev_reference = ''
300+
for i, raw_reference in enumerate(block_references):
301+
reference = self.cleanup(raw_reference)
302+
reference = self.missing_authors(prev_reference, reference)
303+
prev_reference = reference
304+
305+
logger.debug("OUPxml: parsing %s" % reference)
306+
try:
307+
oup_reference = OUPFTreference(reference)
308+
parsed_references.append(self.merge({**oup_reference.get_parsed_reference(), 'refraw': raw_reference}, self.any_item_num(item_nums, i)))
309+
except ReferenceError as error_desc:
310+
logger.error("OUPFTxml: error parsing reference: %s" % error_desc)
311+
312+
references.append({'bibcode': bibcode, 'references': parsed_references})
313+
logger.debug("%s: parsed %d references" % (bibcode, len(references)))
314+
315+
return references
316+
317+
318+
# This is the main program used for manual testing and verification of OUPxml references.
319+
# It allows parsing references from either a file or a buffer, and if no input is provided,
320+
# it runs a source test file to verify the functionality against expected parsed results.
321+
# The test results are printed to indicate whether the parsing is successful or not.
322+
from adsrefpipe.tests.unittests.stubdata import parsed_references
323+
if __name__ == '__main__': # pragma: no cover
324+
parser = argparse.ArgumentParser(description='Parse OUPFT references')
325+
parser.add_argument('-f', '--filename', help='the path to source file')
326+
parser.add_argument('-b', '--buffer', help='xml reference(s)')
327+
args = parser.parse_args()
328+
if args.filename:
329+
print(OUPFTtoREFs(filename=args.filename, buffer=None).process_and_dispatch())
330+
elif args.buffer:
331+
print(OUPFTtoREFs(buffer=args.buffer, filename=None).process_and_dispatch())
332+
# if no reference source is provided, just run the source test file
333+
elif not args.filename and not args.buffer:
334+
filename = os.path.abspath(os.path.dirname(__file__) + '/../tests/unittests/stubdata/test.oupft.xml')
335+
result = OUPFTtoREFs(filename=filename, buffer=None).process_and_dispatch()
336+
if result == parsed_references.parsed_oup:
337+
print('Test passed!')
338+
else:
339+
print('Test failed!')
340+
sys.exit(0)

adsrefpipe/tests/unittests/stubdata/parsed_references.py

Lines changed: 2 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<ADSBIBCODE>2001FOO...999..999X</ADSBIBCODE>
2+
<ref id="bib1"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Abramowitz</surname> <given-names>M.</given-names> </name> <name> <surname>Stegun</surname> <given-names>I.</given-names> </name> </person-group>, <year>1964</year>, <source>Handbook of Mathematical Functions</source>. <publisher-name>Dover</publisher-name>, <publisher-loc>New York</publisher-loc></mixed-citation> </ref>
3+
<ref id="bib2"> <mixed-citation publication-type="journal" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Andredakis</surname> <given-names>Y.</given-names> </name> <name> <surname>Peletier</surname> <given-names>R.</given-names> </name> <name> <surname>Balcells</surname> <given-names>M.</given-names> </name> </person-group>, <year>1995</year>, <source>MNRAS</source>, <volume>275</volume>, <fpage>874</fpage></mixed-citation> </ref>
4+
<ref id="bib3"> <mixed-citation publication-type="book" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Hubble</surname> <given-names>E.</given-names> </name> </person-group>, <year>1936</year>, <source>The Realm of the Nebulae</source>, <publisher-name>Yale Univ. Press</publisher-name>. <publisher-loc>New Haven, CT</publisher-loc></mixed-citation> </ref>
5+
<ref id="bib4"> <mixed-citation publication-type="other" xlink:type="simple" xmlns:xlink="http://www.w3.org/1999/xlink"> <person-group person-group-type="author"> <name> <surname>Prieto</surname> <given-names>M.</given-names> </name> <name> <surname>Aguerri</surname> <given-names>J. A. L.</given-names> </name> <name> <surname>Varela</surname> <given-names>A. M.</given-names> </name> <name> <surname>Munoz-Tun&oacute;n</surname> <given-names>C.</given-names> </name> </person-group>, <year>2000</year>, <source>A&amp;A, in press</source></mixed-citation> </ref>
6+
<ref id="bib5"><mixed-citation publication-type="book"><person-group person-group-type="author"><collab>Lightkurve Collaboration</collab></person-group><etal>et al</etal>., <year>2018</year>, <source>Lightkurve: Kepler and TESS time series analysis in Python, Astrophysics Source Code Library</source>. <comment>record (ascl:1812.013)</comment></mixed-citation></ref>
7+
<ref id="bib6"> <mixed-citation publication-type="journal"> <person-group person-group-type="author"> <string-name> <surname>Colberg</surname> &nbsp;<given-names>J. M.</given-names></string-name> &nbsp;<etal>et al.</etal></person-group>, <year>2008</year>, <source>MNRAS</source>, <volume>387</volume>, <fpage>933</fpage> &nbsp;<pub-id pub-id-type="doi">10.1111/j.1365-2966.2008.13307.x</pub-id></mixed-citation> </ref>
8+
<ref id="bib7"><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Schmidt</surname> <given-names>S. P.</given-names></string-name> <etal>et al.</etal></person-group>, <year>2025</year>, <comment>preprint</comment> (<pub-id pub-id-type="arxiv">arXiv:2501.18477</pub-id>)</mixed-citation></ref>

0 commit comments

Comments
 (0)