Skip to content

Commit 43e1c25

Browse files
authored
Merge pull request #15 from golnazads/master
Issues #3, #4, #14, and #15
2 parents 161aeed + c97a8c6 commit 43e1c25

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+7791
-2465
lines changed

.github/workflows/python_actions.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545

4646
- name: Install dependencies
4747
run: |
48-
python -m pip install --upgrade setuptools pip
48+
python -m pip install --upgrade setuptools "pip<24.1"
4949
pip install -r requirements.txt
5050
pip install -r dev-requirements.txt
5151
- name: Test with pytest

README.md

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ This pipeline is to process source reference files, if xml to parse them first a
5454
python run.py RESOLVE -s <list of source filenames separated by spaces>
5555
```
5656
57-
2. Specify a directory, and file extension, to recursively search all sub directories for this type of reference file, and queue them all for processing, use the command
57+
2. Specify a directory, and file extension (i.e. -e *.raw), to recursively search all sub directories for this type of reference file, and queue them all for processing, use the command
5858
```
5959
python run.py RESOLVE -p <source files path> -e <source files extension>
6060
```

adsrefpipe/app.py

100644100755
Lines changed: 391 additions & 244 deletions
Large diffs are not rendered by default.

adsrefpipe/models.py

100644100755
Lines changed: 90 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,19 @@ class Action(Base):
2020
__tablename__ = 'action'
2121
status = Column(String, primary_key=True)
2222

23-
def get_status_new(self):
23+
def get_status_new(self) -> str:
2424
"""
25+
returns the initial status
2526
26-
:return:
27+
:return: string indicating the initial status
2728
"""
2829
return 'initial'
2930

30-
def get_status_retry(self):
31+
def get_status_retry(self) -> str:
3132
"""
33+
returns the retry status
3234
33-
:return:
35+
:return: string indicating the retry status
3436
"""
3537
return 'retry'
3638

@@ -49,50 +51,57 @@ class Parser(Base):
4951
reference_service_endpoint = Column(String)
5052
matches = Column(JSONB, default=dict)
5153

52-
def __init__(self, name, extension_pattern, reference_service_endpoint, matches=[]):
54+
def __init__(self, name: str, extension_pattern: str, reference_service_endpoint: str, matches: list = []):
5355
"""
56+
initializes a parser object
5457
55-
:param name:
56-
:param extension_pattern:
57-
:param reference_service_endpoint:
58-
:param matches:
58+
:param name: name of the parser
59+
:param extension_pattern: reference file extension pattern used by the parser
60+
:param reference_service_endpoint: endpoint for the reference service
61+
:param matches: list of matches for the parser-reference file mapping
5962
"""
6063
self.name = name
6164
self.extension_pattern = extension_pattern
6265
self.reference_service_endpoint = reference_service_endpoint
6366
self.matches = matches
6467

65-
def get_name(self):
68+
def get_name(self) -> str:
6669
"""
70+
returns the name of the parser
6771
68-
:return:
72+
:return: string indicating the name of the parser
6973
"""
7074
return self.name
7175

72-
def get_extension_pattern(self):
76+
def get_extension_pattern(self) -> str:
7377
"""
78+
returns the extension pattern of the reference files processed by the parser
7479
75-
:return:
80+
:return: string indicating the file extension pattern
7681
"""
7782
return self.extension_pattern
7883

79-
def get_endpoint(self):
84+
def get_endpoint(self) -> str:
8085
"""
86+
returns the reference service endpoint to resolve references
8187
82-
:return:
88+
:return: string indicating the reference service endpoint
8389
"""
8490
return self.reference_service_endpoint
8591

86-
def get_matches(self):
92+
def get_matches(self) -> list:
8793
"""
94+
returns the list of mappings for the parser
8895
89-
:return:
96+
:return: list of matches
9097
"""
9198
return self.matches
9299

93-
def toJSON(self):
100+
def toJSON(self) -> dict:
94101
"""
95-
:return: values formatted as python dict
102+
converts the parser object to a JSON dictionary
103+
104+
:return: dictionary containing the parser details
96105
"""
97106
return {
98107
'name': self.name,
@@ -103,28 +112,37 @@ def toJSON(self):
103112

104113

105114
class ReferenceSource(Base):
115+
"""
116+
This class represents the source of a reference in the database,
117+
each entry links a source file with its resolved version and
118+
the parser used to process the reference.
119+
It serves as the initial record for the reference processing pipeline.
120+
"""
106121
__tablename__ = 'reference_source'
107122
bibcode = Column(String, primary_key=True)
108123
source_filename = Column(String, primary_key=True)
109124
resolved_filename = Column(String)
110125
parser_name = Column(String, ForeignKey('parser.name'))
111126

112-
def __init__(self, bibcode, source_filename, resolved_filename, parser_name):
127+
def __init__(self, bibcode: str, source_filename: str, resolved_filename: str, parser_name: str):
113128
"""
129+
initializes a reference source object
114130
115-
:param bibcode:
116-
:param source_filename:
117-
:param resolved_filename:
118-
:param parser_name:
131+
:param bibcode: unique bibcode for the reference source
132+
:param source_filename: name of the reference file
133+
:param resolved_filename: name of the resolved file for future use
134+
:param parser_name: name of the parser used
119135
"""
120136
self.bibcode = bibcode
121137
self.source_filename = source_filename
122138
self.resolved_filename = resolved_filename
123139
self.parser_name = parser_name
124140

125-
def toJSON(self):
141+
def toJSON(self) -> dict:
126142
"""
127-
:return: values formatted as python dict, if no values found returns empty structure, not None
143+
converts the reference source object to a JSON dictionary
144+
145+
:return: dictionary containing reference source details
128146
"""
129147
return {
130148
'bibcode': self.bibcode,
@@ -135,6 +153,10 @@ def toJSON(self):
135153

136154

137155
class ProcessedHistory(Base):
156+
"""
157+
This class tracks the processing history of a resolved reference, recording details about the processing status,
158+
reference file timestamp, and the total number of references parsed.
159+
"""
138160
__tablename__ = 'processed_history'
139161
__table_args__ = (ForeignKeyConstraint( ['bibcode', 'source_filename'], ['reference_source.bibcode', 'reference_source.source_filename']),)
140162
id = Column(Integer, primary_key=True)
@@ -145,15 +167,16 @@ class ProcessedHistory(Base):
145167
date = Column(DateTime, default=func.now())
146168
total_ref = Column(Integer)
147169

148-
def __init__(self, bibcode, source_filename, source_modified, status, date, total_ref):
170+
def __init__(self, bibcode: str, source_filename: str, source_modified: DateTime, status: str, date: DateTime, total_ref: int):
149171
"""
172+
initializes a processed history object
150173
151-
:param bibcode:
152-
:param source_filename:
153-
:param source_modified:
154-
:param status:
155-
:param date:
156-
:param total_ref:
174+
:param bibcode: bibcode for the reference source
175+
:param source_filename: name of the source reference file
176+
:param source_modified: timestamp of the reference file at the time it was read
177+
:param status: first time processing, or reprocessing this list of references
178+
:param date: date of processing
179+
:param total_ref: total number of references parsed
157180
"""
158181
self.bibcode = bibcode
159182
self.source_filename = source_filename
@@ -162,9 +185,11 @@ def __init__(self, bibcode, source_filename, source_modified, status, date, tota
162185
self.date = date
163186
self.total_ref = total_ref
164187

165-
def toJSON(self):
188+
def toJSON(self) -> dict:
166189
"""
167-
:return: values formatted as python dict, if no values found returns empty structure, not None
190+
converts the processed history object to a JSON dictionary
191+
192+
:return: dictionary containing processed history details
168193
"""
169194
return {
170195
'bibcode': self.bibcode,
@@ -177,6 +202,10 @@ def toJSON(self):
177202

178203

179204
class ResolvedReference(Base):
205+
"""
206+
This class stores information about references that have been resolved, including the reference string, score,
207+
and its associated history entry.
208+
"""
180209
__tablename__ = 'resolved_reference'
181210
history_id = Column(Integer, ForeignKey('processed_history.id'), primary_key=True)
182211
item_num = Column(Integer, primary_key=True)
@@ -185,14 +214,16 @@ class ResolvedReference(Base):
185214
score = Column(Numeric)
186215
reference_raw = Column(String)
187216

188-
def __init__(self, history_id, item_num, reference_str, bibcode, score, reference_raw):
217+
def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str):
189218
"""
219+
initializes a resolved reference object
190220
191-
:param history_id:
192-
:param item_num
193-
:param reference_str:
194-
:param bibcode:
195-
:param score:
221+
:param history_id: ID of the related processed history entry
222+
:param item_num: order of the reference within the source
223+
:param reference_str: reference string
224+
:param bibcode: resolved bibcode
225+
:param score: confidence score of the resolved reference
226+
:param reference_raw: raw reference string
196227
"""
197228
self.history_id = history_id
198229
self.item_num = item_num
@@ -201,35 +232,28 @@ def __init__(self, history_id, item_num, reference_str, bibcode, score, referenc
201232
self.score = score
202233
self.reference_raw = reference_raw
203234

204-
def toJSON(self):
235+
def toJSON(self) -> dict:
205236
"""
206-
:return: values formatted as python dict, if no values found returns empty structure, not None
237+
converts the resolved reference object to a JSON dictionary
238+
239+
:return: dictionary containing resolved reference details
207240
"""
208-
if self.reference_raw:
209-
return {
210-
'history_id': self.history_id,
211-
'reference_str': self.reference_str,
212-
'bibcode': self.bibcode,
213-
'score': self.score,
214-
'item_num': self.item_num,
215-
'reference_raw': self.reference_raw
216-
}
217-
# do not include reference_raw if it is None
218241
return {
219242
'history_id': self.history_id,
220243
'reference_str': self.reference_str,
221244
'bibcode': self.bibcode,
222245
'score': self.score,
223246
'item_num': self.item_num,
247+
**({'reference_raw': self.reference_raw} if self.reference_raw else {})
224248
}
225249

226250

227251
class CompareClassic(Base):
228252
"""
229253
This table is for comparing classic resolver with service reference,
230254
keeps track of service reference that matched classic reference
231-
bibcode and score here is for classic
232-
255+
bibcode and score here is for classic, should be a temparary class
256+
only used during development/testing and verification
233257
"""
234258
__tablename__ = 'compare_classic'
235259
history_id = Column(Integer, ForeignKey('processed_history.id'), primary_key=True)
@@ -238,24 +262,27 @@ class CompareClassic(Base):
238262
score = Column(Numeric)
239263
state = Column(String)
240264

241-
def __init__(self, history_id, item_num, bibcode, score, state):
265+
def __init__(self, history_id: int, item_num: int, bibcode: str, score: Numeric, state: str):
242266
"""
267+
initializes a compare classic object
243268
244-
:param history_id:
245-
:param item_num:
246-
:param bibcode:
247-
:param classic_score:
248-
:param state:
269+
:param history_id: ID of the related processed history entry
270+
:param item_num: order of the reference within the source
271+
:param bibcode: resolved bibcode
272+
:param score: confidence score of the resolved reference
273+
:param state: comparison state (ie, matched, unmatched, etc.)
249274
"""
250275
self.history_id = history_id
251276
self.item_num = item_num
252277
self.bibcode = bibcode
253278
self.score = score
254279
self.state = state
255280

256-
def toJSON(self):
281+
def toJSON(self) -> dict:
257282
"""
258-
:return: values formatted as python dict, if no values found returns empty structure, not None
283+
converts the compare classic object to a JSON dictionary
284+
285+
:return: dictionary containing compare classic details
259286
"""
260287
return {
261288
'history_id': self.history_id,

adsrefpipe/refparsers/AASxml.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11

22
import sys, os
3-
import regex as re
43
import argparse
4+
from typing import List, Dict
55

66
from adsputils import setup_logging, load_config
7-
87
logger = setup_logging('refparsers')
98
config = {}
109
config.update(load_config())
@@ -15,9 +14,14 @@
1514

1615

1716
class AASreference(XMLreference):
17+
"""
18+
This class handles parsing AAS references in XML format. It extracts citation information such as authors,
19+
year, journal, title, volume, pages, DOI, and eprint, and stores the parsed details.
20+
"""
1821

1922
def parse(self):
2023
"""
24+
parse the AAS reference and extract citation information such as authors, year, title, and DOI
2125
2226
:return:
2327
"""
@@ -50,22 +54,26 @@ def parse(self):
5054

5155

5256
class AAStoREFs(XMLtoREFs):
57+
"""
58+
This class converts AAS XML references to a standardized reference format. It processes raw AAS references from
59+
either a file or a buffer and outputs parsed references, including bibcodes, authors, volume, pages, and DOI.
60+
"""
5361

54-
def __init__(self, filename, buffer):
62+
def __init__(self, filename: str, buffer: str):
5563
"""
64+
initialize the AAStoREFs object to process AAS references
5665
57-
:param filename:
58-
:param buffer:
59-
:param unicode:
60-
:param tag:
66+
:param filename: the path to the source file
67+
:param buffer: the XML references as a buffer
6168
"""
6269
XMLtoREFs.__init__(self, filename, buffer, parsername=AAStoREFs, tag='CITATION')
6370

6471

65-
def process_and_dispatch(self):
72+
def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]:
6673
"""
74+
perform reference cleaning and parsing, then dispatch the parsed references
6775
68-
:return:
76+
:return: a list of dictionaries containing bibcodes and parsed references
6977
"""
7078
references = []
7179
for raw_block_references in self.raw_references:
@@ -90,6 +98,10 @@ def process_and_dispatch(self):
9098
return references
9199

92100

101+
# This is the main program used for manual testing and verification of AASxml references.
102+
# It allows parsing references from either a file or a buffer, and if no input is provided,
103+
# it runs a source test file to verify the functionality against expected parsed results.
104+
# The test results are printed to indicate whether the parsing is successful or not.
93105
from adsrefpipe.tests.unittests.stubdata import parsed_references
94106
if __name__ == '__main__': # pragma: no cover
95107
parser = argparse.ArgumentParser(description='Parse AAS references')

0 commit comments

Comments
 (0)