Skip to content

Commit 6a64965

Browse files
Issue 16 (#31)
* add external_identifier column * alembic upgrade add external identifier * alembic update external_identifier * add external identifier to unittest mock data * update for unit tests * check external_identifier column * external identifier update * unit test fix
1 parent c1c2363 commit 6a64965

File tree

5 files changed

+170
-41
lines changed

5 files changed

+170
-41
lines changed

adsrefpipe/app.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@
2222

2323
from texttable import Texttable
2424

25+
def _ensure_list(x):
26+
if x is None:
27+
return None
28+
# treat strings as scalars, not iterables
29+
if isinstance(x, (str, bytes)):
30+
return [x]
31+
# already list-like
32+
return list(x)
33+
2534
class ADSReferencePipelineCelery(ADSCelery):
2635
"""
2736
celery-based pipeline for processing and resolving references
@@ -306,6 +315,7 @@ def query_resolved_reference_tbl(self, history_id_list: List = None) -> List:
306315

307316
return results
308317

318+
309319
def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List = None) -> List:
310320
"""
311321
perform a diagnostic query to retrieve combined reference records
@@ -315,6 +325,8 @@ def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List
315325
:return: List of combined records from multiple tables
316326
"""
317327
results = []
328+
bibcode_list = _ensure_list(bibcode_list)
329+
source_filename_list = _ensure_list(source_filename_list)
318330

319331
reference_source = self.query_reference_source_tbl(bibcode_list, source_filename_list)
320332
processed_history = self.query_processed_history_tbl(bibcode_list, source_filename_list)
@@ -404,6 +416,31 @@ def update_resolved_reference_records(self, session: object, resolved_list: List
404416
self.logger.debug("Added `ResolvedReference` records successfully.")
405417
return True
406418

419+
def update_resolved_reference_records(self, session: object, resolved_list: List[ResolvedReference]) -> bool:
420+
"""
421+
update resolved reference records in the database
422+
"""
423+
mappings = []
424+
for r in resolved_list:
425+
mappings.append({
426+
# must include PK columns for bulk_update_mappings
427+
"history_id": r.history_id,
428+
"item_num": r.item_num,
429+
"reference_str": r.reference_str,
430+
431+
# fields to update
432+
"bibcode": r.bibcode,
433+
"score": r.score,
434+
"reference_raw": r.reference_raw,
435+
"external_identifier": _ensure_list(getattr(r, "external_identifier", None)) or [],
436+
})
437+
438+
session.bulk_update_mappings(ResolvedReference, mappings)
439+
session.flush()
440+
self.logger.debug("Added `ResolvedReference` records successfully.")
441+
return True
442+
443+
407444
def insert_compare_records(self, session: object, compared_list: List[CompareClassic]) -> bool:
408445
"""
409446
insert records into the compare classic table
@@ -537,7 +574,8 @@ def populate_tables_post_resolved(self, resolved_reference: List, source_bibcode
537574
reference_str=ref.get('refstring', None),
538575
bibcode=ref.get('bibcode', None),
539576
score=ref.get('score', None),
540-
reference_raw=ref.get('refstring', None))
577+
reference_raw=ref.get('refstring', None),
578+
external_identifier=_ensure_list(ref.get('external_identifier', None)) or [])
541579
resolved_records.append(resolved_record)
542580
if resolved_classic:
543581
compare_record = CompareClassic(history_id=history_id,

adsrefpipe/models.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33

44
from sqlalchemy import Integer, String, Column, ForeignKey, DateTime, func, Numeric, ForeignKeyConstraint
5-
from sqlalchemy.dialects.postgresql import JSONB
5+
from sqlalchemy.dialects.postgresql import JSONB, ARRAY
66
from sqlalchemy.ext.declarative import declarative_base
77

88

@@ -213,8 +213,9 @@ class ResolvedReference(Base):
213213
bibcode = Column(String)
214214
score = Column(Numeric)
215215
reference_raw = Column(String)
216+
external_identifier = Column(ARRAY(String))
216217

217-
def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str):
218+
def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str, external_identifier: list = None):
218219
"""
219220
initializes a resolved reference object
220221
@@ -224,13 +225,15 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode:
224225
:param bibcode: resolved bibcode
225226
:param score: confidence score of the resolved reference
226227
:param reference_raw: raw reference string
228+
:param external_identifier: list of external identifiers associated with the reference, e.g. ["doi:...", "arxiv:...", "ascl:..."]
227229
"""
228230
self.history_id = history_id
229231
self.item_num = item_num
230232
self.reference_str = reference_str
231233
self.bibcode = bibcode
232234
self.score = score
233235
self.reference_raw = reference_raw
236+
self.external_identifier = external_identifier or []
234237

235238
def toJSON(self) -> dict:
236239
"""
@@ -244,7 +247,8 @@ def toJSON(self) -> dict:
244247
'bibcode': self.bibcode,
245248
'score': self.score,
246249
'item_num': self.item_num,
247-
**({'reference_raw': self.reference_raw} if self.reference_raw else {})
250+
**({'reference_raw': self.reference_raw} if self.reference_raw else {}),
251+
'external_identifier': self.external_identifier
248252
}
249253

250254

adsrefpipe/tests/unittests/test_app.py

Lines changed: 93 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@
3030
from adsrefpipe.refparsers.handler import verify
3131
from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records
3232

33+
import testing.postgresql
34+
35+
def _get_external_identifier(rec):
36+
"""
37+
Works whether rec is a dict (bulk mappings) or an ORM object.
38+
"""
39+
if rec is None:
40+
return []
41+
if isinstance(rec, dict):
42+
return rec.get("external_identifier") or []
43+
return getattr(rec, "external_identifier", None) or []
3344

3445
class TestDatabase(unittest.TestCase):
3546

@@ -39,18 +50,13 @@ class TestDatabase(unittest.TestCase):
3950

4051
maxDiff = None
4152

42-
postgresql_url_dict = {
43-
'port': 5432,
44-
'host': '127.0.0.1',
45-
'user': 'postgres',
46-
'database': 'postgres'
47-
}
48-
postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \
49-
.format(user=postgresql_url_dict['user'],
50-
host=postgresql_url_dict['host'],
51-
port=postgresql_url_dict['port'],
52-
database=postgresql_url_dict['database']
53-
)
53+
_postgresql = testing.postgresql.Postgresql()
54+
postgresql_url = _postgresql.url()
55+
56+
@classmethod
57+
def tearDownClass(cls):
58+
super().tearDownClass()
59+
cls._postgresql.stop()
5460

5561
def setUp(self):
5662
self.test_dir = os.path.join(project_home, 'adsrefpipe/tests')
@@ -88,16 +94,22 @@ def add_stub_data(self):
8894

8995
resolved_reference = [
9096
[
91-
('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ','2011LRR....14....2U',1.0),
92-
('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.','2017RPPh...80l6902M',1.0)
97+
('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ',
98+
'2011LRR....14....2U', 1.0, ['arxiv:1009.5514']),
99+
('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.',
100+
'2017RPPh...80l6902M', 1.0, ['arxiv:1709.02923'])
93101
],
94102
[
95-
('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.','2011MNRAS.417..709A',1.0),
96-
('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ','2019A&A...625A.136A',1.0)
103+
('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.',
104+
'2011MNRAS.417..709A', 1.0, ['doi:10.0000/mnras.417.709']),
105+
('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ',
106+
'2019A&A...625A.136A', 1.0, ['doi:10.0000/aa.625.A136'])
97107
],
98108
[
99-
('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24','2017ApJ...842L..24A',1.0),
100-
('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ','2016A&A...586A..71A',1.0)
109+
('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24',
110+
'2017ApJ...842L..24A', 1.0, ['ascl:1701.001']),
111+
('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ',
112+
'2016A&A...586A..71A', 1.0, ['doi:10.0000/aa.586.A71'])
101113
],
102114
]
103115

@@ -117,8 +129,13 @@ def add_stub_data(self):
117129
]
118130

119131
with self.app.session_scope() as session:
120-
session.bulk_save_objects(actions_records)
121-
session.bulk_save_objects(parsers_records)
132+
session.query(Action).delete()
133+
session.query(Parser).delete()
134+
session.commit()
135+
if session.query(Action).count() == 0:
136+
session.bulk_save_objects(actions_records)
137+
if session.query(Parser).count() == 0:
138+
session.bulk_save_objects(parsers_records)
122139
session.commit()
123140

124141
for i, (a_reference,a_history) in enumerate(zip(reference_source,processed_history)):
@@ -453,9 +470,22 @@ def test_populate_tables_post_resolved_with_classic(self):
453470
""" test populate_tables_post_resolved when resolved_classic is available """
454471

455472
resolved_reference = [
456-
{'id': 'H1I1', 'refstring': 'Reference 1', 'bibcode': '2023A&A...657A...1X', 'score': 1.0},
457-
{'id': 'H1I2', 'refstring': 'Reference 2', 'bibcode': '2023A&A...657A...2X', 'score': 0.8}
473+
{
474+
'id': 'H1I1',
475+
'refstring': 'Reference 1',
476+
'bibcode': '2023A&A...657A...1X',
477+
'score': 1.0,
478+
'external_identifier': ['doi:10.1234/abc', 'arxiv:2301.00001'],
479+
},
480+
{
481+
'id': 'H1I2',
482+
'refstring': 'Reference 2',
483+
'bibcode': '2023A&A...657A...2X',
484+
'score': 0.8,
485+
'external_identifier': ['ascl:2301.001', 'doi:10.9999/xyz'],
486+
}
458487
]
488+
459489
source_bibcode = "2023A&A...657A...1X"
460490
classic_resolved_filename = "classic_results.txt"
461491
classic_resolved_reference = [
@@ -476,6 +506,12 @@ def test_populate_tables_post_resolved_with_classic(self):
476506
mock_insert.assert_called_once()
477507
mock_logger.assert_called_with("Updated 2 resolved reference records successfully.")
478508

509+
# Check whether external_identifier is populated with correct data
510+
_, resolved_records = mock_update.call_args[0]
511+
self.assertEqual(len(resolved_records), 2)
512+
self.assertEqual(_get_external_identifier(resolved_records[0]), ['doi:10.1234/abc', 'arxiv:2301.00001'])
513+
self.assertEqual(_get_external_identifier(resolved_records[1]), ['ascl:2301.001', 'doi:10.9999/xyz'])
514+
479515
@patch("adsrefpipe.app.ProcessedHistory")
480516
@patch("adsrefpipe.app.ResolvedReference")
481517
@patch("adsrefpipe.app.CompareClassic")
@@ -745,18 +781,13 @@ class TestDatabaseNoStubdata(unittest.TestCase):
745781

746782
maxDiff = None
747783

748-
postgresql_url_dict = {
749-
'port': 5432,
750-
'host': '127.0.0.1',
751-
'user': 'postgres',
752-
'database': 'postgres'
753-
}
754-
postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \
755-
.format(user=postgresql_url_dict['user'],
756-
host=postgresql_url_dict['host'],
757-
port=postgresql_url_dict['port'],
758-
database=postgresql_url_dict['database']
759-
)
784+
_postgresql = testing.postgresql.Postgresql()
785+
postgresql_url = _postgresql.url()
786+
787+
@classmethod
788+
def tearDownClass(cls):
789+
super().tearDownClass()
790+
cls._postgresql.stop()
760791

761792
def setUp(self):
762793
self.test_dir = os.path.join(project_home, 'adsrefpipe/tests')
@@ -805,26 +836,36 @@ def test_populate_tables(self):
805836
"refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.",
806837
"id": "H1I2"}
807838
]
839+
840+
# IMPORTANT: use the real column name expected by app/models: external_identifier (list)
808841
resolved_references = [
809842
{
810843
"score": "1.0",
811844
"bibcode": "2011LRR....14....2U",
812845
"refstring": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ",
813846
"refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ",
814-
"id": "H1I1"
847+
"id": "H1I1",
848+
"external_identifier": ["arxiv:1009.5514", "doi:10.1234/abc"]
815849
},
816850
{
817851
"score": "1.0",
818852
"bibcode": "2017RPPh...80l6902M",
819853
"refstring": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.",
820854
"refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.",
821855
"id": "H1I2",
856+
"external_identifier": ["arxiv:1709.02923", "ascl:2301.001"]
822857
}
823858
]
859+
824860
arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/')
825861
with self.app.session_scope() as session:
826-
session.bulk_save_objects(actions_records)
827-
session.bulk_save_objects(parsers_records)
862+
session.query(Action).delete()
863+
session.query(Parser).delete()
864+
session.commit()
865+
if session.query(Action).count() == 0:
866+
session.bulk_save_objects(actions_records)
867+
if session.query(Parser).count() == 0:
868+
session.bulk_save_objects(parsers_records)
828869
session.commit()
829870

830871
references = self.app.populate_tables_pre_resolved_initial_status(
@@ -842,6 +883,20 @@ def test_populate_tables(self):
842883
classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result'))
843884
self.assertTrue(status == True)
844885

886+
# Verify external_identifier was persisted on ResolvedReference rows
887+
# We know history_id should be 1 for the first inserted ProcessedHistory in an empty DB.
888+
rows = (
889+
session.query(ResolvedReference)
890+
.filter(ResolvedReference.history_id == 1)
891+
.order_by(ResolvedReference.item_num.asc())
892+
.all()
893+
)
894+
self.assertEqual(len(rows), 2)
895+
self.assertEqual(rows[0].item_num, 1)
896+
self.assertEqual(rows[1].item_num, 2)
897+
self.assertEqual(rows[0].external_identifier, ["arxiv:1009.5514", "doi:10.1234/abc"])
898+
self.assertEqual(rows[1].external_identifier, ["arxiv:1709.02923", "ascl:2301.001"])
899+
845900
def test_get_parser_error(self):
846901
""" test get_parser when it errors for unrecognized source filename """
847902
with patch.object(self.app.logger, 'error') as mock_error:
@@ -851,3 +906,4 @@ def test_get_parser_error(self):
851906

852907
if __name__ == '__main__':
853908
unittest.main()
909+

adsrefpipe/tests/unittests/test_tasks.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ def add_stub_data(self):
7474
]
7575

7676
with self.app.session_scope() as session:
77+
session.query(Action).delete()
78+
session.query(Parser).delete()
7779
session.bulk_save_objects(actions_records)
7880
session.bulk_save_objects(parsers_records)
7981
session.commit()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""add_external_identifier
2+
3+
Revision ID: 08ca70bd6f5f
4+
Revises: e3d6e15c3b8c
5+
Create Date: 2026-01-05 11:16:27.454389
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
from sqlalchemy.dialects import postgresql
11+
12+
13+
# revision identifiers, used by Alembic.
14+
revision = '08ca70bd6f5f'
15+
down_revision = 'e3d6e15c3b8c'
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade():
21+
op.add_column('resolved_reference',
22+
sa.Column("external_identifier",
23+
postgresql.ARRAY(sa.String()))
24+
)
25+
26+
27+
def downgrade():
28+
op.drop_column('resolved_reference', 'external_identifier')
29+

0 commit comments

Comments
 (0)