diff --git a/adsmp/app.py b/adsmp/app.py
index b3fbf9a..c2cd9fd 100644
--- a/adsmp/app.py
+++ b/adsmp/app.py
@@ -181,14 +181,6 @@ def update_storage(self, bibcode, type, payload):
record.scix_id = "scix:" + str(self.generate_scix_id(record.bib_data))
out = record.toJSON()
session.commit()
-
- # Send payload to Boost pipeline
- if type != 'boost' and not self._config.get('TESTING_MODE', False):
- try:
- self.generate_boost_request_message(bibcode)
- except Exception as e:
- self.logger.exception('Error generating boost request message for bibcode %s: %s', bibcode, e)
-
return out
except exc.IntegrityError:
self.logger.exception('error in app.update_storage while updating database for bibcode {}, type {}'.format(bibcode, type))
@@ -196,7 +188,11 @@ def update_storage(self, bibcode, type, payload):
raise
def generate_scix_id(self, bib_data):
- return scix_id.generate_scix_id(bib_data)
+ if self._config.get('SCIX_ID_GENERATION_FIELDS', None):
+ user_fields = self._config.get('SCIX_ID_GENERATION_FIELDS')
+ else:
+ user_fields = None
+ return scix_id.generate_scix_id(bib_data, user_fields = user_fields)
def delete_by_bibcode(self, bibcode):
with self.session_scope() as session:
@@ -894,7 +890,7 @@ def should_include_in_sitemap(self, record):
3. If processed, processing isn't too stale
Args:
- record: Dictionary with record data including bib_data, status, timestamps
+ record: Dictionary with record data including has_bib_data, status, timestamps
Returns:
bool: True if record should be included in sitemap, False otherwise
@@ -903,14 +899,14 @@ def should_include_in_sitemap(self, record):
# Extract values from record dictionary
bibcode = record.get('bibcode', None)
- bib_data = record.get('bib_data', None)
+ has_bib_data = record.get('has_bib_data', None)
bib_data_updated = record.get('bib_data_updated')
solr_processed = record.get('solr_processed')
status = record.get('status')
# Must have bibliographic data
- if not bib_data or not bibcode or (isinstance(bib_data, str) and not bib_data.strip()):
- self.logger.debug('Excluding %s from sitemap: No bibcode or bib_data', bibcode)
+ if not has_bib_data or not bibcode:
+ self.logger.debug('Excluding %s from sitemap: No bibcode or has_bib_data is False', bibcode)
return False
# Exclude if SOLR failed or if record is being retried (previously failed)
@@ -959,6 +955,8 @@ def get_records_bulk(self, bibcodes, session, load_only=None):
record_data = {}
for field in (load_only or ['id', 'bibcode', 'bib_data', 'bib_data_updated', 'solr_processed', 'status']):
record_data[field] = getattr(record, field, None)
+ # Add has_bib_data boolean for sitemap checks
+ record_data['has_bib_data'] = bool(record_data.get('bib_data'))
records_dict[record.bibcode] = record_data
return records_dict
diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py
index 808a961..eccfc5f 100644
--- a/adsmp/solr_updater.py
+++ b/adsmp/solr_updater.py
@@ -179,6 +179,7 @@ def extract_classifications_pipeline(db_classifications, solrdoc):
"""retrieve expected classifier collections
classifications is a solr virtual field so it should never be set"""
+ db_classifications = [element for element in db_classifications if element] # remove empty strings
if db_classifications is None or len(db_classifications) == 0:
return {"database" : solrdoc.get("database", None)}
diff --git a/adsmp/tasks.py b/adsmp/tasks.py
index efdfc3e..53a6cef 100644
--- a/adsmp/tasks.py
+++ b/adsmp/tasks.py
@@ -42,10 +42,73 @@
Queue('update-sitemap-files', app.exchange, routing_key='update-sitemap-files'),
Queue('update-scixid', app.exchange, routing_key='update-scixid'),
Queue('boost-request', app.exchange, routing_key='boost-request'),
+ Queue('augment-record', app.exchange, routing_key='augment-record'),
)
# ============================= TASKS ============================================= #
+@app.task(queue='augment-record')
+def task_augment_record(msg):
+ """Receives payload to augment the record.
+
+ @param msg: protobuff that contains at minimum
+ - bibcode
+ - and specific payload
+ """
+ # logger.debug('Updating record: %s', msg)
+ logger.debug('Updating record: %s', msg)
+ status = app.get_msg_status(msg)
+ logger.debug(f'Message status: {status}')
+ type = app.get_msg_type(msg)
+ logger.debug(f'Message type: {type}')
+ bibcodes = []
+
+ if status == 'active':
+ # save into a database
+ # passed msg may contain details on one bibcode or a list of bibcodes
+ if type == 'nonbib_records':
+ for m in msg.nonbib_records:
+ m = Msg(m, None, None) # m is a raw protobuf, TODO: return proper instance from .nonbib_records
+ bibcodes.append(m.bibcode)
+ record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON())
+ if record:
+ logger.debug('Saved record from list: %s', record)
+ elif type == 'metrics_records':
+ for m in msg.metrics_records:
+ m = Msg(m, None, None)
+ bibcodes.append(m.bibcode)
+ record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True))
+ if record:
+ logger.debug('Saved record from list: %s', record)
+ elif type == 'augment':
+ bibcodes.append(msg.bibcode)
+ record = app.update_storage(msg.bibcode, 'augment',
+ msg.toJSON(including_default_value_fields=True))
+ if record:
+ logger.debug('Saved augment message: %s', msg)
+ elif type == 'classify':
+ bibcodes.append(msg.bibcode)
+ logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}')
+ payload = msg.toJSON(including_default_value_fields=True)
+ payload = payload['collections']
+ record = app.update_storage(msg.bibcode, 'classify',payload)
+ if record:
+ logger.debug('Saved classify message: %s', msg)
+ else:
+ # here when record has a single bibcode
+ bibcodes.append(msg.bibcode)
+ record = app.update_storage(msg.bibcode, type, msg.toJSON())
+ if record:
+ logger.debug('Saved record: %s', record)
+ if record:
+ # Send payload to Boost pipeline
+ if type != 'boost' and not app._config.get('TESTING_MODE', False):
+ try:
+ task_boost_request.apply_async(args=(msg.bibcode,))
+ except Exception as e:
+ app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
+ else:
+ logger.error('Received a message with unclear status: %s', msg)
@app.task(queue='update-record')
def task_update_record(msg):
@@ -94,6 +157,7 @@ def task_update_record(msg):
record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON())
if record:
logger.debug('Saved record from list: %s', record)
+ _generate_boost_request(m, type)
elif type == 'metrics_records':
for m in msg.metrics_records:
m = Msg(m, None, None)
@@ -101,12 +165,14 @@ def task_update_record(msg):
record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True))
if record:
logger.debug('Saved record from list: %s', record)
+ _generate_boost_request(m, type)
elif type == 'augment':
bibcodes.append(msg.bibcode)
record = app.update_storage(msg.bibcode, 'augment',
msg.toJSON(including_default_value_fields=True))
if record:
logger.debug('Saved augment message: %s', msg)
+ _generate_boost_request(msg, type)
elif type == 'classify':
bibcodes.append(msg.bibcode)
logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}')
@@ -115,21 +181,32 @@ def task_update_record(msg):
record = app.update_storage(msg.bibcode, 'classify',payload)
if record:
logger.debug('Saved classify message: %s', msg)
+ _generate_boost_request(msg, type)
else:
# here when record has a single bibcode
bibcodes.append(msg.bibcode)
record = app.update_storage(msg.bibcode, type, msg.toJSON())
if record:
logger.debug('Saved record: %s', record)
+ _generate_boost_request(msg, type)
if type == 'metadata':
# with new bib data we request to augment the affiliation
# that pipeline will eventually respond with a msg to task_update_record
logger.debug('requesting affilation augmentation for %s', msg.bibcode)
app.request_aff_augment(msg.bibcode)
-
else:
logger.error('Received a message with unclear status: %s', msg)
+def _generate_boost_request(msg, msg_type):
+ # Send payload to Boost pipeline
+ if msg_type not in app._config.get('IGNORED_BOOST_PAYLOAD_TYPES', ['boost']) and not app._config.get('TESTING_MODE', False):
+ try:
+ task_boost_request.apply_async(args=(msg.bibcode,))
+ except Exception as e:
+ app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
+ else:
+ app.logger.debug("Message for bibcode %s has type: %s, Skipping.".format(msg.bibcode, msg_type))
+
@app.task(queue='update-scixid')
def task_update_scixid(bibcodes, flag):
"""Receives bibcodes to add scix id to the record.
@@ -428,7 +505,7 @@ def task_cleanup_invalid_sitemaps():
session.query(
SitemapInfo.id,
SitemapInfo.bibcode,
- Records.bib_data,
+ (Records.bib_data.isnot(None)).label('has_bib_data'),
Records.bib_data_updated,
Records.solr_processed,
Records.status
@@ -457,7 +534,7 @@ def task_cleanup_invalid_sitemaps():
# Convert to dict for should_include_in_sitemap function
record_dict = {
'bibcode': record_data.bibcode,
- 'bib_data': record_data.bib_data,
+ 'has_bib_data': record_data.has_bib_data,
'bib_data_updated': record_data.bib_data_updated,
'solr_processed': record_data.solr_processed,
'status': record_data.status
@@ -626,7 +703,7 @@ def task_manage_sitemap(bibcodes, action):
# Apply SOLR filtering - convert record to dict for should_include_in_sitemap
record_dict = {
'bibcode': record.bibcode,
- 'bib_data': record.bib_data,
+ 'has_bib_data': bool(record.bib_data),
'bib_data_updated': record.bib_data_updated,
'solr_processed': record.solr_processed,
'status': record.status
@@ -688,7 +765,6 @@ def task_manage_sitemap(bibcodes, action):
logger.info('Bootstrap completed: %d successful, %d failed out of %d total records',
successful_count, failed_count, processed)
logger.info('All records marked with update_flag=True')
- logger.info('Run --update-sitemap-files to generate sitemap XML files')
return
elif action in ['add', 'force-update']:
diff --git a/adsmp/tests/test_app.py b/adsmp/tests/test_app.py
index 9c4bb21..178982d 100644
--- a/adsmp/tests/test_app.py
+++ b/adsmp/tests/test_app.py
@@ -1,56 +1,59 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import mock
-from mock import patch
-import unittest
-import os
-import sys
import json
+import os
import re
+import sys
import tempfile
import time
+import unittest
+from datetime import timedelta
import adsputils
-from adsmp import app, models
-from adsmp.models import Base, MetricsBase, Records, SitemapInfo, ChangeLog
-from adsputils import get_date
+import mock
import testing.postgresql
+from adsputils import get_date
+from mock import patch
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
-from datetime import timedelta
+
+from adsmp import app, models
+from adsmp.models import Base, ChangeLog, MetricsBase, Records, SitemapInfo
class TestAdsOrcidCelery(unittest.TestCase):
"""
Tests the appliction's methods
"""
-
+
@classmethod
def setUpClass(cls):
- cls.postgresql = \
- testing.postgresql.Postgresql(host='127.0.0.1', port=15678, user='postgres',
- database='test')
+ cls.postgresql = testing.postgresql.Postgresql(
+ host="127.0.0.1", port=15678, user="postgres", database="test"
+ )
@classmethod
def tearDownClass(cls):
cls.postgresql.stop()
-
+
def setUp(self):
unittest.TestCase.setUp(self)
-
- proj_home = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
- with mock.patch.dict('os.environ', {'ADS_API_TOKEN': 'fixme'}):
- self.app = app.ADSMasterPipelineCelery('test', local_config=\
- {
- 'SQLALCHEMY_URL': 'sqlite:///',
- 'METRICS_SQLALCHEMY_URL': 'postgresql://postgres@127.0.0.1:15678/test',
- 'SQLALCHEMY_ECHO': False,
- 'PROJ_HOME' : proj_home,
- 'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'),
- })
+
+ proj_home = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+ with mock.patch.dict("os.environ", {"ADS_API_TOKEN": "fixme"}):
+ self.app = app.ADSMasterPipelineCelery(
+ "test",
+ local_config={
+ "SQLALCHEMY_URL": "sqlite:///",
+ "METRICS_SQLALCHEMY_URL": "postgresql://postgres@127.0.0.1:15678/test",
+ "SQLALCHEMY_ECHO": False,
+ "PROJ_HOME": proj_home,
+ "TEST_DIR": os.path.join(proj_home, "adsmp/tests"),
+ },
+ )
Base.metadata.bind = self.app._session.get_bind()
Base.metadata.create_all()
-
+
MetricsBase.metadata.bind = self.app._metrics_engine
MetricsBase.metadata.create_all()
@@ -61,228 +64,306 @@ def tearDown(self):
self.app.close_app()
def test_app(self):
- assert self.app._config.get('SQLALCHEMY_URL') == 'sqlite:///'
- assert self.app.conf.get('SQLALCHEMY_URL') == 'sqlite:///'
+ assert self.app._config.get("SQLALCHEMY_URL") == "sqlite:///"
+ assert self.app.conf.get("SQLALCHEMY_URL") == "sqlite:///"
def test_mark_processed(self):
- self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success')
- r = self.app.get_record('abc')
+ self.app.mark_processed(["abc"], "solr", checksums=["jkl"], status="success")
+ r = self.app.get_record("abc")
self.assertEqual(r, None)
-
- self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
- self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success')
- r = self.app.get_record('abc')
-
- self.assertTrue(r['solr_processed'])
- self.assertTrue(r['status'])
-
- self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='solr-failed')
- r = self.app.get_record('abc')
- self.assertTrue(r['solr_processed'])
- self.assertTrue(r['processed'])
- self.assertEqual(r['status'], 'solr-failed')
+
+ self.app.update_storage("abc", "bib_data", {"bibcode": "abc", "hey": 1})
+ self.app.mark_processed(["abc"], "solr", checksums=["jkl"], status="success")
+ r = self.app.get_record("abc")
+
+ self.assertTrue(r["solr_processed"])
+ self.assertTrue(r["status"])
+
+ self.app.mark_processed(
+ ["abc"], "solr", checksums=["jkl"], status="solr-failed"
+ )
+ r = self.app.get_record("abc")
+ self.assertTrue(r["solr_processed"])
+ self.assertTrue(r["processed"])
+ self.assertEqual(r["status"], "solr-failed")
def test_index_solr(self):
- self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1, 'test': 'test'})
- self.app.update_storage('foo', 'bib_data', {'bibcode': 'foo', 'hey': 1})
-
- with mock.patch('adsmp.solr_updater.update_solr', return_value=[200]):
- self.app.index_solr([{'bibcode': 'abc'},
- {'bibcode': 'foo'}],
- ['checksum1', 'checksum2'],
- ['http://solr1'])
+ self.app.update_storage(
+ "abc",
+ "bib_data",
+ {
+ "bibcode": "abc",
+ "hey": 1,
+ "test": "test",
+ "title": "Test record abc solr",
+ },
+ )
+ self.app.update_storage(
+ "foo",
+ "bib_data",
+ {"bibcode": "foo", "hey": 1, "title": "Test record foo solr"},
+ )
+
+ with mock.patch("adsmp.solr_updater.update_solr", return_value=[200]):
+ self.app.index_solr(
+ [{"bibcode": "abc"}, {"bibcode": "foo"}],
+ ["checksum1", "checksum2"],
+ ["http://solr1"],
+ )
with self.app.session_scope() as session:
- for x in ['abc', 'foo']:
+ for x in ["abc", "foo"]:
r = session.query(models.Records).filter_by(bibcode=x).first()
self.assertTrue(r.processed)
self.assertFalse(r.metrics_processed)
self.assertTrue(r.solr_processed)
-
+
# pretend group failure and then success when records sent individually
- with mock.patch('adsmp.solr_updater.update_solr') as us, \
- mock.patch.object(self.app, 'mark_processed') as mp:
+ with mock.patch("adsmp.solr_updater.update_solr") as us, mock.patch.object(
+ self.app, "mark_processed"
+ ) as mp:
us.side_effect = [[503], [200], [200]]
- self.app.index_solr([{'bibcode': 'abc'},
- {'bibcode': 'foo'}],
- ['checksum1', 'checksum2'],
- ['http://solr1'])
+ self.app.index_solr(
+ [{"bibcode": "abc"}, {"bibcode": "foo"}],
+ ["checksum1", "checksum2"],
+ ["http://solr1"],
+ )
# self.assertTrue(len(failed) == 0)
x = str(mp.call_args_list[0])
- self.assertTrue('abc' in x)
- self.assertTrue('success' in x)
- self.assertTrue('solr' in x)
+ self.assertTrue("abc" in x)
+ self.assertTrue("success" in x)
+ self.assertTrue("solr" in x)
self.assertEqual(us.call_count, 3)
x = str(mp.call_args_list[1])
- self.assertTrue('foo' in x)
- self.assertTrue('success' in x)
- self.assertTrue('solr' in x)
+ self.assertTrue("foo" in x)
+ self.assertTrue("success" in x)
+ self.assertTrue("solr" in x)
# pretend failure and success without body
# update_solr should try to send two records together and then
# each record by itself twice: once as is and once without fulltext
- with mock.patch('adsmp.solr_updater.update_solr') as us, \
- mock.patch.object(self.app, 'mark_processed') as mp:
- us.side_effect = [[503, 503], Exception('body failed'), 200, Exception('body failed'), 200]
- self.app.index_solr([{'bibcode': 'abc', 'body': 'BAD BODY'},
- {'bibcode': 'foo', 'body': 'BAD BODY'}],
- ['checksum1', 'checksum2'],
- ['http://solr1'])
+ with mock.patch("adsmp.solr_updater.update_solr") as us, mock.patch.object(
+ self.app, "mark_processed"
+ ) as mp:
+ us.side_effect = [
+ [503, 503],
+ Exception("body failed"),
+ 200,
+ Exception("body failed"),
+ 200,
+ ]
+ self.app.index_solr(
+ [
+ {"bibcode": "abc", "body": "BAD BODY"},
+ {"bibcode": "foo", "body": "BAD BODY"},
+ ],
+ ["checksum1", "checksum2"],
+ ["http://solr1"],
+ )
self.assertEqual(us.call_count, 5)
# self.assertTrue(len(failed) == 0)
self.assertEqual(mp.call_count, 2)
x = str(us.call_args_list[-2])
- self.assertTrue('http://solr1' in x)
- self.assertTrue('foo' in x)
- self.assertTrue('body' in x)
- self.assertTrue('BAD BODY' in x)
+ self.assertTrue("http://solr1" in x)
+ self.assertTrue("foo" in x)
+ self.assertTrue("body" in x)
+ self.assertTrue("BAD BODY" in x)
x = str(us.call_args_list[-1])
- self.assertTrue('http://solr1' in x)
- self.assertTrue('foo' in x)
+ self.assertTrue("http://solr1" in x)
+ self.assertTrue("foo" in x)
# pretend failure and then lots more failure
# update_solr should try to send two records together and then
# each record by itself twice: once as is and once without fulltext
- with mock.patch('adsmp.solr_updater.update_solr') as us:
- us.side_effect = [[503, 503],
- Exception('body failed'), Exception('body failed'),
- Exception('body failed'), Exception('body failed')]
- self.app.index_solr([{'bibcode': 'abc', 'body': 'bad body'},
- {'bibcode': 'foo', 'body': 'bad body'}],
- ['checksum1', 'checksum2'],
- ['http://solr1'])
+ with mock.patch("adsmp.solr_updater.update_solr") as us:
+ us.side_effect = [
+ [503, 503],
+ Exception("body failed"),
+ Exception("body failed"),
+ Exception("body failed"),
+ Exception("body failed"),
+ ]
+ self.app.index_solr(
+ [
+ {"bibcode": "abc", "body": "bad body"},
+ {"bibcode": "foo", "body": "bad body"},
+ ],
+ ["checksum1", "checksum2"],
+ ["http://solr1"],
+ )
self.assertEqual(us.call_count, 5)
# pretend failure and and then failure for a mix of reasons
- with mock.patch('adsmp.solr_updater.update_solr') as us:
- us.side_effect = [[503, 503], Exception('body failed'), Exception('failed'), Exception('failed')]
- self.app.index_solr([{'bibcode': 'abc', 'body': 'bad body'},
- {'bibcode': 'foo', 'body': 'good body'}],
- ['checksum1', 'checksum2'],
- ['http://solr1'])
+ with mock.patch("adsmp.solr_updater.update_solr") as us:
+ us.side_effect = [
+ [503, 503],
+ Exception("body failed"),
+ Exception("failed"),
+ Exception("failed"),
+ ]
+ self.app.index_solr(
+ [
+ {"bibcode": "abc", "body": "bad body"},
+ {"bibcode": "foo", "body": "good body"},
+ ],
+ ["checksum1", "checksum2"],
+ ["http://solr1"],
+ )
self.assertEqual(us.call_count, 4)
if sys.version_info > (3,):
call_dict = "{'bibcode': 'foo', 'body': 'good body'}"
else:
call_dict = "{'body': 'good body', 'bibcode': 'foo'}"
- self.assertEqual(str(us.call_args_list[-1]), "call([%s], ['http://solr1'], commit=False, ignore_errors=False)" % call_dict)
+ self.assertEqual(
+ str(us.call_args_list[-1]),
+ "call([%s], ['http://solr1'], commit=False, ignore_errors=False)"
+ % call_dict,
+ )
# pretend failure and and then a mix of failure and success
- with mock.patch('adsmp.solr_updater.update_solr') as us, \
- mock.patch.object(self.app, 'mark_processed') as mp:
- us.side_effect = [[503, 503], Exception('body failed'), [200]]
- self.app.index_solr([{'bibcode': 'abc', 'body': 'bad body'},
- {'bibcode': 'foo', 'body': 'good body'}],
- ['checksum1', 'checksum2'],
- ['http://solr1'])
+ with mock.patch("adsmp.solr_updater.update_solr") as us, mock.patch.object(
+ self.app, "mark_processed"
+ ) as mp:
+ us.side_effect = [[503, 503], Exception("body failed"), [200]]
+ self.app.index_solr(
+ [
+ {"bibcode": "abc", "body": "bad body"},
+ {"bibcode": "foo", "body": "good body"},
+ ],
+ ["checksum1", "checksum2"],
+ ["http://solr1"],
+ )
self.assertEqual(us.call_count, 4)
# self.assertTrue(len(failed) == 1)
self.assertEqual(us.call_count, 4)
self.assertEqual(mp.call_count, 2)
x = str(us.call_args_list[-1])
- self.assertTrue('foo' in x)
- self.assertTrue('good body' in x)
- self.assertTrue('http://solr1' in x)
+ self.assertTrue("foo" in x)
+ self.assertTrue("good body" in x)
+ self.assertTrue("http://solr1" in x)
def test_update_metrics(self):
- self.app.update_storage('abc', 'metrics', {
- 'author_num': 1,
- 'bibcode': 'abc',
- })
- self.app.update_storage('foo', 'metrics', {
- 'bibcode': 'foo',
- 'citation_num': 6,
- 'author_num': 3,
- })
-
- batch_metrics = [self.app.get_record('abc')['metrics'], self.app.get_record('foo')['metrics']]
- batch_checksum = ['checksum1', 'checksum2']
+ self.app.update_storage(
+ "abc",
+ "metrics",
+ {
+ "author_num": 1,
+ "bibcode": "abc",
+ },
+ )
+ self.app.update_storage(
+ "foo",
+ "metrics",
+ {
+ "bibcode": "foo",
+ "citation_num": 6,
+ "author_num": 3,
+ },
+ )
+
+ batch_metrics = [
+ self.app.get_record("abc")["metrics"],
+ self.app.get_record("foo")["metrics"],
+ ]
+ batch_checksum = ["checksum1", "checksum2"]
self.app.index_metrics(batch_metrics, batch_checksum)
-
- for x in ['abc', 'foo']:
+
+ for x in ["abc", "foo"]:
r = self.app.get_record(x)
- self.assertTrue(r['processed'])
- self.assertTrue(r['metrics_processed'])
- self.assertFalse(r['solr_processed'])
-
+ self.assertTrue(r["processed"])
+ self.assertTrue(r["metrics_processed"])
+ self.assertFalse(r["solr_processed"])
+
def test_delete_metrics(self):
"""Makes sure we can delete a metrics record by bibcode"""
- self.app.update_storage('abc', 'metrics', {
- 'author_num': 1,
- 'bibcode': 'abc',
- })
- r = self.app.get_record('abc')
- self.app.index_metrics([r], ['checksum'])
- m = self.app.get_metrics('abc')
- self.assertTrue(m, 'intialized metrics data')
- self.app.metrics_delete_by_bibcode('abc')
- m = self.app.get_metrics('abc')
- self.assertFalse(m, 'deleted metrics data')
-
+ self.app.update_storage(
+ "abc",
+ "metrics",
+ {
+ "author_num": 1,
+ "bibcode": "abc",
+ },
+ )
+ r = self.app.get_record("abc")
+ self.app.index_metrics([r], ["checksum"])
+ m = self.app.get_metrics("abc")
+ self.assertTrue(m, "intialized metrics data")
+ self.app.metrics_delete_by_bibcode("abc")
+ m = self.app.get_metrics("abc")
+ self.assertFalse(m, "deleted metrics data")
+
def test_update_records(self):
"""Makes sure we can write recs into the storage."""
now = adsputils.get_date()
last_time = adsputils.get_date()
- for k in ['bib_data', 'nonbib_data', 'orcid_claims']:
- self.app.update_storage('abc', k, {'foo': 'bar', 'hey': 1})
+ for k in ["bib_data", "nonbib_data", "orcid_claims"]:
+ self.app.update_storage(
+ "abc", k, {"foo": "bar", "hey": 1, "title": "Test record abc"}
+ )
with self.app.session_scope() as session:
- r = session.query(models.Records).filter_by(bibcode='abc').first()
+ r = session.query(models.Records).filter_by(bibcode="abc").first()
self.assertTrue(r.id == 1)
- self.assertTrue(r.scix_id == 'scix:0RW9-X19B-XHYY')
+ self.assertEqual(r.scix_id, "scix:50RZ-VNK5-03S7")
j = r.toJSON()
- self.assertEqual(j[k], {'foo': 'bar', 'hey': 1})
- t = j[k + '_updated']
+ self.assertEqual(
+ j[k], {"foo": "bar", "hey": 1, "title": "Test record abc"}
+ )
+ t = j[k + "_updated"]
self.assertTrue(now < t)
- self.assertTrue(last_time < j['updated'])
- last_time = j['updated']
-
- self.app.update_storage('abc', 'fulltext', {'body': 'foo bar'})
+ self.assertTrue(last_time < j["updated"])
+ last_time = j["updated"]
+
+ self.app.update_storage("abc", "fulltext", {"body": "foo bar"})
with self.app.session_scope() as session:
- r = session.query(models.Records).filter_by(bibcode='abc').first()
+ r = session.query(models.Records).filter_by(bibcode="abc").first()
self.assertTrue(r.id == 1)
- self.assertTrue(r.scix_id == 'scix:0RW9-X19B-XHYY')
+ self.assertEqual(r.scix_id, "scix:50RZ-VNK5-03S7")
j = r.toJSON()
- self.assertEqual(j['fulltext'], {'body': 'foo bar'})
- t = j['fulltext_updated']
+ self.assertEqual(j["fulltext"], {"body": "foo bar"})
+ t = j["fulltext_updated"]
self.assertTrue(now < t)
-
- r = self.app.get_record('abc')
- self.assertEqual(r['id'], 1)
- self.assertEqual(r['scix_id'],'scix:0RW9-X19B-XHYY')
- self.assertEqual(r['processed'], None)
-
- r = self.app.get_record(['abc'])
- self.assertEqual(r[0]['id'], 1)
- self.assertEqual(r[0]['scix_id'],'scix:0RW9-X19B-XHYY')
- self.assertEqual(r[0]['processed'], None)
-
- r = self.app.get_record('abc', load_only=['id'])
- self.assertEqual(r['id'], 1)
- self.assertFalse('processed' in r)
+
+ r = self.app.get_record("abc")
+ self.assertEqual(r["id"], 1)
+ self.assertEqual(r["scix_id"], "scix:50RZ-VNK5-03S7")
+ self.assertEqual(r["processed"], None)
+
+ r = self.app.get_record(["abc"])
+ self.assertEqual(r[0]["id"], 1)
+ self.assertEqual(r[0]["scix_id"], "scix:50RZ-VNK5-03S7")
+ self.assertEqual(r[0]["processed"], None)
+
+ r = self.app.get_record("abc", load_only=["id"])
+ self.assertEqual(r["id"], 1)
+ self.assertFalse("processed" in r)
with self.assertRaises(ValueError) as e:
- self.app.mark_processed(['abc'], 'foobar')
- self.assertTrue('foobar' in e.exception)
-
+ self.app.mark_processed(["abc"], "foobar")
+ self.assertTrue("foobar" in e.exception)
+
# now delete it
- self.app.delete_by_bibcode('abc')
- r = self.app.get_record('abc')
+ self.app.delete_by_bibcode("abc")
+ r = self.app.get_record("abc")
self.assertTrue(r is None)
with self.app.session_scope() as session:
- r = session.query(models.ChangeLog).filter_by(key='bibcode:abc').first()
- self.assertTrue(r.key, 'abc')
+ r = session.query(models.ChangeLog).filter_by(key="bibcode:abc").first()
+ self.assertTrue(r.key, "abc")
def test_index_metrics_database_failure(self):
"""
- verify handles failure from database
- send one bibcode, verify there are two commits
+ verify handles failure from database
+ send one bibcode, verify there are two commits
"""
- self.app.update_storage('abc', 'metrics', {
- 'author_num': 1,
- 'bibcode': 'abc',
- })
+ self.app.update_storage(
+ "abc",
+ "metrics",
+ {
+ "author_num": 1,
+ "bibcode": "abc",
+ },
+ )
trans = mock.Mock()
- trans.commit.side_effect = SQLAlchemyError('test')
+ trans.commit.side_effect = SQLAlchemyError("test")
m = mock.Mock()
m.begin_nested.return_value = trans
m.__exit__ = mock.Mock()
@@ -290,559 +371,689 @@ def test_index_metrics_database_failure(self):
m.__enter__.return_value = mock.Mock()
m.__enter__.return_value.begin_nested.return_value = trans
# init database so timestamps and checksum can be updated
- with mock.patch('adsmp.app.ADSMasterPipelineCelery.metrics_session_scope', return_value=m) as p:
- metrics_payload = {'bibcode': 'abc', 'author_num': 1}
- checksum = 'checksum'
+ with mock.patch(
+ "adsmp.app.ADSMasterPipelineCelery.metrics_session_scope", return_value=m
+ ) as p:
+ metrics_payload = {"bibcode": "abc", "author_num": 1}
+ checksum = "checksum"
self.app.index_metrics([metrics_payload], [checksum])
self.assertEqual(trans.commit.call_count, 2)
def test_index_datalinks_success(self):
"""verify passed data sent to resolver service
- verify handles success from service
- verify records table updated with processed, status and checksum
+ verify handles success from service
+ verify records table updated with processed, status and checksum
"""
m = mock.Mock()
m.status_code = 200
# init database so timestamps and checksum can be updated
- nonbib_data = {'data_links_rows': [{'baz': 0}]}
- self.app.update_storage('linkstest', 'nonbib_data', nonbib_data)
- with mock.patch('requests.put', return_value=m) as p:
- datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]}
- checksum = 'thechecksum'
+ nonbib_data = {"data_links_rows": [{"baz": 0}]}
+ self.app.update_storage("linkstest", "nonbib_data", nonbib_data)
+ with mock.patch("requests.put", return_value=m) as p:
+ datalinks_payload = {
+ "bibcode": "linkstest",
+ "data_links_rows": [{"baz": 0}],
+ }
+ checksum = "thechecksum"
self.app.index_datalinks([datalinks_payload], [checksum])
- p.assert_called_with('http://localhost:8080/update',
- data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]),
- headers={'Authorization': 'Bearer fixme'})
+ p.assert_called_with(
+ "http://localhost:8080/update",
+ data=json.dumps(
+ [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}]
+ ),
+ headers={"Authorization": "Bearer fixme"},
+ )
self.assertEqual(p.call_count, 1)
# verify database updated
- rec = self.app.get_record(bibcode='linkstest')
- self.assertEqual(rec['datalinks_checksum'], 'thechecksum')
- self.assertEqual(rec['solr_checksum'], None)
- self.assertEqual(rec['metrics_checksum'], None)
- self.assertEqual(rec['status'], 'success')
- self.assertTrue(rec['datalinks_processed'])
+ rec = self.app.get_record(bibcode="linkstest")
+ self.assertEqual(rec["datalinks_checksum"], "thechecksum")
+ self.assertEqual(rec["solr_checksum"], None)
+ self.assertEqual(rec["metrics_checksum"], None)
+ self.assertEqual(rec["status"], "success")
+ self.assertTrue(rec["datalinks_processed"])
def test_index_datalinks_service_failure(self):
"""
- verify handles failure from service
+ verify handles failure from service
"""
m = mock.Mock()
m.status_code = 500
# init database so timestamps and checksum can be updated
- nonbib_data = {'data_links_rows': [{'baz': 0}]}
- self.app.update_storage('linkstest', 'nonbib_data', nonbib_data)
- with mock.patch('requests.put', return_value=m) as p:
- datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]}
- checksum = 'thechecksum'
+ nonbib_data = {"data_links_rows": [{"baz": 0}]}
+ self.app.update_storage("linkstest", "nonbib_data", nonbib_data)
+ with mock.patch("requests.put", return_value=m) as p:
+ datalinks_payload = {
+ "bibcode": "linkstest",
+ "data_links_rows": [{"baz": 0}],
+ }
+ checksum = "thechecksum"
self.app.index_datalinks([datalinks_payload], [checksum])
- p.assert_called_with('http://localhost:8080/update',
- data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]),
- headers={'Authorization': 'Bearer fixme'})
+ p.assert_called_with(
+ "http://localhost:8080/update",
+ data=json.dumps(
+ [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}]
+ ),
+ headers={"Authorization": "Bearer fixme"},
+ )
- rec = self.app.get_record(bibcode='linkstest')
+ rec = self.app.get_record(bibcode="linkstest")
self.assertEqual(p.call_count, 2)
- self.assertEqual(rec['datalinks_checksum'], None)
- self.assertEqual(rec['solr_checksum'], None)
- self.assertEqual(rec['metrics_checksum'], None)
- self.assertEqual(rec['status'], 'links-failed')
- self.assertTrue(rec['datalinks_processed'])
+ self.assertEqual(rec["datalinks_checksum"], None)
+ self.assertEqual(rec["solr_checksum"], None)
+ self.assertEqual(rec["metrics_checksum"], None)
+ self.assertEqual(rec["status"], "links-failed")
+ self.assertTrue(rec["datalinks_processed"])
def test_index_datalinks_service_only_batch_failure(self):
# init database so timestamps and checksum can be updated
- nonbib_data = {'data_links_rows': [{'baz': 0}]}
- self.app.update_storage('linkstest', 'nonbib_data', nonbib_data)
- with mock.patch('requests.put') as p:
+ nonbib_data = {"data_links_rows": [{"baz": 0}]}
+ self.app.update_storage("linkstest", "nonbib_data", nonbib_data)
+ with mock.patch("requests.put") as p:
bad = mock.Mock()
bad.status_code = 500
good = mock.Mock()
good.status_code = 200
p.side_effect = [bad, good]
- datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]}
- checksum = 'thechecksum'
+ datalinks_payload = {
+ "bibcode": "linkstest",
+ "data_links_rows": [{"baz": 0}],
+ }
+ checksum = "thechecksum"
self.app.index_datalinks([datalinks_payload], [checksum])
- p.assert_called_with('http://localhost:8080/update',
- data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]),
- headers={'Authorization': 'Bearer fixme'})
+ p.assert_called_with(
+ "http://localhost:8080/update",
+ data=json.dumps(
+ [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}]
+ ),
+ headers={"Authorization": "Bearer fixme"},
+ )
self.assertEqual(p.call_count, 2)
# verify database updated
- rec = self.app.get_record(bibcode='linkstest')
- self.assertEqual(rec['datalinks_checksum'], 'thechecksum')
- self.assertEqual(rec['solr_checksum'], None)
- self.assertEqual(rec['metrics_checksum'], None)
- self.assertEqual(rec['status'], 'success')
- self.assertTrue(rec['datalinks_processed'])
+ rec = self.app.get_record(bibcode="linkstest")
+ self.assertEqual(rec["datalinks_checksum"], "thechecksum")
+ self.assertEqual(rec["solr_checksum"], None)
+ self.assertEqual(rec["metrics_checksum"], None)
+ self.assertEqual(rec["status"], "success")
+ self.assertTrue(rec["datalinks_processed"])
def test_index_datalinks_update_processed_false(self):
m = mock.Mock()
m.status_code = 200
# init database so timestamps and checksum can be updated
- nonbib_data = {'data_links_rows': [{'baz': 0}]}
- self.app.update_storage('linkstest', 'nonbib_data', nonbib_data)
- with mock.patch('requests.put', return_value=m) as p:
- datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]}
- checksum = 'thechecksum'
- self.app.index_datalinks([datalinks_payload], [checksum], update_processed=False)
- p.assert_called_with('http://localhost:8080/update',
- data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]),
- headers={'Authorization': 'Bearer fixme'})
+ nonbib_data = {"data_links_rows": [{"baz": 0}]}
+ self.app.update_storage("linkstest", "nonbib_data", nonbib_data)
+ with mock.patch("requests.put", return_value=m) as p:
+ datalinks_payload = {
+ "bibcode": "linkstest",
+ "data_links_rows": [{"baz": 0}],
+ }
+ checksum = "thechecksum"
+ self.app.index_datalinks(
+ [datalinks_payload], [checksum], update_processed=False
+ )
+ p.assert_called_with(
+ "http://localhost:8080/update",
+ data=json.dumps(
+ [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}]
+ ),
+ headers={"Authorization": "Bearer fixme"},
+ )
# verify database updated
- rec = self.app.get_record(bibcode='linkstest')
- self.assertEqual(rec['datalinks_checksum'], None)
- self.assertEqual(rec['solr_checksum'], None)
- self.assertEqual(rec['metrics_checksum'], None)
- self.assertEqual(rec['status'], None)
- self.assertEqual(rec['datalinks_processed'], None)
+ rec = self.app.get_record(bibcode="linkstest")
+ self.assertEqual(rec["datalinks_checksum"], None)
+ self.assertEqual(rec["solr_checksum"], None)
+ self.assertEqual(rec["metrics_checksum"], None)
+ self.assertEqual(rec["status"], None)
+ self.assertEqual(rec["datalinks_processed"], None)
def test_update_records_db_error(self):
"""test database exception IntegrityError is caught"""
- with mock.patch('sqlalchemy.orm.session.Session.commit', side_effect=[IntegrityError('a', 'b', 'c', 'd'), None]):
- self.assertRaises(IntegrityError, self.app.update_storage, 'abc', 'nonbib_data', '{}')
-
+ with mock.patch(
+ "sqlalchemy.orm.session.Session.commit",
+ side_effect=[IntegrityError("a", "b", "c", "d"), None],
+ ):
+ self.assertRaises(
+ IntegrityError, self.app.update_storage, "abc", "nonbib_data", "{}"
+ )
+
def test_rename_bibcode(self):
- self.app.update_storage('abc', 'metadata', {'foo': 'bar', 'hey': 1})
- r = self.app.get_record('abc')
-
- self.app.rename_bibcode('abc', 'def')
-
+ self.app.update_storage("abc", "metadata", {"foo": "bar", "hey": 1})
+ r = self.app.get_record("abc")
+
+ self.app.rename_bibcode("abc", "def")
+
with self.app.session_scope() as session:
- ref = session.query(models.IdentifierMapping).filter_by(key='abc').first()
- self.assertTrue(ref.target, 'def')
-
- self.assertTrue(self.app.get_changelog('abc'), [{'target': u'def', 'key': u'abc'}])
+ ref = session.query(models.IdentifierMapping).filter_by(key="abc").first()
+ self.assertTrue(ref.target, "def")
+
+ self.assertTrue(
+ self.app.get_changelog("abc"), [{"target": "def", "key": "abc"}]
+ )
def test_generate_links_for_resolver(self):
- only_nonbib = {'bibcode': 'asdf',
- 'nonbib_data':
- {'data_links_rows': [{'url': ['http://arxiv.org/abs/1902.09522']}]}}
+ only_nonbib = {
+ "bibcode": "asdf",
+ "nonbib_data": {
+ "data_links_rows": [{"url": ["http://arxiv.org/abs/1902.09522"]}]
+ },
+ }
links = self.app.generate_links_for_resolver(only_nonbib)
- self.assertEqual(only_nonbib['bibcode'], links['bibcode'])
- self.assertEqual(only_nonbib['nonbib_data']['data_links_rows'], links['data_links_rows'])
+ self.assertEqual(only_nonbib["bibcode"], links["bibcode"])
+ self.assertEqual(
+ only_nonbib["nonbib_data"]["data_links_rows"], links["data_links_rows"]
+ )
- only_bib = {'bibcode': 'asdf',
- 'bib_data':
- {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}']}}
+ only_bib = {
+ "bibcode": "asdf",
+ "bib_data": {
+ "links_data": [
+ '{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}'
+ ]
+ },
+ }
links = self.app.generate_links_for_resolver(only_bib)
- self.assertEqual(only_bib['bibcode'], links['bibcode'])
- first = links['data_links_rows'][0]
- self.assertEqual('http://arxiv.org/abs/1902.09522', first['url'][0])
- self.assertEqual('ESOURCE', first['link_type'])
- self.assertEqual('EPRINT_HTML', first['link_sub_type'])
- self.assertEqual([''], first['title'])
- self.assertEqual(0, first['item_count'])
-
- bib_and_nonbib = {'bibcode': 'asdf',
- 'bib_data':
- {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522zz"}']},
- 'nonbib_data':
- {'data_links_rows': [{'url': ['http://arxiv.org/abs/1902.09522']}]}}
+ self.assertEqual(only_bib["bibcode"], links["bibcode"])
+ first = links["data_links_rows"][0]
+ self.assertEqual("http://arxiv.org/abs/1902.09522", first["url"][0])
+ self.assertEqual("ESOURCE", first["link_type"])
+ self.assertEqual("EPRINT_HTML", first["link_sub_type"])
+ self.assertEqual([""], first["title"])
+ self.assertEqual(0, first["item_count"])
+
+ bib_and_nonbib = {
+ "bibcode": "asdf",
+ "bib_data": {
+ "links_data": [
+ '{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522zz"}'
+ ]
+ },
+ "nonbib_data": {
+ "data_links_rows": [{"url": ["http://arxiv.org/abs/1902.09522"]}]
+ },
+ }
links = self.app.generate_links_for_resolver(bib_and_nonbib)
- self.assertEqual(only_nonbib['bibcode'], links['bibcode'])
- self.assertEqual(only_nonbib['nonbib_data']['data_links_rows'], links['data_links_rows'])
+ self.assertEqual(only_nonbib["bibcode"], links["bibcode"])
+ self.assertEqual(
+ only_nonbib["nonbib_data"]["data_links_rows"], links["data_links_rows"]
+ )
# string in database
- only_bib = {'bibcode': 'asdf',
- 'bib_data':
- {'links_data': [u'{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}']}}
+ only_bib = {
+ "bibcode": "asdf",
+ "bib_data": {
+ "links_data": [
+ '{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}'
+ ]
+ },
+ }
links = self.app.generate_links_for_resolver(only_bib)
- self.assertEqual(only_bib['bibcode'], links['bibcode'])
- first = links['data_links_rows'][0]
- self.assertEqual('http://arxiv.org/abs/1902.09522', first['url'][0])
- self.assertEqual('ESOURCE', first['link_type'])
- self.assertEqual('EPRINT_HTML', first['link_sub_type'])
-
+ self.assertEqual(only_bib["bibcode"], links["bibcode"])
+ first = links["data_links_rows"][0]
+ self.assertEqual("http://arxiv.org/abs/1902.09522", first["url"][0])
+ self.assertEqual("ESOURCE", first["link_type"])
+ self.assertEqual("EPRINT_HTML", first["link_sub_type"])
+
# bad string in database
- with mock.patch.object(self.app.logger, 'error') as m:
- only_bib = {'bibcode': 'testbib',
- 'bib_data':
- {'links_data': u'foobar[!)'}}
+ with mock.patch.object(self.app.logger, "error") as m:
+ only_bib = {"bibcode": "testbib", "bib_data": {"links_data": "foobar[!)"}}
links = self.app.generate_links_for_resolver(only_bib)
self.assertEqual(None, links)
self.assertEqual(1, m.call_count)
m_args = m.call_args_list
- self.assertTrue('testbib' in str(m_args[0]))
- self.assertTrue('foobar' in str(m_args[0]))
+ self.assertTrue("testbib" in str(m_args[0]))
+ self.assertTrue("foobar" in str(m_args[0]))
def test_should_include_in_sitemap_comprehensive(self):
"""Test all code paths and scenarios in should_include_in_sitemap function"""
-
+
base_time = adsputils.get_date()
-
-
+
# Test 1: Record with no bib_data (should be excluded)
record_no_data = {
- 'bibcode': '2023NoData..1..1A',
- 'bib_data': None,
- 'status': 'success'
+ "bibcode": "2023NoData..1..1A",
+ "has_bib_data": False,
+ "status": "success",
}
- self.assertFalse(self.app.should_include_in_sitemap(record_no_data),
- "Record without bib_data should be excluded")
-
+ self.assertFalse(
+ self.app.should_include_in_sitemap(record_no_data),
+ "Record without bib_data should be excluded",
+ )
+
# Test 2: Record with empty bib_data string (should be excluded)
record_empty_data = {
- 'bibcode': '2023Empty..1..1A',
- 'bib_data': '',
- 'status': 'success'
+ "bibcode": "2023Empty..1..1A",
+ "has_bib_data": False,
+ "status": "success",
}
- self.assertFalse(self.app.should_include_in_sitemap(record_empty_data),
- "Record with empty bib_data should be excluded")
-
+ self.assertFalse(
+ self.app.should_include_in_sitemap(record_empty_data),
+ "Record with empty bib_data should be excluded",
+ )
+
# Test 3: Record with solr-failed status (should be excluded)
record_solr_failed = {
- 'bibcode': '2023Failed..1..1A',
- 'bib_data': '{"title": "Test"}',
- 'status': 'solr-failed'
+ "bibcode": "2023Failed..1..1A",
+ "has_bib_data": True,
+ "status": "solr-failed",
}
- self.assertFalse(self.app.should_include_in_sitemap(record_solr_failed),
- "Record with solr-failed status should be excluded")
-
+ self.assertFalse(
+ self.app.should_include_in_sitemap(record_solr_failed),
+ "Record with solr-failed status should be excluded",
+ )
+
# Test 4: Record with retrying status (should be excluded)
record_retrying = {
- 'bibcode': '2023Retrying..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'retrying'
+ "bibcode": "2023Retrying..1..1A",
+ "has_bib_data": True,
+ "status": "retrying",
}
- self.assertFalse(self.app.should_include_in_sitemap(record_retrying),
- "Record with retrying status should be excluded")
-
+ self.assertFalse(
+ self.app.should_include_in_sitemap(record_retrying),
+ "Record with retrying status should be excluded",
+ )
+
# Test 5: Record with None status (should be included)
record_none_status = {
- 'bibcode': '2023NoneStatus..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': None
+ "bibcode": "2023NoneStatus..1..1A",
+ "has_bib_data": True,
+ "status": None,
}
- self.assertTrue(self.app.should_include_in_sitemap(record_none_status),
- "Record with None status should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_none_status),
+ "Record with None status should be included",
+ )
+
# Test 6: Record with success status (should be included)
record_success = {
- 'bibcode': '2023Success..1..1A',
- 'bib_data': '{"title": "Test"}',
- 'status': 'success',
- 'bib_data_updated': base_time - timedelta(days=1)
+ "bibcode": "2023Success..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": base_time - timedelta(days=1),
}
- self.assertTrue(self.app.should_include_in_sitemap(record_success),
- "Record with success status should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_success),
+ "Record with success status should be included",
+ )
+
# Test 7: Record with metrics-failed status (should be included - not SOLR-related)
record_metrics_failed = {
- 'bibcode': '2023MetricsFailed..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'metrics-failed'
+ "bibcode": "2023MetricsFailed..1..1A",
+ "has_bib_data": True,
+ "status": "metrics-failed",
}
- self.assertTrue(self.app.should_include_in_sitemap(record_metrics_failed),
- "Record with metrics-failed status should be included (not SOLR-related)")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_metrics_failed),
+ "Record with metrics-failed status should be included (not SOLR-related)",
+ )
+
# Test 8: Record with links-failed status (should be included - not SOLR-related)
record_links_failed = {
- 'bibcode': '2023LinksFailed..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'links-failed'
+ "bibcode": "2023LinksFailed..1..1A",
+ "has_bib_data": True,
+ "status": "links-failed",
}
- self.assertTrue(self.app.should_include_in_sitemap(record_links_failed),
- "Record with links-failed status should be included (not SOLR-related)")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_links_failed),
+ "Record with links-failed status should be included (not SOLR-related)",
+ )
+
# Test 9: Record with None status and no solr_processed (should be included - not yet processed)
record_not_processed = {
- 'bibcode': '2023NotProcessed..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': None,
- 'solr_processed': None
+ "bibcode": "2023NotProcessed..1..1A",
+ "has_bib_data": True,
+ "status": None,
+ "solr_processed": None,
}
- self.assertTrue(self.app.should_include_in_sitemap(record_not_processed),
- "Record not yet processed by SOLR should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_not_processed),
+ "Record not yet processed by SOLR should be included",
+ )
+
# Test 10: Record with recent solr_processed (should be included)
record_recent_solr = {
- 'bibcode': '2023Recent..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': base_time - timedelta(days=1),
- 'solr_processed': base_time # More recent than bib_data_updated
+ "bibcode": "2023Recent..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": base_time - timedelta(days=1),
+ "solr_processed": base_time, # More recent than bib_data_updated
}
- self.assertTrue(self.app.should_include_in_sitemap(record_recent_solr),
- "Record with recent SOLR processing should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_recent_solr),
+ "Record with recent SOLR processing should be included",
+ )
+
# Test 11: Record with stale solr_processed (should be included with warning)
record_stale_solr = {
- 'bibcode': '2023Stale..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': base_time,
- 'solr_processed': base_time - timedelta(days=6) # 6 days stale (> 5 day threshold)
+ "bibcode": "2023Stale..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": base_time,
+ "solr_processed": base_time
+ - timedelta(days=6), # 6 days stale (> 5 day threshold)
}
- self.assertTrue(self.app.should_include_in_sitemap(record_stale_solr),
- "Record with stale SOLR processing should still be included (with warning)")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_stale_solr),
+ "Record with stale SOLR processing should still be included (with warning)",
+ )
+
# Test 12: Record with exactly 5+ days staleness (boundary condition)
record_boundary = {
- 'bibcode': '2023Boundary..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': base_time,
- 'solr_processed': base_time - timedelta(days=5, seconds=1) # Just over 5 days
+ "bibcode": "2023Boundary..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": base_time,
+ "solr_processed": base_time
+ - timedelta(days=5, seconds=1), # Just over 5 days
}
- self.assertTrue(self.app.should_include_in_sitemap(record_boundary),
- "Record with exactly 5+ days staleness should be included with warning")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_boundary),
+ "Record with exactly 5+ days staleness should be included with warning",
+ )
+
# Test 13: Record with no timestamps (should be included)
record_no_timestamps = {
- 'bibcode': '2023NoTimestamps..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': None,
- 'solr_processed': None
+ "bibcode": "2023NoTimestamps..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": None,
+ "solr_processed": None,
}
- self.assertTrue(self.app.should_include_in_sitemap(record_no_timestamps),
- "Record with no timestamps should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_no_timestamps),
+ "Record with no timestamps should be included",
+ )
+
# Test 14: Record with bib_data_updated but no solr_processed (should be included)
record_no_solr_time = {
- 'bibcode': '2023NoSolrTime..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': base_time,
- 'solr_processed': None
+ "bibcode": "2023NoSolrTime..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": base_time,
+ "solr_processed": None,
}
- self.assertTrue(self.app.should_include_in_sitemap(record_no_solr_time),
- "Record with bib_data_updated but no solr_processed should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_no_solr_time),
+ "Record with bib_data_updated but no solr_processed should be included",
+ )
+
# Test 15: Record with solr_processed but no bib_data_updated (should be included)
record_no_bib_time = {
- 'bibcode': '2023NoBibTime..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': None,
- 'solr_processed': base_time
+ "bibcode": "2023NoBibTime..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": None,
+ "solr_processed": base_time,
}
- self.assertTrue(self.app.should_include_in_sitemap(record_no_bib_time),
- "Record with solr_processed but no bib_data_updated should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_no_bib_time),
+ "Record with solr_processed but no bib_data_updated should be included",
+ )
+
# Test 16: Record with very fresh processing (should be included)
record_fresh = {
- 'bibcode': '2023Fresh..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': base_time - timedelta(minutes=30),
- 'solr_processed': base_time
+ "bibcode": "2023Fresh..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": base_time - timedelta(minutes=30),
+ "solr_processed": base_time,
}
- self.assertTrue(self.app.should_include_in_sitemap(record_fresh),
- "Record with very fresh processing should be included")
-
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_fresh),
+ "Record with very fresh processing should be included",
+ )
+
# Test 17: Record with moderate lag (2 days, should be included without warning)
record_moderate_lag = {
- 'bibcode': '2023Moderate..1..1A',
- 'bib_data': {'title': 'Test'},
- 'status': 'success',
- 'bib_data_updated': base_time - timedelta(days=2),
- 'solr_processed': base_time
+ "bibcode": "2023Moderate..1..1A",
+ "has_bib_data": True,
+ "status": "success",
+ "bib_data_updated": base_time - timedelta(days=2),
+ "solr_processed": base_time,
}
- self.assertTrue(self.app.should_include_in_sitemap(record_moderate_lag),
- "Record with moderate processing lag should be included")
+ self.assertTrue(
+ self.app.should_include_in_sitemap(record_moderate_lag),
+ "Record with moderate processing lag should be included",
+ )
def test_get_records_bulk_performance(self):
"""Test get_records_bulk with a considerable number of records"""
-
+
# Create 1000 test records
test_bibcodes = []
-
+
for i in range(1000):
- bibcode = f'2023Bulk..{i:04d}..{i:04d}A'
+ bibcode = f"2023Bulk..{i:04d}..{i:04d}A"
test_bibcodes.append(bibcode)
-
+
# Simple test data
- bib_data = {
- 'title': f'Test Paper {i}',
- 'year': 2023
- }
-
+ bib_data = {"title": f"Test Paper {i}", "year": 2023}
+
# Store record in database
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Test 1: Get all records with default fields
with self.app.session_scope() as session:
start_time = adsputils.get_date()
-
+
result = self.app.get_records_bulk(test_bibcodes, session)
-
+
end_time = adsputils.get_date()
query_time = (end_time - start_time).total_seconds()
-
+
# Performance assertion - should complete within reasonable time
- self.assertLess(query_time, 10.0, f"Bulk query took {query_time:.2f}s, should be under 10s")
-
+ self.assertLess(
+ query_time,
+ 10.0,
+ f"Bulk query took {query_time:.2f}s, should be under 10s",
+ )
+
# Verify all records returned
self.assertEqual(len(result), 1000, "Should return all 1000 records")
-
+
# Verify basic structure
for bibcode in test_bibcodes[:5]: # Check first 5 records
self.assertIn(bibcode, result, f"Should contain record {bibcode}")
record = result[bibcode]
-
+
# Check required fields are present
- self.assertIn('id', record, "Should contain id field")
- self.assertIn('bibcode', record, "Should contain bibcode field")
- self.assertIn('bib_data', record, "Should contain bib_data field")
-
+ self.assertIn("id", record, "Should contain id field")
+ self.assertIn("bibcode", record, "Should contain bibcode field")
+ self.assertIn("bib_data", record, "Should contain bib_data field")
+
# Verify bibcode matches
- self.assertEqual(record['bibcode'], bibcode, "Bibcode should match")
-
- print(f" get_records_bulk performance: 1000 records retrieved in {query_time:.2f}s")
-
+ self.assertEqual(record["bibcode"], bibcode, "Bibcode should match")
+
+ print(
+ f" get_records_bulk performance: 1000 records retrieved in {query_time:.2f}s"
+ )
+
# Test 2: Test load_only functionality
with self.app.session_scope() as session:
result_limited = self.app.get_records_bulk(
- test_bibcodes[:10],
- session,
- load_only=['bibcode', 'bib_data_updated']
+ test_bibcodes[:10], session, load_only=["bibcode", "bib_data_updated"]
)
-
+
# Verify correct fields returned
for bibcode in test_bibcodes[:5]:
record = result_limited[bibcode]
-
+
# Should have requested fields
- self.assertIn('bibcode', record, "Should contain bibcode field")
- self.assertIn('bib_data_updated', record, "Should contain bib_data_updated field")
-
+ self.assertIn("bibcode", record, "Should contain bibcode field")
+ self.assertIn(
+ "bib_data_updated", record, "Should contain bib_data_updated field"
+ )
+
# Should not have other fields (they should be None)
- self.assertIsNone(record.get('bib_data'), "bib_data should be None when not requested")
-
+ self.assertIsNone(
+ record.get("bib_data"), "bib_data should be None when not requested"
+ )
+
# Test 3: Empty bibcode list
with self.app.session_scope() as session:
empty_result = self.app.get_records_bulk([], session)
- self.assertEqual(empty_result, {}, "Empty bibcode list should return empty dict")
-
+ self.assertEqual(
+ empty_result, {}, "Empty bibcode list should return empty dict"
+ )
+
# Test 4: Non-existent bibcodes
- fake_bibcodes = ['2023Fake..1..1A', '2023Fake..1..2B']
+ fake_bibcodes = ["2023Fake..1..1A", "2023Fake..1..2B"]
with self.app.session_scope() as session:
fake_result = self.app.get_records_bulk(fake_bibcodes, session)
- self.assertEqual(fake_result, {}, "Non-existent bibcodes should return empty dict")
+ self.assertEqual(
+ fake_result, {}, "Non-existent bibcodes should return empty dict"
+ )
def test_get_sitemap_info_bulk_performance(self):
"""Test get_sitemap_info_bulk with a considerable number of sitemaps"""
-
+
# Create 1000 test records and sitemap entries
test_bibcodes = []
-
+
for i in range(1000):
- bibcode = f'2023Sitemap..{i:04d}..{i:04d}A'
+ bibcode = f"2023Sitemap..{i:04d}..{i:04d}A"
test_bibcodes.append(bibcode)
-
+
# Simple test data
- bib_data = {
- 'title': f'Test Sitemap Paper {i}',
- 'year': 2023
- }
-
+ bib_data = {"title": f"Test Sitemap Paper {i}", "year": 2023}
+
# Store record in database
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Create sitemap entries for these records
with self.app.session_scope() as session:
# Get record IDs
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ records = (
+ session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
# Create sitemap info entries
for i, bibcode in enumerate(test_bibcodes):
sitemap_info = SitemapInfo(
record_id=record_map[bibcode],
bibcode=bibcode,
- sitemap_filename=f'sitemap_bib_{(i // 50) + 1}.xml', # 50 records per file
+ sitemap_filename=f"sitemap_bib_{(i // 50) + 1}.xml", # 50 records per file
filename_lastmoddate=adsputils.get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Test 1: Get all sitemap infos with performance timing
with self.app.session_scope() as session:
start_time = adsputils.get_date()
-
+
result = self.app.get_sitemap_info_bulk(test_bibcodes, session)
-
+
end_time = adsputils.get_date()
query_time = (end_time - start_time).total_seconds()
-
+
# Performance assertion - should complete within reasonable time
- self.assertLess(query_time, 10.0, f"Bulk sitemap query took {query_time:.2f}s, should be under 10s")
-
+ self.assertLess(
+ query_time,
+ 10.0,
+ f"Bulk sitemap query took {query_time:.2f}s, should be under 10s",
+ )
+
# Verify all sitemap infos returned
self.assertEqual(len(result), 1000, "Should return all 1000 sitemap infos")
-
+
# Verify basic structure
for bibcode in test_bibcodes[:5]: # Check first 5 records
- self.assertIn(bibcode, result, f"Should contain sitemap info for {bibcode}")
+ self.assertIn(
+ bibcode, result, f"Should contain sitemap info for {bibcode}"
+ )
sitemap_data = result[bibcode]
-
+
# Check required fields are present (toJSON() format)
- self.assertIn('bibcode', sitemap_data, "Should contain bibcode field")
- self.assertIn('sitemap_filename', sitemap_data, "Should contain sitemap_filename field")
- self.assertIn('update_flag', sitemap_data, "Should contain update_flag field")
-
+ self.assertIn("bibcode", sitemap_data, "Should contain bibcode field")
+ self.assertIn(
+ "sitemap_filename",
+ sitemap_data,
+ "Should contain sitemap_filename field",
+ )
+ self.assertIn(
+ "update_flag", sitemap_data, "Should contain update_flag field"
+ )
+
# Verify bibcode matches
- self.assertEqual(sitemap_data['bibcode'], bibcode, "Bibcode should match")
-
+ self.assertEqual(
+ sitemap_data["bibcode"], bibcode, "Bibcode should match"
+ )
+
# Verify filename format
- self.assertTrue(sitemap_data['sitemap_filename'].startswith('sitemap_bib_'),
- "Filename should have correct format")
-
- print(f"get_sitemap_info_bulk performance: 1000 sitemap infos retrieved in {query_time:.2f}s")
-
+ self.assertTrue(
+ sitemap_data["sitemap_filename"].startswith("sitemap_bib_"),
+ "Filename should have correct format",
+ )
+
+ print(
+ f"get_sitemap_info_bulk performance: 1000 sitemap infos retrieved in {query_time:.2f}s"
+ )
+
# Test 2: Empty bibcode list
with self.app.session_scope() as session:
empty_result = self.app.get_sitemap_info_bulk([], session)
- self.assertEqual(empty_result, {}, "Empty bibcode list should return empty dict")
-
+ self.assertEqual(
+ empty_result, {}, "Empty bibcode list should return empty dict"
+ )
+
# Test 3: Non-existent bibcodes
- fake_bibcodes = ['2023FakeSitemap..1..1A', '2023FakeSitemap..1..2B']
+ fake_bibcodes = ["2023FakeSitemap..1..1A", "2023FakeSitemap..1..2B"]
with self.app.session_scope() as session:
fake_result = self.app.get_sitemap_info_bulk(fake_bibcodes, session)
- self.assertEqual(fake_result, {}, "Non-existent bibcodes should return empty dict")
+ self.assertEqual(
+ fake_result, {}, "Non-existent bibcodes should return empty dict"
+ )
def test_get_current_sitemap_state_performance(self):
"""Test get_current_sitemap_state with multiple sitemaps and records"""
-
+
# Create test records across multiple sitemap files
test_bibcodes = []
-
- for i in range(500):
- bibcode = f'2023State..{i:04d}..{i:04d}A'
+
+ for i in range(500):
+ bibcode = f"2023State..{i:04d}..{i:04d}A"
test_bibcodes.append(bibcode)
-
+
# Create highly unique bib_data to ensure different scix_ids
-
+
bib_data = {
- 'title': f'Test State Paper {i} - Unique Content {i*17} - {bibcode}',
- 'year': 2023 + (i % 10), # Vary the year
- 'bibcode': bibcode, # Include bibcode for uniqueness
- 'abstract': f'This is a unique abstract for paper {i} with specific content {i*23} and bibcode {bibcode}',
- 'authors': [f'Author{i}_{bibcode}', f'CoAuthor{i*2}_{bibcode}'],
- 'unique_field': f'unique_value_{i}_{i*37}_{bibcode}_{int(time.time()*1000000) % 1000000}',
- 'doi': f'10.1000/test.{i}.{i*41}',
- 'page': f'{i*100}-{i*100+10}',
- 'volume': str(i % 100 + 1),
- 'issue': str(i % 12 + 1)
+ "title": f"Test State Paper {i} - Unique Content {i*17} - {bibcode}",
+ "year": 2023 + (i % 10), # Vary the year
+ "bibcode": bibcode, # Include bibcode for uniqueness
+ "abstract": f"This is a unique abstract for paper {i} with specific content {i*23} and bibcode {bibcode}",
+ "authors": [f"Author{i}_{bibcode}", f"CoAuthor{i*2}_{bibcode}"],
+ "unique_field": f"unique_value_{i}_{i*37}_{bibcode}_{int(time.time()*1000000) % 1000000}",
+ "doi": f"10.1000/test.{i}.{i*41}",
+ "page": f"{i*100}-{i*100+10}",
+ "volume": str(i % 100 + 1),
+ "issue": str(i % 12 + 1),
}
-
+
# Store record in database
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Test Scenario 1: Last file has EQUAL records (100 each)
with self.app.session_scope() as session:
# Get record IDs
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ records = (
+ session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
# Create sitemap info entries - all files have 100 records each
sitemap_distributions_equal = [
- ('sitemap_bib_1.xml', 100), # 100 records
- ('sitemap_bib_2.xml', 100), # 100 records
- ('sitemap_bib_3.xml', 100), # 100 records
- ('sitemap_bib_4.xml', 100), # 100 records
- ('sitemap_bib_5.xml', 100), # 100 records (equal - should be returned as highest)
+ ("sitemap_bib_1.xml", 100), # 100 records
+ ("sitemap_bib_2.xml", 100), # 100 records
+ ("sitemap_bib_3.xml", 100), # 100 records
+ ("sitemap_bib_4.xml", 100), # 100 records
+ (
+ "sitemap_bib_5.xml",
+ 100,
+ ), # 100 records (equal - should be returned as highest)
]
-
+
bibcode_index = 0
for filename, record_count in sitemap_distributions_equal:
for _ in range(record_count):
@@ -853,49 +1064,65 @@ def test_get_current_sitemap_state_performance(self):
bibcode=bibcode,
sitemap_filename=filename,
filename_lastmoddate=adsputils.get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
bibcode_index += 1
session.commit()
-
+
# Test 1: Get current sitemap state with performance timing (EQUAL scenario)
with self.app.session_scope() as session:
start_time = adsputils.get_date()
-
+
result = self.app.get_current_sitemap_state(session)
-
+
end_time = adsputils.get_date()
query_time = (end_time - start_time).total_seconds()
-
+
# Performance assertion - should complete quickly
- self.assertLess(query_time, 2.0, f"Sitemap state query took {query_time:.3f}s, should be under 2s")
-
+ self.assertLess(
+ query_time,
+ 2.0,
+ f"Sitemap state query took {query_time:.3f}s, should be under 2s",
+ )
+
# Verify it returns the latest filename (highest index) when all have equal records
- self.assertEqual(result['filename'], 'sitemap_bib_5.xml',
- "Should return the highest numbered sitemap file when all have equal records")
- self.assertEqual(result['count'], 100,
- "Should return 100 records for the latest file (equal scenario)")
- self.assertEqual(result['index'], 5,
- "Should return index 5 for the latest file")
-
- print(f"get_current_sitemap_state performance (EQUAL): query completed in {query_time:.3f}s")
-
+ self.assertEqual(
+ result["filename"],
+ "sitemap_bib_5.xml",
+ "Should return the highest numbered sitemap file when all have equal records",
+ )
+ self.assertEqual(
+ result["count"],
+ 100,
+ "Should return 100 records for the latest file (equal scenario)",
+ )
+ self.assertEqual(
+ result["index"], 5, "Should return index 5 for the latest file"
+ )
+
+ print(
+ f"get_current_sitemap_state performance (EQUAL): query completed in {query_time:.3f}s"
+ )
+
# Test Scenario 2: Last file has FEWER records (100, 100, 100, 100, 80)
with self.app.session_scope() as session:
# Clear existing sitemap info
session.query(SitemapInfo).delete(synchronize_session=False)
session.commit()
-
+
# Create new distribution where last file has fewer records
sitemap_distributions_fewer = [
- ('sitemap_bib_1.xml', 100), # 100 records
- ('sitemap_bib_2.xml', 100), # 100 records
- ('sitemap_bib_3.xml', 100), # 100 records
- ('sitemap_bib_4.xml', 100), # 100 records
- ('sitemap_bib_5.xml', 80), # 80 records (fewer - should still be returned as highest)
+ ("sitemap_bib_1.xml", 100), # 100 records
+ ("sitemap_bib_2.xml", 100), # 100 records
+ ("sitemap_bib_3.xml", 100), # 100 records
+ ("sitemap_bib_4.xml", 100), # 100 records
+ (
+ "sitemap_bib_5.xml",
+ 80,
+ ), # 80 records (fewer - should still be returned as highest)
]
-
+
bibcode_index = 0
for filename, record_count in sitemap_distributions_fewer:
for _ in range(record_count):
@@ -906,62 +1133,83 @@ def test_get_current_sitemap_state_performance(self):
bibcode=bibcode,
sitemap_filename=filename,
filename_lastmoddate=adsputils.get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
bibcode_index += 1
session.commit()
-
+
# Test with fewer records in last file
start_time = adsputils.get_date()
-
+
result = self.app.get_current_sitemap_state(session)
-
+
end_time = adsputils.get_date()
query_time_fewer = (end_time - start_time).total_seconds()
-
+
# Performance assertion
- self.assertLess(query_time_fewer, 2.0, f"Sitemap state query took {query_time_fewer:.3f}s, should be under 2s")
-
+ self.assertLess(
+ query_time_fewer,
+ 2.0,
+ f"Sitemap state query took {query_time_fewer:.3f}s, should be under 2s",
+ )
+
# Verify it still returns the latest filename even with fewer records
- self.assertEqual(result['filename'], 'sitemap_bib_5.xml',
- "Should return the highest numbered sitemap file even when it has fewer records")
- self.assertEqual(result['count'], 80,
- "Should return 80 records for the latest file (fewer scenario)")
- self.assertEqual(result['index'], 5,
- "Should return index 5 for the latest file")
-
+ self.assertEqual(
+ result["filename"],
+ "sitemap_bib_5.xml",
+ "Should return the highest numbered sitemap file even when it has fewer records",
+ )
+ self.assertEqual(
+ result["count"],
+ 80,
+ "Should return 80 records for the latest file (fewer scenario)",
+ )
+ self.assertEqual(
+ result["index"], 5, "Should return index 5 for the latest file"
+ )
+
# Test 3: Verify state reflects the actual database content (using fewer scenario data)
with self.app.session_scope() as session:
# Verify the count matches actual database records
- actual_count = session.query(SitemapInfo).filter(
- SitemapInfo.sitemap_filename == 'sitemap_bib_5.xml'
- ).count()
-
+ actual_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.sitemap_filename == "sitemap_bib_5.xml")
+ .count()
+ )
+
result = self.app.get_current_sitemap_state(session)
- self.assertEqual(result['count'], actual_count,
- "State count should match actual database count")
- self.assertEqual(result['count'], 80,
- "Should reflect the fewer records scenario (80 records)")
-
+ self.assertEqual(
+ result["count"],
+ actual_count,
+ "State count should match actual database count",
+ )
+ self.assertEqual(
+ result["count"],
+ 80,
+ "Should reflect the fewer records scenario (80 records)",
+ )
+
# Test 4: Test with files that have None filenames (should be filtered out)
with self.app.session_scope() as session:
# Add some records with None filenames
- none_bibcodes = ['2023None..1..1A', '2023None..2..2A']
+ none_bibcodes = ["2023None..1..1A", "2023None..2..2A"]
for i, bibcode in enumerate(none_bibcodes):
bib_data = {
- 'title': f'Test None {i} - {bibcode}',
- 'year': 2024 + i,
- 'bibcode': bibcode,
- 'unique_field': f'none_test_{i}_{bibcode}_{int(time.time()*1000000) % 1000000}',
- 'abstract': f'Unique abstract for none test {i} with bibcode {bibcode}',
- 'authors': [f'NoneAuthor{i}_{bibcode}']
+ "title": f"Test None {i} - {bibcode}",
+ "year": 2024 + i,
+ "bibcode": bibcode,
+ "unique_field": f"none_test_{i}_{bibcode}_{int(time.time()*1000000) % 1000000}",
+ "abstract": f"Unique abstract for none test {i} with bibcode {bibcode}",
+ "authors": [f"NoneAuthor{i}_{bibcode}"],
}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Get the record IDs
- none_records = session.query(Records).filter(Records.bibcode.in_(none_bibcodes)).all()
-
+ none_records = (
+ session.query(Records).filter(Records.bibcode.in_(none_bibcodes)).all()
+ )
+
# Add SitemapInfo entries with None filenames
for record in none_records:
sitemap_info = SitemapInfo(
@@ -969,1415 +1217,2167 @@ def test_get_current_sitemap_state_performance(self):
bibcode=record.bibcode,
sitemap_filename=None, # None filename should be filtered out
filename_lastmoddate=adsputils.get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Should still return sitemap_bib_5.xml, ignoring None filenames
result = self.app.get_current_sitemap_state(session)
- self.assertEqual(result['filename'], 'sitemap_bib_5.xml',
- "Should ignore None filenames and return highest valid filename")
- self.assertEqual(result['count'], 80,
- "Should still return 80 records from the valid highest file")
-
+ self.assertEqual(
+ result["filename"],
+ "sitemap_bib_5.xml",
+ "Should ignore None filenames and return highest valid filename",
+ )
+ self.assertEqual(
+ result["count"],
+ 80,
+ "Should still return 80 records from the valid highest file",
+ )
+
# Test 5: Empty database state (edge case)
with self.app.session_scope() as session:
# Clear all sitemap info
session.query(SitemapInfo).delete(synchronize_session=False)
session.commit()
-
+
result = self.app.get_current_sitemap_state(session)
-
+
# Should return default state
- self.assertEqual(result['filename'], 'sitemap_bib_1.xml',
- "Should return default filename when no records exist")
- self.assertEqual(result['count'], 0,
- "Should return 0 count when no records exist")
- self.assertEqual(result['index'], 1,
- "Should return default index 1 when no records exist")
+ self.assertEqual(
+ result["filename"],
+ "sitemap_bib_1.xml",
+ "Should return default filename when no records exist",
+ )
+ self.assertEqual(
+ result["count"], 0, "Should return 0 count when no records exist"
+ )
+ self.assertEqual(
+ result["index"],
+ 1,
+ "Should return default index 1 when no records exist",
+ )
def test_process_sitemap_batch_session_persistence(self):
"""Test _process_sitemap_batch with session management and persistence"""
-
+
# Create test records for batch processing
test_bibcodes = []
-
+
for i in range(100):
- bibcode = f'2023Batch..{i:04d}..{i:04d}A'
+ bibcode = f"2023Batch..{i:04d}..{i:04d}A"
test_bibcodes.append(bibcode)
-
+
# Simple test data
- bib_data = {
- 'title': f'Test Batch Paper {i}',
- 'year': 2023
- }
-
+ bib_data = {"title": f"Test Batch Paper {i}", "year": 2023}
+
# Store record in database
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Test session persistence
with self.app.session_scope() as session:
start_time = adsputils.get_date()
-
+
# Get initial sitemap state
sitemap_state = self.app.get_current_sitemap_state(session)
-
+
# Test 1: Process first batch of 50 bibcodes
batch_bibcodes_1 = test_bibcodes[:50]
batch_stats, updated_state_1 = self.app._process_sitemap_batch(
- batch_bibcodes_1, 'add', session, sitemap_state
+ batch_bibcodes_1, "add", session, sitemap_state
)
-
+
end_time = adsputils.get_date()
query_time = (end_time - start_time).total_seconds()
-
+
# Performance assertion
- self.assertLess(query_time, 5.0, f"Batch processing took {query_time:.3f}s, should be under 5s")
-
+ self.assertLess(
+ query_time,
+ 5.0,
+ f"Batch processing took {query_time:.3f}s, should be under 5s",
+ )
+
# Verify first batch results
- self.assertEqual(batch_stats['successful'], 50, "Should successfully process all 50 bibcodes")
- self.assertEqual(batch_stats['failed'], 0, "Should have no failed bibcodes")
- self.assertEqual(len(batch_stats['sitemap_records']), 50, "Should return 50 sitemap records")
-
- print(f"process_sitemap_batch performance (ADD): processed 50 records in {query_time:.3f}s")
-
+ self.assertEqual(
+ batch_stats["successful"],
+ 50,
+ "Should successfully process all 50 bibcodes",
+ )
+ self.assertEqual(batch_stats["failed"], 0, "Should have no failed bibcodes")
+ self.assertEqual(
+ len(batch_stats["sitemap_records"]),
+ 50,
+ "Should return 50 sitemap records",
+ )
+
+ print(
+ f"process_sitemap_batch performance (ADD): processed 50 records in {query_time:.3f}s"
+ )
+
# Test 2: Verify session persistence
- created_records_1 = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(batch_bibcodes_1)
- ).all()
-
- self.assertEqual(len(created_records_1), 50,
- "All 50 sitemap records should be visible in same session")
-
+ created_records_1 = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(batch_bibcodes_1))
+ .all()
+ )
+
+ self.assertEqual(
+ len(created_records_1),
+ 50,
+ "All 50 sitemap records should be visible in same session",
+ )
+
# Test 3: Process second batch using updated_state from first batch
batch_bibcodes_2 = test_bibcodes[50:80]
batch_stats, updated_state_2 = self.app._process_sitemap_batch(
- batch_bibcodes_2, 'force-update', session, updated_state_1
+ batch_bibcodes_2, "force-update", session, updated_state_1
)
-
+
# Verify second batch results
- self.assertEqual(batch_stats['successful'], 30, "Should successfully process all 30 bibcodes")
- self.assertEqual(batch_stats['failed'], 0, "Should have no failed bibcodes")
- self.assertEqual(len(batch_stats['sitemap_records']), 30, "Should return 30 sitemap records")
-
+ self.assertEqual(
+ batch_stats["successful"],
+ 30,
+ "Should successfully process all 30 bibcodes",
+ )
+ self.assertEqual(batch_stats["failed"], 0, "Should have no failed bibcodes")
+ self.assertEqual(
+ len(batch_stats["sitemap_records"]),
+ 30,
+ "Should return 30 sitemap records",
+ )
+
# Test 4: Verify session consistency - state should be cumulative
- initial_count = sitemap_state['count']
- if initial_count + 80 <= self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000):
+ initial_count = sitemap_state["count"]
+ if initial_count + 80 <= self.app.conf.get(
+ "MAX_RECORDS_PER_SITEMAP", 50000
+ ):
# Should be same file with cumulative records
- self.assertEqual(updated_state_2['filename'], sitemap_state['filename'],
- "Should use same filename when under limit")
- self.assertEqual(updated_state_2['count'], initial_count + 80,
- "Count should be cumulative across batches")
-
+ self.assertEqual(
+ updated_state_2["filename"],
+ sitemap_state["filename"],
+ "Should use same filename when under limit",
+ )
+ self.assertEqual(
+ updated_state_2["count"],
+ initial_count + 80,
+ "Count should be cumulative across batches",
+ )
+
# Test 5: Verify all records are visible in same session (no commits yet!)
- all_records_in_session = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes[:80])
- ).all()
-
- self.assertEqual(len(all_records_in_session), 80,
- "All 80 records should be visible in same session before commit")
-
+ all_records_in_session = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes[:80]))
+ .all()
+ )
+
+ self.assertEqual(
+ len(all_records_in_session),
+ 80,
+ "All 80 records should be visible in same session before commit",
+ )
+
# Test 6: Verify state consistency within session
current_state_in_session = self.app.get_current_sitemap_state(session)
- self.assertEqual(current_state_in_session['count'], updated_state_2['count'],
- "Current state should match updated state within same session")
-
-
+ self.assertEqual(
+ current_state_in_session["count"],
+ updated_state_2["count"],
+ "Current state should match updated state within same session",
+ )
+
# Now commit everything at once
session.commit()
-
+
# Test 7: Verify data persisted after session ends
with self.app.session_scope() as new_session:
- verification_records = new_session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes[:80])
- ).count()
-
- self.assertEqual(verification_records, 80,
- "New session should see all committed records")
-
-
+ verification_records = (
+ new_session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes[:80]))
+ .count()
+ )
+
+ self.assertEqual(
+ verification_records, 80, "New session should see all committed records"
+ )
+
# Test 6: Test empty batch edge case
with self.app.session_scope() as session:
empty_state = self.app.get_current_sitemap_state(session)
batch_stats, empty_updated_state = self.app._process_sitemap_batch(
- [], 'add', session, empty_state
+ [], "add", session, empty_state
+ )
+ self.assertEqual(
+ batch_stats["successful"], 0, "Empty batch should return 0 successful"
+ )
+ self.assertEqual(
+ batch_stats["failed"], 0, "Empty batch should return 0 failed"
+ )
+ self.assertEqual(
+ len(batch_stats["sitemap_records"]),
+ 0,
+ "Empty batch should return empty records list",
+ )
+ self.assertEqual(
+ empty_updated_state,
+ empty_state,
+ "Empty batch should return unchanged state",
)
- self.assertEqual(batch_stats['successful'], 0, "Empty batch should return 0 successful")
- self.assertEqual(batch_stats['failed'], 0, "Empty batch should return 0 failed")
- self.assertEqual(len(batch_stats['sitemap_records']), 0, "Empty batch should return empty records list")
- self.assertEqual(empty_updated_state, empty_state, "Empty batch should return unchanged state")
def test_process_sitemap_batch_solr_filtering(self):
"""Test SOLR status filtering logic in _process_sitemap_batch"""
-
+
# Create records with different statuses to test all should_include_in_sitemap logic
test_bibcodes = [
- '2023Success..1..1A', # success - should be included
- '2023SolrFailed..1..1A', # solr-failed - should be excluded
- '2023Retrying..1..1A', # retrying - should be excluded
- '2023MetricsFailed..1..1A', # metrics-failed - should be included (not SOLR-related)
- '2023LinksFailed..1..1A', # links-failed - should be included (not SOLR-related)
- '2023NoBibData..1..1A' # will have no bib_data - should be excluded
+ "2023Success..1..1A", # success - should be included
+ "2023SolrFailed..1..1A", # solr-failed - should be excluded
+ "2023Retrying..1..1A", # retrying - should be excluded
+ "2023MetricsFailed..1..1A", # metrics-failed - should be included (not SOLR-related)
+ "2023LinksFailed..1..1A", # links-failed - should be included (not SOLR-related)
+ "2023NoBibData..1..1A", # will have no bib_data - should be excluded
]
-
+
for i, bibcode in enumerate(test_bibcodes):
- if bibcode != '2023NoBibData..1..1A': # Skip creating bib_data for this one
- bib_data = {'title': f'Test Paper {i}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ if bibcode != "2023NoBibData..1..1A": # Skip creating bib_data for this one
+ bib_data = {"title": f"Test Paper {i}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Set different statuses
- self.app.mark_processed(['2023SolrFailed..1..1A'], 'solr', checksums=['checksum_failed'], status='solr-failed')
- self.app.mark_processed(['2023Retrying..1..1A'], 'solr', checksums=['checksum_retrying'], status='retrying')
- self.app.mark_processed(['2023MetricsFailed..1..1A'], 'solr', checksums=['checksum_metrics'], status='metrics-failed')
- self.app.mark_processed(['2023LinksFailed..1..1A'], 'solr', checksums=['checksum_links'], status='links-failed')
+ self.app.mark_processed(
+ ["2023SolrFailed..1..1A"],
+ "solr",
+ checksums=["checksum_failed"],
+ status="solr-failed",
+ )
+ self.app.mark_processed(
+ ["2023Retrying..1..1A"],
+ "solr",
+ checksums=["checksum_retrying"],
+ status="retrying",
+ )
+ self.app.mark_processed(
+ ["2023MetricsFailed..1..1A"],
+ "solr",
+ checksums=["checksum_metrics"],
+ status="metrics-failed",
+ )
+ self.app.mark_processed(
+ ["2023LinksFailed..1..1A"],
+ "solr",
+ checksums=["checksum_links"],
+ status="links-failed",
+ )
# 2023Success..1..1A gets default 'success' status
# 2023NoBibData..1..1A will have no bib_data at all
-
+
# Test 'add' action
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1}
+
batch_stats, updated_state_add = self.app._process_sitemap_batch(
- test_bibcodes, 'add', session, initial_state
+ test_bibcodes, "add", session, initial_state
)
-
+
# Should include: success, metrics-failed, links-failed = 3 successful
# Should exclude: solr-failed, retrying, no-bib-data = 3 failed
- self.assertEqual(batch_stats['successful'], 3, "Add: Should include success, metrics-failed, links-failed statuses")
- self.assertEqual(batch_stats['failed'], 3, "Add: Should exclude solr-failed, retrying, and no-bib-data records")
- self.assertEqual(len(batch_stats['sitemap_records']), 3, "Add: Should return 3 sitemap records")
- self.assertEqual(updated_state_add['count'], 3, "Add: State should reflect only successful records")
-
+ self.assertEqual(
+ batch_stats["successful"],
+ 3,
+ "Add: Should include success, metrics-failed, links-failed statuses",
+ )
+ self.assertEqual(
+ batch_stats["failed"],
+ 3,
+ "Add: Should exclude solr-failed, retrying, and no-bib-data records",
+ )
+ self.assertEqual(
+ len(batch_stats["sitemap_records"]),
+ 3,
+ "Add: Should return 3 sitemap records",
+ )
+ self.assertEqual(
+ updated_state_add["count"],
+ 3,
+ "Add: State should reflect only successful records",
+ )
+
# Test 'force-update' action - should have same filtering results
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_2.xml', 'count': 0, 'index': 2}
-
+ initial_state = {"filename": "sitemap_bib_2.xml", "count": 0, "index": 2}
+
batch_stats, updated_state_force = self.app._process_sitemap_batch(
- test_bibcodes, 'force-update', session, initial_state
+ test_bibcodes, "force-update", session, initial_state
)
-
+
# Force-update should have same filtering results as add
- self.assertEqual(batch_stats['successful'], 3, "Force-update: Should include success, metrics-failed, links-failed statuses")
- self.assertEqual(batch_stats['failed'], 3, "Force-update: Should exclude solr-failed, retrying, and no-bib-data records")
- self.assertEqual(len(batch_stats['sitemap_records']), 3, "Force-update: Should return updated sitemap records for reporting")
- self.assertEqual(updated_state_force['count'], 0, "Force-update: State count should remain 0 (updating existing, not adding new)")
-
- # Results should be identical for filtering
- self.assertEqual(batch_stats['successful'], batch_stats['successful'], "Both actions should have same successful count")
- self.assertEqual(batch_stats['failed'], batch_stats['failed'], "Both actions should have same failed count")
+ self.assertEqual(
+ batch_stats["successful"],
+ 3,
+ "Force-update: Should include success, metrics-failed, links-failed statuses",
+ )
+ self.assertEqual(
+ batch_stats["failed"],
+ 3,
+ "Force-update: Should exclude solr-failed, retrying, and no-bib-data records",
+ )
+ self.assertEqual(
+ len(batch_stats["sitemap_records"]),
+ 3,
+ "Force-update: Should return updated sitemap records for reporting",
+ )
+ self.assertEqual(
+ updated_state_force["count"],
+ 0,
+ "Force-update: State count should remain 0 (updating existing, not adding new)",
+ )
+ # Results should be identical for filtering
+ self.assertEqual(
+ batch_stats["successful"],
+ batch_stats["successful"],
+ "Both actions should have same successful count",
+ )
+ self.assertEqual(
+ batch_stats["failed"],
+ batch_stats["failed"],
+ "Both actions should have same failed count",
+ )
def test_process_sitemap_batch_new_vs_existing_records(self):
"""Test handling of new records vs existing sitemap entries"""
-
+
# Create test records with specific timestamps
base_time = adsputils.get_date()
- new_bibcode = '2023New..1..1A'
- existing_recent_bibcode = '2023ExistingRecent..1..1A'
- existing_stale_bibcode = '2023ExistingStale..1..1A'
-
+ new_bibcode = "2023New..1..1A"
+ existing_recent_bibcode = "2023ExistingRecent..1..1A"
+ existing_stale_bibcode = "2023ExistingStale..1..1A"
+
test_bibcodes = [new_bibcode, existing_recent_bibcode, existing_stale_bibcode]
-
+
# Create records with specific bib_data_updated timestamps
for i, bibcode in enumerate(test_bibcodes):
- bib_data = {'title': f'Test Paper {bibcode}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ bib_data = {"title": f"Test Paper {bibcode}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Update bib_data_updated timestamps
with self.app.session_scope() as session:
- session.query(Records).filter(Records.bibcode == bibcode).update({
- 'bib_data_updated': base_time - timedelta(hours=i) # Different timestamps
- }, synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == bibcode).update(
+ {
+ "bib_data_updated": base_time
+ - timedelta(hours=i) # Different timestamps
+ },
+ synchronize_session=False,
+ )
session.commit()
-
+
# Create existing sitemap entries
with self.app.session_scope() as session:
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ records = (
+ session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
# Recent sitemap entry - filename_lastmoddate is NEWER than bib_data_updated
recent_sitemap = SitemapInfo(
record_id=record_map[existing_recent_bibcode],
bibcode=existing_recent_bibcode,
- sitemap_filename='sitemap_bib_1.xml',
- filename_lastmoddate=base_time + timedelta(hours=1), # NEWER than bib_data_updated
- update_flag=False
+ sitemap_filename="sitemap_bib_1.xml",
+ filename_lastmoddate=base_time
+ + timedelta(hours=1), # NEWER than bib_data_updated
+ update_flag=False,
)
-
+
# Stale sitemap entry - filename_lastmoddate is OLDER than bib_data_updated
stale_sitemap = SitemapInfo(
record_id=record_map[existing_stale_bibcode],
bibcode=existing_stale_bibcode,
- sitemap_filename='sitemap_bib_1.xml',
- filename_lastmoddate=base_time - timedelta(days=10), # OLDER than bib_data_updated
- update_flag=False
+ sitemap_filename="sitemap_bib_1.xml",
+ filename_lastmoddate=base_time
+ - timedelta(days=10), # OLDER than bib_data_updated
+ update_flag=False,
)
-
+
session.add(recent_sitemap)
session.add(stale_sitemap)
session.commit()
-
+
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 5, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 5, "index": 1}
+
batch_stats, updated_state = self.app._process_sitemap_batch(
- test_bibcodes, 'add', session, initial_state
+ test_bibcodes, "add", session, initial_state
)
-
+
# All 3 should be successful
- self.assertEqual(batch_stats['successful'], 3, "All records should be processed successfully")
- self.assertEqual(batch_stats['failed'], 0, "No records should fail")
-
+ self.assertEqual(
+ batch_stats["successful"],
+ 3,
+ "All records should be processed successfully",
+ )
+ self.assertEqual(batch_stats["failed"], 0, "No records should fail")
+
# Only NEW record increments count (1 new record)
- self.assertEqual(updated_state['count'], 6, "Only new record should increment count (5 + 1 = 6)")
-
+ self.assertEqual(
+ updated_state["count"],
+ 6,
+ "Only new record should increment count (5 + 1 = 6)",
+ )
+
# Check that update_flags are set correctly
with self.app.session_scope() as session:
- batch_stats['sitemap_records'] = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).all()
-
- for record in batch_stats['sitemap_records']:
+ batch_stats["sitemap_records"] = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .all()
+ )
+
+ for record in batch_stats["sitemap_records"]:
if record.bibcode == new_bibcode:
# New record should have update_flag = True
- self.assertTrue(record.update_flag, f"New record {record.bibcode} should have update_flag=True")
+ self.assertTrue(
+ record.update_flag,
+ f"New record {record.bibcode} should have update_flag=True",
+ )
elif record.bibcode == existing_recent_bibcode:
# Recent record should NOT be updated (filename_lastmoddate > bib_data_updated)
- self.assertFalse(record.update_flag, f"Recent record {record.bibcode} should have update_flag=False")
+ self.assertFalse(
+ record.update_flag,
+ f"Recent record {record.bibcode} should have update_flag=False",
+ )
elif record.bibcode == existing_stale_bibcode:
# Stale record should be updated (filename_lastmoddate < bib_data_updated)
- self.assertTrue(record.update_flag, f"Stale record {record.bibcode} should have update_flag=True")
+ self.assertTrue(
+ record.update_flag,
+ f"Stale record {record.bibcode} should have update_flag=True",
+ )
def test_process_sitemap_batch_add_action_with_recent_file(self):
"""Test 'add' action when file is newer than data (should NOT update)"""
-
+
base_time = adsputils.get_date()
- test_bibcode = '2023AddRecent..1..1A'
- bib_data = {'title': 'Test Add Recent Paper', 'year': 2023}
- self.app.update_storage(test_bibcode, 'bib_data', bib_data)
-
+ test_bibcode = "2023AddRecent..1..1A"
+ bib_data = {"title": "Test Add Recent Paper", "year": 2023}
+ self.app.update_storage(test_bibcode, "bib_data", bib_data)
+
# Set bib_data_updated to be OLDER than filename_lastmoddate
with self.app.session_scope() as session:
- session.query(Records).filter(Records.bibcode == test_bibcode).update({
- 'bib_data_updated': base_time - timedelta(hours=2) # 2 hours ago (OLDER)
- }, synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == test_bibcode).update(
+ {
+ "bib_data_updated": base_time
+ - timedelta(hours=2) # 2 hours ago (OLDER)
+ },
+ synchronize_session=False,
+ )
session.commit()
-
+
# Create existing sitemap entry with NEWER timestamp
with self.app.session_scope() as session:
- record = session.query(Records).filter(Records.bibcode == test_bibcode).first()
-
+ record = (
+ session.query(Records).filter(Records.bibcode == test_bibcode).first()
+ )
+
sitemap_info = SitemapInfo(
record_id=record.id,
bibcode=test_bibcode,
- sitemap_filename='sitemap_bib_1.xml',
+ sitemap_filename="sitemap_bib_1.xml",
filename_lastmoddate=base_time, # NEWER than bib_data_updated
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Store original sitemap_info values for comparison
with self.app.session_scope() as session:
- original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
+ original_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
original_filename_lastmoddate = original_record.filename_lastmoddate
original_sitemap_filename = original_record.sitemap_filename
original_update_flag = original_record.update_flag
-
+
# Test 'add' action
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1}
+
batch_stats, _ = self.app._process_sitemap_batch(
- [test_bibcode], 'add', session, initial_state
+ [test_bibcode], "add", session, initial_state
)
-
+
# Check that sitemap_info record remains unchanged
- sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
-
- self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully")
- self.assertEqual(batch_stats['failed'], 0, "No records should fail")
-
+ sitemap_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
+
+ self.assertEqual(
+ batch_stats["successful"], 1, "Record should be processed successfully"
+ )
+ self.assertEqual(batch_stats["failed"], 0, "No records should fail")
+
# Verify the record was not modified
- self.assertFalse(sitemap_record.update_flag, "'add' should NOT set update_flag when file is newer than data")
- self.assertEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should remain unchanged")
- self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged")
- self.assertEqual(sitemap_record.update_flag, original_update_flag, "update_flag should remain unchanged (False)")
+ self.assertFalse(
+ sitemap_record.update_flag,
+ "'add' should NOT set update_flag when file is newer than data",
+ )
+ self.assertEqual(
+ sitemap_record.filename_lastmoddate,
+ original_filename_lastmoddate,
+ "filename_lastmoddate should remain unchanged",
+ )
+ self.assertEqual(
+ sitemap_record.sitemap_filename,
+ original_sitemap_filename,
+ "sitemap_filename should remain unchanged",
+ )
+ self.assertEqual(
+ sitemap_record.update_flag,
+ original_update_flag,
+ "update_flag should remain unchanged (False)",
+ )
def test_process_sitemap_batch_add_action_with_stale_file(self):
"""Test 'add' action when data is newer than file (should update)"""
-
+
base_time = adsputils.get_date()
- test_bibcode = '2023AddStale..1..1A'
- bib_data = {'title': 'Test Add Stale Paper', 'year': 2023}
- self.app.update_storage(test_bibcode, 'bib_data', bib_data)
-
+ test_bibcode = "2023AddStale..1..1A"
+ bib_data = {"title": "Test Add Stale Paper", "year": 2023}
+ self.app.update_storage(test_bibcode, "bib_data", bib_data)
+
# Set bib_data_updated to be NEWER than filename_lastmoddate
with self.app.session_scope() as session:
- session.query(Records).filter(Records.bibcode == test_bibcode).update({
- 'bib_data_updated': base_time # Current time (NEWER)
- }, synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == test_bibcode).update(
+ {"bib_data_updated": base_time}, # Current time (NEWER)
+ synchronize_session=False,
+ )
session.commit()
-
+
# Create existing sitemap entry with OLDER timestamp
with self.app.session_scope() as session:
- record = session.query(Records).filter(Records.bibcode == test_bibcode).first()
-
+ record = (
+ session.query(Records).filter(Records.bibcode == test_bibcode).first()
+ )
+
sitemap_info = SitemapInfo(
record_id=record.id,
bibcode=test_bibcode,
- sitemap_filename='sitemap_bib_1.xml',
- filename_lastmoddate=base_time - timedelta(hours=3), # OLDER than bib_data_updated
- update_flag=False
+ sitemap_filename="sitemap_bib_1.xml",
+ filename_lastmoddate=base_time
+ - timedelta(hours=3), # OLDER than bib_data_updated
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Store original sitemap_info values for comparison
with self.app.session_scope() as session:
- original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
+ original_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
original_filename_lastmoddate = original_record.filename_lastmoddate
original_sitemap_filename = original_record.sitemap_filename
original_update_flag = original_record.update_flag
-
+
# Test 'add' action
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1}
+
batch_stats, _ = self.app._process_sitemap_batch(
- [test_bibcode], 'add', session, initial_state
+ [test_bibcode], "add", session, initial_state
)
-
+
# Check that sitemap_info record was updated appropriately
- sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
-
- self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully")
- self.assertEqual(batch_stats['failed'], 0, "No records should fail")
-
+ sitemap_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
+
+ self.assertEqual(
+ batch_stats["successful"], 1, "Record should be processed successfully"
+ )
+ self.assertEqual(batch_stats["failed"], 0, "No records should fail")
+
# Verify the record was updated correctly
- self.assertTrue(sitemap_record.update_flag, "'add' should set update_flag when data is newer than file")
- self.assertEqual(sitemap_record.filename_lastmoddate, base_time, "filename_lastmoddate should be updated to bib_data_updated")
- self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged")
- self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True")
- self.assertNotEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should have been updated")
+ self.assertTrue(
+ sitemap_record.update_flag,
+ "'add' should set update_flag when data is newer than file",
+ )
+ self.assertEqual(
+ sitemap_record.filename_lastmoddate,
+ base_time,
+ "filename_lastmoddate should be updated to bib_data_updated",
+ )
+ self.assertEqual(
+ sitemap_record.sitemap_filename,
+ original_sitemap_filename,
+ "sitemap_filename should remain unchanged",
+ )
+ self.assertNotEqual(
+ sitemap_record.update_flag,
+ original_update_flag,
+ "update_flag should have changed from False to True",
+ )
+ self.assertNotEqual(
+ sitemap_record.filename_lastmoddate,
+ original_filename_lastmoddate,
+ "filename_lastmoddate should have been updated",
+ )
def test_process_sitemap_batch_add_action_with_never_generated_file(self):
"""Test 'add' action when file has never been generated (filename_lastmoddate is None)"""
-
+
base_time = adsputils.get_date()
- test_bibcode = '2023AddNeverGenerated..1..1A'
- bib_data = {'title': 'Test Never Generated Paper', 'year': 2023}
- self.app.update_storage(test_bibcode, 'bib_data', bib_data)
-
+ test_bibcode = "2023AddNeverGenerated..1..1A"
+ bib_data = {"title": "Test Never Generated Paper", "year": 2023}
+ self.app.update_storage(test_bibcode, "bib_data", bib_data)
+
# Set bib_data_updated to any time (doesn't matter since filename_lastmoddate is None)
with self.app.session_scope() as session:
- session.query(Records).filter(Records.bibcode == test_bibcode).update({
- 'bib_data_updated': base_time - timedelta(hours=1) # 1 hour ago
- }, synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == test_bibcode).update(
+ {"bib_data_updated": base_time - timedelta(hours=1)}, # 1 hour ago
+ synchronize_session=False,
+ )
session.commit()
-
+
# Create existing sitemap entry with None filename_lastmoddate (never generated)
with self.app.session_scope() as session:
- record = session.query(Records).filter(Records.bibcode == test_bibcode).first()
-
+ record = (
+ session.query(Records).filter(Records.bibcode == test_bibcode).first()
+ )
+
sitemap_info = SitemapInfo(
record_id=record.id,
bibcode=test_bibcode,
- sitemap_filename='sitemap_bib_1.xml',
+ sitemap_filename="sitemap_bib_1.xml",
filename_lastmoddate=None, # Never been generated
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Store original sitemap_info values for comparison
with self.app.session_scope() as session:
- original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
+ original_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
original_filename_lastmoddate = original_record.filename_lastmoddate
original_sitemap_filename = original_record.sitemap_filename
original_update_flag = original_record.update_flag
-
+
# Test 'add' action
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1}
+
batch_stats, _ = self.app._process_sitemap_batch(
- [test_bibcode], 'add', session, initial_state
+ [test_bibcode], "add", session, initial_state
)
-
+
# Check that sitemap_info record was updated appropriately
- sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
-
- self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully")
- self.assertEqual(batch_stats['failed'], 0, "No records should fail")
-
+ sitemap_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
+
+ self.assertEqual(
+ batch_stats["successful"], 1, "Record should be processed successfully"
+ )
+ self.assertEqual(batch_stats["failed"], 0, "No records should fail")
+
# Verify the record was updated correctly
- self.assertTrue(sitemap_record.update_flag, "'add' should set update_flag when file has never been generated")
- self.assertEqual(sitemap_record.filename_lastmoddate, base_time - timedelta(hours=1), "filename_lastmoddate should be updated to bib_data_updated")
- self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged")
- self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True")
- self.assertIsNone(original_filename_lastmoddate, "Original filename_lastmoddate should have been None")
- self.assertIsNotNone(sitemap_record.filename_lastmoddate, "filename_lastmoddate should now be set")
+ self.assertTrue(
+ sitemap_record.update_flag,
+ "'add' should set update_flag when file has never been generated",
+ )
+ self.assertEqual(
+ sitemap_record.filename_lastmoddate,
+ base_time - timedelta(hours=1),
+ "filename_lastmoddate should be updated to bib_data_updated",
+ )
+ self.assertEqual(
+ sitemap_record.sitemap_filename,
+ original_sitemap_filename,
+ "sitemap_filename should remain unchanged",
+ )
+ self.assertNotEqual(
+ sitemap_record.update_flag,
+ original_update_flag,
+ "update_flag should have changed from False to True",
+ )
+ self.assertIsNone(
+ original_filename_lastmoddate,
+ "Original filename_lastmoddate should have been None",
+ )
+ self.assertIsNotNone(
+ sitemap_record.filename_lastmoddate,
+ "filename_lastmoddate should now be set",
+ )
def test_process_sitemap_batch_force_update_with_recent_file(self):
"""Test 'force-update' action when file is newer than data (should still update)"""
-
+
base_time = adsputils.get_date()
- test_bibcode = '2023ForceRecent..1..1A'
- bib_data = {'title': 'Test Force Recent Paper', 'year': 2023}
- self.app.update_storage(test_bibcode, 'bib_data', bib_data)
-
+ test_bibcode = "2023ForceRecent..1..1A"
+ bib_data = {"title": "Test Force Recent Paper", "year": 2023}
+ self.app.update_storage(test_bibcode, "bib_data", bib_data)
+
# Set bib_data_updated to be OLDER than filename_lastmoddate
with self.app.session_scope() as session:
- session.query(Records).filter(Records.bibcode == test_bibcode).update({
- 'bib_data_updated': base_time - timedelta(hours=4) # 4 hours ago (OLDER)
- }, synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == test_bibcode).update(
+ {
+ "bib_data_updated": base_time
+ - timedelta(hours=4) # 4 hours ago (OLDER)
+ },
+ synchronize_session=False,
+ )
session.commit()
-
+
# Create existing sitemap entry with NEWER timestamp
with self.app.session_scope() as session:
- record = session.query(Records).filter(Records.bibcode == test_bibcode).first()
-
+ record = (
+ session.query(Records).filter(Records.bibcode == test_bibcode).first()
+ )
+
sitemap_info = SitemapInfo(
record_id=record.id,
bibcode=test_bibcode,
- sitemap_filename='sitemap_bib_1.xml',
+ sitemap_filename="sitemap_bib_1.xml",
filename_lastmoddate=base_time, # NEWER than bib_data_updated
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Store original sitemap_info values for comparison
with self.app.session_scope() as session:
- original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
+ original_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
original_filename_lastmoddate = original_record.filename_lastmoddate
original_sitemap_filename = original_record.sitemap_filename
original_update_flag = original_record.update_flag
-
+
# Test 'force-update' action
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1}
+
batch_stats, _ = self.app._process_sitemap_batch(
- [test_bibcode], 'force-update', session, initial_state
+ [test_bibcode], "force-update", session, initial_state
)
-
+
# Check that sitemap_info record was updated appropriately
- sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
-
- self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully")
- self.assertEqual(batch_stats['failed'], 0, "No records should fail")
-
+ sitemap_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
+
+ self.assertEqual(
+ batch_stats["successful"], 1, "Record should be processed successfully"
+ )
+ self.assertEqual(batch_stats["failed"], 0, "No records should fail")
+
# Verify the record was updated correctly
- self.assertTrue(sitemap_record.update_flag, "'force-update' should ALWAYS set update_flag, even when file is newer")
- self.assertEqual(sitemap_record.filename_lastmoddate, base_time - timedelta(hours=4), "filename_lastmoddate should be updated to bib_data_updated")
- self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged")
- self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True")
- self.assertNotEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should have been updated")
+ self.assertTrue(
+ sitemap_record.update_flag,
+ "'force-update' should ALWAYS set update_flag, even when file is newer",
+ )
+ self.assertEqual(
+ sitemap_record.filename_lastmoddate,
+ base_time - timedelta(hours=4),
+ "filename_lastmoddate should be updated to bib_data_updated",
+ )
+ self.assertEqual(
+ sitemap_record.sitemap_filename,
+ original_sitemap_filename,
+ "sitemap_filename should remain unchanged",
+ )
+ self.assertNotEqual(
+ sitemap_record.update_flag,
+ original_update_flag,
+ "update_flag should have changed from False to True",
+ )
+ self.assertNotEqual(
+ sitemap_record.filename_lastmoddate,
+ original_filename_lastmoddate,
+ "filename_lastmoddate should have been updated",
+ )
def test_process_sitemap_batch_force_update_with_stale_file(self):
"""Test 'force-update' action when data is newer than file (should still update)"""
-
+
base_time = adsputils.get_date()
- test_bibcode = '2023ForceStale..1..1A'
- bib_data = {'title': 'Test Force Stale Paper', 'year': 2023}
- self.app.update_storage(test_bibcode, 'bib_data', bib_data)
-
+ test_bibcode = "2023ForceStale..1..1A"
+ bib_data = {"title": "Test Force Stale Paper", "year": 2023}
+ self.app.update_storage(test_bibcode, "bib_data", bib_data)
+
# Set bib_data_updated to be NEWER than filename_lastmoddate
with self.app.session_scope() as session:
- session.query(Records).filter(Records.bibcode == test_bibcode).update({
- 'bib_data_updated': base_time # Current time (NEWER)
- }, synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == test_bibcode).update(
+ {"bib_data_updated": base_time}, # Current time (NEWER)
+ synchronize_session=False,
+ )
session.commit()
-
+
# Create existing sitemap entry with OLDER timestamp
with self.app.session_scope() as session:
- record = session.query(Records).filter(Records.bibcode == test_bibcode).first()
-
+ record = (
+ session.query(Records).filter(Records.bibcode == test_bibcode).first()
+ )
+
sitemap_info = SitemapInfo(
record_id=record.id,
bibcode=test_bibcode,
- sitemap_filename='sitemap_bib_1.xml',
- filename_lastmoddate=base_time - timedelta(hours=2), # OLDER than bib_data_updated
- update_flag=False
+ sitemap_filename="sitemap_bib_1.xml",
+ filename_lastmoddate=base_time
+ - timedelta(hours=2), # OLDER than bib_data_updated
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Store original sitemap_info values for comparison
with self.app.session_scope() as session:
- original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
+ original_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
original_filename_lastmoddate = original_record.filename_lastmoddate
original_sitemap_filename = original_record.sitemap_filename
original_update_flag = original_record.update_flag
-
+
# Test 'force-update' action
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1}
+
batch_stats, _ = self.app._process_sitemap_batch(
- [test_bibcode], 'force-update', session, initial_state
+ [test_bibcode], "force-update", session, initial_state
)
-
+
# Check that sitemap_info record was updated appropriately
- sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first()
-
- self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully")
- self.assertEqual(batch_stats['failed'], 0, "No records should fail")
-
- # Verify the record was updated correctly
- self.assertTrue(sitemap_record.update_flag, "'force-update' should ALWAYS set update_flag, regardless of timestamps")
- self.assertEqual(sitemap_record.filename_lastmoddate, base_time, "filename_lastmoddate should be updated to bib_data_updated")
- self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged")
- self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True")
- self.assertNotEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should have been updated")
+ sitemap_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
-
+ self.assertEqual(
+ batch_stats["successful"], 1, "Record should be processed successfully"
+ )
+ self.assertEqual(batch_stats["failed"], 0, "No records should fail")
+
+ # Verify the record was updated correctly
+ self.assertTrue(
+ sitemap_record.update_flag,
+ "'force-update' should ALWAYS set update_flag, regardless of timestamps",
+ )
+ self.assertEqual(
+ sitemap_record.filename_lastmoddate,
+ base_time,
+ "filename_lastmoddate should be updated to bib_data_updated",
+ )
+ self.assertEqual(
+ sitemap_record.sitemap_filename,
+ original_sitemap_filename,
+ "sitemap_filename should remain unchanged",
+ )
+ self.assertNotEqual(
+ sitemap_record.update_flag,
+ original_update_flag,
+ "update_flag should have changed from False to True",
+ )
+ self.assertNotEqual(
+ sitemap_record.filename_lastmoddate,
+ original_filename_lastmoddate,
+ "filename_lastmoddate should have been updated",
+ )
def test_process_sitemap_batch_file_rollover(self):
"""Test sitemap file rollover when MAX_RECORDS_PER_SITEMAP is exceeded"""
-
+
# Create new records for rollover test
- rollover_bibcodes = ['2023Rollover..1..1A', '2023Rollover..2..2A']
+ rollover_bibcodes = ["2023Rollover..1..1A", "2023Rollover..2..2A"]
for bibcode in rollover_bibcodes:
- bib_data = {'title': f'Rollover Paper {bibcode}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ bib_data = {"title": f"Rollover Paper {bibcode}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Set low limit to trigger rollover
- original_max = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000)
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 1 # Very low limit
-
+ original_max = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000)
+ self.app.conf["MAX_RECORDS_PER_SITEMAP"] = 1 # Very low limit
+
try:
with self.app.session_scope() as session:
initial_state = {
- 'filename': 'sitemap_bib_3.xml',
- 'count': 1, # At limit
- 'index': 3
+ "filename": "sitemap_bib_3.xml",
+ "count": 1, # At limit
+ "index": 3,
}
-
+
batch_stats, updated_state = self.app._process_sitemap_batch(
- rollover_bibcodes, 'add', session, initial_state
+ rollover_bibcodes, "add", session, initial_state
)
-
+
# Should roll over to next file (final state after processing both records)
- self.assertEqual(updated_state['filename'], 'sitemap_bib_5.xml',
- "Final filename should be sitemap_bib_5.xml after both rollovers")
- self.assertEqual(updated_state['index'], 5, "Final index should be 5 after both rollovers")
- self.assertEqual(updated_state['count'], 1, "Final count should be 1 (second record in sitemap_bib_5.xml)")
- self.assertEqual(batch_stats['successful'], 2, "Both records should be processed successfully")
-
+ self.assertEqual(
+ updated_state["filename"],
+ "sitemap_bib_5.xml",
+ "Final filename should be sitemap_bib_5.xml after both rollovers",
+ )
+ self.assertEqual(
+ updated_state["index"],
+ 5,
+ "Final index should be 5 after both rollovers",
+ )
+ self.assertEqual(
+ updated_state["count"],
+ 1,
+ "Final count should be 1 (second record in sitemap_bib_5.xml)",
+ )
+ self.assertEqual(
+ batch_stats["successful"],
+ 2,
+ "Both records should be processed successfully",
+ )
+
# Verify database was updated correctly
- sitemap_records_db = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(rollover_bibcodes)
- ).order_by(SitemapInfo.bibcode).all()
-
- self.assertEqual(len(sitemap_records_db), 2, "Should have 2 records in database")
-
+ sitemap_records_db = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(rollover_bibcodes))
+ .order_by(SitemapInfo.bibcode)
+ .all()
+ )
+
+ self.assertEqual(
+ len(sitemap_records_db), 2, "Should have 2 records in database"
+ )
+
# Check first record (should be in sitemap_bib_4.xml after first rollover)
first_record = sitemap_records_db[0] # 2023Rollover..1..1A
- self.assertEqual(first_record.bibcode, '2023Rollover..1..1A', "First record bibcode should match")
- self.assertEqual(first_record.sitemap_filename, 'sitemap_bib_4.xml', "First record should be in sitemap_bib_4.xml")
- self.assertTrue(first_record.update_flag, "First record should have update_flag=True")
- self.assertIsNone(first_record.filename_lastmoddate, "First record should have filename_lastmoddate=None (new record)")
-
+ self.assertEqual(
+ first_record.bibcode,
+ "2023Rollover..1..1A",
+ "First record bibcode should match",
+ )
+ self.assertEqual(
+ first_record.sitemap_filename,
+ "sitemap_bib_4.xml",
+ "First record should be in sitemap_bib_4.xml",
+ )
+ self.assertTrue(
+ first_record.update_flag,
+ "First record should have update_flag=True",
+ )
+ self.assertIsNone(
+ first_record.filename_lastmoddate,
+ "First record should have filename_lastmoddate=None (new record)",
+ )
+
# Check second record (should be in sitemap_bib_5.xml after second rollover)
second_record = sitemap_records_db[1] # 2023Rollover..2..2A
- self.assertEqual(second_record.bibcode, '2023Rollover..2..2A', "Second record bibcode should match")
- self.assertEqual(second_record.sitemap_filename, 'sitemap_bib_5.xml', "Second record should be in sitemap_bib_5.xml")
- self.assertTrue(second_record.update_flag, "Second record should have update_flag=True")
- self.assertIsNone(second_record.filename_lastmoddate, "Second record should have filename_lastmoddate=None (new record)")
-
+ self.assertEqual(
+ second_record.bibcode,
+ "2023Rollover..2..2A",
+ "Second record bibcode should match",
+ )
+ self.assertEqual(
+ second_record.sitemap_filename,
+ "sitemap_bib_5.xml",
+ "Second record should be in sitemap_bib_5.xml",
+ )
+ self.assertTrue(
+ second_record.update_flag,
+ "Second record should have update_flag=True",
+ )
+ self.assertIsNone(
+ second_record.filename_lastmoddate,
+ "Second record should have filename_lastmoddate=None (new record)",
+ )
+
# Verify both records have valid record_id links
- self.assertIsNotNone(first_record.record_id, "First record should have valid record_id")
- self.assertIsNotNone(second_record.record_id, "Second record should have valid record_id")
-
+ self.assertIsNotNone(
+ first_record.record_id, "First record should have valid record_id"
+ )
+ self.assertIsNotNone(
+ second_record.record_id, "Second record should have valid record_id"
+ )
+
# Verify the Records table entries exist
- records_db = session.query(Records).filter(Records.bibcode.in_(rollover_bibcodes)).all()
- self.assertEqual(len(records_db), 2, "Should have 2 records in Records table")
-
+ records_db = (
+ session.query(Records)
+ .filter(Records.bibcode.in_(rollover_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(records_db), 2, "Should have 2 records in Records table"
+ )
+
# Verify record_id relationships are correct
record_ids = {r.bibcode: r.id for r in records_db}
- self.assertEqual(first_record.record_id, record_ids['2023Rollover..1..1A'], "First sitemap record_id should match Records table")
- self.assertEqual(second_record.record_id, record_ids['2023Rollover..2..2A'], "Second sitemap record_id should match Records table")
-
+ self.assertEqual(
+ first_record.record_id,
+ record_ids["2023Rollover..1..1A"],
+ "First sitemap record_id should match Records table",
+ )
+ self.assertEqual(
+ second_record.record_id,
+ record_ids["2023Rollover..2..2A"],
+ "Second sitemap record_id should match Records table",
+ )
+
finally:
# Restore original limit
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_max
+ self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_max
def test_process_sitemap_batch_error_handling(self):
"""Test error handling for non-existent records and exceptions"""
-
+
# Test 1: Non-existent record
- non_existent_bibcode = '2023Missing..1..1A'
-
+ non_existent_bibcode = "2023Missing..1..1A"
+
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1}
+
batch_stats, updated_state = self.app._process_sitemap_batch(
- [non_existent_bibcode], 'add', session, initial_state
+ [non_existent_bibcode], "add", session, initial_state
)
-
- self.assertEqual(batch_stats['successful'], 0, "Non-existent record should not be processed")
- self.assertEqual(batch_stats['failed'], 1, "Non-existent record should be counted as failed")
- self.assertEqual(updated_state, initial_state, "State should not change for failed records")
-
+
+ self.assertEqual(
+ batch_stats["successful"],
+ 0,
+ "Non-existent record should not be processed",
+ )
+ self.assertEqual(
+ batch_stats["failed"],
+ 1,
+ "Non-existent record should be counted as failed",
+ )
+ self.assertEqual(
+ updated_state,
+ initial_state,
+ "State should not change for failed records",
+ )
+
# Test 2: Exception during processing
- problematic_bibcode = '2023Problem..1..1A'
- bib_data = {'title': 'Problematic Paper'}
- self.app.update_storage(problematic_bibcode, 'bib_data', bib_data)
-
+ problematic_bibcode = "2023Problem..1..1A"
+ bib_data = {"title": "Problematic Paper"}
+ self.app.update_storage(problematic_bibcode, "bib_data", bib_data)
+
# Mock should_include_in_sitemap to raise an exception
original_method = self.app.should_include_in_sitemap
+
def mock_should_include(record):
- if record.get('bibcode') == problematic_bibcode:
+ if record.get("bibcode") == problematic_bibcode:
raise Exception("Test exception")
return original_method(record)
-
+
self.app.should_include_in_sitemap = mock_should_include
-
+
try:
with self.app.session_scope() as session:
batch_stats, updated_state = self.app._process_sitemap_batch(
- [problematic_bibcode], 'add', session, initial_state
+ [problematic_bibcode], "add", session, initial_state
+ )
+
+ self.assertEqual(
+ batch_stats["successful"],
+ 0,
+ "Exception should result in 0 successful",
)
-
- self.assertEqual(batch_stats['successful'], 0, "Exception should result in 0 successful")
- self.assertEqual(batch_stats['failed'], 1, "Exception should result in 1 failed")
-
+ self.assertEqual(
+ batch_stats["failed"], 1, "Exception should result in 1 failed"
+ )
+
finally:
# Restore original method
self.app.should_include_in_sitemap = original_method
def test_process_sitemap_batch_empty_input(self):
"""Test handling of empty bibcode list"""
-
+
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 5, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 5, "index": 1}
+
batch_stats, updated_state = self.app._process_sitemap_batch(
- [], 'add', session, initial_state
+ [], "add", session, initial_state
+ )
+
+ self.assertEqual(
+ batch_stats["successful"], 0, "Empty batch should have 0 successful"
+ )
+ self.assertEqual(
+ batch_stats["failed"], 0, "Empty batch should have 0 failed"
+ )
+ self.assertEqual(
+ len(batch_stats["sitemap_records"]),
+ 0,
+ "Empty batch should return empty records",
+ )
+ self.assertEqual(
+ updated_state, initial_state, "Empty batch should not change state"
)
-
- self.assertEqual(batch_stats['successful'], 0, "Empty batch should have 0 successful")
- self.assertEqual(batch_stats['failed'], 0, "Empty batch should have 0 failed")
- self.assertEqual(len(batch_stats['sitemap_records']), 0, "Empty batch should return empty records")
- self.assertEqual(updated_state, initial_state, "Empty batch should not change state")
def test_process_sitemap_batch_integration(self):
"""Integration test combining multiple scenarios in realistic workflow"""
-
+
# Create a mix of different record types (realistic scenario)
test_data = [
- ('2023Integration..1..1A', 'success', 'new'), # New valid record
- ('2023Integration..2..2A', 'success', 'existing'), # Existing valid record
- ('2023Integration..3..3A', 'solr-failed', 'new'), # New but SOLR failed
+ ("2023Integration..1..1A", "success", "new"), # New valid record
+ ("2023Integration..2..2A", "success", "existing"), # Existing valid record
+ ("2023Integration..3..3A", "solr-failed", "new"), # New but SOLR failed
]
-
+
# Setup records
for bibcode, status, record_type in test_data:
- bib_data = {'title': f'Integration Test {bibcode}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
- if status != 'success':
- self.app.mark_processed([bibcode], 'solr', checksums=[f'checksum_{bibcode}'], status=status)
-
+ bib_data = {"title": f"Integration Test {bibcode}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
+ if status != "success":
+ self.app.mark_processed(
+ [bibcode], "solr", checksums=[f"checksum_{bibcode}"], status=status
+ )
+
# Create existing sitemap entry for one record
with self.app.session_scope() as session:
- records = session.query(Records).filter(Records.bibcode.like('2023Integration%')).all()
+ records = (
+ session.query(Records)
+ .filter(Records.bibcode.like("2023Integration%"))
+ .all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
existing_sitemap = SitemapInfo(
- record_id=record_map['2023Integration..2..2A'],
- bibcode='2023Integration..2..2A',
- sitemap_filename='sitemap_bib_1.xml',
+ record_id=record_map["2023Integration..2..2A"],
+ bibcode="2023Integration..2..2A",
+ sitemap_filename="sitemap_bib_1.xml",
filename_lastmoddate=adsputils.get_date() - timedelta(days=5), # Stale
- update_flag=False
+ update_flag=False,
)
session.add(existing_sitemap)
session.commit()
-
+
# Run the integration test
test_bibcodes = [item[0] for item in test_data]
-
+
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1}
-
+ initial_state = {"filename": "sitemap_bib_1.xml", "count": 10, "index": 1}
+
batch_stats, updated_state = self.app._process_sitemap_batch(
- test_bibcodes, 'add', session, initial_state
+ test_bibcodes, "add", session, initial_state
)
-
+
# Expected: 2 successful (1 new valid + 1 existing valid), 1 failed (solr-failed)
- self.assertEqual(batch_stats['successful'], 2, "Should process 1 new + 1 existing valid record")
- self.assertEqual(batch_stats['failed'], 1, "Should fail 1 solr-failed record")
+ self.assertEqual(
+ batch_stats["successful"],
+ 2,
+ "Should process 1 new + 1 existing valid record",
+ )
+ self.assertEqual(
+ batch_stats["failed"], 1, "Should fail 1 solr-failed record"
+ )
# Only 1 new record should increment count
- self.assertEqual(updated_state['count'], 11, "Only new record should increment count")
- self.assertEqual(updated_state['filename'], 'sitemap_bib_1.xml', "Should stay in same file")
-
+ self.assertEqual(
+ updated_state["count"], 11, "Only new record should increment count"
+ )
+ self.assertEqual(
+ updated_state["filename"],
+ "sitemap_bib_1.xml",
+ "Should stay in same file",
+ )
+
# Verify database state
- sitemap_records_db = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.like('2023Integration%')
- ).order_by(SitemapInfo.bibcode).all()
-
+ sitemap_records_db = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023Integration%"))
+ .order_by(SitemapInfo.bibcode)
+ .all()
+ )
+
# Should have 2 records in database (1 new + 1 existing, solr-failed was not added)
- self.assertEqual(len(sitemap_records_db), 2, "Should have 2 sitemap records in database")
-
+ self.assertEqual(
+ len(sitemap_records_db), 2, "Should have 2 sitemap records in database"
+ )
+
# Check new record (2023Integration..1..1A)
- new_record = next((r for r in sitemap_records_db if r.bibcode == '2023Integration..1..1A'), None)
+ new_record = next(
+ (
+ r
+ for r in sitemap_records_db
+ if r.bibcode == "2023Integration..1..1A"
+ ),
+ None,
+ )
self.assertIsNotNone(new_record, "New record should exist in database")
- self.assertEqual(new_record.sitemap_filename, 'sitemap_bib_1.xml', "New record should be in sitemap_bib_1.xml")
- self.assertTrue(new_record.update_flag, "New record should have update_flag=True")
- self.assertIsNone(new_record.filename_lastmoddate, "New record should have filename_lastmoddate=None")
- self.assertIsNotNone(new_record.record_id, "New record should have valid record_id")
-
- # Check existing record (2023Integration..2..2A)
- existing_record = next((r for r in sitemap_records_db if r.bibcode == '2023Integration..2..2A'), None)
- self.assertIsNotNone(existing_record, "Existing record should still exist in database")
- self.assertEqual(existing_record.sitemap_filename, 'sitemap_bib_1.xml', "Existing record should stay in sitemap_bib_1.xml")
- self.assertTrue(existing_record.update_flag, "Existing record should have update_flag=True (was updated)")
+ self.assertEqual(
+ new_record.sitemap_filename,
+ "sitemap_bib_1.xml",
+ "New record should be in sitemap_bib_1.xml",
+ )
+ self.assertTrue(
+ new_record.update_flag, "New record should have update_flag=True"
+ )
+ self.assertIsNone(
+ new_record.filename_lastmoddate,
+ "New record should have filename_lastmoddate=None",
+ )
+ self.assertIsNotNone(
+ new_record.record_id, "New record should have valid record_id"
+ )
+
+ # Check existing record (2023Integration..2..2A)
+ existing_record = next(
+ (
+ r
+ for r in sitemap_records_db
+ if r.bibcode == "2023Integration..2..2A"
+ ),
+ None,
+ )
+ self.assertIsNotNone(
+ existing_record, "Existing record should still exist in database"
+ )
+ self.assertEqual(
+ existing_record.sitemap_filename,
+ "sitemap_bib_1.xml",
+ "Existing record should stay in sitemap_bib_1.xml",
+ )
+ self.assertTrue(
+ existing_record.update_flag,
+ "Existing record should have update_flag=True (was updated)",
+ )
# filename_lastmoddate should be updated to bib_data_updated for existing record
- self.assertIsNotNone(existing_record.filename_lastmoddate, "Existing record should have filename_lastmoddate updated")
- self.assertIsNotNone(existing_record.record_id, "Existing record should have valid record_id")
-
+ self.assertIsNotNone(
+ existing_record.filename_lastmoddate,
+ "Existing record should have filename_lastmoddate updated",
+ )
+ self.assertIsNotNone(
+ existing_record.record_id, "Existing record should have valid record_id"
+ )
+
# Verify solr-failed record is NOT in sitemap database
- failed_record = next((r for r in sitemap_records_db if r.bibcode == '2023Integration..3..3A'), None)
- self.assertIsNone(failed_record, "SOLR-failed record should NOT be in sitemap database")
-
+ failed_record = next(
+ (
+ r
+ for r in sitemap_records_db
+ if r.bibcode == "2023Integration..3..3A"
+ ),
+ None,
+ )
+ self.assertIsNone(
+ failed_record, "SOLR-failed record should NOT be in sitemap database"
+ )
+
# Verify Records table has all 3 records (including the failed one)
- records_db = session.query(Records).filter(Records.bibcode.like('2023Integration%')).all()
- self.assertEqual(len(records_db), 3, "Should have 3 records in Records table (including failed one)")
-
+ records_db = (
+ session.query(Records)
+ .filter(Records.bibcode.like("2023Integration%"))
+ .all()
+ )
+ self.assertEqual(
+ len(records_db),
+ 3,
+ "Should have 3 records in Records table (including failed one)",
+ )
+
# Verify record_id relationships are correct
record_ids = {r.bibcode: r.id for r in records_db}
- self.assertEqual(new_record.record_id, record_ids['2023Integration..1..1A'], "New record record_id should match")
- self.assertEqual(existing_record.record_id, record_ids['2023Integration..2..2A'], "Existing record record_id should match")
-
-
+ self.assertEqual(
+ new_record.record_id,
+ record_ids["2023Integration..1..1A"],
+ "New record record_id should match",
+ )
+ self.assertEqual(
+ existing_record.record_id,
+ record_ids["2023Integration..2..2A"],
+ "Existing record record_id should match",
+ )
def test_bulk_insert_and_update_operations(self):
"""Test both bulk_insert_sitemap_records and bulk_update_sitemap_records in single batch"""
-
+
# Create test data - mix of new records and records that will need updates
test_bibcodes = [
- '2023BulkOps..1..1A', # Will be new (insert)
- '2023BulkOps..2..2A', # Will be new (insert)
- '2023BulkOps..3..3A', # Will be existing (update)
- '2023BulkOps..4..4A', # Will be existing (update)
+ "2023BulkOps..1..1A", # Will be new (insert)
+ "2023BulkOps..2..2A", # Will be new (insert)
+ "2023BulkOps..3..3A", # Will be existing (update)
+ "2023BulkOps..4..4A", # Will be existing (update)
]
-
+
# Create Records entries
for bibcode in test_bibcodes:
- bib_data = {'title': f'Bulk Operations Test {bibcode}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ bib_data = {"title": f"Bulk Operations Test {bibcode}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Create existing SitemapInfo entries for records 3 and 4 (these will be updates)
with self.app.session_scope() as session:
- records = session.query(Records).filter(Records.bibcode.like('2023BulkOps%')).all()
+ records = (
+ session.query(Records)
+ .filter(Records.bibcode.like("2023BulkOps%"))
+ .all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
existing_entries = [
SitemapInfo(
- record_id=record_map['2023BulkOps..3..3A'],
- bibcode='2023BulkOps..3..3A',
- sitemap_filename='sitemap_bib_1.xml',
- filename_lastmoddate=adsputils.get_date() - timedelta(days=5), # Stale
- update_flag=False
+ record_id=record_map["2023BulkOps..3..3A"],
+ bibcode="2023BulkOps..3..3A",
+ sitemap_filename="sitemap_bib_1.xml",
+ filename_lastmoddate=adsputils.get_date()
+ - timedelta(days=5), # Stale
+ update_flag=False,
),
SitemapInfo(
- record_id=record_map['2023BulkOps..4..4A'],
- bibcode='2023BulkOps..4..4A',
- sitemap_filename='sitemap_bib_1.xml',
- filename_lastmoddate=adsputils.get_date() - timedelta(days=3), # Stale
- update_flag=False
- )
+ record_id=record_map["2023BulkOps..4..4A"],
+ bibcode="2023BulkOps..4..4A",
+ sitemap_filename="sitemap_bib_1.xml",
+ filename_lastmoddate=adsputils.get_date()
+ - timedelta(days=3), # Stale
+ update_flag=False,
+ ),
]
-
+
for entry in existing_entries:
session.add(entry)
session.commit()
-
+
# Mock the bulk operations to verify they're called correctly
- with patch.object(self.app, 'bulk_insert_sitemap_records') as mock_insert, \
- patch.object(self.app, 'bulk_update_sitemap_records') as mock_update:
-
+ with patch.object(
+ self.app, "bulk_insert_sitemap_records"
+ ) as mock_insert, patch.object(
+ self.app, "bulk_update_sitemap_records"
+ ) as mock_update:
# Run the batch processing
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1}
-
+ initial_state = {
+ "filename": "sitemap_bib_1.xml",
+ "count": 10,
+ "index": 1,
+ }
+
batch_stats, updated_state = self.app._process_sitemap_batch(
- test_bibcodes, 'add', session, initial_state
+ test_bibcodes, "add", session, initial_state
)
-
+
# Verify results
- self.assertEqual(batch_stats['successful'], 4, "Should process all 4 records successfully")
- self.assertEqual(batch_stats['failed'], 0, "Should have no failures")
- self.assertEqual(updated_state['count'], 12, "Should increment count by 2 (new records only)")
-
+ self.assertEqual(
+ batch_stats["successful"],
+ 4,
+ "Should process all 4 records successfully",
+ )
+ self.assertEqual(batch_stats["failed"], 0, "Should have no failures")
+ self.assertEqual(
+ updated_state["count"],
+ 12,
+ "Should increment count by 2 (new records only)",
+ )
+
# Verify bulk_insert_sitemap_records was called with new records
- self.assertTrue(mock_insert.called, "bulk_insert_sitemap_records should be called")
+ self.assertTrue(
+ mock_insert.called, "bulk_insert_sitemap_records should be called"
+ )
insert_call_args = mock_insert.call_args[0]
insert_records = insert_call_args[0] # First argument: new_records list
insert_session = insert_call_args[1] # Second argument: session
-
+
# Should have 2 new records (records 1 and 2)
self.assertEqual(len(insert_records), 2, "Should insert 2 new records")
- insert_bibcodes = {r['bibcode'] for r in insert_records}
- expected_new = {'2023BulkOps..1..1A', '2023BulkOps..2..2A'}
- self.assertEqual(insert_bibcodes, expected_new, "Should insert correct new records")
-
+ insert_bibcodes = {r["bibcode"] for r in insert_records}
+ expected_new = {"2023BulkOps..1..1A", "2023BulkOps..2..2A"}
+ self.assertEqual(
+ insert_bibcodes, expected_new, "Should insert correct new records"
+ )
+
# Verify session parameter
- self.assertIs(insert_session, session, "Should pass correct session to bulk_insert")
-
+ self.assertIs(
+ insert_session,
+ session,
+ "Should pass correct session to bulk_insert",
+ )
+
# Verify bulk_update_sitemap_records was called with existing records
- self.assertTrue(mock_update.called, "bulk_update_sitemap_records should be called")
+ self.assertTrue(
+ mock_update.called, "bulk_update_sitemap_records should be called"
+ )
update_call_args = mock_update.call_args[0]
- update_records = update_call_args[0] # First argument: update_records list
+ update_records = update_call_args[
+ 0
+ ] # First argument: update_records list
update_session = update_call_args[1] # Second argument: session
-
+
# Should have 2 update records (records 3 and 4)
- self.assertEqual(len(update_records), 2, "Should update 2 existing records")
- update_bibcodes = {r[0]['bibcode'] for r in update_records} # r[0] is sitemap_record
- expected_updates = {'2023BulkOps..3..3A', '2023BulkOps..4..4A'}
- self.assertEqual(update_bibcodes, expected_updates, "Should update correct existing records")
-
+ self.assertEqual(
+ len(update_records), 2, "Should update 2 existing records"
+ )
+ update_bibcodes = {
+ r[0]["bibcode"] for r in update_records
+ } # r[0] is sitemap_record
+ expected_updates = {"2023BulkOps..3..3A", "2023BulkOps..4..4A"}
+ self.assertEqual(
+ update_bibcodes,
+ expected_updates,
+ "Should update correct existing records",
+ )
+
# Verify session parameter
- self.assertIs(update_session, session, "Should pass correct session to bulk_update")
-
+ self.assertIs(
+ update_session,
+ session,
+ "Should pass correct session to bulk_update",
+ )
+
# Verify update records have correct properties
for sitemap_record, sitemap_info in update_records: # Unpack tuple
- self.assertTrue(sitemap_record['update_flag'], f"Update record {sitemap_record['bibcode']} should have update_flag=True")
- self.assertIsNotNone(sitemap_record['filename_lastmoddate'], f"Update record {sitemap_record['bibcode']} should have filename_lastmoddate updated")
-
+ self.assertTrue(
+ sitemap_record["update_flag"],
+ f"Update record {sitemap_record['bibcode']} should have update_flag=True",
+ )
+ self.assertIsNotNone(
+ sitemap_record["filename_lastmoddate"],
+ f"Update record {sitemap_record['bibcode']} should have filename_lastmoddate updated",
+ )
+
# Verify insert records have correct properties
for record in insert_records:
- self.assertTrue(record['update_flag'], f"Insert record {record['bibcode']} should have update_flag=True")
- self.assertIsNone(record['filename_lastmoddate'], f"Insert record {record['bibcode']} should have filename_lastmoddate=None")
- self.assertEqual(record['sitemap_filename'], 'sitemap_bib_1.xml', f"Insert record {record['bibcode']} should be in correct file")
-
-
+ self.assertTrue(
+ record["update_flag"],
+ f"Insert record {record['bibcode']} should have update_flag=True",
+ )
+ self.assertIsNone(
+ record["filename_lastmoddate"],
+ f"Insert record {record['bibcode']} should have filename_lastmoddate=None",
+ )
+ self.assertEqual(
+ record["sitemap_filename"],
+ "sitemap_bib_1.xml",
+ f"Insert record {record['bibcode']} should be in correct file",
+ )
def test_bulk_operations_error_handling(self):
"""Test error handling in bulk database operations during _process_sitemap_batch"""
-
+
# Create test data
- test_bibcodes = ['2023BulkError..1..1A', '2023BulkError..2..2A']
-
+ test_bibcodes = ["2023BulkError..1..1A", "2023BulkError..2..2A"]
+
for bibcode in test_bibcodes:
- bib_data = {'title': f'Bulk Error Test {bibcode}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ bib_data = {"title": f"Bulk Error Test {bibcode}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Mock bulk_insert to raise an exception
- with patch.object(self.app, 'bulk_insert_sitemap_records', side_effect=Exception("Database insert failed")):
-
+ with patch.object(
+ self.app,
+ "bulk_insert_sitemap_records",
+ side_effect=Exception("Database insert failed"),
+ ):
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1}
-
+ initial_state = {
+ "filename": "sitemap_bib_1.xml",
+ "count": 10,
+ "index": 1,
+ }
+
# Should raise the exception from bulk operations
with self.assertRaises(Exception) as context:
- self.app._process_sitemap_batch(test_bibcodes, 'add', session, initial_state)
-
+ self.app._process_sitemap_batch(
+ test_bibcodes, "add", session, initial_state
+ )
+
self.assertIn("Database insert failed", str(context.exception))
def test_bulk_operations_empty_scenarios(self):
"""Test bulk operations when there are no records to insert or update"""
-
+
# Create test records that will all be filtered out by SOLR status
- test_bibcodes = ['2023BulkEmpty..1..1A', '2023BulkEmpty..2..2A']
-
+ test_bibcodes = ["2023BulkEmpty..1..1A", "2023BulkEmpty..2..2A"]
+
for bibcode in test_bibcodes:
- bib_data = {'title': f'Bulk Empty Test {bibcode}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
+ bib_data = {"title": f"Bulk Empty Test {bibcode}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
# Mark as solr-failed so they get filtered out
- self.app.mark_processed([bibcode], 'solr', checksums=[f'checksum_{bibcode}'], status='solr-failed')
-
+ self.app.mark_processed(
+ [bibcode],
+ "solr",
+ checksums=[f"checksum_{bibcode}"],
+ status="solr-failed",
+ )
+
# Mock the bulk operations to verify they're not called
- with patch.object(self.app, 'bulk_insert_sitemap_records') as mock_insert, \
- patch.object(self.app, 'bulk_update_sitemap_records') as mock_update:
-
+ with patch.object(
+ self.app, "bulk_insert_sitemap_records"
+ ) as mock_insert, patch.object(
+ self.app, "bulk_update_sitemap_records"
+ ) as mock_update:
with self.app.session_scope() as session:
- initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1}
-
+ initial_state = {
+ "filename": "sitemap_bib_1.xml",
+ "count": 10,
+ "index": 1,
+ }
+
batch_stats, updated_state = self.app._process_sitemap_batch(
- test_bibcodes, 'add', session, initial_state
+ test_bibcodes, "add", session, initial_state
)
-
+
# Verify results
- self.assertEqual(batch_stats['successful'], 0, "Should have no successful records")
- self.assertEqual(batch_stats['failed'], 2, "Should have 2 failed records (filtered out)")
- self.assertEqual(updated_state['count'], 10, "Count should not change")
-
+ self.assertEqual(
+ batch_stats["successful"], 0, "Should have no successful records"
+ )
+ self.assertEqual(
+ batch_stats["failed"],
+ 2,
+ "Should have 2 failed records (filtered out)",
+ )
+ self.assertEqual(updated_state["count"], 10, "Count should not change")
+
# Verify bulk operations were not called (no valid records to process)
- self.assertFalse(mock_insert.called, "bulk_insert_sitemap_records should not be called")
- self.assertFalse(mock_update.called, "bulk_update_sitemap_records should not be called")
-
+ self.assertFalse(
+ mock_insert.called,
+ "bulk_insert_sitemap_records should not be called",
+ )
+ self.assertFalse(
+ mock_update.called,
+ "bulk_update_sitemap_records should not be called",
+ )
def test_bulk_update_sitemap_records(self):
"""Test bulk_update_sitemap_records method with performance timing"""
-
+
# Create test records
test_bibcodes = []
for i in range(100):
- bibcode = f'2023BulkUpdate..{i:04d}..{i:04d}A'
+ bibcode = f"2023BulkUpdate..{i:04d}..{i:04d}A"
test_bibcodes.append(bibcode)
- bib_data = {'title': f'Bulk Update Test {i}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ bib_data = {"title": f"Bulk Update Test {i}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Create initial sitemap entries
with self.app.session_scope() as session:
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ records = (
+ session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
for i, bibcode in enumerate(test_bibcodes):
sitemap_info = SitemapInfo(
record_id=record_map[bibcode],
bibcode=bibcode,
- sitemap_filename=f'sitemap_bib_{(i // 50) + 1}.xml', # 50 per file
- filename_lastmoddate=adsputils.get_date() - timedelta(hours=i), # Different timestamps
- update_flag=False
+ sitemap_filename=f"sitemap_bib_{(i // 50) + 1}.xml", # 50 per file
+ filename_lastmoddate=adsputils.get_date()
+ - timedelta(hours=i), # Different timestamps
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Prepare update records (tuples of sitemap_record, sitemap_info)
update_records = []
new_timestamp = adsputils.get_date()
-
+
with self.app.session_scope() as session:
- sitemap_infos = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).all()
-
+ sitemap_infos = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .all()
+ )
+
for sitemap_info in sitemap_infos:
# Create sitemap_record dict with updated values
sitemap_record = {
- 'bibcode': sitemap_info.bibcode,
- 'bib_data_updated': new_timestamp,
- 'sitemap_filename': sitemap_info.sitemap_filename,
- 'filename_lastmoddate': new_timestamp, # Updated timestamp
- 'update_flag': True # Mark for regeneration
+ "bibcode": sitemap_info.bibcode,
+ "bib_data_updated": new_timestamp,
+ "sitemap_filename": sitemap_info.sitemap_filename,
+ "filename_lastmoddate": new_timestamp, # Updated timestamp
+ "update_flag": True, # Mark for regeneration
}
-
+
# Create sitemap_info dict with id for bulk update
sitemap_info_dict = {
- 'id': sitemap_info.id,
- 'bibcode': sitemap_info.bibcode,
- 'sitemap_filename': sitemap_info.sitemap_filename
+ "id": sitemap_info.id,
+ "bibcode": sitemap_info.bibcode,
+ "sitemap_filename": sitemap_info.sitemap_filename,
}
-
+
update_records.append((sitemap_record, sitemap_info_dict))
-
+
# Test bulk update with performance timing
with self.app.session_scope() as session:
start_time = adsputils.get_date()
-
+
self.app.bulk_update_sitemap_records(update_records, session)
session.commit()
-
+
end_time = adsputils.get_date()
update_time = (end_time - start_time).total_seconds()
-
+
# Performance assertion
- self.assertLess(update_time, 5.0, f"Bulk update took {update_time:.3f}s, should be under 5s")
-
- print(f"bulk_update_sitemap_records performance: 100 records updated in {update_time:.3f}s")
-
+ self.assertLess(
+ update_time,
+ 5.0,
+ f"Bulk update took {update_time:.3f}s, should be under 5s",
+ )
+
+ print(
+ f"bulk_update_sitemap_records performance: 100 records updated in {update_time:.3f}s"
+ )
+
# Verify all records were updated correctly
with self.app.session_scope() as session:
- updated_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).all()
-
- self.assertEqual(len(updated_records), 100, "All 100 records should still exist")
-
+ updated_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .all()
+ )
+
+ self.assertEqual(
+ len(updated_records), 100, "All 100 records should still exist"
+ )
+
for record in updated_records:
# Verify update_flag was set to True
- self.assertTrue(record.update_flag, f"Record {record.bibcode} should have update_flag=True")
-
+ self.assertTrue(
+ record.update_flag,
+ f"Record {record.bibcode} should have update_flag=True",
+ )
+
# Verify filename_lastmoddate was updated (should be close to new_timestamp)
- time_diff = abs((record.filename_lastmoddate - new_timestamp).total_seconds())
- self.assertLess(time_diff, 60, f"Record {record.bibcode} filename_lastmoddate should be updated")
-
+ time_diff = abs(
+ (record.filename_lastmoddate - new_timestamp).total_seconds()
+ )
+ self.assertLess(
+ time_diff,
+ 60,
+ f"Record {record.bibcode} filename_lastmoddate should be updated",
+ )
+
# Verify bib_data_updated was updated
- bib_time_diff = abs((record.bib_data_updated - new_timestamp).total_seconds())
- self.assertLess(bib_time_diff, 60, f"Record {record.bibcode} bib_data_updated should be updated")
-
+ bib_time_diff = abs(
+ (record.bib_data_updated - new_timestamp).total_seconds()
+ )
+ self.assertLess(
+ bib_time_diff,
+ 60,
+ f"Record {record.bibcode} bib_data_updated should be updated",
+ )
+
# Verify sitemap_filename remains unchanged
- expected_filename = f'sitemap_bib_{(test_bibcodes.index(record.bibcode) // 50) + 1}.xml'
- self.assertEqual(record.sitemap_filename, expected_filename,
- f"Record {record.bibcode} sitemap_filename should remain unchanged")
-
+ expected_filename = (
+ f"sitemap_bib_{(test_bibcodes.index(record.bibcode) // 50) + 1}.xml"
+ )
+ self.assertEqual(
+ record.sitemap_filename,
+ expected_filename,
+ f"Record {record.bibcode} sitemap_filename should remain unchanged",
+ )
+
# Test edge cases
-
+
# Test 1: Empty update_records list
with self.app.session_scope() as session:
# Should not raise an exception
self.app.bulk_update_sitemap_records([], session)
-
+
# Test 2: Single record update
single_update = [(update_records[0][0], update_records[0][1])] # First record
-
+
with self.app.session_scope() as session:
# Change update_flag back to False for testing
session.query(SitemapInfo).filter(
SitemapInfo.bibcode == test_bibcodes[0]
- ).update({'update_flag': False}, synchronize_session=False)
+ ).update({"update_flag": False}, synchronize_session=False)
session.commit()
-
+
# Update with new values
- single_update[0][0]['update_flag'] = True
- single_update[0][0]['filename_lastmoddate'] = adsputils.get_date()
-
+ single_update[0][0]["update_flag"] = True
+ single_update[0][0]["filename_lastmoddate"] = adsputils.get_date()
+
self.app.bulk_update_sitemap_records(single_update, session)
session.commit()
-
+
# Verify single record was updated
- updated_record = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode == test_bibcodes[0]
- ).first()
-
- self.assertTrue(updated_record.update_flag, "Single record should have update_flag=True")
-
+ updated_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcodes[0])
+ .first()
+ )
+
+ self.assertTrue(
+ updated_record.update_flag, "Single record should have update_flag=True"
+ )
+
# Test 3: Partial field updates (only some fields provided in sitemap_record)
with self.app.session_scope() as session:
# Get the record ID within the active session
- second_record = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode == test_bibcodes[1]
- ).first()
-
+ second_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcodes[1])
+ .first()
+ )
+
partial_update_record = {
- 'bibcode': test_bibcodes[1],
- 'update_flag': False # Only updating this field
- }
-
- partial_sitemap_info = {
- 'id': second_record.id,
- 'bibcode': test_bibcodes[1]
+ "bibcode": test_bibcodes[1],
+ "update_flag": False, # Only updating this field
}
-
+
+ partial_sitemap_info = {"id": second_record.id, "bibcode": test_bibcodes[1]}
+
partial_updates = [(partial_update_record, partial_sitemap_info)]
-
+
self.app.bulk_update_sitemap_records(partial_updates, session)
session.commit()
-
+
# Verify only update_flag was changed
- partially_updated = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode == test_bibcodes[1]
- ).first()
-
- self.assertFalse(partially_updated.update_flag, "Partial update should set update_flag=False")
+ partially_updated = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcodes[1])
+ .first()
+ )
+
+ self.assertFalse(
+ partially_updated.update_flag,
+ "Partial update should set update_flag=False",
+ )
# Other fields should remain as they were (not None)
- self.assertIsNotNone(partially_updated.sitemap_filename, "sitemap_filename should not be cleared")
- self.assertIsNotNone(partially_updated.filename_lastmoddate, "filename_lastmoddate should not be cleared")
-
+ self.assertIsNotNone(
+ partially_updated.sitemap_filename,
+ "sitemap_filename should not be cleared",
+ )
+ self.assertIsNotNone(
+ partially_updated.filename_lastmoddate,
+ "filename_lastmoddate should not be cleared",
+ )
def test_bulk_insert_sitemap_records(self):
"""Test bulk_insert_sitemap_records method with performance timing"""
-
+
# Create test records in Records table first
test_bibcodes = []
for i in range(200):
- bibcode = f'2023BulkInsert..{i:04d}..{i:04d}A'
+ bibcode = f"2023BulkInsert..{i:04d}..{i:04d}A"
test_bibcodes.append(bibcode)
- bib_data = {'title': f'Bulk Insert Test {i}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ bib_data = {"title": f"Bulk Insert Test {i}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Get record IDs for foreign key relationships
with self.app.session_scope() as session:
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ records = (
+ session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
# Prepare sitemap records for bulk insert
sitemap_records = []
base_timestamp = adsputils.get_date()
-
+
for i, bibcode in enumerate(test_bibcodes):
sitemap_record = {
- 'record_id': record_map[bibcode],
- 'bibcode': bibcode,
- 'sitemap_filename': f'sitemap_bib_{(i // 100) + 1}.xml', # 100 per file
- 'bib_data_updated': base_timestamp - timedelta(minutes=i), # Different timestamps
- 'filename_lastmoddate': None, # New records start with None
- 'update_flag': True # New records need file generation
+ "record_id": record_map[bibcode],
+ "bibcode": bibcode,
+ "sitemap_filename": f"sitemap_bib_{(i // 100) + 1}.xml", # 100 per file
+ "bib_data_updated": base_timestamp
+ - timedelta(minutes=i), # Different timestamps
+ "filename_lastmoddate": None, # New records start with None
+ "update_flag": True, # New records need file generation
}
sitemap_records.append(sitemap_record)
-
+
# Test bulk insert with performance timing
with self.app.session_scope() as session:
# Verify no sitemap records exist initially
- initial_count = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).count()
+ initial_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .count()
+ )
self.assertEqual(initial_count, 0, "Should start with no sitemap records")
-
+
start_time = adsputils.get_date()
-
+
self.app.bulk_insert_sitemap_records(sitemap_records, session)
session.commit()
-
+
end_time = adsputils.get_date()
insert_time = (end_time - start_time).total_seconds()
-
+
# Performance assertion
- self.assertLess(insert_time, 5.0, f"Bulk insert took {insert_time:.3f}s, should be under 5s")
-
- print(f"bulk_insert_sitemap_records performance: 200 records inserted in {insert_time:.3f}s")
-
+ self.assertLess(
+ insert_time,
+ 5.0,
+ f"Bulk insert took {insert_time:.3f}s, should be under 5s",
+ )
+
+ print(
+ f"bulk_insert_sitemap_records performance: 200 records inserted in {insert_time:.3f}s"
+ )
+
# Verify all records were inserted correctly
with self.app.session_scope() as session:
- inserted_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).order_by(SitemapInfo.bibcode).all()
-
- self.assertEqual(len(inserted_records), 200, "All 200 records should be inserted")
-
+ inserted_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .order_by(SitemapInfo.bibcode)
+ .all()
+ )
+
+ self.assertEqual(
+ len(inserted_records), 200, "All 200 records should be inserted"
+ )
+
# Verify record distribution across files
file_counts = {}
for record in inserted_records:
filename = record.sitemap_filename
file_counts[filename] = file_counts.get(filename, 0) + 1
-
+
# Should have 2 files with 100 records each
self.assertEqual(len(file_counts), 2, "Should have exactly 2 sitemap files")
- self.assertEqual(file_counts.get('sitemap_bib_1.xml', 0), 100, "First file should have 100 records")
- self.assertEqual(file_counts.get('sitemap_bib_2.xml', 0), 100, "Second file should have 100 records")
-
+ self.assertEqual(
+ file_counts.get("sitemap_bib_1.xml", 0),
+ 100,
+ "First file should have 100 records",
+ )
+ self.assertEqual(
+ file_counts.get("sitemap_bib_2.xml", 0),
+ 100,
+ "Second file should have 100 records",
+ )
+
# Verify individual record properties
for i, record in enumerate(inserted_records):
expected_bibcode = test_bibcodes[i]
- self.assertEqual(record.bibcode, expected_bibcode, f"Record {i} bibcode should match")
-
+ self.assertEqual(
+ record.bibcode, expected_bibcode, f"Record {i} bibcode should match"
+ )
+
# Verify foreign key relationship
- self.assertEqual(record.record_id, record_map[expected_bibcode],
- f"Record {expected_bibcode} should have correct record_id")
-
+ self.assertEqual(
+ record.record_id,
+ record_map[expected_bibcode],
+ f"Record {expected_bibcode} should have correct record_id",
+ )
+
# Verify initial values for new records
- self.assertTrue(record.update_flag, f"Record {expected_bibcode} should have update_flag=True")
- self.assertIsNone(record.filename_lastmoddate, f"Record {expected_bibcode} should have filename_lastmoddate=None")
-
+ self.assertTrue(
+ record.update_flag,
+ f"Record {expected_bibcode} should have update_flag=True",
+ )
+ self.assertIsNone(
+ record.filename_lastmoddate,
+ f"Record {expected_bibcode} should have filename_lastmoddate=None",
+ )
+
# Verify sitemap filename assignment
- expected_filename = f'sitemap_bib_{(i // 100) + 1}.xml'
- self.assertEqual(record.sitemap_filename, expected_filename,
- f"Record {expected_bibcode} should be in {expected_filename}")
-
+ expected_filename = f"sitemap_bib_{(i // 100) + 1}.xml"
+ self.assertEqual(
+ record.sitemap_filename,
+ expected_filename,
+ f"Record {expected_bibcode} should be in {expected_filename}",
+ )
+
# Verify timestamp was set
- self.assertIsNotNone(record.bib_data_updated, f"Record {expected_bibcode} should have bib_data_updated")
-
+ self.assertIsNotNone(
+ record.bib_data_updated,
+ f"Record {expected_bibcode} should have bib_data_updated",
+ )
+
# Verify timestamp precision (should be within expected range)
expected_time = base_timestamp - timedelta(minutes=i)
- time_diff = abs((record.bib_data_updated - expected_time).total_seconds())
- self.assertLess(time_diff, 60, f"Record {expected_bibcode} timestamp should be accurate")
-
+ time_diff = abs(
+ (record.bib_data_updated - expected_time).total_seconds()
+ )
+ self.assertLess(
+ time_diff,
+ 60,
+ f"Record {expected_bibcode} timestamp should be accurate",
+ )
+
# Test edge cases
-
+
# Test 1: Empty batch_stats['sitemap_records'] list
with self.app.session_scope() as session:
# Should not raise an exception
self.app.bulk_insert_sitemap_records([], session)
session.commit()
-
+
# Test 2: Single record insert
- single_bibcode = '2023SingleInsert..1..1A'
- single_bib_data = {'title': 'Single Insert Test', 'year': 2023}
- self.app.update_storage(single_bibcode, 'bib_data', single_bib_data)
-
+ single_bibcode = "2023SingleInsert..1..1A"
+ single_bib_data = {"title": "Single Insert Test", "year": 2023}
+ self.app.update_storage(single_bibcode, "bib_data", single_bib_data)
+
with self.app.session_scope() as session:
- single_record = session.query(Records).filter(Records.bibcode == single_bibcode).first()
-
+ single_record = (
+ session.query(Records).filter(Records.bibcode == single_bibcode).first()
+ )
+
single_sitemap_record = {
- 'record_id': single_record.id,
- 'bibcode': single_bibcode,
- 'sitemap_filename': 'sitemap_bib_single.xml',
- 'bib_data_updated': adsputils.get_date(),
- 'filename_lastmoddate': None,
- 'update_flag': True
+ "record_id": single_record.id,
+ "bibcode": single_bibcode,
+ "sitemap_filename": "sitemap_bib_single.xml",
+ "bib_data_updated": adsputils.get_date(),
+ "filename_lastmoddate": None,
+ "update_flag": True,
}
-
+
self.app.bulk_insert_sitemap_records([single_sitemap_record], session)
session.commit()
-
+
# Verify single record was inserted
- inserted_single = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode == single_bibcode
- ).first()
-
+ inserted_single = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == single_bibcode)
+ .first()
+ )
+
self.assertIsNotNone(inserted_single, "Single record should be inserted")
- self.assertEqual(inserted_single.bibcode, single_bibcode, "Single record bibcode should match")
- self.assertEqual(inserted_single.sitemap_filename, 'sitemap_bib_single.xml', "Single record filename should match")
- self.assertTrue(inserted_single.update_flag, "Single record should have update_flag=True")
-
+ self.assertEqual(
+ inserted_single.bibcode,
+ single_bibcode,
+ "Single record bibcode should match",
+ )
+ self.assertEqual(
+ inserted_single.sitemap_filename,
+ "sitemap_bib_single.xml",
+ "Single record filename should match",
+ )
+ self.assertTrue(
+ inserted_single.update_flag,
+ "Single record should have update_flag=True",
+ )
+
# Test 3: Minimal required fields (test with only required fields)
- minimal_bibcode = '2023MinimalInsert..1..1A'
- minimal_bib_data = {'title': 'Minimal Insert Test', 'year': 2023}
- self.app.update_storage(minimal_bibcode, 'bib_data', minimal_bib_data)
-
+ minimal_bibcode = "2023MinimalInsert..1..1A"
+ minimal_bib_data = {"title": "Minimal Insert Test", "year": 2023}
+ self.app.update_storage(minimal_bibcode, "bib_data", minimal_bib_data)
+
with self.app.session_scope() as session:
- minimal_record = session.query(Records).filter(Records.bibcode == minimal_bibcode).first()
-
+ minimal_record = (
+ session.query(Records)
+ .filter(Records.bibcode == minimal_bibcode)
+ .first()
+ )
+
minimal_sitemap_record = {
- 'record_id': minimal_record.id,
- 'bibcode': minimal_bibcode,
+ "record_id": minimal_record.id,
+ "bibcode": minimal_bibcode,
# Only required fields, test defaults
}
-
+
self.app.bulk_insert_sitemap_records([minimal_sitemap_record], session)
session.commit()
-
+
# Verify minimal record was inserted with defaults
- inserted_minimal = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode == minimal_bibcode
- ).first()
-
+ inserted_minimal = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == minimal_bibcode)
+ .first()
+ )
+
self.assertIsNotNone(inserted_minimal, "Minimal record should be inserted")
- self.assertEqual(inserted_minimal.bibcode, minimal_bibcode, "Minimal record bibcode should match")
- self.assertEqual(inserted_minimal.record_id, minimal_record.id, "Minimal record should have correct record_id")
+ self.assertEqual(
+ inserted_minimal.bibcode,
+ minimal_bibcode,
+ "Minimal record bibcode should match",
+ )
+ self.assertEqual(
+ inserted_minimal.record_id,
+ minimal_record.id,
+ "Minimal record should have correct record_id",
+ )
# Other fields should have their database defaults
- self.assertIsNone(inserted_minimal.sitemap_filename, "Minimal record should have default sitemap_filename")
- self.assertFalse(inserted_minimal.update_flag, "Minimal record should have default update_flag=False")
-
+ self.assertIsNone(
+ inserted_minimal.sitemap_filename,
+ "Minimal record should have default sitemap_filename",
+ )
+ self.assertFalse(
+ inserted_minimal.update_flag,
+ "Minimal record should have default update_flag=False",
+ )
+
# Test 4: Verify no duplicate inserts (attempt to insert same bibcode twice should fail)
- duplicate_bibcode = '2023DuplicateTest..1..1A'
- duplicate_bib_data = {'title': 'Duplicate Test', 'year': 2023}
- self.app.update_storage(duplicate_bibcode, 'bib_data', duplicate_bib_data)
-
+ duplicate_bibcode = "2023DuplicateTest..1..1A"
+ duplicate_bib_data = {"title": "Duplicate Test", "year": 2023}
+ self.app.update_storage(duplicate_bibcode, "bib_data", duplicate_bib_data)
+
with self.app.session_scope() as session:
- duplicate_record = session.query(Records).filter(Records.bibcode == duplicate_bibcode).first()
-
+ duplicate_record = (
+ session.query(Records)
+ .filter(Records.bibcode == duplicate_bibcode)
+ .first()
+ )
+
duplicate_sitemap_record = {
- 'record_id': duplicate_record.id,
- 'bibcode': duplicate_bibcode,
- 'sitemap_filename': 'sitemap_bib_duplicate.xml',
- 'update_flag': True
+ "record_id": duplicate_record.id,
+ "bibcode": duplicate_bibcode,
+ "sitemap_filename": "sitemap_bib_duplicate.xml",
+ "update_flag": True,
}
-
+
# First insert should succeed
self.app.bulk_insert_sitemap_records([duplicate_sitemap_record], session)
session.commit()
-
+
# Second insert of same bibcode should fail due to UNIQUE constraint
with self.assertRaises(Exception): # Should raise IntegrityError or similar
- with self.app.session_scope() as new_session:
- self.app.bulk_insert_sitemap_records([duplicate_sitemap_record], new_session)
+ with self.app.session_scope() as new_session:
+ self.app.bulk_insert_sitemap_records(
+ [duplicate_sitemap_record], new_session
+ )
new_session.commit()
-
def test_delete_contents(self):
"""Test delete_contents method"""
-
+
# Create test records in SitemapInfo table
- test_bibcodes = ['2023DeleteTest..1..1A', '2023DeleteTest..2..2A', '2023DeleteTest..3..3A']
-
+ test_bibcodes = [
+ "2023DeleteTest..1..1A",
+ "2023DeleteTest..2..2A",
+ "2023DeleteTest..3..3A",
+ ]
+
for bibcode in test_bibcodes:
- bib_data = {'title': f'Delete Test {bibcode}', 'year': 2023}
- self.app.update_storage(bibcode, 'bib_data', bib_data)
-
+ bib_data = {"title": f"Delete Test {bibcode}", "year": 2023}
+ self.app.update_storage(bibcode, "bib_data", bib_data)
+
# Create sitemap entries
with self.app.session_scope() as session:
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
-
+ records = (
+ session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ )
+
for record in records:
sitemap_info = SitemapInfo(
record_id=record.id,
bibcode=record.bibcode,
- sitemap_filename='sitemap_bib_test.xml',
- update_flag=True
+ sitemap_filename="sitemap_bib_test.xml",
+ update_flag=True,
)
session.add(sitemap_info)
session.commit()
-
+
# Verify records exist before deletion
with self.app.session_scope() as session:
- initial_count = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).count()
- self.assertEqual(initial_count, 3, "Should have 3 sitemap records before deletion")
-
+ initial_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .count()
+ )
+ self.assertEqual(
+ initial_count, 3, "Should have 3 sitemap records before deletion"
+ )
+
# Test delete_contents
self.app.delete_contents(SitemapInfo)
-
+
# Verify all records were deleted
with self.app.session_scope() as session:
final_count = session.query(SitemapInfo).count()
self.assertEqual(final_count, 0, "All sitemap records should be deleted")
-
+
# Verify Records table is unaffected
- records_count = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).count()
+ records_count = (
+ session.query(Records)
+ .filter(Records.bibcode.in_(test_bibcodes))
+ .count()
+ )
self.assertEqual(records_count, 3, "Records table should be unaffected")
-
def test_backup_sitemap_files(self):
"""Test backup_sitemap_files method"""
-
# Create temporary directory for test
with tempfile.TemporaryDirectory() as temp_dir:
# Create test sitemap files
- test_files = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml', 'sitemap_index.xml']
-
+ test_files = ["sitemap_bib_1.xml", "sitemap_bib_2.xml", "sitemap_index.xml"]
+
for filename in test_files:
file_path = os.path.join(temp_dir, filename)
- with open(file_path, 'w') as f:
- f.write(f'Test content for {filename}')
-
+ with open(file_path, "w") as f:
+ f.write(f"Test content for {filename}")
+
# Verify files exist before backup
initial_files = os.listdir(temp_dir)
- self.assertEqual(len(initial_files), 3, "Should have 3 test files before backup")
+ self.assertEqual(
+ len(initial_files), 3, "Should have 3 test files before backup"
+ )
for filename in test_files:
- self.assertIn(filename, initial_files, f"File {filename} should exist before backup")
-
+ self.assertIn(
+ filename,
+ initial_files,
+ f"File {filename} should exist before backup",
+ )
+
# Mock os.system to capture the backup commands
backup_commands = []
original_system = os.system
-
+
def mock_system(command):
backup_commands.append(command)
# Execute mkdir command but skip mv command for testing
- if command.startswith('mkdir'):
+ if command.startswith("mkdir"):
return original_system(command)
return 0 # Success for mv command
-
+
# Test backup_sitemap_files with mocked os.system
- with patch('os.system', side_effect=mock_system):
+ with patch("os.system", side_effect=mock_system):
self.app.backup_sitemap_files(temp_dir)
-
+
# Verify backup commands were called
- self.assertEqual(len(backup_commands), 2, "Should execute 2 commands (mkdir + mv)")
-
+ self.assertEqual(
+ len(backup_commands), 2, "Should execute 2 commands (mkdir + mv)"
+ )
+
# Check mkdir command
mkdir_command = backup_commands[0]
- self.assertTrue(mkdir_command.startswith('mkdir -p /app/logs/tmp/sitemap_'),
- "First command should create backup directory")
-
+ self.assertTrue(
+ mkdir_command.startswith("mkdir -p /app/logs/tmp/sitemap_"),
+ "First command should create backup directory",
+ )
+
# Check mv command
mv_command = backup_commands[1]
- self.assertTrue(mv_command.startswith(f'mv {temp_dir}/*'),
- "Second command should move files from source directory")
- self.assertIn('/app/logs/tmp/sitemap_', mv_command,
- "Move command should target backup directory")
-
+ self.assertTrue(
+ mv_command.startswith(f"mv {temp_dir}/*"),
+ "Second command should move files from source directory",
+ )
+ self.assertIn(
+ "/app/logs/tmp/sitemap_",
+ mv_command,
+ "Move command should target backup directory",
+ )
+
# Verify backup directory path format (contains date components)
-
- date_pattern = r'/app/logs/tmp/sitemap_\d{4}_\d{1,2}_\d{1,2}-'
- self.assertTrue(re.search(date_pattern, mkdir_command),
- "Backup directory should contain date components")
-
+
+ date_pattern = r"/app/logs/tmp/sitemap_\d{4}_\d{1,2}_\d{1,2}-"
+ self.assertTrue(
+ re.search(date_pattern, mkdir_command),
+ "Backup directory should contain date components",
+ )
def test_execute_remove_action_basic_functionality(self):
"""Test basic functionality of _execute_remove_action method"""
-
+
# Create test records and sitemap entries
test_bibcodes = [
- '2023RemoveTest..1..1A',
- '2023RemoveTest..1..2A',
- '2023RemoveTest..1..3A'
+ "2023RemoveTest..1..1A",
+ "2023RemoveTest..1..2A",
+ "2023RemoveTest..1..3A",
]
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023RemoveTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023RemoveTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create Records entries
records = []
for i, bibcode in enumerate(test_bibcodes):
@@ -2385,72 +3385,107 @@ def test_execute_remove_action_basic_functionality(self):
bibcode=bibcode,
bib_data='{"title": "Test Record"}',
bib_data_updated=get_date(),
- status='success'
+ status="success",
)
session.add(record)
records.append(record)
-
+
session.flush() # Get record IDs
-
+
# Create SitemapInfo entries
sitemap_records = []
for i, (bibcode, record) in enumerate(zip(test_bibcodes, records)):
sitemap_record = SitemapInfo(
record_id=record.id,
bibcode=bibcode,
- sitemap_filename=f'sitemap_bib_{i+1}.xml',
+ sitemap_filename=f"sitemap_bib_{i+1}.xml",
bib_data_updated=get_date(),
filename_lastmoddate=get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_record)
sitemap_records.append(sitemap_record)
-
+
session.commit()
-
+
# Verify initial state
- initial_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).count()
- self.assertEqual(initial_count, 3, "Should have 3 sitemap records initially")
-
+ initial_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023RemoveTest%"))
+ .count()
+ )
+ self.assertEqual(
+ initial_count, 3, "Should have 3 sitemap records initially"
+ )
+
# Test removing 2 bibcodes
bibcodes_to_remove = test_bibcodes[:2] # Remove first 2
- removed_count, files_to_delete, _ = self.app._execute_remove_action(session, bibcodes_to_remove)
-
+ removed_count, files_to_delete, _ = self.app._execute_remove_action(
+ session, bibcodes_to_remove
+ )
+
# Verify results
self.assertEqual(removed_count, 2, "Should remove exactly 2 bibcodes")
- self.assertEqual(files_to_delete, {'sitemap_bib_1.xml', 'sitemap_bib_2.xml'},
- "Should identify 2 files for deletion")
-
+ self.assertEqual(
+ files_to_delete,
+ {"sitemap_bib_1.xml", "sitemap_bib_2.xml"},
+ "Should identify 2 files for deletion",
+ )
+
# Verify database state
- remaining_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).count()
- self.assertEqual(remaining_count, 1, "Should have 1 sitemap record remaining")
-
- remaining_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).first()
- self.assertEqual(remaining_record.bibcode, test_bibcodes[2], "Should keep the third bibcode")
- self.assertFalse(remaining_record.update_flag, "Remaining record should have update_flag=False")
-
+ remaining_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023RemoveTest%"))
+ .count()
+ )
+ self.assertEqual(
+ remaining_count, 1, "Should have 1 sitemap record remaining"
+ )
+
+ remaining_record = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023RemoveTest%"))
+ .first()
+ )
+ self.assertEqual(
+ remaining_record.bibcode,
+ test_bibcodes[2],
+ "Should keep the third bibcode",
+ )
+ self.assertFalse(
+ remaining_record.update_flag,
+ "Remaining record should have update_flag=False",
+ )
+
# Clean up
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023RemoveTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023RemoveTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
def test_execute_remove_action_empty_files_detection(self):
"""Test that _execute_remove_action correctly identifies empty files"""
-
+
test_bibcodes = [
- '2023EmptyTest..1..1A',
- '2023EmptyTest..1..2A',
- '2023EmptyTest..1..3A',
- '2023EmptyTest..1..4A'
+ "2023EmptyTest..1..1A",
+ "2023EmptyTest..1..2A",
+ "2023EmptyTest..1..3A",
+ "2023EmptyTest..1..4A",
]
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023EmptyTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023EmptyTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create Records entries
records = []
for bibcode in test_bibcodes:
@@ -2458,21 +3493,21 @@ def test_execute_remove_action_empty_files_detection(self):
bibcode=bibcode,
bib_data='{"title": "Test Record"}',
bib_data_updated=get_date(),
- status='success'
+ status="success",
)
session.add(record)
records.append(record)
-
+
session.flush()
-
+
# Create SitemapInfo entries - 2 records in file1, 1 record in file2, 1 record in file3
sitemap_assignments = [
- ('sitemap_bib_1.xml', test_bibcodes[0]), # File 1: 2 records
- ('sitemap_bib_1.xml', test_bibcodes[1]),
- ('sitemap_bib_2.xml', test_bibcodes[2]), # File 2: 1 record
- ('sitemap_bib_3.xml', test_bibcodes[3]) # File 3: 1 record
+ ("sitemap_bib_1.xml", test_bibcodes[0]), # File 1: 2 records
+ ("sitemap_bib_1.xml", test_bibcodes[1]),
+ ("sitemap_bib_2.xml", test_bibcodes[2]), # File 2: 1 record
+ ("sitemap_bib_3.xml", test_bibcodes[3]), # File 3: 1 record
]
-
+
for i, (filename, bibcode) in enumerate(sitemap_assignments):
sitemap_record = SitemapInfo(
record_id=records[i].id,
@@ -2480,73 +3515,113 @@ def test_execute_remove_action_empty_files_detection(self):
sitemap_filename=filename,
bib_data_updated=get_date(),
filename_lastmoddate=get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_record)
-
+
session.commit()
-
+
# Remove records that will make file2 and file3 empty, but leave file1 with 1 record
- bibcodes_to_remove = [test_bibcodes[1], test_bibcodes[2], test_bibcodes[3]] # Remove from file1, all of file2, all of file3
- removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, bibcodes_to_remove)
-
+ bibcodes_to_remove = [
+ test_bibcodes[1],
+ test_bibcodes[2],
+ test_bibcodes[3],
+ ] # Remove from file1, all of file2, all of file3
+ (
+ removed_count,
+ files_to_delete,
+ files_to_update,
+ ) = self.app._execute_remove_action(session, bibcodes_to_remove)
+
# Verify results
self.assertEqual(removed_count, 3, "Should remove exactly 3 bibcodes")
- self.assertEqual(files_to_delete, {'sitemap_bib_2.xml', 'sitemap_bib_3.xml'},
- "Should identify files 2 and 3 as empty")
-
+ self.assertEqual(
+ files_to_delete,
+ {"sitemap_bib_2.xml", "sitemap_bib_3.xml"},
+ "Should identify files 2 and 3 as empty",
+ )
+
# Verify file1 is in files_to_update (needs regeneration but not deletion)
- self.assertIn('sitemap_bib_1.xml', files_to_update, "File 1 should be marked for update")
-
+ self.assertIn(
+ "sitemap_bib_1.xml",
+ files_to_update,
+ "File 1 should be marked for update",
+ )
+
# Verify file1 still has records
- file1_records = session.query(SitemapInfo).filter(
- SitemapInfo.sitemap_filename == 'sitemap_bib_1.xml'
- ).all()
- self.assertEqual(len(file1_records), 1, "File 1 should have 1 remaining record")
-
+ file1_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.sitemap_filename == "sitemap_bib_1.xml")
+ .all()
+ )
+ self.assertEqual(
+ len(file1_records), 1, "File 1 should have 1 remaining record"
+ )
+
# Clean up
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023EmptyTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023EmptyTest%")
+ ).delete(synchronize_session=False)
session.commit()
def test_execute_remove_action_no_matching_records(self):
"""Test _execute_remove_action with bibcodes that don't exist"""
-
+
with self.app.session_scope() as session:
# Test with non-existent bibcodes
- non_existent_bibcodes = ['2023NonExistent..1..1A', '2023NonExistent..1..2A']
- removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, non_existent_bibcodes)
-
+ non_existent_bibcodes = ["2023NonExistent..1..1A", "2023NonExistent..1..2A"]
+ (
+ removed_count,
+ files_to_delete,
+ files_to_update,
+ ) = self.app._execute_remove_action(session, non_existent_bibcodes)
+
# Should return zero results
- self.assertEqual(removed_count, 0, "Should remove 0 bibcodes when none exist")
- self.assertEqual(files_to_delete, set(), "Should return empty set for files to delete")
-
+ self.assertEqual(
+ removed_count, 0, "Should remove 0 bibcodes when none exist"
+ )
+ self.assertEqual(
+ files_to_delete, set(), "Should return empty set for files to delete"
+ )
+
def test_execute_remove_action_empty_input(self):
"""Test _execute_remove_action with empty input"""
-
+
with self.app.session_scope() as session:
# Test with empty list
- removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, [])
-
+ (
+ removed_count,
+ files_to_delete,
+ files_to_update,
+ ) = self.app._execute_remove_action(session, [])
+
# Should return zero results immediately
- self.assertEqual(removed_count, 0, "Should remove 0 bibcodes with empty input")
- self.assertEqual(files_to_delete, set(), "Should return empty set for files to delete")
-
+ self.assertEqual(
+ removed_count, 0, "Should remove 0 bibcodes with empty input"
+ )
+ self.assertEqual(
+ files_to_delete, set(), "Should return empty set for files to delete"
+ )
+
def test_execute_remove_action_mixed_scenarios(self):
"""Test _execute_remove_action with mixed existing/non-existing bibcodes"""
-
- test_bibcodes = [
- '2023MixedTest..1..1A',
- '2023MixedTest..1..2A'
- ]
- non_existent_bibcodes = ['2023NonExist..1..1A', '2023NonExist..1..2A']
-
+
+ test_bibcodes = ["2023MixedTest..1..1A", "2023MixedTest..1..2A"]
+ non_existent_bibcodes = ["2023NonExist..1..1A", "2023NonExist..1..2A"]
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023MixedTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023MixedTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023MixedTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023MixedTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create Records entries
records = []
for bibcode in test_bibcodes:
@@ -2554,60 +3629,82 @@ def test_execute_remove_action_mixed_scenarios(self):
bibcode=bibcode,
bib_data='{"title": "Test Record"}',
bib_data_updated=get_date(),
- status='success'
+ status="success",
)
session.add(record)
records.append(record)
-
+
session.flush()
-
+
# Create SitemapInfo entries
for i, (bibcode, record) in enumerate(zip(test_bibcodes, records)):
sitemap_record = SitemapInfo(
record_id=record.id,
bibcode=bibcode,
- sitemap_filename='sitemap_bib_1.xml',
+ sitemap_filename="sitemap_bib_1.xml",
bib_data_updated=get_date(),
filename_lastmoddate=get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_record)
-
+
session.commit()
-
+
# Test removing mix of existing and non-existing bibcodes
mixed_bibcodes = test_bibcodes + non_existent_bibcodes
- removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, mixed_bibcodes)
-
+ (
+ removed_count,
+ files_to_delete,
+ files_to_update,
+ ) = self.app._execute_remove_action(session, mixed_bibcodes)
+
# Should only remove the existing ones
- self.assertEqual(removed_count, 2, "Should remove only the 2 existing bibcodes")
- self.assertEqual(files_to_delete, {'sitemap_bib_1.xml'}, "Should identify 1 file for deletion")
-
+ self.assertEqual(
+ removed_count, 2, "Should remove only the 2 existing bibcodes"
+ )
+ self.assertEqual(
+ files_to_delete,
+ {"sitemap_bib_1.xml"},
+ "Should identify 1 file for deletion",
+ )
+
# Verify database state
- remaining_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023MixedTest%')).count()
- self.assertEqual(remaining_count, 0, "Should have no sitemap records remaining")
-
+ remaining_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023MixedTest%"))
+ .count()
+ )
+ self.assertEqual(
+ remaining_count, 0, "Should have no sitemap records remaining"
+ )
+
# Clean up
- session.query(Records).filter(Records.bibcode.like('2023MixedTest%')).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023MixedTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
def test_execute_remove_action_partial_file_removal(self):
"""Test _execute_remove_action when only some records are removed from files"""
-
+
test_bibcodes = [
- '2023PartialTest..1..1A',
- '2023PartialTest..1..2A',
- '2023PartialTest..1..3A',
- '2023PartialTest..1..4A',
- '2023PartialTest..1..5A'
+ "2023PartialTest..1..1A",
+ "2023PartialTest..1..2A",
+ "2023PartialTest..1..3A",
+ "2023PartialTest..1..4A",
+ "2023PartialTest..1..5A",
]
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PartialTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023PartialTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023PartialTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023PartialTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create Records entries
records = []
for bibcode in test_bibcodes:
@@ -2615,22 +3712,22 @@ def test_execute_remove_action_partial_file_removal(self):
bibcode=bibcode,
bib_data='{"title": "Test Record"}',
bib_data_updated=get_date(),
- status='success'
+ status="success",
)
session.add(record)
records.append(record)
-
+
session.flush()
-
+
# Create SitemapInfo entries - distribute across 2 files
sitemap_assignments = [
- ('sitemap_bib_1.xml', test_bibcodes[0]), # File 1: 3 records
- ('sitemap_bib_1.xml', test_bibcodes[1]),
- ('sitemap_bib_1.xml', test_bibcodes[2]),
- ('sitemap_bib_2.xml', test_bibcodes[3]), # File 2: 2 records
- ('sitemap_bib_2.xml', test_bibcodes[4])
+ ("sitemap_bib_1.xml", test_bibcodes[0]), # File 1: 3 records
+ ("sitemap_bib_1.xml", test_bibcodes[1]),
+ ("sitemap_bib_1.xml", test_bibcodes[2]),
+ ("sitemap_bib_2.xml", test_bibcodes[3]), # File 2: 2 records
+ ("sitemap_bib_2.xml", test_bibcodes[4]),
]
-
+
for i, (filename, bibcode) in enumerate(sitemap_assignments):
sitemap_record = SitemapInfo(
record_id=records[i].id,
@@ -2638,53 +3735,88 @@ def test_execute_remove_action_partial_file_removal(self):
sitemap_filename=filename,
bib_data_updated=get_date(),
filename_lastmoddate=get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_record)
-
+
session.commit()
-
+
# Remove 1 record from file1 and 1 record from file2 (partial removal)
- bibcodes_to_remove = [test_bibcodes[1], test_bibcodes[3]] # 1 from each file
- removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, bibcodes_to_remove)
-
+ bibcodes_to_remove = [
+ test_bibcodes[1],
+ test_bibcodes[3],
+ ] # 1 from each file
+ (
+ removed_count,
+ files_to_delete,
+ files_to_update,
+ ) = self.app._execute_remove_action(session, bibcodes_to_remove)
+
# Verify results
self.assertEqual(removed_count, 2, "Should remove exactly 2 bibcodes")
- self.assertEqual(files_to_delete, set(), "Should not delete any files (both still have records)")
-
+ self.assertEqual(
+ files_to_delete,
+ set(),
+ "Should not delete any files (both still have records)",
+ )
+
# Verify both files are in files_to_update
- self.assertIn('sitemap_bib_1.xml', files_to_update, "File 1 should be marked for update")
- self.assertIn('sitemap_bib_2.xml', files_to_update, "File 2 should be marked for update")
-
+ self.assertIn(
+ "sitemap_bib_1.xml",
+ files_to_update,
+ "File 1 should be marked for update",
+ )
+ self.assertIn(
+ "sitemap_bib_2.xml",
+ files_to_update,
+ "File 2 should be marked for update",
+ )
+
# Verify both files still have records
- file1_records = session.query(SitemapInfo).filter(
- SitemapInfo.sitemap_filename == 'sitemap_bib_1.xml'
- ).all()
- file2_records = session.query(SitemapInfo).filter(
- SitemapInfo.sitemap_filename == 'sitemap_bib_2.xml'
- ).all()
-
- self.assertEqual(len(file1_records), 2, "File 1 should have 2 remaining records")
- self.assertEqual(len(file2_records), 1, "File 2 should have 1 remaining record")
-
+ file1_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.sitemap_filename == "sitemap_bib_1.xml")
+ .all()
+ )
+ file2_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.sitemap_filename == "sitemap_bib_2.xml")
+ .all()
+ )
+
+ self.assertEqual(
+ len(file1_records), 2, "File 1 should have 2 remaining records"
+ )
+ self.assertEqual(
+ len(file2_records), 1, "File 2 should have 1 remaining record"
+ )
+
# Clean up
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PartialTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023PartialTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023PartialTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023PartialTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
def test_execute_remove_action_performance_with_large_batch(self):
"""Test _execute_remove_action performance with larger batch sizes"""
-
+
# Create a larger batch for performance testing
batch_size = 1000
- test_bibcodes = [f'2023PerfTest..{i:03d}..{i:03d}A' for i in range(batch_size)]
-
+ test_bibcodes = [f"2023PerfTest..{i:03d}..{i:03d}A" for i in range(batch_size)]
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PerfTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023PerfTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023PerfTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(Records.bibcode.like("2023PerfTest%")).delete(
+ synchronize_session=False
+ )
session.commit()
-
+
# Create Records entries
records = []
for bibcode in test_bibcodes:
@@ -2692,289 +3824,383 @@ def test_execute_remove_action_performance_with_large_batch(self):
bibcode=bibcode,
bib_data='{"title": "Performance Test Record"}',
bib_data_updated=get_date(),
- status='success'
+ status="success",
)
session.add(record)
records.append(record)
-
+
session.flush()
-
+
# Create SitemapInfo entries - distribute across multiple files
for i, (bibcode, record) in enumerate(zip(test_bibcodes, records)):
file_index = (i // 10) + 1 # 10 records per file
sitemap_record = SitemapInfo(
record_id=record.id,
bibcode=bibcode,
- sitemap_filename=f'sitemap_bib_{file_index}.xml',
+ sitemap_filename=f"sitemap_bib_{file_index}.xml",
bib_data_updated=get_date(),
filename_lastmoddate=get_date(),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_record)
-
+
session.commit()
-
+
# Time the removal operation
start_time = time.time()
- removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, test_bibcodes)
+ (
+ removed_count,
+ files_to_delete,
+ files_to_update,
+ ) = self.app._execute_remove_action(session, test_bibcodes)
end_time = time.time()
-
+
execution_time = end_time - start_time
-
+
# Verify results
- self.assertEqual(removed_count, batch_size, f"Should remove all {batch_size} bibcodes")
- self.assertEqual(len(files_to_delete), 100, "Should identify 100 files for deletion")
-
+ self.assertEqual(
+ removed_count, batch_size, f"Should remove all {batch_size} bibcodes"
+ )
+ self.assertEqual(
+ len(files_to_delete), 100, "Should identify 100 files for deletion"
+ )
+
# Performance assertion - should complete reasonably quickly
- self.assertLess(execution_time, 5.0, f"Removal of {batch_size} records should complete in under 5 seconds")
-
+ self.assertLess(
+ execution_time,
+ 5.0,
+ f"Removal of {batch_size} records should complete in under 5 seconds",
+ )
+
# Verify database state
- remaining_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PerfTest%')).count()
- self.assertEqual(remaining_count, 0, "Should have no sitemap records remaining")
-
+ remaining_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023PerfTest%"))
+ .count()
+ )
+ self.assertEqual(
+ remaining_count, 0, "Should have no sitemap records remaining"
+ )
+
# Clean up
- session.query(Records).filter(Records.bibcode.like('2023PerfTest%')).delete(synchronize_session=False)
+ session.query(Records).filter(Records.bibcode.like("2023PerfTest%")).delete(
+ synchronize_session=False
+ )
session.commit()
-
- print(f"_execute_remove_action performance test completed in {execution_time:.3f} seconds for {batch_size} records")
-
+
+ print(
+ f"_execute_remove_action performance test completed in {execution_time:.3f} seconds for {batch_size} records"
+ )
+
def test_delete_sitemap_files(self):
"""Test delete_sitemap_files method"""
-
+
# Create temporary directory structure for test
with tempfile.TemporaryDirectory() as temp_dir:
# Mock SITES configuration
sites_config = {
- 'ads': {'base_url': 'https://ui.adsabs.harvard.edu/'},
- 'scix': {'base_url': 'https://scixplorer.org/'}
+ "ads": {"base_url": "https://ui.adsabs.harvard.edu/"},
+ "scix": {"base_url": "https://scixplorer.org/"},
}
-
+
# Create site directories and test files
- test_files = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml', 'sitemap_index.xml']
+ test_files = ["sitemap_bib_1.xml", "sitemap_bib_2.xml", "sitemap_index.xml"]
created_files = []
-
+
for site_key in sites_config.keys():
site_dir = os.path.join(temp_dir, site_key)
os.makedirs(site_dir)
-
+
for filename in test_files:
file_path = os.path.join(site_dir, filename)
- with open(file_path, 'w') as f:
- f.write(f'Test content for {filename} in {site_key}')
+ with open(file_path, "w") as f:
+ f.write(
+ f"Test content for {filename} in {site_key}"
+ )
created_files.append(file_path)
-
+
# Verify all files exist before deletion
for file_path in created_files:
- self.assertTrue(os.path.exists(file_path), f"File {file_path} should exist before deletion")
-
+ self.assertTrue(
+ os.path.exists(file_path),
+ f"File {file_path} should exist before deletion",
+ )
+
# Mock the SITES configuration
- original_sites = self.app.conf.get('SITES', {})
- self.app.conf['SITES'] = sites_config
-
+ original_sites = self.app.conf.get("SITES", {})
+ self.app.conf["SITES"] = sites_config
+
try:
# Test delete_sitemap_files - delete first 2 files
- files_to_delete = {'sitemap_bib_1.xml', 'sitemap_bib_2.xml'}
-
+ files_to_delete = {"sitemap_bib_1.xml", "sitemap_bib_2.xml"}
+
self.app.delete_sitemap_files(files_to_delete, temp_dir)
-
+
# Verify deleted files are gone
for site_key in sites_config.keys():
for filename in files_to_delete:
file_path = os.path.join(temp_dir, site_key, filename)
- self.assertFalse(os.path.exists(file_path),
- f"File {file_path} should be deleted")
-
+ self.assertFalse(
+ os.path.exists(file_path),
+ f"File {file_path} should be deleted",
+ )
+
# Verify remaining files still exist
for site_key in sites_config.keys():
- remaining_file = os.path.join(temp_dir, site_key, 'sitemap_index.xml')
- self.assertTrue(os.path.exists(remaining_file),
- f"File {remaining_file} should still exist")
-
+ remaining_file = os.path.join(
+ temp_dir, site_key, "sitemap_index.xml"
+ )
+ self.assertTrue(
+ os.path.exists(remaining_file),
+ f"File {remaining_file} should still exist",
+ )
+
# Test empty files_to_delete set (should do nothing)
- remaining_count_before = sum(len(os.listdir(os.path.join(temp_dir, site)))
- for site in sites_config.keys())
-
+ remaining_count_before = sum(
+ len(os.listdir(os.path.join(temp_dir, site)))
+ for site in sites_config.keys()
+ )
+
self.app.delete_sitemap_files(set(), temp_dir)
-
- remaining_count_after = sum(len(os.listdir(os.path.join(temp_dir, site)))
- for site in sites_config.keys())
-
- self.assertEqual(remaining_count_before, remaining_count_after,
- "Empty files_to_delete should not change file count")
-
+
+ remaining_count_after = sum(
+ len(os.listdir(os.path.join(temp_dir, site)))
+ for site in sites_config.keys()
+ )
+
+ self.assertEqual(
+ remaining_count_before,
+ remaining_count_after,
+ "Empty files_to_delete should not change file count",
+ )
+
# Test non-existent files (should not raise error)
- non_existent_files = {'non_existent_1.xml', 'non_existent_2.xml'}
-
+ non_existent_files = {"non_existent_1.xml", "non_existent_2.xml"}
+
# Should not raise an exception
self.app.delete_sitemap_files(non_existent_files, temp_dir)
-
+
# Remaining files should still exist
- final_count = sum(len(os.listdir(os.path.join(temp_dir, site)))
- for site in sites_config.keys())
- self.assertEqual(final_count, 2, "Should still have 2 files (1 per site)")
-
+ final_count = sum(
+ len(os.listdir(os.path.join(temp_dir, site)))
+ for site in sites_config.keys()
+ )
+ self.assertEqual(
+ final_count, 2, "Should still have 2 files (1 per site)"
+ )
+
finally:
# Restore original SITES configuration
- self.app.conf['SITES'] = original_sites
-
+ self.app.conf["SITES"] = original_sites
def test_chunked(self):
"""Test chunked method"""
-
+
# Test 1: Normal chunking with exact division
data = list(range(10)) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
chunks = list(self.app.chunked(data, 5))
-
+
self.assertEqual(len(chunks), 2, "Should create 2 chunks")
- self.assertEqual(chunks[0], [0, 1, 2, 3, 4], "First chunk should contain first 5 elements")
- self.assertEqual(chunks[1], [5, 6, 7, 8, 9], "Second chunk should contain last 5 elements")
-
+ self.assertEqual(
+ chunks[0], [0, 1, 2, 3, 4], "First chunk should contain first 5 elements"
+ )
+ self.assertEqual(
+ chunks[1], [5, 6, 7, 8, 9], "Second chunk should contain last 5 elements"
+ )
+
# Test 2: Chunking with remainder
data = list(range(7)) # [0, 1, 2, 3, 4, 5, 6]
chunks = list(self.app.chunked(data, 3))
-
+
self.assertEqual(len(chunks), 3, "Should create 3 chunks")
self.assertEqual(chunks[0], [0, 1, 2], "First chunk should have 3 elements")
self.assertEqual(chunks[1], [3, 4, 5], "Second chunk should have 3 elements")
- self.assertEqual(chunks[2], [6], "Third chunk should have 1 element (remainder)")
-
+ self.assertEqual(
+ chunks[2], [6], "Third chunk should have 1 element (remainder)"
+ )
+
# Test 3: Single chunk (chunk_size larger than data)
data = [1, 2, 3]
chunks = list(self.app.chunked(data, 10))
-
+
self.assertEqual(len(chunks), 1, "Should create 1 chunk")
- self.assertEqual(chunks[0], [1, 2, 3], "Single chunk should contain all elements")
-
+ self.assertEqual(
+ chunks[0], [1, 2, 3], "Single chunk should contain all elements"
+ )
+
# Test 4: Empty iterable
data = []
chunks = list(self.app.chunked(data, 5))
-
+
self.assertEqual(len(chunks), 0, "Empty iterable should produce no chunks")
-
+
# Test 5: Chunk size of 1
- data = ['a', 'b', 'c']
+ data = ["a", "b", "c"]
chunks = list(self.app.chunked(data, 1))
-
+
self.assertEqual(len(chunks), 3, "Should create 3 chunks with size 1")
- self.assertEqual(chunks[0], ['a'], "First chunk should contain 'a'")
- self.assertEqual(chunks[1], ['b'], "Second chunk should contain 'b'")
- self.assertEqual(chunks[2], ['c'], "Third chunk should contain 'c'")
-
+ self.assertEqual(chunks[0], ["a"], "First chunk should contain 'a'")
+ self.assertEqual(chunks[1], ["b"], "Second chunk should contain 'b'")
+ self.assertEqual(chunks[2], ["c"], "Third chunk should contain 'c'")
+
# Test 6: Memory efficiency test with generator (doesn't copy data)
def large_generator():
for i in range(1000):
yield f"item_{i}"
-
+
chunks = list(self.app.chunked(large_generator(), 100))
-
+
self.assertEqual(len(chunks), 10, "Should create 10 chunks from 1000 items")
self.assertEqual(len(chunks[0]), 100, "Each chunk should have 100 items")
self.assertEqual(len(chunks[-1]), 100, "Last chunk should also have 100 items")
self.assertEqual(chunks[0][0], "item_0", "First item should be 'item_0'")
self.assertEqual(chunks[-1][-1], "item_999", "Last item should be 'item_999'")
-
+
# Test 7: String chunking
data = "abcdefghij"
chunks = list(self.app.chunked(data, 4))
-
+
self.assertEqual(len(chunks), 3, "Should create 3 chunks from string")
- self.assertEqual(chunks[0], ['a', 'b', 'c', 'd'], "First chunk should contain first 4 chars")
- self.assertEqual(chunks[1], ['e', 'f', 'g', 'h'], "Second chunk should contain next 4 chars")
- self.assertEqual(chunks[2], ['i', 'j'], "Third chunk should contain remaining 2 chars")
-
+ self.assertEqual(
+ chunks[0], ["a", "b", "c", "d"], "First chunk should contain first 4 chars"
+ )
+ self.assertEqual(
+ chunks[1], ["e", "f", "g", "h"], "Second chunk should contain next 4 chars"
+ )
+ self.assertEqual(
+ chunks[2], ["i", "j"], "Third chunk should contain remaining 2 chars"
+ )
+
# Test 8: Different data types
- data = [1, 'two', 3.0, [4, 5], {'six': 6}]
+ data = [1, "two", 3.0, [4, 5], {"six": 6}]
chunks = list(self.app.chunked(data, 2))
-
+
self.assertEqual(len(chunks), 3, "Should create 3 chunks from mixed data types")
- self.assertEqual(chunks[0], [1, 'two'], "First chunk should contain first 2 items")
- self.assertEqual(chunks[1], [3.0, [4, 5]], "Second chunk should contain next 2 items")
- self.assertEqual(chunks[2], [{'six': 6}], "Third chunk should contain last item")
-
+ self.assertEqual(
+ chunks[0], [1, "two"], "First chunk should contain first 2 items"
+ )
+ self.assertEqual(
+ chunks[1], [3.0, [4, 5]], "Second chunk should contain next 2 items"
+ )
+ self.assertEqual(
+ chunks[2], [{"six": 6}], "Third chunk should contain last item"
+ )
def test_delete_by_bibcode_with_sitemap(self):
"""Test delete_by_bibcode function with sitemap records (database deletion only)"""
# TEST CASE 1: Delete record with both Records and SitemapInfo entries
- test_bibcode = '2023DeleteSitemapTest..1..1A'
- bib_data = {'title': 'Test Record for Sitemap Deletion', 'year': 2023}
-
+ test_bibcode = "2023DeleteSitemapTest..1..1A"
+ bib_data = {"title": "Test Record for Sitemap Deletion", "year": 2023}
+
# Create test record
- self.app.update_storage(test_bibcode, 'bib_data', bib_data)
-
+ self.app.update_storage(test_bibcode, "bib_data", bib_data)
+
# Create sitemap entry
with self.app.session_scope() as session:
record = session.query(Records).filter_by(bibcode=test_bibcode).first()
self.assertIsNotNone(record, "Test record should exist")
-
+
sitemap_info = SitemapInfo(
record_id=record.id,
bibcode=test_bibcode,
- sitemap_filename='sitemap_bib_delete_test.xml',
+ sitemap_filename="sitemap_bib_delete_test.xml",
update_flag=False,
- bib_data_updated=record.bib_data_updated
+ bib_data_updated=record.bib_data_updated,
)
session.add(sitemap_info)
session.commit()
-
+
# Verify setup: record and sitemap entry exist
with self.app.session_scope() as session:
- record_count = session.query(Records).filter_by(bibcode=test_bibcode).count()
- sitemap_count = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).count()
- self.assertEqual(record_count, 1, "Should have 1 Records entry before deletion")
- self.assertEqual(sitemap_count, 1, "Should have 1 SitemapInfo entry before deletion")
-
+ record_count = (
+ session.query(Records).filter_by(bibcode=test_bibcode).count()
+ )
+ sitemap_count = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode).count()
+ )
+ self.assertEqual(
+ record_count, 1, "Should have 1 Records entry before deletion"
+ )
+ self.assertEqual(
+ sitemap_count, 1, "Should have 1 SitemapInfo entry before deletion"
+ )
+
# Delete the record
result = self.app.delete_by_bibcode(test_bibcode)
- self.assertTrue(result, "delete_by_bibcode should return True for successful deletion")
-
+ self.assertTrue(
+ result, "delete_by_bibcode should return True for successful deletion"
+ )
+
# Verify both Records and SitemapInfo entries are deleted
with self.app.session_scope() as session:
record = session.query(Records).filter_by(bibcode=test_bibcode).first()
self.assertIsNone(record, "Records entry should be deleted")
-
+
# Verify ChangeLog entry was created
- changelog = session.query(ChangeLog).filter_by(key=f'bibcode:{test_bibcode}').first()
+ changelog = (
+ session.query(ChangeLog)
+ .filter_by(key=f"bibcode:{test_bibcode}")
+ .first()
+ )
self.assertIsNotNone(changelog, "ChangeLog entry should be created")
- self.assertEqual(changelog.type, 'deleted', "ChangeLog type should be 'deleted'")
-
+ self.assertEqual(
+ changelog.type, "deleted", "ChangeLog type should be 'deleted'"
+ )
+
# With application-level cascade, SitemapInfo should be deleted
- sitemap_info = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
- self.assertIsNone(sitemap_info, "SitemapInfo entry should be deleted by application logic")
-
+ sitemap_info = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ )
+ self.assertIsNone(
+ sitemap_info, "SitemapInfo entry should be deleted by application logic"
+ )
+
# TEST CASE 2: Delete when only SitemapInfo exists (Records already deleted)
- test_bibcode_2 = '2023DeleteSitemapTest..2..2A'
-
+ test_bibcode_2 = "2023DeleteSitemapTest..2..2A"
+
# Create only SitemapInfo entry (no Records entry)
with self.app.session_scope() as session:
sitemap_info_2 = SitemapInfo(
record_id=999999, # Non-existent record_id
bibcode=test_bibcode_2,
- sitemap_filename='sitemap_bib_orphan.xml',
- update_flag=False
+ sitemap_filename="sitemap_bib_orphan.xml",
+ update_flag=False,
)
session.add(sitemap_info_2)
session.commit()
-
+
# Verify setup: no Records entry, but SitemapInfo exists
with self.app.session_scope() as session:
- record_count = session.query(Records).filter_by(bibcode=test_bibcode_2).count()
- sitemap_count = session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).count()
+ record_count = (
+ session.query(Records).filter_by(bibcode=test_bibcode_2).count()
+ )
+ sitemap_count = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).count()
+ )
self.assertEqual(record_count, 0, "Should have 0 Records entries")
self.assertEqual(sitemap_count, 1, "Should have 1 SitemapInfo entry")
-
+
# Delete orphaned sitemap entry
result_2 = self.app.delete_by_bibcode(test_bibcode_2)
- self.assertTrue(result_2, "delete_by_bibcode should return True for SitemapInfo deletion")
-
+ self.assertTrue(
+ result_2, "delete_by_bibcode should return True for SitemapInfo deletion"
+ )
+
# Verify SitemapInfo entry is deleted
with self.app.session_scope() as session:
- sitemap_info = session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).first()
- self.assertIsNone(sitemap_info, "Orphaned SitemapInfo entry should be deleted")
-
+ sitemap_info = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).first()
+ )
+ self.assertIsNone(
+ sitemap_info, "Orphaned SitemapInfo entry should be deleted"
+ )
+
# TEST CASE 3: Delete non-existent bibcode
- result_3 = self.app.delete_by_bibcode('2023NonExistent..1..1A')
- self.assertIsNone(result_3, "delete_by_bibcode should return None for non-existent bibcode")
+ result_3 = self.app.delete_by_bibcode("2023NonExistent..1..1A")
+ self.assertIsNone(
+ result_3, "delete_by_bibcode should return None for non-existent bibcode"
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/adsmp/tests/test_solr_updater.py b/adsmp/tests/test_solr_updater.py
index 114917d..7dc8d85 100644
--- a/adsmp/tests/test_solr_updater.py
+++ b/adsmp/tests/test_solr_updater.py
@@ -109,6 +109,7 @@ def test_solr_transformer(self):
"page": ["283"],
# u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'],
"pub": "Astronomical Data Analysis Software and Systems XII",
+ "pub_abbrev": "ADASS XII",
"pub_raw": "Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283",
"pubdate": "2003-00-00",
"title": ["Chandra Data Archive Download and Usage Database"],
@@ -128,7 +129,7 @@ def test_solr_transformer(self):
"boost_factor": 0.5142857142857143,
"astronomy_final_boost": 0.5142857142857143,
"physics_final_boost": 0.5142857142857143,
- }
+ },
)
self.app.update_storage(
"bibcode",
@@ -367,8 +368,8 @@ def test_solr_transformer(self):
"volume",
],
)
- self.assertEqual(x["scix_id"], "scix:42MM-89VE-90A0")
- self.assertEqual(round(x["doctype_boost"],3),0.857)
+ self.assertEqual(x["scix_id"], "scix:2VD6-M93T-HEGP")
+ self.assertEqual(round(x["doctype_boost"], 3), 0.857)
self.app.update_storage(
"bibcode",
@@ -382,13 +383,13 @@ def test_solr_transformer(self):
"boost_factor": 0.5142857142857143,
"astronomy_final_boost": 0.5142857142857143,
"physics_final_boost": 0.5142857142857143,
- }
+ },
)
rec = self.app.get_record("bibcode")
x = solr_updater.transform_json_record(rec)
- self.assertEqual(x["scix_id"], "scix:42MM-89VE-90A0")
- self.assertEqual(round(x["doctype_boost"],3),0.857)
- self.assertEqual(round(x["astronomy_final_boost"],3), 0.514)
+ self.assertEqual(x["scix_id"], "scix:2VD6-M93T-HEGP")
+ self.assertEqual(round(x["doctype_boost"], 3), 0.857)
+ self.assertEqual(round(x["astronomy_final_boost"], 3), 0.514)
self.app.update_storage(
"bibcode",
@@ -503,6 +504,7 @@ def test_solr_transformer(self):
"property": ["OPENACCESS", "ADS_OPENACCESS", "ARTICLE", "NOT REFEREED"],
"pub": "Astronomical Data Analysis Software and Systems XII",
"pub_raw": "Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283",
+ "pub_abbrev": "ADASS XII",
"pubdate": "2003-00-00",
"read_count": 0,
"reference": [
@@ -609,8 +611,7 @@ def test_solr_transformer(self):
"volume",
],
)
- self.assertEqual(round(x["doctype_boost"],3),0.857)
-
+ self.assertEqual(round(x["doctype_boost"], 3), 0.857)
def test_links_data_merge(self):
# links_data only from bib
diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py
index bf1072f..6ace026 100644
--- a/adsmp/tests/test_tasks.py
+++ b/adsmp/tests/test_tasks.py
@@ -1,13 +1,14 @@
import copy
import html
import json
+import logging
import os
import shutil
-import unittest
-from datetime import datetime, timedelta, timezone
import tempfile
import time
-from unittest.mock import patch, MagicMock
+import unittest
+from datetime import datetime, timedelta, timezone
+from unittest.mock import MagicMock, patch
import mock
from adsmsg import (
@@ -21,15 +22,15 @@
)
from adsmsg.orcid_claims import OrcidClaims
from adsputils import get_date
-from mock import Mock, patch, MagicMock
+from mock import MagicMock, Mock, patch
from adsmp import app, tasks
-from adsmp.models import Base, Records, SitemapInfo, ChangeLog
-from adsmp.tasks import update_sitemap_index, update_robots_files
+from adsmp.models import Base, ChangeLog, Records, SitemapInfo
+from adsmp.tasks import update_robots_files, update_sitemap_index
-import logging
logger = logging.getLogger(__name__)
+
def unwind_task_index_solr_apply_async(args=None, kwargs=None, priority=None):
tasks.task_index_solr(args[0], args[1], kwargs)
@@ -706,18 +707,23 @@ def test_index_metrics_no_data(self):
x.assert_not_called()
def test_task_update_scixid(self):
- self.app.update_storage("bibcode", "bib_data", {"title":"abc test 123"})
- self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:5RNB-CG0M-EQYN")
+ self.app.update_storage("bibcode", "bib_data", {"title": "abc test 123"})
+ self.assertEqual(
+ self.app.get_record("bibcode")["scix_id"], "scix:8KM7-38V2-N637"
+ )
tasks.task_update_scixid(bibcodes=["bibcode"], flag="force")
# scixid should not change since bib_data has not changed
- self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:5RNB-CG0M-EQYN")
+ self.assertEqual(
+ self.app.get_record("bibcode")["scix_id"], "scix:8KM7-38V2-N637"
+ )
- self.app.update_storage("bibcode", "bib_data", {"title":"abc test 456"})
+ self.app.update_storage("bibcode", "bib_data", {"title": "abc test 456"})
tasks.task_update_scixid(bibcodes=["bibcode"], flag="force")
# scix_id should change since bib_data has changed and we used the force flag to create a new scix_id
- self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:3BPZ-TQ3C-HFMU")
-
+ self.assertEqual(
+ self.app.get_record("bibcode")["scix_id"], "scix:6Z3P-MJ87-67A1"
+ )
with self.app.session_scope() as session:
r = session.query(Records).filter_by(bibcode="bibcode").first()
@@ -726,18 +732,16 @@ def test_task_update_scixid(self):
tasks.task_update_scixid(bibcodes=["bibcode"], flag="update")
# bibcode should still be the same as above since bib_data has not changed
- self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:3BPZ-TQ3C-HFMU")
-
-
-
-
+ self.assertEqual(
+ self.app.get_record("bibcode")["scix_id"], "scix:6Z3P-MJ87-67A1"
+ )
class TestSitemapWorkflow(unittest.TestCase):
"""
Comprehensive tests for the complete sitemap workflow
"""
-
+
def setUp(self):
unittest.TestCase.setUp(self)
self.proj_home = os.path.join(os.path.dirname(__file__), "../..")
@@ -756,78 +760,80 @@ def setUp(self):
tasks.app = self.app # monkey-patch the app object
Base.metadata.bind = self.app._session.get_bind()
Base.metadata.create_all()
-
+
# Drop and recreate tables to ensure they have proper schema with indexes
try:
SitemapInfo.__table__.drop(self.app._session.get_bind(), checkfirst=True)
Records.__table__.drop(self.app._session.get_bind(), checkfirst=True)
except:
pass # Tables might not exist
-
+
# Recreate tables with current schema (including indexes)
Records.__table__.create(self.app._session.get_bind())
SitemapInfo.__table__.create(self.app._session.get_bind())
-
+
# Configure app for sitemap testing
- self.app.conf.update({
- 'SITEMAP_DIR': '/tmp/test_sitemap/',
- 'SITES': {
- 'ads': {
- 'name': 'ADS',
- 'base_url': 'https://ui.adsabs.harvard.edu/',
- 'sitemap_url': 'https://ui.adsabs.harvard.edu/sitemap',
- 'abs_url_pattern': 'https://ui.adsabs.harvard.edu/abs/{bibcode}'
+ self.app.conf.update(
+ {
+ "SITEMAP_DIR": "/tmp/test_sitemap/",
+ "SITES": {
+ "ads": {
+ "name": "ADS",
+ "base_url": "https://ui.adsabs.harvard.edu/",
+ "sitemap_url": "https://ui.adsabs.harvard.edu/sitemap",
+ "abs_url_pattern": "https://ui.adsabs.harvard.edu/abs/{bibcode}",
+ },
+ "scix": {
+ "name": "SciX",
+ "base_url": "https://scixplorer.org/",
+ "sitemap_url": "https://scixplorer.org/sitemap",
+ "abs_url_pattern": "https://scixplorer.org/abs/{bibcode}",
+ },
},
- 'scix': {
- 'name': 'SciX',
- 'base_url': 'https://scixplorer.org/',
- 'sitemap_url': 'https://scixplorer.org/sitemap',
- 'abs_url_pattern': 'https://scixplorer.org/abs/{bibcode}'
- }
}
- })
-
+ )
+
# Set up test data
self.test_records = [
{
- 'bibcode': '2023ApJ...123..456A',
- 'id': 1,
- 'bib_data': '{"title": "Test Paper A"}',
- 'bib_data_updated': get_date() - timedelta(days=1)
+ "bibcode": "2023ApJ...123..456A",
+ "id": 1,
+ "bib_data": '{"title": "Test Paper A"}',
+ "bib_data_updated": get_date() - timedelta(days=1),
},
{
- 'bibcode': '2023ApJ...123..457B',
- 'id': 2,
- 'bib_data': '{"title": "Test Paper B"}',
- 'bib_data_updated': get_date() - timedelta(days=2)
+ "bibcode": "2023ApJ...123..457B",
+ "id": 2,
+ "bib_data": '{"title": "Test Paper B"}',
+ "bib_data_updated": get_date() - timedelta(days=2),
},
{
- 'bibcode': '2023ApJ...123..458C',
- 'id': 3,
- 'bib_data': '{"title": "Test Paper C"}',
- 'bib_data_updated': get_date() - timedelta(days=3)
+ "bibcode": "2023ApJ...123..458C",
+ "id": 3,
+ "bib_data": '{"title": "Test Paper C"}',
+ "bib_data_updated": get_date() - timedelta(days=3),
},
{
- 'bibcode': '2023ApJ...123..459D',
- 'id': 4,
- 'bib_data': '{"title": "Test Paper D"}',
- 'bib_data_updated': get_date()
- }
+ "bibcode": "2023ApJ...123..459D",
+ "id": 4,
+ "bib_data": '{"title": "Test Paper D"}',
+ "bib_data_updated": get_date(),
+ },
]
-
+
# Clean database and insert test records
with self.app.session_scope() as session:
# Clear existing records
session.query(Records).delete()
session.commit()
-
+
# Insert test records with specified IDs
for record_data in self.test_records:
record = Records(
- id=record_data['id'],
- bibcode=record_data['bibcode'],
- bib_data=record_data['bib_data'],
- bib_data_updated=record_data['bib_data_updated']
+ id=record_data["id"],
+ bibcode=record_data["bibcode"],
+ bib_data=record_data["bib_data"],
+ bib_data_updated=record_data["bib_data_updated"],
)
session.add(record)
session.commit()
@@ -847,42 +853,48 @@ def tearDown(self):
self.app.close_app()
tasks.app = self._app
-
-
def test_task_cleanup_invalid_sitemaps(self):
"""Test the task_cleanup_invalid_sitemaps function thoroughly"""
-
+
# Setup test data - create records with different statuses
- valid_bibcodes = ['2023CleanValid1A', '2023CleanValid2B']
- invalid_bibcodes = ['2023CleanInvalid1C', '2023CleanInvalid2D', '2023CleanInvalid3E']
+ valid_bibcodes = ["2023CleanValid1A", "2023CleanValid2B"]
+ invalid_bibcodes = [
+ "2023CleanInvalid1C",
+ "2023CleanInvalid2D",
+ "2023CleanInvalid3E",
+ ]
all_bibcodes = valid_bibcodes + invalid_bibcodes
-
+
with self.app.session_scope() as session:
# Verify clean state
total_records_before = session.query(SitemapInfo).count()
- self.assertEqual(total_records_before, 0, "Should start with empty sitemap table")
-
+ self.assertEqual(
+ total_records_before, 0, "Should start with empty sitemap table"
+ )
+
# Create valid records (should remain in sitemap)
for bibcode in valid_bibcodes:
record = Records()
record.bibcode = bibcode
record.bib_data = '{"title": "Valid Test Record"}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.solr_processed = get_date() - timedelta(hours=12) # Recently processed
- record.status = 'success'
+ record.solr_processed = get_date() - timedelta(
+ hours=12
+ ) # Recently processed
+ record.status = "success"
session.add(record)
session.flush()
-
+
# Create sitemap entry
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_valid.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_valid.xml"
sitemap_record.update_flag = False
session.add(sitemap_record)
-
+
# Create invalid records (should be removed from sitemap)
- statuses = ['solr-failed', 'retrying', 'solr-failed']
+ statuses = ["solr-failed", "retrying", "solr-failed"]
for i, bibcode in enumerate(invalid_bibcodes):
record = Records()
record.bibcode = bibcode
@@ -892,117 +904,155 @@ def test_task_cleanup_invalid_sitemaps(self):
record.status = statuses[i]
session.add(record)
session.flush()
-
+
# Create sitemap entry
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_invalid.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_invalid.xml"
sitemap_record.update_flag = False
session.add(sitemap_record)
-
+
session.commit()
-
+
# Verify we have exactly 5 records
final_count = session.query(SitemapInfo).count()
- self.assertEqual(final_count, 5, "Should have exactly 5 sitemap records after setup")
-
+ self.assertEqual(
+ final_count, 5, "Should have exactly 5 sitemap records after setup"
+ )
+
# Execute cleanup with small batch size for testing
- original_batch_size = self.app.conf.get('SITEMAP_BOOTSTRAP_BATCH_SIZE', 50000)
- self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = 2 # Small batch for testing
-
+ original_batch_size = self.app.conf.get("SITEMAP_BOOTSTRAP_BATCH_SIZE", 50000)
+ self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = 2 # Small batch for testing
+
try:
- # Mock delete_sitemap_files
- with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files:
+ # Mock delete_sitemap_files
+ with patch.object(self.app, "delete_sitemap_files") as mock_delete_files:
result = tasks.task_cleanup_invalid_sitemaps()
finally:
# Restore original batch size
- self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = original_batch_size
-
+ self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = original_batch_size
+
# Verify result structure and content
self.assertIsInstance(result, dict, "Should return result dictionary")
- self.assertIn('total_processed', result, "Should include total_processed count")
- self.assertIn('invalid_removed', result, "Should include invalid_removed count")
- self.assertIn('batches_processed', result, "Should include batches_processed count")
- self.assertIn('files_regenerated', result, "Should include files_regenerated flag")
- self.assertIn('files_flagged', result, "Should include files_flagged count")
-
+ self.assertIn("total_processed", result, "Should include total_processed count")
+ self.assertIn("invalid_removed", result, "Should include invalid_removed count")
+ self.assertIn(
+ "batches_processed", result, "Should include batches_processed count"
+ )
+ self.assertIn(
+ "files_regenerated", result, "Should include files_regenerated flag"
+ )
+ self.assertIn("files_flagged", result, "Should include files_flagged count")
+
# Verify cleanup results - should have processed exactly our 5 records
- self.assertEqual(result['total_processed'], 5, "Should have processed exactly 5 records")
- self.assertEqual(result['invalid_removed'], 3, "Should have removed exactly 3 invalid records")
- self.assertGreaterEqual(result['batches_processed'], 1, "Should have processed at least 1 batch")
- self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration")
+ self.assertEqual(
+ result["total_processed"], 5, "Should have processed exactly 5 records"
+ )
+ self.assertEqual(
+ result["invalid_removed"],
+ 3,
+ "Should have removed exactly 3 invalid records",
+ )
+ self.assertGreaterEqual(
+ result["batches_processed"], 1, "Should have processed at least 1 batch"
+ )
+ self.assertTrue(
+ result["files_regenerated"], "Should indicate files need regeneration"
+ )
# files_flagged may be 0 if all invalid records were in files that became completely empty
-
+
# Verify delete_sitemap_files was called to clean up empty files
- self.assertTrue(mock_delete_files.called, "delete_sitemap_files should have been called")
+ self.assertTrue(
+ mock_delete_files.called, "delete_sitemap_files should have been called"
+ )
# Verify it was called with a non-empty set of files to delete
call_args = mock_delete_files.call_args[0]
files_to_delete = call_args[0] # First argument is the files_to_delete set
- self.assertIsInstance(files_to_delete, set, "Should pass a set of files to delete")
+ self.assertIsInstance(
+ files_to_delete, set, "Should pass a set of files to delete"
+ )
self.assertEqual(len(files_to_delete), 1, "Should have files to delete")
-
+
# Verify database state after cleanup
with self.app.session_scope() as session:
# Should have exactly 2 records remaining
total_remaining = session.query(SitemapInfo).count()
- self.assertEqual(total_remaining, 2, "Should have exactly 2 records remaining")
-
+ self.assertEqual(
+ total_remaining, 2, "Should have exactly 2 records remaining"
+ )
+
# Valid records should remain
- valid_remaining = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(valid_bibcodes)
- ).all()
- self.assertEqual(len(valid_remaining), 2, "Valid records should remain in sitemap")
-
+ valid_remaining = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(valid_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(valid_remaining), 2, "Valid records should remain in sitemap"
+ )
+
# Invalid records should be removed
- invalid_remaining = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(invalid_bibcodes)
- ).all()
- self.assertEqual(len(invalid_remaining), 0, "Invalid records should be removed from sitemap")
-
+ invalid_remaining = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(invalid_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(invalid_remaining),
+ 0,
+ "Invalid records should be removed from sitemap",
+ )
+
# Verify remaining records have correct properties
for sitemap_record in valid_remaining:
self.assertIn(sitemap_record.bibcode, valid_bibcodes)
- self.assertEqual(sitemap_record.sitemap_filename, 'sitemap_bib_valid.xml')
-
+ self.assertEqual(
+ sitemap_record.sitemap_filename, "sitemap_bib_valid.xml"
+ )
+
# Verify the Records table is unchanged (cleanup should only affect SitemapInfo)
with self.app.session_scope() as session:
- all_records = session.query(Records).filter(
- Records.bibcode.in_(all_bibcodes)
- ).all()
+ all_records = (
+ session.query(Records).filter(Records.bibcode.in_(all_bibcodes)).all()
+ )
self.assertEqual(len(all_records), 5, "All Records should still exist")
-
+
# Verify record statuses are unchanged
valid_records = [r for r in all_records if r.bibcode in valid_bibcodes]
invalid_records = [r for r in all_records if r.bibcode in invalid_bibcodes]
-
+
for record in valid_records:
- self.assertEqual(record.status, 'success')
-
+ self.assertEqual(record.status, "success")
+
for record in invalid_records:
- self.assertIn(record.status, ['solr-failed', 'retrying'])
+ self.assertIn(record.status, ["solr-failed", "retrying"])
def test_task_cleanup_invalid_sitemaps_with_file_flagging(self):
"""Test that cleanup correctly flags files for regeneration when some records remain"""
-
+
# Setup: Create TWO files:
# File 1 (mixed): Has both valid and invalid records - should be flagged when invalid ones removed
# File 2 (invalid only): Has only invalid records - should be deleted entirely
test_bibcodes = [
- '2023FlagTest1A', # Valid - will remain in file1
- '2023FlagTest2B', # Valid - will remain in file1
- '2023FlagTest3C', # Invalid - will be removed from file1
- '2023FlagTest4D', # Invalid - will be removed from file2
+ "2023FlagTest1A", # Valid - will remain in file1
+ "2023FlagTest2B", # Valid - will remain in file1
+ "2023FlagTest3C", # Invalid - will be removed from file1
+ "2023FlagTest4D", # Invalid - will be removed from file2
]
valid_bibcodes = test_bibcodes[:2]
invalid_bibcodes = test_bibcodes[2:]
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023FlagTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023FlagTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023FlagTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(Records.bibcode.like("2023FlagTest%")).delete(
+ synchronize_session=False
+ )
session.commit()
-
+
# Create valid records (should remain in sitemap)
for bibcode in valid_bibcodes:
record = Records()
@@ -1010,17 +1060,17 @@ def test_task_cleanup_invalid_sitemaps_with_file_flagging(self):
record.bib_data = '{"title": "Valid Test Record"}'
record.bib_data_updated = get_date() - timedelta(days=1)
record.solr_processed = get_date() - timedelta(hours=12)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_mixed.xml' # File 1
+ sitemap_record.sitemap_filename = "sitemap_bib_mixed.xml" # File 1
sitemap_record.update_flag = False
session.add(sitemap_record)
-
+
# Create invalid records (should be removed from sitemap)
for i, bibcode in enumerate(invalid_bibcodes):
record = Records()
@@ -1028,94 +1078,146 @@ def test_task_cleanup_invalid_sitemaps_with_file_flagging(self):
record.bib_data = '{"title": "Invalid Test Record"}'
record.bib_data_updated = get_date() - timedelta(days=1)
record.solr_processed = get_date() - timedelta(days=2)
- record.status = 'solr-failed'
+ record.status = "solr-failed"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
# First invalid goes to file1 (mixed), second to file2 (will be deleted)
- sitemap_record.sitemap_filename = 'sitemap_bib_mixed.xml' if i == 0 else 'sitemap_bib_invalid_only.xml'
+ sitemap_record.sitemap_filename = (
+ "sitemap_bib_mixed.xml"
+ if i == 0
+ else "sitemap_bib_invalid_only.xml"
+ )
sitemap_record.update_flag = False
session.add(sitemap_record)
-
+
session.commit()
-
+
# Verify initial state: 4 records, all in same file, none flagged
- total_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.like('2023FlagTest%')
- ).count()
+ total_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023FlagTest%"))
+ .count()
+ )
self.assertEqual(total_records, 4, "Should have 4 records initially")
-
- flagged_count = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.like('2023FlagTest%'),
- SitemapInfo.update_flag == True
- ).count()
- self.assertEqual(flagged_count, 0, "Should have 0 flagged records initially")
-
+
+ flagged_count = (
+ session.query(SitemapInfo)
+ .filter(
+ SitemapInfo.bibcode.like("2023FlagTest%"),
+ SitemapInfo.update_flag == True,
+ )
+ .count()
+ )
+ self.assertEqual(
+ flagged_count, 0, "Should have 0 flagged records initially"
+ )
+
# Execute cleanup
- original_batch_size = self.app.conf.get('SITEMAP_BOOTSTRAP_BATCH_SIZE', 50000)
- self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = 10 # Small batch
-
+ original_batch_size = self.app.conf.get("SITEMAP_BOOTSTRAP_BATCH_SIZE", 50000)
+ self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = 10 # Small batch
+
try:
- with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files:
+ with patch.object(self.app, "delete_sitemap_files") as mock_delete_files:
result = tasks.task_cleanup_invalid_sitemaps()
finally:
- self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = original_batch_size
-
+ self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = original_batch_size
+
# Verify cleanup results
- self.assertEqual(result['total_processed'], 4, "Should have processed 4 records")
- self.assertEqual(result['invalid_removed'], 2, "Should have removed 2 invalid records")
- self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration")
- self.assertEqual(result['files_flagged'], 1, "Should have flagged exactly 1 file (mixed file)")
-
+ self.assertEqual(
+ result["total_processed"], 4, "Should have processed 4 records"
+ )
+ self.assertEqual(
+ result["invalid_removed"], 2, "Should have removed 2 invalid records"
+ )
+ self.assertTrue(
+ result["files_regenerated"], "Should indicate files need regeneration"
+ )
+ self.assertEqual(
+ result["files_flagged"],
+ 1,
+ "Should have flagged exactly 1 file (mixed file)",
+ )
+
# Verify delete_sitemap_files was called for the file that became empty
self.assertTrue(mock_delete_files.called, "Should have deleted the empty file")
# Check that the empty file was deleted
call_args = mock_delete_files.call_args[0]
files_to_delete = call_args[0]
- self.assertIn('sitemap_bib_invalid_only.xml', files_to_delete, "Should delete the file with only invalid records")
-
+ self.assertIn(
+ "sitemap_bib_invalid_only.xml",
+ files_to_delete,
+ "Should delete the file with only invalid records",
+ )
+
# Verify database state after cleanup
with self.app.session_scope() as session:
# Should have 2 valid records remaining
- remaining_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.like('2023FlagTest%')
- ).all()
- self.assertEqual(len(remaining_records), 2, "Should have 2 remaining records")
-
+ remaining_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.like("2023FlagTest%"))
+ .all()
+ )
+ self.assertEqual(
+ len(remaining_records), 2, "Should have 2 remaining records"
+ )
+
# At least one record should be flagged for update
flagged_records = [r for r in remaining_records if r.update_flag]
- self.assertGreaterEqual(len(flagged_records), 1, "At least one record should be flagged")
-
+ self.assertGreaterEqual(
+ len(flagged_records), 1, "At least one record should be flagged"
+ )
+
# All remaining records should be valid bibcodes
remaining_bibcodes = [r.bibcode for r in remaining_records]
- self.assertEqual(set(remaining_bibcodes), set(valid_bibcodes), "Only valid bibcodes should remain")
-
+ self.assertEqual(
+ set(remaining_bibcodes),
+ set(valid_bibcodes),
+ "Only valid bibcodes should remain",
+ )
+
# All remaining records should be in the mixed file (not the deleted one)
filenames = set(r.sitemap_filename for r in remaining_records)
- self.assertEqual(filenames, {'sitemap_bib_mixed.xml'}, "All remaining records should be in mixed file")
-
+ self.assertEqual(
+ filenames,
+ {"sitemap_bib_mixed.xml"},
+ "All remaining records should be in mixed file",
+ )
+
# Clean up test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023FlagTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023FlagTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023FlagTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(Records.bibcode.like("2023FlagTest%")).delete(
+ synchronize_session=False
+ )
session.commit()
def test_task_cleanup_invalid_sitemaps_orphaned_entries_cleanup(self):
"""Test cleanup of orphaned sitemap entries (part 2)"""
-
+
# Setup orphaned entries - create records where some will become orphaned
- test_bibcodes = ['2023OrphanCleanup1A', '2023OrphanCleanup2B', '2023ValidCleanup3C']
+ test_bibcodes = [
+ "2023OrphanCleanup1A",
+ "2023OrphanCleanup2B",
+ "2023ValidCleanup3C",
+ ]
orphaned_bibcodes = test_bibcodes[:2] # First 2 will become orphaned
- valid_bibcodes = test_bibcodes[2:] # Last 1 will remain valid
-
+ valid_bibcodes = test_bibcodes[2:] # Last 1 will remain valid
+
with self.app.session_scope() as session:
# Clean up any existing test data first
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023%Cleanup%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023%Cleanup%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023%Cleanup%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(Records.bibcode.like("2023%Cleanup%")).delete(
+ synchronize_session=False
+ )
session.commit()
-
+
# Create Records and SitemapInfo for all bibcodes
record_ids = {}
for i, bibcode in enumerate(test_bibcodes):
@@ -1123,126 +1225,166 @@ def test_task_cleanup_invalid_sitemaps_orphaned_entries_cleanup(self):
record.bibcode = bibcode
record.bib_data = '{"title": "Test Record"}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
record_ids[bibcode] = record.id
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_cleanup_test.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_cleanup_test.xml"
sitemap_record.update_flag = False
session.add(sitemap_record)
-
+
session.commit()
-
+
# Delete Records entries for orphaned bibcodes
- session.query(Records).filter(Records.bibcode.in_(orphaned_bibcodes)).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.in_(orphaned_bibcodes)
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Execute cleanup with small batch size for testing
- original_batch_size = self.app.conf.get('SITEMAP_BOOTSTRAP_BATCH_SIZE', 50000)
- self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = 2 # Small batch for testing
-
+ original_batch_size = self.app.conf.get("SITEMAP_BOOTSTRAP_BATCH_SIZE", 50000)
+ self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = 2 # Small batch for testing
+
try:
# Mock delete_sitemap_files
- with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files:
+ with patch.object(self.app, "delete_sitemap_files") as mock_delete_files:
result = tasks.task_cleanup_invalid_sitemaps()
finally:
# Restore original batch size
- self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = original_batch_size
-
+ self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = original_batch_size
+
# Verify cleanup results - should have processed exactly 3 records and removed 2 orphaned ones
- self.assertEqual(result['total_processed'], 3, "Should have processed exactly 3 records")
- self.assertEqual(result['invalid_removed'], 2, "Should have removed exactly 2 orphaned records")
- self.assertGreaterEqual(result['batches_processed'], 1, "Should have processed at least 1 batch")
- self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration")
-
+ self.assertEqual(
+ result["total_processed"], 3, "Should have processed exactly 3 records"
+ )
+ self.assertEqual(
+ result["invalid_removed"],
+ 2,
+ "Should have removed exactly 2 orphaned records",
+ )
+ self.assertGreaterEqual(
+ result["batches_processed"], 1, "Should have processed at least 1 batch"
+ )
+ self.assertTrue(
+ result["files_regenerated"], "Should indicate files need regeneration"
+ )
+
# Verify database state after cleanup
with self.app.session_scope() as session:
# Valid record should remain
- valid_remaining = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(valid_bibcodes)
- ).all()
- self.assertEqual(len(valid_remaining), 1, "Valid record should remain in sitemap")
-
+ valid_remaining = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(valid_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(valid_remaining), 1, "Valid record should remain in sitemap"
+ )
+
# Orphaned records should be removed
- orphaned_remaining = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(orphaned_bibcodes)
- ).all()
- self.assertEqual(len(orphaned_remaining), 0, "Orphaned records should be removed from sitemap")
+ orphaned_remaining = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(orphaned_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(orphaned_remaining),
+ 0,
+ "Orphaned records should be removed from sitemap",
+ )
def test_task_cleanup_invalid_sitemaps_orphaned_entries_verification(self):
"""Test verification that remaining entries are valid after orphan cleanup (part 3)"""
-
- test_bibcode = '2023OrphanVerify1A'
-
+
+ test_bibcode = "2023OrphanVerify1A"
+
with self.app.session_scope() as session:
# Clean up any existing test data first
- session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode == test_bibcode).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode == test_bibcode
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == test_bibcode).delete(
+ synchronize_session=False
+ )
session.commit()
-
+
# Create a valid Records and SitemapInfo entry
record = Records()
record.bibcode = test_bibcode
record.bib_data = '{"title": "Valid Test Record"}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = test_bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_verify_test.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_verify_test.xml"
sitemap_record.update_flag = False
session.add(sitemap_record)
session.commit()
-
+
# Execute cleanup - should not remove the valid entry
- with patch('adsmp.tasks.task_update_sitemap_files.apply_async'), \
- patch.object(self.app, 'delete_sitemap_files'):
+ with patch("adsmp.tasks.task_update_sitemap_files.apply_async"), patch.object(
+ self.app, "delete_sitemap_files"
+ ):
tasks.task_cleanup_invalid_sitemaps()
-
+
# Verify the valid entry still exists and has correct relationships
with self.app.session_scope() as session:
- remaining_sitemap = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode == test_bibcode
- ).first()
+ remaining_sitemap = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode == test_bibcode)
+ .first()
+ )
self.assertIsNotNone(remaining_sitemap, "Valid sitemap entry should remain")
-
+
# Verify the sitemap entry still has a valid Records entry
- corresponding_record = session.query(Records).filter(
- Records.id == remaining_sitemap.record_id
- ).first()
- self.assertIsNotNone(corresponding_record, "Sitemap entry should have valid Records entry")
- self.assertEqual(corresponding_record.bibcode, remaining_sitemap.bibcode, "Bibcodes should match")
+ corresponding_record = (
+ session.query(Records)
+ .filter(Records.id == remaining_sitemap.record_id)
+ .first()
+ )
+ self.assertIsNotNone(
+ corresponding_record, "Sitemap entry should have valid Records entry"
+ )
+ self.assertEqual(
+ corresponding_record.bibcode,
+ remaining_sitemap.bibcode,
+ "Bibcodes should match",
+ )
def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self):
"""Test cleanup of all types of invalid records that should be removed from sitemaps"""
-
+
# Test various invalid scenarios
test_cases = [
# (bibcode, bib_data, status, description)
- ('2023NoData..1..1A', None, 'success', 'No bib_data'),
- ('2023EmptyData..1..1B', '', 'success', 'Empty bib_data'),
- ('2023EmptyData2..1..1C', ' ', 'success', 'Whitespace-only bib_data'),
- ('2023SolrFailed..1..1D', '{"title": "Test"}', 'solr-failed', 'SOLR failed status'),
- ('2023Retrying..1..1E', '{"title": "Test"}', 'retrying', 'Retrying status'),
+ ("2023NoData..1..1A", None, "success", "No bib_data"),
+ (
+ "2023SolrFailed..1..1D",
+ '{"title": "Test"}',
+ "solr-failed",
+ "SOLR failed status",
+ ),
+ ("2023Retrying..1..1E", '{"title": "Test"}', "retrying", "Retrying status"),
]
-
- valid_bibcode = '2023ValidRecord..1..1F'
+
+ valid_bibcode = "2023ValidRecord..1..1F"
# Add another valid bibcode in the same file as invalid ones to ensure file gets flagged not deleted
- valid_bibcode_in_mixed_file = '2023ValidMixed..1..1G'
-
+ valid_bibcode_in_mixed_file = "2023ValidMixed..1..1G"
+
with self.app.session_scope() as session:
# Clean up all test data
session.query(SitemapInfo).delete(synchronize_session=False)
session.query(Records).delete(synchronize_session=False)
session.commit()
-
+
# Create invalid records - all in the same file
for bibcode, bib_data, status, description in test_cases:
record = Records()
@@ -1252,292 +1394,411 @@ def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self):
record.status = status
session.add(record)
session.flush()
-
+
# Create sitemap entry
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_invalid_comprehensive.xml'
+ sitemap_record.sitemap_filename = (
+ "sitemap_bib_invalid_comprehensive.xml"
+ )
sitemap_record.update_flag = False
session.add(sitemap_record)
-
+
# Create a valid record in the same file as invalid ones (mixed file)
mixed_record = Records()
mixed_record.bibcode = valid_bibcode_in_mixed_file
mixed_record.bib_data = '{"title": "Valid Mixed Record"}'
mixed_record.bib_data_updated = get_date() - timedelta(days=1)
- mixed_record.status = 'success'
+ mixed_record.status = "success"
session.add(mixed_record)
session.flush()
-
+
mixed_sitemap_record = SitemapInfo()
mixed_sitemap_record.bibcode = valid_bibcode_in_mixed_file
mixed_sitemap_record.record_id = mixed_record.id
- mixed_sitemap_record.sitemap_filename = 'sitemap_bib_invalid_comprehensive.xml' # Same file!
+ mixed_sitemap_record.sitemap_filename = (
+ "sitemap_bib_invalid_comprehensive.xml" # Same file!
+ )
mixed_sitemap_record.update_flag = False
session.add(mixed_sitemap_record)
-
+
# Create another valid record in a different file
valid_record = Records()
valid_record.bibcode = valid_bibcode
valid_record.bib_data = '{"title": "Valid Record"}'
valid_record.bib_data_updated = get_date() - timedelta(days=1)
- valid_record.status = 'success'
+ valid_record.status = "success"
session.add(valid_record)
session.flush()
-
+
valid_sitemap_record = SitemapInfo()
valid_sitemap_record.bibcode = valid_bibcode
valid_sitemap_record.record_id = valid_record.id
- valid_sitemap_record.sitemap_filename = 'sitemap_bib_valid_comprehensive.xml'
+ valid_sitemap_record.sitemap_filename = (
+ "sitemap_bib_valid_comprehensive.xml"
+ )
valid_sitemap_record.update_flag = False
session.add(valid_sitemap_record)
-
+
session.commit()
-
+
# Verify setup
total_records = session.query(SitemapInfo).count()
- self.assertEqual(total_records, 7, "Should have 7 records (5 invalid + 2 valid)")
-
+ self.assertEqual(
+ total_records, 5, "Should have 5 records (3 invalid + 2 valid)"
+ )
+
# Execute cleanup
- with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files:
+ with patch.object(self.app, "delete_sitemap_files") as mock_delete_files:
result = tasks.task_cleanup_invalid_sitemaps()
-
- # Verify all invalid records were removed
- self.assertEqual(result['invalid_removed'], 5, "Should remove all 5 invalid records")
- self.assertEqual(result['total_processed'], 7, "Should process all 7 records")
- self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration")
-
+
+ # Verify invalid records were removed
+ self.assertEqual(
+ result["invalid_removed"],
+ 3,
+ "Should remove 3 invalid records (None, solr-failed, retrying)",
+ )
+ self.assertEqual(result["total_processed"], 5, "Should process all 5 records")
+ self.assertTrue(
+ result["files_regenerated"], "Should indicate files need regeneration"
+ )
+
# Verify files were flagged for regeneration (1 file with mixed valid/invalid records)
- self.assertGreaterEqual(result['files_flagged'], 1, "Should have flagged at least 1 file")
-
+ self.assertGreaterEqual(
+ result["files_flagged"], 1, "Should have flagged at least 1 file"
+ )
+
# Verify database state
with self.app.session_scope() as session:
- # Two valid records should remain
+ # Only 2 valid records should remain
remaining_records = session.query(SitemapInfo).all()
- self.assertEqual(len(remaining_records), 2, "Should have 2 remaining valid records")
+ self.assertEqual(
+ len(remaining_records), 2, "Should have 2 remaining valid records"
+ )
remaining_bibcodes = {r.bibcode for r in remaining_records}
- self.assertEqual(remaining_bibcodes, {valid_bibcode, valid_bibcode_in_mixed_file}, "Both valid records should remain")
-
- # All invalid records should be gone
- for bibcode, _, _, description in test_cases:
- invalid_count = session.query(SitemapInfo).filter_by(bibcode=bibcode).count()
- self.assertEqual(invalid_count, 0, f"Invalid record should be removed: {description}")
+ expected_bibcodes = {valid_bibcode, valid_bibcode_in_mixed_file}
+ self.assertEqual(
+ remaining_bibcodes,
+ expected_bibcodes,
+ "Both valid records should remain",
+ )
+
+ # All invalid records (None bib_data, solr-failed, retrying) should be removed
+ removed_bibcodes = {
+ "2023NoData..1..1A",
+ "2023SolrFailed..1..1D",
+ "2023Retrying..1..1E",
+ }
+ for bibcode in removed_bibcodes:
+ invalid_count = (
+ session.query(SitemapInfo).filter_by(bibcode=bibcode).count()
+ )
+ self.assertEqual(
+ invalid_count, 0, f"Invalid record should be removed: {bibcode}"
+ )
def test_delete_by_bibcode_marks_sitemap_files_for_regeneration(self):
"""Test that delete_by_bibcode properly marks affected sitemap files for regeneration"""
-
+
# Create test records in the same sitemap file
- test_bibcodes = ['2023DeleteRegen1A', '2023DeleteRegen2B', '2023DeleteRegen3C']
+ test_bibcodes = ["2023DeleteRegen1A", "2023DeleteRegen2B", "2023DeleteRegen3C"]
bibcode_to_delete = test_bibcodes[0] # Will delete the first one
remaining_bibcodes = test_bibcodes[1:] # These should be marked for update
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023DeleteRegen%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023DeleteRegen%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023DeleteRegen%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023DeleteRegen%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create records and sitemap entries in the same file
for bibcode in test_bibcodes:
record = Records()
record.bibcode = bibcode
record.bib_data = '{"title": "Test Record"}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_delete_test.xml' # Same file
+ sitemap_record.sitemap_filename = (
+ "sitemap_bib_delete_test.xml" # Same file
+ )
sitemap_record.update_flag = False # Start with False
session.add(sitemap_record)
-
+
session.commit()
-
+
# Verify setup: all records exist with update_flag=False
- all_sitemap_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).all()
- self.assertEqual(len(all_sitemap_records), 3, "Should have 3 sitemap records")
+ all_sitemap_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(all_sitemap_records), 3, "Should have 3 sitemap records"
+ )
for record in all_sitemap_records:
- self.assertFalse(record.update_flag, f"Update flag should start False for {record.bibcode}")
-
+ self.assertFalse(
+ record.update_flag,
+ f"Update flag should start False for {record.bibcode}",
+ )
+
# Delete one bibcode using delete_by_bibcode
result = self.app.delete_by_bibcode(bibcode_to_delete)
self.assertTrue(result, "delete_by_bibcode should succeed")
-
+
# Verify the deletion and regeneration marking
with self.app.session_scope() as session:
# Deleted record should be gone from both tables
- deleted_record = session.query(Records).filter_by(bibcode=bibcode_to_delete).first()
+ deleted_record = (
+ session.query(Records).filter_by(bibcode=bibcode_to_delete).first()
+ )
self.assertIsNone(deleted_record, "Deleted Records entry should be gone")
-
- deleted_sitemap = session.query(SitemapInfo).filter_by(bibcode=bibcode_to_delete).first()
- self.assertIsNone(deleted_sitemap, "Deleted SitemapInfo entry should be gone")
-
+
+ deleted_sitemap = (
+ session.query(SitemapInfo).filter_by(bibcode=bibcode_to_delete).first()
+ )
+ self.assertIsNone(
+ deleted_sitemap, "Deleted SitemapInfo entry should be gone"
+ )
+
# Remaining records should exist and exactly one should be marked for update (one-row-per-file flagging)
- remaining_sitemap_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(remaining_bibcodes)
- ).all()
- self.assertEqual(len(remaining_sitemap_records), 2, "Should have 2 remaining sitemap records")
-
+ remaining_sitemap_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(remaining_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(remaining_sitemap_records),
+ 2,
+ "Should have 2 remaining sitemap records",
+ )
+
flagged_count = sum(1 for r in remaining_sitemap_records if r.update_flag)
- self.assertEqual(flagged_count, 1, "At least one remaining record should be marked for update")
+ self.assertEqual(
+ flagged_count,
+ 1,
+ "At least one remaining record should be marked for update",
+ )
for record in remaining_sitemap_records:
- self.assertEqual(record.sitemap_filename, 'sitemap_bib_delete_test.xml', "Should be in same sitemap file")
-
+ self.assertEqual(
+ record.sitemap_filename,
+ "sitemap_bib_delete_test.xml",
+ "Should be in same sitemap file",
+ )
+
# Verify ChangeLog entry was created
- changelog = session.query(ChangeLog).filter_by(key=f'bibcode:{bibcode_to_delete}').first()
+ changelog = (
+ session.query(ChangeLog)
+ .filter_by(key=f"bibcode:{bibcode_to_delete}")
+ .first()
+ )
self.assertIsNotNone(changelog, "ChangeLog entry should be created")
- self.assertEqual(changelog.type, 'deleted', "ChangeLog type should be 'deleted'")
+ self.assertEqual(
+ changelog.type, "deleted", "ChangeLog type should be 'deleted'"
+ )
def test_sitemap_file_regeneration_after_deletion_and_cleanup(self):
"""Test that sitemap files are correctly regenerated after deletion and cleanup operations"""
-
+
# Create temporary sitemap directory
temp_dir = tempfile.mkdtemp()
- original_sitemap_dir = self.app.conf.get('SITEMAP_DIR')
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ original_sitemap_dir = self.app.conf.get("SITEMAP_DIR")
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
try:
# Test the core functionality we implemented: delete_by_bibcode marking files for regeneration
- test_bibcodes = ['2023FileRegen1A', '2023FileRegen2B', '2023FileRegen3C']
+ test_bibcodes = ["2023FileRegen1A", "2023FileRegen2B", "2023FileRegen3C"]
bibcode_to_delete = test_bibcodes[0]
remaining_bibcodes = test_bibcodes[1:]
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023FileRegen%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023FileRegen%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023FileRegen%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023FileRegen%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create test records and sitemap entries
for bibcode in test_bibcodes:
record = Records()
record.bibcode = bibcode
record.bib_data = '{"title": "File Regeneration Test"}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_file_regen.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_file_regen.xml"
sitemap_record.update_flag = False # Start with False
session.add(sitemap_record)
-
+
session.commit()
-
+
# Configure sites for testing
- sites_config = {'ads': {'name': 'ADS'}}
- original_sites = self.app.conf.get('SITES')
- self.app.conf['SITES'] = sites_config
-
+ sites_config = {"ads": {"name": "ADS"}}
+ original_sites = self.app.conf.get("SITES")
+ self.app.conf["SITES"] = sites_config
+
# Create site directory
- site_dir = os.path.join(temp_dir, 'ads')
+ site_dir = os.path.join(temp_dir, "ads")
os.makedirs(site_dir, exist_ok=True)
-
+
# STEP 1: Test delete_by_bibcode marks files for regeneration
result = self.app.delete_by_bibcode(bibcode_to_delete)
self.assertTrue(result, f"Should successfully delete {bibcode_to_delete}")
-
+
# Verify exactly one remaining record is marked for update (one-row-per-file flagging)
with self.app.session_scope() as session:
- remaining_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(remaining_bibcodes)
- ).all()
-
- self.assertEqual(len(remaining_records), 2, "Should have 2 remaining records")
+ remaining_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(remaining_bibcodes))
+ .all()
+ )
+
+ self.assertEqual(
+ len(remaining_records), 2, "Should have 2 remaining records"
+ )
flagged_count = sum(1 for r in remaining_records if r.update_flag)
- self.assertEqual(flagged_count, 1, "At least one record should be marked for update")
-
+ self.assertEqual(
+ flagged_count, 1, "At least one record should be marked for update"
+ )
+
# Get record IDs while still in session
record_ids = [r.id for r in remaining_records]
-
+
# STEP 2: Generate sitemap file to verify the deleted bibcode is excluded
- tasks.task_generate_single_sitemap('sitemap_bib_file_regen.xml', record_ids)
-
+ tasks.task_generate_single_sitemap("sitemap_bib_file_regen.xml", record_ids)
+
# STEP 3: Verify the generated file excludes the deleted bibcode
- sitemap_file = os.path.join(site_dir, 'sitemap_bib_file_regen.xml')
- self.assertTrue(os.path.exists(sitemap_file), "Sitemap file should be generated")
-
- with open(sitemap_file, 'r') as f:
+ sitemap_file = os.path.join(site_dir, "sitemap_bib_file_regen.xml")
+ self.assertTrue(
+ os.path.exists(sitemap_file), "Sitemap file should be generated"
+ )
+
+ with open(sitemap_file, "r") as f:
content = f.read()
-
+
# Should contain remaining records
for bibcode in remaining_bibcodes:
- self.assertIn(bibcode, content, f"Sitemap should contain remaining record {bibcode}")
-
+ self.assertIn(
+ bibcode,
+ content,
+ f"Sitemap should contain remaining record {bibcode}",
+ )
+
# Should NOT contain deleted record (this proves the bug fix works)
- self.assertNotIn(bibcode_to_delete, content, f"Sitemap should NOT contain deleted record {bibcode_to_delete}")
-
+ self.assertNotIn(
+ bibcode_to_delete,
+ content,
+ f"Sitemap should NOT contain deleted record {bibcode_to_delete}",
+ )
+
# Verify basic XML structure
- self.assertIn('', content, "Should have XML declaration")
- self.assertIn('',
+ content,
+ "Should have XML declaration",
+ )
+ self.assertIn("', content)
- self.assertIn('', content)
+ self.assertIn(
+ '',
+ content,
+ )
for bibcode in test_bibcodes:
- self.assertIn(bibcode, content, f"Bibcode {bibcode} should be in sitemap")
-
+ self.assertIn(
+ bibcode, content, f"Bibcode {bibcode} should be in sitemap"
+ )
+
# Verify update_flag was reset to False
with self.app.session_scope() as session:
- updated_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).all()
+ updated_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .all()
+ )
for record in updated_records:
- self.assertFalse(record.update_flag, f"Update flag should be False for {record.bibcode}")
- self.assertIsNotNone(record.filename_lastmoddate, f"Last mod date should be set for {record.bibcode}")
-
+ self.assertFalse(
+ record.update_flag,
+ f"Update flag should be False for {record.bibcode}",
+ )
+ self.assertIsNotNone(
+ record.filename_lastmoddate,
+ f"Last mod date should be set for {record.bibcode}",
+ )
+
finally:
# Cleanup
- self.app.conf['SITEMAP_DIR'] = original_sitemap_dir
+ self.app.conf["SITEMAP_DIR"] = original_sitemap_dir
if original_sites:
- self.app.conf['SITES'] = original_sites
-
+ self.app.conf["SITES"] = original_sites
+
try:
shutil.rmtree(temp_dir)
except OSError:
@@ -2000,98 +2393,124 @@ def test_task_update_sitemap_files_full_workflow(self):
def test_task_update_sitemap_files_after_record_deletion(self):
"""Test task_update_sitemap_files after records have been deleted (simulating cleanup scenario)"""
-
+
# Create temporary sitemap directory
temp_dir = tempfile.mkdtemp()
- original_sitemap_dir = self.app.conf.get('SITEMAP_DIR')
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ original_sitemap_dir = self.app.conf.get("SITEMAP_DIR")
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
try:
# Setup: Create records, then delete some to simulate cleanup scenario
- test_bibcodes = ['2023DeleteTest1A', '2023DeleteTest2B', '2023DeleteTest3C']
+ test_bibcodes = ["2023DeleteTest1A", "2023DeleteTest2B", "2023DeleteTest3C"]
remaining_bibcodes = test_bibcodes[1:] # Keep last 2 records
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023DeleteTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023DeleteTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023DeleteTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023DeleteTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create Records and SitemapInfo for remaining bibcodes only (simulating post-cleanup state)
for bibcode in remaining_bibcodes:
record = Records()
record.bibcode = bibcode
record.bib_data = f'{{"title": "Remaining Record", "year": 2023}}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_after_delete.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_after_delete.xml"
sitemap_record.update_flag = True # Mark for regeneration
session.add(sitemap_record)
-
+
session.commit()
-
+
# Configure sites
- sites_config = {'ads': {'name': 'ADS'}}
- original_sites = self.app.conf.get('SITES')
- self.app.conf['SITES'] = sites_config
-
+ sites_config = {"ads": {"name": "ADS"}}
+ original_sites = self.app.conf.get("SITES")
+ self.app.conf["SITES"] = sites_config
+
# Create site directory
- site_dir = os.path.join(temp_dir, 'ads')
+ site_dir = os.path.join(temp_dir, "ads")
os.makedirs(site_dir, exist_ok=True)
-
+
# Execute the workflow synchronously
with self.app.session_scope() as session:
- files_to_generate = session.query(SitemapInfo.sitemap_filename).filter(
- SitemapInfo.update_flag == True
- ).distinct().all()
-
- # Generate each sitemap file
+ files_to_generate = (
+ session.query(SitemapInfo.sitemap_filename)
+ .filter(SitemapInfo.update_flag == True)
+ .distinct()
+ .all()
+ )
+
+ # Generate each sitemap file
for (filename,) in files_to_generate:
with self.app.session_scope() as session:
- record_ids = session.query(SitemapInfo.id).filter(
- SitemapInfo.sitemap_filename == filename,
- SitemapInfo.update_flag == True
- ).all()
+ record_ids = (
+ session.query(SitemapInfo.id)
+ .filter(
+ SitemapInfo.sitemap_filename == filename,
+ SitemapInfo.update_flag == True,
+ )
+ .all()
+ )
record_ids = [r[0] for r in record_ids]
-
+
tasks.task_generate_single_sitemap(filename, record_ids)
-
+
# Generate the index
tasks.task_generate_sitemap_index()
-
+
# Verify sitemap file contains only remaining records
- sitemap_file = os.path.join(temp_dir, 'ads', 'sitemap_bib_after_delete.xml')
+ sitemap_file = os.path.join(temp_dir, "ads", "sitemap_bib_after_delete.xml")
self.assertTrue(os.path.exists(sitemap_file), "Sitemap file should exist")
-
- with open(sitemap_file, 'r') as f:
+
+ with open(sitemap_file, "r") as f:
content = f.read()
# Should contain remaining bibcodes
for bibcode in remaining_bibcodes:
- self.assertIn(bibcode, content, f"Remaining bibcode {bibcode} should be in sitemap")
+ self.assertIn(
+ bibcode,
+ content,
+ f"Remaining bibcode {bibcode} should be in sitemap",
+ )
# Should NOT contain deleted bibcode
- self.assertNotIn(test_bibcodes[0], content, f"Deleted bibcode {test_bibcodes[0]} should not be in sitemap")
-
+ self.assertNotIn(
+ test_bibcodes[0],
+ content,
+ f"Deleted bibcode {test_bibcodes[0]} should not be in sitemap",
+ )
+
# Verify update flags were reset
with self.app.session_scope() as session:
- updated_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(remaining_bibcodes)
- ).all()
- self.assertEqual(len(updated_records), 2, "Should have 2 remaining records")
+ updated_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(remaining_bibcodes))
+ .all()
+ )
+ self.assertEqual(
+ len(updated_records), 2, "Should have 2 remaining records"
+ )
for record in updated_records:
- self.assertFalse(record.update_flag, f"Update flag should be False for {record.bibcode}")
-
+ self.assertFalse(
+ record.update_flag,
+ f"Update flag should be False for {record.bibcode}",
+ )
+
finally:
# Cleanup
- self.app.conf['SITEMAP_DIR'] = original_sitemap_dir
+ self.app.conf["SITEMAP_DIR"] = original_sitemap_dir
if original_sites:
- self.app.conf['SITES'] = original_sites
-
+ self.app.conf["SITES"] = original_sites
+
try:
shutil.rmtree(temp_dir)
except OSError:
@@ -2099,68 +2518,75 @@ def test_task_update_sitemap_files_after_record_deletion(self):
def test_task_update_sitemap_files_no_updates_needed(self):
"""Test task_update_sitemap_files when no files need updating"""
-
+
# Create temporary sitemap directory
temp_dir = tempfile.mkdtemp()
- original_sitemap_dir = self.app.conf.get('SITEMAP_DIR')
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ original_sitemap_dir = self.app.conf.get("SITEMAP_DIR")
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
try:
# Setup test data with NO update flags set
- test_bibcodes = ['2023NoUpdateTest1A', '2023NoUpdateTest2B']
-
+ test_bibcodes = ["2023NoUpdateTest1A", "2023NoUpdateTest2B"]
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023NoUpdateTest%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023NoUpdateTest%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023NoUpdateTest%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023NoUpdateTest%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create Records and SitemapInfo entries WITHOUT update flag
for bibcode in test_bibcodes:
record = Records()
record.bibcode = bibcode
record.bib_data = f'{{"title": "No Update Record", "year": 2023}}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_no_update.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_no_update.xml"
sitemap_record.update_flag = False # No update needed
session.add(sitemap_record)
-
+
session.commit()
-
+
# Configure sites
- sites_config = {'ads': {'name': 'ADS'}}
- original_sites = self.app.conf.get('SITES')
- self.app.conf['SITES'] = sites_config
-
+ sites_config = {"ads": {"name": "ADS"}}
+ original_sites = self.app.conf.get("SITES")
+ self.app.conf["SITES"] = sites_config
+
# Create site directory
- site_dir = os.path.join(temp_dir, 'ads')
+ site_dir = os.path.join(temp_dir, "ads")
os.makedirs(site_dir, exist_ok=True)
-
+
# Execute the workflow - should only regenerate index
- with patch('adsmp.tasks.update_sitemap_index') as mock_update_index:
+ with patch("adsmp.tasks.update_sitemap_index") as mock_update_index:
mock_update_index.return_value = True
tasks.task_update_sitemap_files()
-
+
# Verify no individual sitemap files were created (since no updates needed)
- sitemap_file = os.path.join(temp_dir, 'ads', 'sitemap_bib_no_update.xml')
- self.assertFalse(os.path.exists(sitemap_file), "No sitemap files should be generated when no updates needed")
-
+ sitemap_file = os.path.join(temp_dir, "ads", "sitemap_bib_no_update.xml")
+ self.assertFalse(
+ os.path.exists(sitemap_file),
+ "No sitemap files should be generated when no updates needed",
+ )
+
# Verify the index update function was called (even when no files need updating)
mock_update_index.assert_called_once_with()
-
+
finally:
# Cleanup
- self.app.conf['SITEMAP_DIR'] = original_sitemap_dir
+ self.app.conf["SITEMAP_DIR"] = original_sitemap_dir
if original_sites:
- self.app.conf['SITES'] = original_sites
-
+ self.app.conf["SITES"] = original_sites
+
try:
shutil.rmtree(temp_dir)
except OSError:
@@ -2168,126 +2594,154 @@ def test_task_update_sitemap_files_no_updates_needed(self):
def test_task_update_sitemap_files_multiple_files(self):
"""Test task_update_sitemap_files with multiple sitemap files needing updates"""
-
+
# Create temporary sitemap directory
temp_dir = tempfile.mkdtemp()
- original_sitemap_dir = self.app.conf.get('SITEMAP_DIR')
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ original_sitemap_dir = self.app.conf.get("SITEMAP_DIR")
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
try:
# Setup test data across multiple sitemap files
- file1_bibcodes = ['2023MultiFile1A', '2023MultiFile1B']
- file2_bibcodes = ['2023MultiFile2A', '2023MultiFile2B']
+ file1_bibcodes = ["2023MultiFile1A", "2023MultiFile1B"]
+ file2_bibcodes = ["2023MultiFile2A", "2023MultiFile2B"]
all_bibcodes = file1_bibcodes + file2_bibcodes
-
+
with self.app.session_scope() as session:
# Clean up any existing test data
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023MultiFile%')).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.like('2023MultiFile%')).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.like("2023MultiFile%")
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.like("2023MultiFile%")
+ ).delete(synchronize_session=False)
session.commit()
-
+
# Create Records and SitemapInfo for file 1
for bibcode in file1_bibcodes:
record = Records()
record.bibcode = bibcode
- record.bib_data = f'{{"title": "Multi File Record 1", "year": 2023}}'
+ record.bib_data = (
+ f'{{"title": "Multi File Record 1", "year": 2023}}'
+ )
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_multi_1.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_multi_1.xml"
sitemap_record.update_flag = True
session.add(sitemap_record)
-
+
# Create Records and SitemapInfo for file 2
for bibcode in file2_bibcodes:
record = Records()
record.bibcode = bibcode
- record.bib_data = f'{{"title": "Multi File Record 2", "year": 2023}}'
+ record.bib_data = (
+ f'{{"title": "Multi File Record 2", "year": 2023}}'
+ )
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_multi_2.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_multi_2.xml"
sitemap_record.update_flag = True
session.add(sitemap_record)
-
+
session.commit()
-
+
# Configure sites
- sites_config = {'ads': {'name': 'ADS'}}
- original_sites = self.app.conf.get('SITES')
- self.app.conf['SITES'] = sites_config
-
+ sites_config = {"ads": {"name": "ADS"}}
+ original_sites = self.app.conf.get("SITES")
+ self.app.conf["SITES"] = sites_config
+
# Create site directory
- site_dir = os.path.join(temp_dir, 'ads')
+ site_dir = os.path.join(temp_dir, "ads")
os.makedirs(site_dir, exist_ok=True)
-
+
# Execute the workflow synchronously
with self.app.session_scope() as session:
- files_to_generate = session.query(SitemapInfo.sitemap_filename).filter(
- SitemapInfo.update_flag == True
- ).distinct().all()
-
+ files_to_generate = (
+ session.query(SitemapInfo.sitemap_filename)
+ .filter(SitemapInfo.update_flag == True)
+ .distinct()
+ .all()
+ )
+
# Generate each sitemap file synchronously
for (filename,) in files_to_generate:
with self.app.session_scope() as session:
- record_ids = session.query(SitemapInfo.id).filter(
- SitemapInfo.sitemap_filename == filename,
- SitemapInfo.update_flag == True
- ).all()
+ record_ids = (
+ session.query(SitemapInfo.id)
+ .filter(
+ SitemapInfo.sitemap_filename == filename,
+ SitemapInfo.update_flag == True,
+ )
+ .all()
+ )
record_ids = [r[0] for r in record_ids]
-
+
tasks.task_generate_single_sitemap(filename, record_ids)
-
+
# Generate the index
tasks.task_generate_sitemap_index()
-
+
# Verify both sitemap files were created
- file1_path = os.path.join(temp_dir, 'ads', 'sitemap_bib_multi_1.xml')
- file2_path = os.path.join(temp_dir, 'ads', 'sitemap_bib_multi_2.xml')
-
- self.assertTrue(os.path.exists(file1_path), "First sitemap file should exist")
- self.assertTrue(os.path.exists(file2_path), "Second sitemap file should exist")
-
+ file1_path = os.path.join(temp_dir, "ads", "sitemap_bib_multi_1.xml")
+ file2_path = os.path.join(temp_dir, "ads", "sitemap_bib_multi_2.xml")
+
+ self.assertTrue(
+ os.path.exists(file1_path), "First sitemap file should exist"
+ )
+ self.assertTrue(
+ os.path.exists(file2_path), "Second sitemap file should exist"
+ )
+
# Verify file contents
- with open(file1_path, 'r') as f:
+ with open(file1_path, "r") as f:
content1 = f.read()
for bibcode in file1_bibcodes:
self.assertIn(bibcode, content1, f"File 1 should contain {bibcode}")
for bibcode in file2_bibcodes:
- self.assertNotIn(bibcode, content1, f"File 1 should not contain {bibcode}")
-
- with open(file2_path, 'r') as f:
+ self.assertNotIn(
+ bibcode, content1, f"File 1 should not contain {bibcode}"
+ )
+
+ with open(file2_path, "r") as f:
content2 = f.read()
for bibcode in file2_bibcodes:
self.assertIn(bibcode, content2, f"File 2 should contain {bibcode}")
for bibcode in file1_bibcodes:
- self.assertNotIn(bibcode, content2, f"File 2 should not contain {bibcode}")
-
+ self.assertNotIn(
+ bibcode, content2, f"File 2 should not contain {bibcode}"
+ )
+
# Verify all update flags were reset
with self.app.session_scope() as session:
- updated_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(all_bibcodes)
- ).all()
+ updated_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(all_bibcodes))
+ .all()
+ )
self.assertEqual(len(updated_records), 4, "Should have 4 total records")
for record in updated_records:
- self.assertFalse(record.update_flag, f"Update flag should be False for {record.bibcode}")
-
+ self.assertFalse(
+ record.update_flag,
+ f"Update flag should be False for {record.bibcode}",
+ )
+
finally:
# Cleanup
- self.app.conf['SITEMAP_DIR'] = original_sitemap_dir
+ self.app.conf["SITEMAP_DIR"] = original_sitemap_dir
if original_sites:
- self.app.conf['SITES'] = original_sites
-
+ self.app.conf["SITES"] = original_sites
+
try:
shutil.rmtree(temp_dir)
except OSError:
@@ -2295,34 +2749,42 @@ def test_task_update_sitemap_files_multiple_files(self):
def test_task_generate_single_sitemap_multi_site(self):
"""Test generating sitemap files for multiple sites (ADS + SciX) with multiple records"""
-
+
# Setup test data with multiple bibcodes to create multiple files
test_bibcodes = [
- '2023Multi..1..1A', '2023Multi..1..2B', '2023Multi..1..3C',
- '2023Multi..2..1D', '2023Multi..2..2E', '2023Multi..2..3F',
- '2023Multi..3..1G', '2023Multi..3..2H', '2023Multi..3..3I'
+ "2023Multi..1..1A",
+ "2023Multi..1..2B",
+ "2023Multi..1..3C",
+ "2023Multi..2..1D",
+ "2023Multi..2..2E",
+ "2023Multi..2..3F",
+ "2023Multi..3..1G",
+ "2023Multi..3..2H",
+ "2023Multi..3..3I",
]
-
+
# Override MAX_RECORDS_PER_SITEMAP to force multiple files
- original_max_records = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000)
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 3 # Force 3 files with 3 records each
-
+ original_max_records = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000)
+ self.app.conf[
+ "MAX_RECORDS_PER_SITEMAP"
+ ] = 3 # Force 3 files with 3 records each
+
try:
with self.app.session_scope() as session:
# Track record IDs by filename for efficient file generation
file_record_mapping = {}
-
+
for i, bibcode in enumerate(test_bibcodes):
record = Records()
record.bibcode = bibcode
record.bib_data = f'{{"title": "Multi-site Test {i+1}"}}'
record.bib_data_updated = get_date() - timedelta(days=1)
- record.status = 'success'
+ record.status = "success"
session.add(record)
session.flush()
-
+
# Assign records to different sitemap files
- filename = f'sitemap_bib_{(i // 3) + 1}.xml' # 3 records per file
+ filename = f"sitemap_bib_{(i // 3) + 1}.xml" # 3 records per file
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
@@ -2331,865 +2793,1274 @@ def test_task_generate_single_sitemap_multi_site(self):
sitemap_record.update_flag = True
session.add(sitemap_record)
session.flush()
-
+
# Group record IDs by filename
if filename not in file_record_mapping:
file_record_mapping[filename] = []
file_record_mapping[filename].append(sitemap_record.id)
-
+
session.commit()
-
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Create site directories
- ads_dir = os.path.join(temp_dir, 'ads')
- scix_dir = os.path.join(temp_dir, 'scix')
+ ads_dir = os.path.join(temp_dir, "ads")
+ scix_dir = os.path.join(temp_dir, "scix")
os.makedirs(ads_dir, exist_ok=True)
os.makedirs(scix_dir, exist_ok=True)
-
+
# Execute sitemap generation for each file using pre-collected mapping
for filename, file_record_ids in file_record_mapping.items():
tasks.task_generate_single_sitemap(filename, file_record_ids)
-
+
# Verify all files were created for both sites
for filename in file_record_mapping.keys():
ads_file = os.path.join(ads_dir, filename)
scix_file = os.path.join(scix_dir, filename)
-
- self.assertTrue(os.path.exists(ads_file), f"Should create ADS {filename}")
- self.assertTrue(os.path.exists(scix_file), f"Should create SciX {filename}")
-
+
+ self.assertTrue(
+ os.path.exists(ads_file), f"Should create ADS {filename}"
+ )
+ self.assertTrue(
+ os.path.exists(scix_file), f"Should create SciX {filename}"
+ )
+
# Test file content validation for each file
- with open(ads_file, 'r', encoding='utf-8') as f:
+ with open(ads_file, "r", encoding="utf-8") as f:
ads_content = f.read()
- with open(scix_file, 'r', encoding='utf-8') as f:
+ with open(scix_file, "r", encoding="utf-8") as f:
scix_content = f.read()
-
+
# Verify XML structure for both sites
- self.assertIn('', ads_content, f"ADS {filename} should have XML declaration")
- self.assertIn('', ads_content, f"ADS {filename} should have urlset element")
- self.assertIn('', ads_content, f"ADS {filename} should close urlset element")
-
- self.assertIn('', scix_content, f"SciX {filename} should have XML declaration")
- self.assertIn('', scix_content, f"SciX {filename} should have urlset element")
- self.assertIn('', scix_content, f"SciX {filename} should close urlset element")
-
+ self.assertIn(
+ '',
+ ads_content,
+ f"ADS {filename} should have XML declaration",
+ )
+ self.assertIn(
+ '',
+ ads_content,
+ f"ADS {filename} should have urlset element",
+ )
+ self.assertIn(
+ "",
+ ads_content,
+ f"ADS {filename} should close urlset element",
+ )
+
+ self.assertIn(
+ '',
+ scix_content,
+ f"SciX {filename} should have XML declaration",
+ )
+ self.assertIn(
+ '',
+ scix_content,
+ f"SciX {filename} should have urlset element",
+ )
+ self.assertIn(
+ "",
+ scix_content,
+ f"SciX {filename} should close urlset element",
+ )
+
# Verify each file contains exactly 3 URL entries
- ads_url_count = ads_content.count('')
- scix_url_count = scix_content.count('')
- self.assertEqual(ads_url_count, 3, f"ADS {filename} should contain exactly 3 URL entries")
- self.assertEqual(scix_url_count, 3, f"SciX {filename} should contain exactly 3 URL entries")
-
+ ads_url_count = ads_content.count("")
+ scix_url_count = scix_content.count("")
+ self.assertEqual(
+ ads_url_count,
+ 3,
+ f"ADS {filename} should contain exactly 3 URL entries",
+ )
+ self.assertEqual(
+ scix_url_count,
+ 3,
+ f"SciX {filename} should contain exactly 3 URL entries",
+ )
+
# Verify lastmod elements are present
- self.assertIn('', ads_content, f"ADS {filename} should contain lastmod elements")
- self.assertIn('', scix_content, f"SciX {filename} should contain lastmod elements")
-
+ self.assertIn(
+ "",
+ ads_content,
+ f"ADS {filename} should contain lastmod elements",
+ )
+ self.assertIn(
+ "",
+ scix_content,
+ f"SciX {filename} should contain lastmod elements",
+ )
+
# Test specific bibcode content in files
file_bibcode_mapping = {
- 'sitemap_bib_1.xml': test_bibcodes[0:3], # First 3 bibcodes
- 'sitemap_bib_2.xml': test_bibcodes[3:6], # Next 3 bibcodes
- 'sitemap_bib_3.xml': test_bibcodes[6:9], # Last 3 bibcodes
+ "sitemap_bib_1.xml": test_bibcodes[0:3], # First 3 bibcodes
+ "sitemap_bib_2.xml": test_bibcodes[3:6], # Next 3 bibcodes
+ "sitemap_bib_3.xml": test_bibcodes[6:9], # Last 3 bibcodes
}
-
+
for filename, expected_bibcodes in file_bibcode_mapping.items():
ads_file = os.path.join(ads_dir, filename)
scix_file = os.path.join(scix_dir, filename)
-
- with open(ads_file, 'r', encoding='utf-8') as f:
+
+ with open(ads_file, "r", encoding="utf-8") as f:
ads_content = f.read()
- with open(scix_file, 'r', encoding='utf-8') as f:
+ with open(scix_file, "r", encoding="utf-8") as f:
scix_content = f.read()
-
+
# Verify each expected bibcode appears in the correct file
for bibcode in expected_bibcodes:
escaped_bibcode = html.escape(bibcode)
- ads_url = f'https://ui.adsabs.harvard.edu/abs/{escaped_bibcode}'
- scix_url = f'https://scixplorer.org/abs/{escaped_bibcode}'
-
- self.assertIn(f'{ads_url}', ads_content, f"ADS {filename} should contain URL for {bibcode}")
- self.assertIn(f'{scix_url}', scix_content, f"SciX {filename} should contain URL for {bibcode}")
-
+ ads_url = f"https://ui.adsabs.harvard.edu/abs/{escaped_bibcode}"
+ scix_url = f"https://scixplorer.org/abs/{escaped_bibcode}"
+
+ self.assertIn(
+ f"{ads_url}",
+ ads_content,
+ f"ADS {filename} should contain URL for {bibcode}",
+ )
+ self.assertIn(
+ f"{scix_url}",
+ scix_content,
+ f"SciX {filename} should contain URL for {bibcode}",
+ )
+
# Verify total record distribution across all files
total_ads_urls = 0
total_scix_urls = 0
for filename in file_record_mapping.keys():
- with open(os.path.join(ads_dir, filename), 'r') as f:
- total_ads_urls += f.read().count('')
- with open(os.path.join(scix_dir, filename), 'r') as f:
- total_scix_urls += f.read().count('')
-
- self.assertEqual(total_ads_urls, 9, "Total ADS URLs across all files should be 9")
- self.assertEqual(total_scix_urls, 9, "Total SciX URLs across all files should be 9")
-
+ with open(os.path.join(ads_dir, filename), "r") as f:
+ total_ads_urls += f.read().count("")
+ with open(os.path.join(scix_dir, filename), "r") as f:
+ total_scix_urls += f.read().count("")
+
+ self.assertEqual(
+ total_ads_urls, 9, "Total ADS URLs across all files should be 9"
+ )
+ self.assertEqual(
+ total_scix_urls, 9, "Total SciX URLs across all files should be 9"
+ )
+
finally:
# Restore original configuration
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_max_records
+ self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_max_records
def test_task_update_robots_files_creation(self):
"""Test robots.txt file creation for multiple sites with content validation"""
-
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Create site directories
- ads_dir = os.path.join(temp_dir, 'ads')
- scix_dir = os.path.join(temp_dir, 'scix')
+ ads_dir = os.path.join(temp_dir, "ads")
+ scix_dir = os.path.join(temp_dir, "scix")
os.makedirs(ads_dir, exist_ok=True)
os.makedirs(scix_dir, exist_ok=True)
-
+
# Execute robots.txt update
-
+
result = update_robots_files(True)
-
+
# Verify function completed
self.assertTrue(isinstance(result, bool), "Should return boolean result")
-
+
# Verify robots.txt files were created for both sites
- ads_robots = os.path.join(ads_dir, 'robots.txt')
- scix_robots = os.path.join(scix_dir, 'robots.txt')
- self.assertTrue(os.path.exists(ads_robots), "Should create ADS robots.txt file")
- self.assertTrue(os.path.exists(scix_robots), "Should create SciX robots.txt file")
-
+ ads_robots = os.path.join(ads_dir, "robots.txt")
+ scix_robots = os.path.join(scix_dir, "robots.txt")
+ self.assertTrue(
+ os.path.exists(ads_robots), "Should create ADS robots.txt file"
+ )
+ self.assertTrue(
+ os.path.exists(scix_robots), "Should create SciX robots.txt file"
+ )
+
# Test ADS robots.txt content
- with open(ads_robots, 'r', encoding='utf-8') as f:
+ with open(ads_robots, "r", encoding="utf-8") as f:
ads_robots_content = f.read()
-
+
# Verify ADS robots.txt content
- self.assertIn('User-agent: *', ads_robots_content, "ADS robots.txt should contain User-agent directive")
- self.assertIn('Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml', ads_robots_content, "ADS robots.txt should contain sitemap URL")
- self.assertIn('Disallow: /abs/', ads_robots_content, "ADS robots.txt should contain disallow directives")
-
+ self.assertIn(
+ "User-agent: *",
+ ads_robots_content,
+ "ADS robots.txt should contain User-agent directive",
+ )
+ self.assertIn(
+ "Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml",
+ ads_robots_content,
+ "ADS robots.txt should contain sitemap URL",
+ )
+ self.assertIn(
+ "Disallow: /abs/",
+ ads_robots_content,
+ "ADS robots.txt should contain disallow directives",
+ )
+
# Test SciX robots.txt content
- with open(scix_robots, 'r', encoding='utf-8') as f:
+ with open(scix_robots, "r", encoding="utf-8") as f:
scix_robots_content = f.read()
-
# Verify SciX robots.txt content
- self.assertIn('User-agent: *', scix_robots_content, "SciX robots.txt should contain User-agent directive")
- self.assertIn('Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml', scix_robots_content, "SciX robots.txt should contain sitemap URL")
- self.assertIn('Disallow: /abs/', scix_robots_content, "SciX robots.txt should contain disallow directives")
-
+ self.assertIn(
+ "User-agent: *",
+ scix_robots_content,
+ "SciX robots.txt should contain User-agent directive",
+ )
+ self.assertIn(
+ "Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml",
+ scix_robots_content,
+ "SciX robots.txt should contain sitemap URL",
+ )
+ self.assertIn(
+ "Disallow: /abs/",
+ scix_robots_content,
+ "SciX robots.txt should contain disallow directives",
+ )
+
# Verify robots.txt files are not empty
- self.assertGreater(len(ads_robots_content.strip()), 0, "ADS robots.txt should not be empty")
- self.assertGreater(len(scix_robots_content.strip()), 0, "SciX robots.txt should not be empty")
-
+ self.assertGreater(
+ len(ads_robots_content.strip()), 0, "ADS robots.txt should not be empty"
+ )
+ self.assertGreater(
+ len(scix_robots_content.strip()),
+ 0,
+ "SciX robots.txt should not be empty",
+ )
+
# Verify proper line endings and format
- self.assertTrue(ads_robots_content.endswith('\n'), "ADS robots.txt should end with newline")
- self.assertTrue(scix_robots_content.endswith('\n'), "SciX robots.txt should end with newline")
-
+ self.assertTrue(
+ ads_robots_content.endswith("\n"),
+ "ADS robots.txt should end with newline",
+ )
+ self.assertTrue(
+ scix_robots_content.endswith("\n"),
+ "SciX robots.txt should end with newline",
+ )
+
# Verify correct sitemap URLs - should match production URLs
- self.assertIn('Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml', ads_robots_content,
- "ADS robots.txt should contain correct sitemap URL")
- self.assertIn('Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml', scix_robots_content,
- "SciX robots.txt should contain correct sitemap URL")
-
+ self.assertIn(
+ "Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml",
+ ads_robots_content,
+ "ADS robots.txt should contain correct sitemap URL",
+ )
+ self.assertIn(
+ "Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml",
+ scix_robots_content,
+ "SciX robots.txt should contain correct sitemap URL",
+ )
+
# Verify we have the expected user agents (robots.txt contains intentional duplicates for different agents)
- self.assertIn('User-agent: Googlebot', ads_robots_content, "Should have Googlebot directives")
- self.assertIn('User-agent: msnbot', ads_robots_content, "Should have msnbot directives")
- self.assertIn('User-agent: Slurp', ads_robots_content, "Should have Slurp directives")
- self.assertIn('User-agent: *', ads_robots_content, "Should have wildcard user-agent directives")
-
+ self.assertIn(
+ "User-agent: Googlebot",
+ ads_robots_content,
+ "Should have Googlebot directives",
+ )
+ self.assertIn(
+ "User-agent: msnbot",
+ ads_robots_content,
+ "Should have msnbot directives",
+ )
+ self.assertIn(
+ "User-agent: Slurp", ads_robots_content, "Should have Slurp directives"
+ )
+ self.assertIn(
+ "User-agent: *",
+ ads_robots_content,
+ "Should have wildcard user-agent directives",
+ )
+
# Same for SciX
- self.assertIn('User-agent: Googlebot', scix_robots_content, "Should have Googlebot directives")
- self.assertIn('User-agent: msnbot', scix_robots_content, "Should have msnbot directives")
- self.assertIn('User-agent: Slurp', scix_robots_content, "Should have Slurp directives")
- self.assertIn('User-agent: *', scix_robots_content, "Should have wildcard user-agent directives")
+ self.assertIn(
+ "User-agent: Googlebot",
+ scix_robots_content,
+ "Should have Googlebot directives",
+ )
+ self.assertIn(
+ "User-agent: msnbot",
+ scix_robots_content,
+ "Should have msnbot directives",
+ )
+ self.assertIn(
+ "User-agent: Slurp", scix_robots_content, "Should have Slurp directives"
+ )
+ self.assertIn(
+ "User-agent: *",
+ scix_robots_content,
+ "Should have wildcard user-agent directives",
+ )
def test_task_update_sitemap_index_generation(self):
"""Test comprehensive sitemap index generation with actual files and database records"""
-
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Create test records and sitemap entries in database
- test_bibcodes = ['2023Index..1..1A', '2023Index..1..2B', '2023Index..1..3C']
-
+ test_bibcodes = ["2023Index..1..1A", "2023Index..1..2B", "2023Index..1..3C"]
+
with self.app.session_scope() as session:
# Clear any existing data
session.query(SitemapInfo).delete(synchronize_session=False)
session.query(Records).delete(synchronize_session=False)
-
+
# Create Records entries
for i, bibcode in enumerate(test_bibcodes):
record = Records(
bibcode=bibcode,
bib_data='{"title": "Test Title"}',
bib_data_updated=get_date() - timedelta(days=1),
- status='success'
+ status="success",
)
session.add(record)
-
+
session.commit()
-
+
# Get record IDs for sitemap entries
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ records = (
+ session.query(Records)
+ .filter(Records.bibcode.in_(test_bibcodes))
+ .all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
# Create SitemapInfo entries with different filenames
sitemap_entries = [
- {'bibcode': test_bibcodes[0], 'filename': 'sitemap_bib_1.xml', 'record_id': record_map[test_bibcodes[0]]},
- {'bibcode': test_bibcodes[1], 'filename': 'sitemap_bib_1.xml', 'record_id': record_map[test_bibcodes[1]]},
- {'bibcode': test_bibcodes[2], 'filename': 'sitemap_bib_2.xml', 'record_id': record_map[test_bibcodes[2]]},
+ {
+ "bibcode": test_bibcodes[0],
+ "filename": "sitemap_bib_1.xml",
+ "record_id": record_map[test_bibcodes[0]],
+ },
+ {
+ "bibcode": test_bibcodes[1],
+ "filename": "sitemap_bib_1.xml",
+ "record_id": record_map[test_bibcodes[1]],
+ },
+ {
+ "bibcode": test_bibcodes[2],
+ "filename": "sitemap_bib_2.xml",
+ "record_id": record_map[test_bibcodes[2]],
+ },
]
-
+
for entry in sitemap_entries:
sitemap_info = SitemapInfo(
- bibcode=entry['bibcode'],
- record_id=entry['record_id'],
- sitemap_filename=entry['filename'],
+ bibcode=entry["bibcode"],
+ record_id=entry["record_id"],
+ sitemap_filename=entry["filename"],
bib_data_updated=get_date() - timedelta(days=1),
filename_lastmoddate=get_date() - timedelta(hours=1),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
-
+
session.commit()
-
+
# Create site directories and actual sitemap files
- sites_config = self.app.conf.get('SITES', {})
- expected_filenames = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml']
-
+ sites_config = self.app.conf.get("SITES", {})
+ expected_filenames = ["sitemap_bib_1.xml", "sitemap_bib_2.xml"]
+
for site_key in sites_config.keys():
site_dir = os.path.join(temp_dir, site_key)
os.makedirs(site_dir, exist_ok=True)
-
+
# Create actual sitemap files with content
for filename in expected_filenames:
sitemap_path = os.path.join(site_dir, filename)
- with open(sitemap_path, 'w', encoding='utf-8') as f:
- f.write(f'\n{filename}')
-
+ with open(sitemap_path, "w", encoding="utf-8") as f:
+ f.write(
+ f'\n{filename}'
+ )
+
# Execute sitemap index update
result = update_sitemap_index()
-
+
# Verify function completed successfully
- self.assertTrue(result, "update_sitemap_index should return True on success")
-
+ self.assertTrue(
+ result, "update_sitemap_index should return True on success"
+ )
+
# Verify sitemap_index.xml files were created for each site
for site_key, site_config in sites_config.items():
site_dir = os.path.join(temp_dir, site_key)
- index_path = os.path.join(site_dir, 'sitemap_index.xml')
-
- self.assertTrue(os.path.exists(index_path),
- f"sitemap_index.xml should be created for site {site_key}")
-
+ index_path = os.path.join(site_dir, "sitemap_index.xml")
+
+ self.assertTrue(
+ os.path.exists(index_path),
+ f"sitemap_index.xml should be created for site {site_key}",
+ )
+
# Verify index file content
- with open(index_path, 'r', encoding='utf-8') as f:
+ with open(index_path, "r", encoding="utf-8") as f:
index_content = f.read()
-
+
# Should contain XML structure
- self.assertIn('', index_content,
- "Index should contain XML declaration")
- self.assertIn('', index_content,
- "Index should close sitemapindex element")
-
+ self.assertIn(
+ '',
+ index_content,
+ "Index should contain XML declaration",
+ )
+ self.assertIn(
+ "",
+ index_content,
+ "Index should close sitemapindex element",
+ )
+
# Should contain entries for each sitemap file that exists (production URL structure)
- sitemap_base_url = site_config.get('sitemap_url', 'https://ui.adsabs.harvard.edu/sitemap')
+ sitemap_base_url = site_config.get(
+ "sitemap_url", "https://ui.adsabs.harvard.edu/sitemap"
+ )
for filename in expected_filenames:
expected_url = f"{sitemap_base_url}/{filename}"
- self.assertIn(f'{html.escape(expected_url)}', index_content,
- f"Index should reference {filename} with correct URL")
- self.assertIn('', index_content,
- "Index should contain lastmod elements")
-
+ self.assertIn(
+ f"{html.escape(expected_url)}",
+ index_content,
+ f"Index should reference {filename} with correct URL",
+ )
+ self.assertIn(
+ "",
+ index_content,
+ "Index should contain lastmod elements",
+ )
+
# Verify we have the expected number of sitemap entries (2 bib files + 1 static)
- sitemap_count = index_content.count('')
- self.assertEqual(sitemap_count, 3,
- f"Index should contain exactly 3 sitemap entries (2 bib + 1 static), found {sitemap_count}")
-
+ sitemap_count = index_content.count("")
+ self.assertEqual(
+ sitemap_count,
+ 3,
+ f"Index should contain exactly 3 sitemap entries (2 bib + 1 static), found {sitemap_count}",
+ )
+
# Test cleanup
with self.app.session_scope() as session:
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.in_(test_bibcodes)).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.in_(test_bibcodes)
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.in_(test_bibcodes)
+ ).delete(synchronize_session=False)
session.commit()
-
+
def test_task_update_sitemap_index_empty_database(self):
"""Test sitemap index generation when no sitemap files exist in database"""
-
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Ensure database has no sitemap entries
with self.app.session_scope() as session:
session.query(SitemapInfo).delete(synchronize_session=False)
session.commit()
-
+
# Execute sitemap index update
result = update_sitemap_index()
-
+
# Should still succeed (generates empty index files)
- self.assertTrue(result, "update_sitemap_index should return True even with empty database")
-
+ self.assertTrue(
+ result,
+ "update_sitemap_index should return True even with empty database",
+ )
+
# Verify empty sitemap_index.xml files were created
- sites_config = self.app.conf.get('SITES', {})
+ sites_config = self.app.conf.get("SITES", {})
for site_key in sites_config.keys():
site_dir = os.path.join(temp_dir, site_key)
- index_path = os.path.join(site_dir, 'sitemap_index.xml')
-
- self.assertTrue(os.path.exists(index_path),
- f"Empty sitemap_index.xml should be created for site {site_key}")
-
+ index_path = os.path.join(site_dir, "sitemap_index.xml")
+
+ self.assertTrue(
+ os.path.exists(index_path),
+ f"Empty sitemap_index.xml should be created for site {site_key}",
+ )
+
# Verify empty index file content
- with open(index_path, 'r', encoding='utf-8') as f:
+ with open(index_path, "r", encoding="utf-8") as f:
index_content = f.read()
-
+
# Should contain XML structure but no sitemap entries
- self.assertIn('', index_content,
- "Empty index should contain XML declaration")
- self.assertIn('', index_content,
- "Empty index should close sitemapindex element")
-
+ self.assertIn(
+ '',
+ index_content,
+ "Empty index should contain XML declaration",
+ )
+ self.assertIn(
+ "",
+ index_content,
+ "Empty index should close sitemapindex element",
+ )
+
# Should contain only static sitemap entry (1 entry)
- sitemap_count = index_content.count('')
- self.assertEqual(sitemap_count, 1,
- f"Empty index should contain only static sitemap entry, found {sitemap_count}")
-
+ sitemap_count = index_content.count("")
+ self.assertEqual(
+ sitemap_count,
+ 1,
+ f"Empty index should contain only static sitemap entry, found {sitemap_count}",
+ )
+
def test_task_update_sitemap_index_missing_files(self):
"""Test sitemap index generation when database has entries but physical files don't exist"""
-
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Create database entries but no physical files
- test_bibcode = '2023Missing..1..1A'
-
+ test_bibcode = "2023Missing..1..1A"
+
with self.app.session_scope() as session:
# Clear existing data
session.query(SitemapInfo).delete(synchronize_session=False)
session.query(Records).delete(synchronize_session=False)
-
+
# Create a Record entry
record = Records(
bibcode=test_bibcode,
bib_data='{"title": "Test Title"}',
bib_data_updated=get_date() - timedelta(days=1),
- status='success'
+ status="success",
)
session.add(record)
session.commit()
-
+
# Create SitemapInfo entry
sitemap_info = SitemapInfo(
bibcode=test_bibcode,
record_id=record.id,
- sitemap_filename='sitemap_bib_missing.xml',
+ sitemap_filename="sitemap_bib_missing.xml",
bib_data_updated=get_date() - timedelta(days=1),
filename_lastmoddate=get_date() - timedelta(hours=1),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
session.commit()
-
+
# Create site directories but NO sitemap files
- sites_config = self.app.conf.get('SITES', {})
+ sites_config = self.app.conf.get("SITES", {})
for site_key in sites_config.keys():
site_dir = os.path.join(temp_dir, site_key)
os.makedirs(site_dir, exist_ok=True)
# Deliberately NOT creating the sitemap_bib_missing.xml file
-
+
# Execute sitemap index update
result = update_sitemap_index()
-
+
# Should still succeed
- self.assertTrue(result, "update_sitemap_index should return True even when files are missing")
-
+ self.assertTrue(
+ result,
+ "update_sitemap_index should return True even when files are missing",
+ )
+
# Verify empty sitemap_index.xml files were created (no entries since files don't exist)
for site_key in sites_config.keys():
site_dir = os.path.join(temp_dir, site_key)
- index_path = os.path.join(site_dir, 'sitemap_index.xml')
-
- self.assertTrue(os.path.exists(index_path),
- f"sitemap_index.xml should be created for site {site_key}")
-
+ index_path = os.path.join(site_dir, "sitemap_index.xml")
+
+ self.assertTrue(
+ os.path.exists(index_path),
+ f"sitemap_index.xml should be created for site {site_key}",
+ )
+
# Verify index file has no entries (since physical files don't exist)
- with open(index_path, 'r', encoding='utf-8') as f:
+ with open(index_path, "r", encoding="utf-8") as f:
index_content = f.read()
-
- sitemap_count = index_content.count('')
- self.assertEqual(sitemap_count, 1,
- f"Index should contain only static sitemap when physical files missing, found {sitemap_count}")
-
+
+ sitemap_count = index_content.count("")
+ self.assertEqual(
+ sitemap_count,
+ 1,
+ f"Index should contain only static sitemap when physical files missing, found {sitemap_count}",
+ )
+
# Test cleanup
with self.app.session_scope() as session:
- session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode == test_bibcode).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode == test_bibcode
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(Records.bibcode == test_bibcode).delete(
+ synchronize_session=False
+ )
session.commit()
def test_task_generate_sitemap_index(self):
"""Test the Celery task wrapper for sitemap index generation"""
-
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Create test data in database first (required for sitemap index generation)
- test_bibcodes = ['2023TaskIndex..1..1A', '2023TaskIndex..1..2B', '2023TaskIndex..1..3C']
+ test_bibcodes = [
+ "2023TaskIndex..1..1A",
+ "2023TaskIndex..1..2B",
+ "2023TaskIndex..1..3C",
+ ]
# Use production-like filenames (compressed format with zero-padded numbers)
- sample_sitemaps = ['sitemap_bib.0001.xml.gz', 'sitemap_bib.0002.xml.gz', 'sitemap_bib.0003.xml.gz']
-
+ sample_sitemaps = [
+ "sitemap_bib.0001.xml.gz",
+ "sitemap_bib.0002.xml.gz",
+ "sitemap_bib.0003.xml.gz",
+ ]
+
with self.app.session_scope() as session:
# Clear existing data
session.query(SitemapInfo).delete(synchronize_session=False)
session.query(Records).delete(synchronize_session=False)
-
+
# Create Records entries
for i, bibcode in enumerate(test_bibcodes):
record = Records(
bibcode=bibcode,
bib_data='{"title": "Test Title"}',
bib_data_updated=get_date() - timedelta(days=1),
- status='success'
+ status="success",
)
session.add(record)
-
+
session.commit()
-
+
# Get record IDs for sitemap entries
- records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all()
+ records = (
+ session.query(Records)
+ .filter(Records.bibcode.in_(test_bibcodes))
+ .all()
+ )
record_map = {r.bibcode: r.id for r in records}
-
+
# Create SitemapInfo entries for different filenames
sitemap_mappings = [
- {'bibcode': test_bibcodes[0], 'filename': sample_sitemaps[0]},
- {'bibcode': test_bibcodes[1], 'filename': sample_sitemaps[1]},
- {'bibcode': test_bibcodes[2], 'filename': sample_sitemaps[2]},
+ {"bibcode": test_bibcodes[0], "filename": sample_sitemaps[0]},
+ {"bibcode": test_bibcodes[1], "filename": sample_sitemaps[1]},
+ {"bibcode": test_bibcodes[2], "filename": sample_sitemaps[2]},
]
-
+
for mapping in sitemap_mappings:
sitemap_info = SitemapInfo(
- bibcode=mapping['bibcode'],
- record_id=record_map[mapping['bibcode']],
- sitemap_filename=mapping['filename'],
+ bibcode=mapping["bibcode"],
+ record_id=record_map[mapping["bibcode"]],
+ sitemap_filename=mapping["filename"],
bib_data_updated=get_date() - timedelta(days=1),
filename_lastmoddate=get_date() - timedelta(hours=1),
- update_flag=False
+ update_flag=False,
)
session.add(sitemap_info)
-
+
session.commit()
-
+
# Create site directories and physical sitemap files
- ads_dir = os.path.join(temp_dir, 'ads')
- scix_dir = os.path.join(temp_dir, 'scix')
+ ads_dir = os.path.join(temp_dir, "ads")
+ scix_dir = os.path.join(temp_dir, "scix")
os.makedirs(ads_dir, exist_ok=True)
os.makedirs(scix_dir, exist_ok=True)
-
+
for filename in sample_sitemaps:
# Create sample XML files for both sites
- with open(os.path.join(ads_dir, filename), 'w') as f:
+ with open(os.path.join(ads_dir, filename), "w") as f:
f.write('')
- with open(os.path.join(scix_dir, filename), 'w') as f:
+ with open(os.path.join(scix_dir, filename), "w") as f:
f.write('')
-
+
# Execute the Celery task
try:
tasks.task_generate_sitemap_index()
success = True
except Exception as e:
success = False
-
+
# Verify task executed without errors
self.assertTrue(success, "Task should execute without errors")
-
+
# Verify sitemap index files were created for both sites
- ads_index = os.path.join(ads_dir, 'sitemap_index.xml')
- scix_index = os.path.join(scix_dir, 'sitemap_index.xml')
- self.assertTrue(os.path.exists(ads_index), "Should create ADS sitemap index file")
- self.assertTrue(os.path.exists(scix_index), "Should create SciX sitemap index file")
-
+ ads_index = os.path.join(ads_dir, "sitemap_index.xml")
+ scix_index = os.path.join(scix_dir, "sitemap_index.xml")
+ self.assertTrue(
+ os.path.exists(ads_index), "Should create ADS sitemap index file"
+ )
+ self.assertTrue(
+ os.path.exists(scix_index), "Should create SciX sitemap index file"
+ )
+
# Test ADS sitemap index content
- with open(ads_index, 'r', encoding='utf-8') as f:
+ with open(ads_index, "r", encoding="utf-8") as f:
ads_index_content = f.read()
-
+
# Verify XML structure for ADS index
- self.assertIn('', ads_index_content, "ADS index should have XML declaration")
- self.assertIn('', ads_index_content, "ADS index should have sitemapindex element")
- self.assertIn('', ads_index_content, "ADS index should close sitemapindex element")
-
+ self.assertIn(
+ '',
+ ads_index_content,
+ "ADS index should have XML declaration",
+ )
+ self.assertIn(
+ '',
+ ads_index_content,
+ "ADS index should have sitemapindex element",
+ )
+ self.assertIn(
+ "",
+ ads_index_content,
+ "ADS index should close sitemapindex element",
+ )
+
# Verify all sample sitemaps are referenced in ADS index (production URL structure)
for filename in sample_sitemaps:
- sitemap_url = f'https://ui.adsabs.harvard.edu/sitemap/{filename}'
- self.assertIn(f'{html.escape(sitemap_url)}', ads_index_content, f"ADS index should reference {filename}")
- self.assertIn('', ads_index_content, "ADS index should contain lastmod elements")
-
+ sitemap_url = f"https://ui.adsabs.harvard.edu/sitemap/{filename}"
+ self.assertIn(
+ f"{html.escape(sitemap_url)}",
+ ads_index_content,
+ f"ADS index should reference {filename}",
+ )
+ self.assertIn(
+ "",
+ ads_index_content,
+ "ADS index should contain lastmod elements",
+ )
+
# Test SciX sitemap index content
- with open(scix_index, 'r', encoding='utf-8') as f:
+ with open(scix_index, "r", encoding="utf-8") as f:
scix_index_content = f.read()
-
+
# Verify XML structure for SciX index
- self.assertIn('', scix_index_content, "SciX index should have XML declaration")
- self.assertIn('', scix_index_content, "SciX index should have sitemapindex element")
- self.assertIn('', scix_index_content, "SciX index should close sitemapindex element")
-
+ self.assertIn(
+ '',
+ scix_index_content,
+ "SciX index should have XML declaration",
+ )
+ self.assertIn(
+ '',
+ scix_index_content,
+ "SciX index should have sitemapindex element",
+ )
+ self.assertIn(
+ "",
+ scix_index_content,
+ "SciX index should close sitemapindex element",
+ )
+
# Verify all sample sitemaps are referenced in SciX index (production URL structure)
for filename in sample_sitemaps:
- sitemap_url = f'https://scixplorer.org/sitemap/{filename}'
- self.assertIn(f'{html.escape(sitemap_url)}', scix_index_content, f"SciX index should reference {filename}")
- self.assertIn('', scix_index_content, "SciX index should contain lastmod elements")
-
+ sitemap_url = f"https://scixplorer.org/sitemap/{filename}"
+ self.assertIn(
+ f"{html.escape(sitemap_url)}",
+ scix_index_content,
+ f"SciX index should reference {filename}",
+ )
+ self.assertIn(
+ "",
+ scix_index_content,
+ "SciX index should contain lastmod elements",
+ )
+
# Verify sitemap count matches expected (3 bib files + 1 static file)
- ads_sitemap_count = ads_index_content.count('')
- scix_sitemap_count = scix_index_content.count('')
- self.assertEqual(ads_sitemap_count, 4, "ADS index should contain exactly 4 sitemap entries (3 bib + 1 static)")
- self.assertEqual(scix_sitemap_count, 4, "SciX index should contain exactly 4 sitemap entries (3 bib + 1 static)")
-
+ ads_sitemap_count = ads_index_content.count("")
+ scix_sitemap_count = scix_index_content.count("")
+ self.assertEqual(
+ ads_sitemap_count,
+ 4,
+ "ADS index should contain exactly 4 sitemap entries (3 bib + 1 static)",
+ )
+ self.assertEqual(
+ scix_sitemap_count,
+ 4,
+ "SciX index should contain exactly 4 sitemap entries (3 bib + 1 static)",
+ )
+
# Verify proper URL structure and no broken links (production structure)
- self.assertIn('https://ui.adsabs.harvard.edu/sitemap/', ads_index_content, "ADS index should contain ADS sitemap base URL")
- self.assertIn('https://scixplorer.org/sitemap/', scix_index_content, "SciX index should contain SciX sitemap base URL")
-
+ self.assertIn(
+ "https://ui.adsabs.harvard.edu/sitemap/",
+ ads_index_content,
+ "ADS index should contain ADS sitemap base URL",
+ )
+ self.assertIn(
+ "https://scixplorer.org/sitemap/",
+ scix_index_content,
+ "SciX index should contain SciX sitemap base URL",
+ )
+
# Verify index files are not empty and have reasonable content
- self.assertGreater(len(ads_index_content.strip()), 200, "ADS index should have substantial content")
- self.assertGreater(len(scix_index_content.strip()), 200, "SciX index should have substantial content")
-
+ self.assertGreater(
+ len(ads_index_content.strip()),
+ 200,
+ "ADS index should have substantial content",
+ )
+ self.assertGreater(
+ len(scix_index_content.strip()),
+ 200,
+ "SciX index should have substantial content",
+ )
+
# Verify no duplicate sitemap entries
- ads_locs = [line.strip() for line in ads_index_content.split('\n') if '' in line]
- scix_locs = [line.strip() for line in scix_index_content.split('\n') if '' in line]
- self.assertEqual(len(ads_locs), len(set(ads_locs)), "ADS index should not contain duplicate sitemap URLs")
- self.assertEqual(len(scix_locs), len(set(scix_locs)), "SciX index should not contain duplicate sitemap URLs")
-
+ ads_locs = [
+ line.strip()
+ for line in ads_index_content.split("\n")
+ if "" in line
+ ]
+ scix_locs = [
+ line.strip()
+ for line in scix_index_content.split("\n")
+ if "" in line
+ ]
+ self.assertEqual(
+ len(ads_locs),
+ len(set(ads_locs)),
+ "ADS index should not contain duplicate sitemap URLs",
+ )
+ self.assertEqual(
+ len(scix_locs),
+ len(set(scix_locs)),
+ "SciX index should not contain duplicate sitemap URLs",
+ )
+
# Test cleanup
with self.app.session_scope() as session:
- session.query(SitemapInfo).filter(SitemapInfo.bibcode.in_(test_bibcodes)).delete(synchronize_session=False)
- session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).delete(synchronize_session=False)
+ session.query(SitemapInfo).filter(
+ SitemapInfo.bibcode.in_(test_bibcodes)
+ ).delete(synchronize_session=False)
+ session.query(Records).filter(
+ Records.bibcode.in_(test_bibcodes)
+ ).delete(synchronize_session=False)
session.commit()
def test_force_update_workflow(self):
"""Test the complete force-update workflow with timestamp updates"""
- test_bibcode = '2023Test.....1....A'
-
+ test_bibcode = "2023Test.....1....A"
+
# Create initial record
with self.app.session_scope() as session:
record = Records(bibcode=test_bibcode, bib_data='{"title": "test"}')
session.add(record)
session.commit()
record_id = record.id
-
+
# Add to sitemap initially
- tasks.task_manage_sitemap([test_bibcode], 'add')
-
+ tasks.task_manage_sitemap([test_bibcode], "add")
+
# Verify initial creation
with self.app.session_scope() as session:
- sitemap_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ sitemap_record = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ )
self.assertIsNotNone(sitemap_record)
initial_timestamp = sitemap_record.bib_data_updated
-
+
# Wait a moment to ensure timestamp difference
time.sleep(0.01)
-
+
# Update the record's bib_data_updated timestamp
with self.app.session_scope() as session:
- session.query(Records).filter_by(id=record_id).update({
- 'bib_data_updated': datetime.now(timezone.utc)
- }, synchronize_session=False)
+ session.query(Records).filter_by(id=record_id).update(
+ {"bib_data_updated": datetime.now(timezone.utc)},
+ synchronize_session=False,
+ )
session.commit()
-
+
# Force update
- tasks.task_manage_sitemap([test_bibcode], 'force-update')
-
+ tasks.task_manage_sitemap([test_bibcode], "force-update")
+
# Verify timestamp was updated
with self.app.session_scope() as session:
- updated_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ updated_record = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ )
self.assertIsNotNone(updated_record)
self.assertNotEqual(updated_record.bib_data_updated, initial_timestamp)
def test_add_action_timestamp_logic(self):
"""Test that add action correctly handles timestamp comparisons"""
- test_bibcode = '2023Test.....2....A'
-
+ test_bibcode = "2023Test.....2....A"
+
# Create record
with self.app.session_scope() as session:
- record = Records(
- bibcode=test_bibcode,
- bib_data='{"title": "test"}'
- )
+ record = Records(bibcode=test_bibcode, bib_data='{"title": "test"}')
session.add(record)
session.commit()
initial_record_timestamp = record.bib_data_updated
-
+
# Add to sitemap
- tasks.task_manage_sitemap([test_bibcode], 'add')
-
+ tasks.task_manage_sitemap([test_bibcode], "add")
+
# Verify sitemap record was created
with self.app.session_scope() as session:
- sitemap_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ sitemap_record = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ )
self.assertIsNotNone(sitemap_record)
initial_sitemap_timestamp = sitemap_record.bib_data_updated
-
+
# Wait a moment to ensure timestamp difference
time.sleep(0.01)
-
+
# Update record timestamp
with self.app.session_scope() as session:
- session.query(Records).filter_by(bibcode=test_bibcode).update({
- 'bib_data_updated': datetime.now(timezone.utc)
- }, synchronize_session=False)
+ session.query(Records).filter_by(bibcode=test_bibcode).update(
+ {"bib_data_updated": datetime.now(timezone.utc)},
+ synchronize_session=False,
+ )
session.commit()
-
+
# Add again - should update timestamp
- tasks.task_manage_sitemap([test_bibcode], 'add')
-
+ tasks.task_manage_sitemap([test_bibcode], "add")
+
# Verify timestamp was updated
with self.app.session_scope() as session:
- updated_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ updated_record = (
+ session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first()
+ )
self.assertIsNotNone(updated_record)
# Timestamp should be different from initial
- self.assertNotEqual(updated_record.bib_data_updated, initial_sitemap_timestamp)
+ self.assertNotEqual(
+ updated_record.bib_data_updated, initial_sitemap_timestamp
+ )
def test_max_records_per_sitemap_logic(self):
"""Test that sitemap files are created with proper record limits"""
# Use a small limit for testing
- original_limit = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000)
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 3 # Small limit for testing
-
+ original_limit = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000)
+ self.app.conf["MAX_RECORDS_PER_SITEMAP"] = 3 # Small limit for testing
+
try:
# Create test records
- test_bibcodes = [f'2023Test.....{i}....A' for i in range(5)]
-
+ test_bibcodes = [f"2023Test.....{i}....A" for i in range(5)]
+
with self.app.session_scope() as session:
for bibcode in test_bibcodes:
record = Records(bibcode=bibcode, bib_data='{"title": "test"}')
session.add(record)
session.commit()
-
+
# Add all records
- tasks.task_manage_sitemap(test_bibcodes, 'add')
-
+ tasks.task_manage_sitemap(test_bibcodes, "add")
+
# Verify records are distributed across multiple files
with self.app.session_scope() as session:
- sitemap_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(test_bibcodes)
- ).all()
-
+ sitemap_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(test_bibcodes))
+ .all()
+ )
+
# Should have all 5 records
self.assertEqual(len(sitemap_records), 5)
-
+
# Should use at least 2 different filenames (3+2 distribution)
filenames = set(record.sitemap_filename for record in sitemap_records)
self.assertGreaterEqual(len(filenames), 2)
-
+
# Verify no file has more than 3 records
filename_counts = {}
for record in sitemap_records:
- filename_counts[record.sitemap_filename] = filename_counts.get(record.sitemap_filename, 0) + 1
-
+ filename_counts[record.sitemap_filename] = (
+ filename_counts.get(record.sitemap_filename, 0) + 1
+ )
+
for filename, count in filename_counts.items():
- self.assertLessEqual(count, 3, f"File {filename} has {count} records, exceeds limit of 3")
-
+ self.assertLessEqual(
+ count,
+ 3,
+ f"File {filename} has {count} records, exceeds limit of 3",
+ )
+
finally:
# Restore original limit
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_limit
+ self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_limit
def test_batch_processing_mixed_records(self):
"""Test batch processing with mix of new and existing records"""
# Create some existing records
- existing_bibcodes = ['2023Existing.1....A', '2023Existing.2....A']
- new_bibcodes = ['2023New......1....A', '2023New......2....A']
+ existing_bibcodes = ["2023Existing.1....A", "2023Existing.2....A"]
+ new_bibcodes = ["2023New......1....A", "2023New......2....A"]
all_bibcodes = existing_bibcodes + new_bibcodes
-
+
# Override MAX_RECORDS_PER_SITEMAP to force file distribution
- original_max_records = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000)
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 2 # Force max 2 records per file
-
+ original_max_records = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000)
+ self.app.conf["MAX_RECORDS_PER_SITEMAP"] = 2 # Force max 2 records per file
+
try:
with self.app.session_scope() as session:
for bibcode in all_bibcodes:
record = Records(bibcode=bibcode, bib_data='{"title": "test"}')
session.add(record)
session.commit()
-
+
# Add existing records first
- tasks.task_manage_sitemap(existing_bibcodes, 'add')
-
+ tasks.task_manage_sitemap(existing_bibcodes, "add")
+
# Verify existing records are in sitemap
with self.app.session_scope() as session:
- existing_count = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(existing_bibcodes)
- ).count()
+ existing_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(existing_bibcodes))
+ .count()
+ )
self.assertEqual(existing_count, 2)
-
+
# Now add all records (mix of existing and new)
- tasks.task_manage_sitemap(all_bibcodes, 'add')
-
+ tasks.task_manage_sitemap(all_bibcodes, "add")
+
# Verify all records are now in sitemap
with self.app.session_scope() as session:
- total_count = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(all_bibcodes)
- ).count()
+ total_count = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(all_bibcodes))
+ .count()
+ )
self.assertEqual(total_count, 4)
-
+
# Verify no duplicates
- all_records = session.query(SitemapInfo).filter(
- SitemapInfo.bibcode.in_(all_bibcodes)
- ).all()
+ all_records = (
+ session.query(SitemapInfo)
+ .filter(SitemapInfo.bibcode.in_(all_bibcodes))
+ .all()
+ )
bibcodes_in_sitemap = [record.bibcode for record in all_records]
- self.assertEqual(len(bibcodes_in_sitemap), len(set(bibcodes_in_sitemap)))
-
+ self.assertEqual(
+ len(bibcodes_in_sitemap), len(set(bibcodes_in_sitemap))
+ )
+
# Verify records are distributed across multiple files with max 2 records per file
- filenames = session.query(SitemapInfo.sitemap_filename).filter(
- SitemapInfo.bibcode.in_(all_bibcodes)
- ).distinct().all()
+ filenames = (
+ session.query(SitemapInfo.sitemap_filename)
+ .filter(SitemapInfo.bibcode.in_(all_bibcodes))
+ .distinct()
+ .all()
+ )
filename_set = {f[0] for f in filenames}
-
+
# Should use at least 2 different filenames (2+2 distribution)
- self.assertGreaterEqual(len(filename_set), 2, "Should use at least 2 different sitemap files")
-
+ self.assertGreaterEqual(
+ len(filename_set),
+ 2,
+ "Should use at least 2 different sitemap files",
+ )
+
# Verify no file has more than 2 records
filename_counts = {}
for record in all_records:
- filename_counts[record.sitemap_filename] = filename_counts.get(record.sitemap_filename, 0) + 1
-
+ filename_counts[record.sitemap_filename] = (
+ filename_counts.get(record.sitemap_filename, 0) + 1
+ )
+
for filename, count in filename_counts.items():
- self.assertLessEqual(count, 2, f"File {filename} has {count} records, exceeds limit of 2")
-
+ self.assertLessEqual(
+ count,
+ 2,
+ f"File {filename} has {count} records, exceeds limit of 2",
+ )
+
# Verify we have exactly 2 files with 2 records each
- self.assertEqual(len(filename_counts), 2, "Should have exactly 2 sitemap files")
+ self.assertEqual(
+ len(filename_counts), 2, "Should have exactly 2 sitemap files"
+ )
for filename, count in filename_counts.items():
- self.assertEqual(count, 2, f"File {filename} should have exactly 2 records")
-
+ self.assertEqual(
+ count, 2, f"File {filename} should have exactly 2 records"
+ )
+
finally:
# Restore original limit
- self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_max_records
+ self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_max_records
def test_task_manage_sitemap_delete_table_action(self):
"""Test task_manage_sitemap delete-table action"""
-
+
# Create test data first
- test_bibcodes = ['2023DeleteTable..1..1A', '2023DeleteTable..1..2B', '2023DeleteTable..1..3C']
-
+ test_bibcodes = [
+ "2023DeleteTable..1..1A",
+ "2023DeleteTable..1..2B",
+ "2023DeleteTable..1..3C",
+ ]
+
with self.app.session_scope() as session:
for bibcode in test_bibcodes:
- record = Records(bibcode=bibcode, bib_data='{"title": "Delete Table Test"}')
+ record = Records(
+ bibcode=bibcode, bib_data='{"title": "Delete Table Test"}'
+ )
session.add(record)
session.flush()
-
+
# Create sitemap entry
sitemap_record = SitemapInfo()
sitemap_record.bibcode = bibcode
sitemap_record.record_id = record.id
- sitemap_record.sitemap_filename = 'sitemap_bib_delete_test.xml'
+ sitemap_record.sitemap_filename = "sitemap_bib_delete_test.xml"
sitemap_record.update_flag = False
session.add(sitemap_record)
-
+
session.commit()
-
+
# Verify records exist before deletion
initial_count = session.query(SitemapInfo).count()
- self.assertEqual(initial_count, 3, "Should have 3 sitemap records before delete-table")
-
+ self.assertEqual(
+ initial_count, 3, "Should have 3 sitemap records before delete-table"
+ )
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Create some dummy sitemap files to test backup functionality
- ads_dir = os.path.join(temp_dir, 'ads')
- scix_dir = os.path.join(temp_dir, 'scix')
+ ads_dir = os.path.join(temp_dir, "ads")
+ scix_dir = os.path.join(temp_dir, "scix")
os.makedirs(ads_dir, exist_ok=True)
os.makedirs(scix_dir, exist_ok=True)
-
+
# Create test sitemap files
- test_files = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml', 'sitemap_index.xml']
+ test_files = ["sitemap_bib_1.xml", "sitemap_bib_2.xml", "sitemap_index.xml"]
for filename in test_files:
- with open(os.path.join(ads_dir, filename), 'w') as f:
+ with open(os.path.join(ads_dir, filename), "w") as f:
f.write('')
- with open(os.path.join(scix_dir, filename), 'w') as f:
+ with open(os.path.join(scix_dir, filename), "w") as f:
f.write('')
-
+
# Mock the backup_sitemap_files method to verify it's called
- with patch.object(self.app, 'backup_sitemap_files') as mock_backup:
+ with patch.object(self.app, "backup_sitemap_files") as mock_backup:
# Execute delete-table action
- tasks.task_manage_sitemap(['dummy'], 'delete-table')
-
+ tasks.task_manage_sitemap(["dummy"], "delete-table")
+
# Verify backup_sitemap_files was called with correct directory
mock_backup.assert_called_once_with(temp_dir)
-
+
# Verify all sitemap records were deleted
with self.app.session_scope() as session:
final_count = session.query(SitemapInfo).count()
- self.assertEqual(final_count, 0, "All sitemap records should be deleted after delete-table action")
-
+ self.assertEqual(
+ final_count,
+ 0,
+ "All sitemap records should be deleted after delete-table action",
+ )
+
# Verify Records table is unchanged (delete-table should only affect SitemapInfo)
- records_count = session.query(Records).filter(
- Records.bibcode.in_(test_bibcodes)
- ).count()
- self.assertEqual(records_count, 3, "Records table should be unchanged by delete-table action")
+ records_count = (
+ session.query(Records)
+ .filter(Records.bibcode.in_(test_bibcodes))
+ .count()
+ )
+ self.assertEqual(
+ records_count,
+ 3,
+ "Records table should be unchanged by delete-table action",
+ )
def test_task_manage_sitemap_update_robots_action(self):
"""Test task_manage_sitemap update-robots action"""
-
+
with tempfile.TemporaryDirectory() as temp_dir:
- self.app.conf['SITEMAP_DIR'] = temp_dir
-
+ self.app.conf["SITEMAP_DIR"] = temp_dir
+
# Create site directories
- ads_dir = os.path.join(temp_dir, 'ads')
- scix_dir = os.path.join(temp_dir, 'scix')
+ ads_dir = os.path.join(temp_dir, "ads")
+ scix_dir = os.path.join(temp_dir, "scix")
os.makedirs(ads_dir, exist_ok=True)
os.makedirs(scix_dir, exist_ok=True)
-
+
# Verify no robots.txt files exist initially
- ads_robots = os.path.join(ads_dir, 'robots.txt')
- scix_robots = os.path.join(scix_dir, 'robots.txt')
- self.assertFalse(os.path.exists(ads_robots), "ADS robots.txt should not exist initially")
- self.assertFalse(os.path.exists(scix_robots), "SciX robots.txt should not exist initially")
-
+ ads_robots = os.path.join(ads_dir, "robots.txt")
+ scix_robots = os.path.join(scix_dir, "robots.txt")
+ self.assertFalse(
+ os.path.exists(ads_robots), "ADS robots.txt should not exist initially"
+ )
+ self.assertFalse(
+ os.path.exists(scix_robots),
+ "SciX robots.txt should not exist initially",
+ )
+
# Execute update-robots action
try:
- tasks.task_manage_sitemap(['dummy'], 'update-robots')
+ tasks.task_manage_sitemap(["dummy"], "update-robots")
success = True
except Exception as e:
success = False
error_msg = str(e)
-
+
# Verify action completed successfully
- self.assertTrue(success, "update-robots action should complete successfully")
-
+ self.assertTrue(
+ success, "update-robots action should complete successfully"
+ )
+
# Verify robots.txt files were created
- self.assertTrue(os.path.exists(ads_robots), "ADS robots.txt should be created")
- self.assertTrue(os.path.exists(scix_robots), "SciX robots.txt should be created")
-
+ self.assertTrue(
+ os.path.exists(ads_robots), "ADS robots.txt should be created"
+ )
+ self.assertTrue(
+ os.path.exists(scix_robots), "SciX robots.txt should be created"
+ )
+
# Verify robots.txt content is correct
- with open(ads_robots, 'r', encoding='utf-8') as f:
+ with open(ads_robots, "r", encoding="utf-8") as f:
ads_content = f.read()
- with open(scix_robots, 'r', encoding='utf-8') as f:
+ with open(scix_robots, "r", encoding="utf-8") as f:
scix_content = f.read()
-
+
# Check for expected content in ADS robots.txt
- self.assertIn('User-agent: *', ads_content, "ADS robots.txt should contain User-agent directive")
- self.assertIn('Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml', ads_content,
- "ADS robots.txt should contain correct sitemap URL")
- self.assertIn('Disallow:', ads_content, "ADS robots.txt should contain disallow directives")
-
+ self.assertIn(
+ "User-agent: *",
+ ads_content,
+ "ADS robots.txt should contain User-agent directive",
+ )
+ self.assertIn(
+ "Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml",
+ ads_content,
+ "ADS robots.txt should contain correct sitemap URL",
+ )
+ self.assertIn(
+ "Disallow:",
+ ads_content,
+ "ADS robots.txt should contain disallow directives",
+ )
+
# Check for expected content in SciX robots.txt
- self.assertIn('User-agent: *', scix_content, "SciX robots.txt should contain User-agent directive")
- self.assertIn('Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml', scix_content,
- "SciX robots.txt should contain correct sitemap URL")
- self.assertIn('Disallow:', scix_content, "SciX robots.txt should contain disallow directives")
-
+ self.assertIn(
+ "User-agent: *",
+ scix_content,
+ "SciX robots.txt should contain User-agent directive",
+ )
+ self.assertIn(
+ "Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml",
+ scix_content,
+ "SciX robots.txt should contain correct sitemap URL",
+ )
+ self.assertIn(
+ "Disallow:",
+ scix_content,
+ "SciX robots.txt should contain disallow directives",
+ )
+
# Verify files are not empty and properly formatted
- self.assertGreater(len(ads_content.strip()), 50, "ADS robots.txt should have substantial content")
- self.assertGreater(len(scix_content.strip()), 50, "SciX robots.txt should have substantial content")
- self.assertTrue(ads_content.endswith('\n'), "ADS robots.txt should end with newline")
- self.assertTrue(scix_content.endswith('\n'), "SciX robots.txt should end with newline")
+ self.assertGreater(
+ len(ads_content.strip()),
+ 50,
+ "ADS robots.txt should have substantial content",
+ )
+ self.assertGreater(
+ len(scix_content.strip()),
+ 50,
+ "SciX robots.txt should have substantial content",
+ )
+ self.assertTrue(
+ ads_content.endswith("\n"), "ADS robots.txt should end with newline"
+ )
+ self.assertTrue(
+ scix_content.endswith("\n"), "SciX robots.txt should end with newline"
+ )
def test_task_manage_sitemap_update_robots_action_error_handling(self):
"""Test task_manage_sitemap update-robots action error handling"""
-
+
# Test by mocking update_robots_files to return False (simulating failure)
- with patch('adsmp.tasks.update_robots_files') as mock_update_robots:
+ with patch("adsmp.tasks.update_robots_files") as mock_update_robots:
mock_update_robots.return_value = False # Simulate failure
-
+
# Execute update-robots action - should raise exception due to simulated failure
with self.assertRaises(Exception) as context:
- tasks.task_manage_sitemap(['dummy'], 'update-robots')
-
+ tasks.task_manage_sitemap(["dummy"], "update-robots")
+
# Verify the exception message indicates robots.txt update failure
- self.assertIn('Failed to update robots.txt files', str(context.exception))
-
+ self.assertIn("Failed to update robots.txt files", str(context.exception))
+
# Verify update_robots_files was called with force_update=True
mock_update_robots.assert_called_once_with(True)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/config.py b/config.py
index 3764fcc..b4934cb 100644
--- a/config.py
+++ b/config.py
@@ -31,7 +31,7 @@
# db connection to the Boost Pipeline database where boost factors are stored
# if not present, boost factors will not be included in SOLR documents
BOOST_SQLALCHEMY_URL = None #'postgresql://boost_user:boost_pass@localhost:5432/boost_db'
-
+IGNORED_BOOST_PAYLOAD_TYPES = ["boost"]
# Main Solr
# SOLR_URLS = ["http://localhost:9983/solr/collection1/update"]
@@ -142,3 +142,12 @@
"techreport": 3,
"misc": 8
}
+
+SCIX_ID_GENERATION_FIELDS = [
+ "author_norm",
+ "doi",
+ "abstract",
+ "title",
+ "doctype",
+ "pub_raw"
+]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 58ae774..d5df902 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,8 @@
-adsputils==1.5.13
+adsputils==1.5.15
alembic==0.9.1
httplib2==0.19.0
portalocker==1.7.1
psycopg2==2.8.6
pyrabbit==1.1.0
-ScixPipelineUtils @ git+https://github.com/adsabs/SciXPipelineUtils.git@v0.5.2
+awscli==1.27.60
+ScixPipelineUtils @ git+https://github.com/adsabs/SciXPipelineUtils.git@v0.6.2
diff --git a/scripts/cleanup_sitemaps.py b/scripts/cleanup_sitemaps.py
new file mode 100644
index 0000000..5f7cf62
--- /dev/null
+++ b/scripts/cleanup_sitemaps.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+"""
+Wrapper script for sitemap cleanup operation with lockfile protection.
+
+This script prevents concurrent cleanup operations using a lockfile mechanism
+similar to reindex.py. The cleanup operation can take several hours scanning
+millions of sitemap records.
+"""
+
+import os
+import sys
+import pickle
+import time
+import re
+import json
+from subprocess import PIPE, Popen
+
+proj_home = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if proj_home not in sys.path:
+ sys.path.append(proj_home)
+
+from adsputils import setup_logging, load_config
+from celery.result import AsyncResult
+from adsmp import tasks
+
+config = load_config(proj_home=proj_home)
+logger = setup_logging('sitemap_cleanup', proj_home=proj_home,
+ level=config.get('LOGGING_LEVEL', 'INFO'),
+ attach_stdout=config.get('LOG_STDOUT', False))
+
+lockfile = os.path.abspath(proj_home + '/sitemap_cleanup.locked')
+
+
+def read_lockfile(lockfile):
+ with open(lockfile, 'rb') as f:
+ return pickle.load(f)
+
+
+def write_lockfile(lockfile, data):
+ with open(lockfile, 'wb') as f:
+ pickle.dump(data, f)
+
+
+def execute(command, **kwargs):
+ p = Popen(command, shell=True, stdout=PIPE, stderr=PIPE, **kwargs)
+ out, err = p.communicate()
+ return (p.returncode, out, err)
+
+
+def monitor_workflow(workflow_id, start_time):
+ """
+ Monitor Celery workflow until completion.
+ Returns True if successful, raises Exception if failed.
+ """
+ result = AsyncResult(workflow_id, app=tasks.app)
+
+ check_interval = 30 # seconds
+ last_log_time = time.time()
+ log_interval = 300 # Log every 5 minutes
+ max_duration = 12 * 3600 # 12 hours in seconds
+ warning_logged = False
+
+ logger.info('Monitoring workflow %s...' % workflow_id)
+
+ while not result.ready():
+ time.sleep(check_interval)
+ current_time = time.time()
+ elapsed = current_time - start_time
+
+ if current_time - last_log_time >= log_interval:
+ logger.info('Workflow still running... (elapsed: %.1f minutes)' % (elapsed / 60,))
+ last_log_time = current_time
+
+ # Warn if taking too long
+ if elapsed > max_duration and not warning_logged:
+ logger.warning('Workflow has been running for over 12 hours (%.1f hours)!' % (elapsed / 3600,))
+ logger.warning('This is unusually long - check for stuck tasks or performance issues')
+ warning_logged = True
+
+ # Check if successful
+ if result.successful():
+ logger.info('Workflow completed successfully')
+ return True
+ else:
+ error_msg = 'Workflow failed: %s' % str(result.info)
+ logger.error(error_msg)
+ raise Exception(error_msg)
+
+
+def run():
+ # Check for existing lockfile
+ if os.path.exists(lockfile):
+ logger.error('Lockfile %s already exists; exiting! (if you want to proceed, delete the file with rm sitemap_cleanup.locked)' % (lockfile,))
+ data = read_lockfile(lockfile)
+ for k, v in data.items():
+ logger.error('%s=%s' % (k, v,))
+ sys.exit(1)
+ else:
+ data = {}
+
+ try:
+ now = time.time()
+ data['start'] = now
+ data['operation'] = 'sitemap_cleanup'
+ write_lockfile(lockfile, data)
+ logger.info('Lockfile created')
+
+ logger.info('Starting sitemap cleanup operation')
+ logger.info('This may take several hours for large sitemap tables')
+
+ # Execute command and capture workflow ID from output
+ command = 'python3 run.py --cleanup-invalid-sitemaps'
+ retcode, stdout, stderr = execute(command, cwd=proj_home)
+
+ if retcode != 0:
+ data['error'] = '%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode())
+ write_lockfile(lockfile, data)
+ logger.error('stderr=%s' % (stderr.decode(),))
+ raise Exception('%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode()))
+
+ # Parse workflow ID from stdout (JSON log format)
+ stdout_str = stdout.decode()
+ workflow_id = None
+
+ # Try to parse as JSON first
+ for line in stdout_str.split('\n'):
+ if 'Sitemap cleanup workflow submitted:' in line or 'cleanup workflow' in line.lower():
+ try:
+ log_entry = json.loads(line)
+ message = log_entry.get('message', '')
+ # Extract UUID from message
+ uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
+ match = re.search(uuid_pattern, message)
+ if match:
+ workflow_id = match.group(0)
+ break
+ except (json.JSONDecodeError, ValueError):
+ # Fall back to simple parsing
+ uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
+ match = re.search(uuid_pattern, line)
+ if match:
+ workflow_id = match.group(0)
+ break
+
+ if not workflow_id:
+ logger.info('No workflow was started')
+ logger.info('Operation completed in %s secs' % (time.time() - now,))
+ os.remove(lockfile)
+ return
+
+ logger.info('Workflow ID: %s' % workflow_id)
+ data['workflow_id'] = workflow_id
+ write_lockfile(lockfile, data)
+
+ # Monitor workflow until completion
+ monitor_workflow(workflow_id, now)
+
+ logger.info('Successfully finished sitemap cleanup in %s secs (%.1f minutes)' %
+ (time.time() - now, (time.time() - now) / 60))
+
+ # Success - remove lockfile
+ logger.info('Deleting the lock; sitemap cleanup completed successfully!')
+ os.remove(lockfile)
+
+ except Exception as e:
+ logger.exception('Failed: we will keep the process permanently locked')
+ data['last-exception'] = str(e)
+ data['failed_at'] = time.time()
+ write_lockfile(lockfile, data)
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ run()
diff --git a/scripts/update_sitemaps_auto.py b/scripts/update_sitemaps_auto.py
new file mode 100644
index 0000000..10e0d0a
--- /dev/null
+++ b/scripts/update_sitemaps_auto.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python
+"""
+Wrapper script for automatic sitemap updates with lockfile protection.
+
+"""
+
+import os
+import sys
+import pickle
+import time
+import argparse
+import re
+import json
+from subprocess import PIPE, Popen
+
+proj_home = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if proj_home not in sys.path:
+ sys.path.append(proj_home)
+
+from adsputils import setup_logging, load_config
+from celery.result import AsyncResult
+from adsmp import tasks
+
+config = load_config(proj_home=proj_home)
+logger = setup_logging('sitemap_auto_update', proj_home=proj_home,
+ level=config.get('LOGGING_LEVEL', 'INFO'),
+ attach_stdout=config.get('LOG_STDOUT', False))
+
+lockfile = os.path.abspath(proj_home + '/sitemap_auto_update.locked')
+
+
+def read_lockfile(lockfile):
+ with open(lockfile, 'rb') as f:
+ return pickle.load(f)
+
+
+def write_lockfile(lockfile, data):
+ with open(lockfile, 'wb') as f:
+ pickle.dump(data, f)
+
+
+def execute(command, **kwargs):
+ p = Popen(command, shell=True, stdout=PIPE, stderr=PIPE, **kwargs)
+ out, err = p.communicate()
+ return (p.returncode, out, err)
+
+
+def monitor_workflow(workflow_id, start_time):
+ """
+ Monitor Celery workflow until completion.
+ Returns True if successful, raises Exception if failed.
+ """
+ result = AsyncResult(workflow_id, app=tasks.app)
+
+ check_interval = 30 # seconds
+ last_log_time = time.time()
+ log_interval = 300 # Log every 5 minutes
+ max_duration = 12 * 3600 # 12 hours in seconds
+ warning_logged = False
+
+ logger.info('Monitoring workflow %s...' % workflow_id)
+
+ while not result.ready():
+ time.sleep(check_interval)
+ current_time = time.time()
+ elapsed = current_time - start_time
+
+ if current_time - last_log_time >= log_interval:
+ logger.info('Workflow still running... (elapsed: %.1f minutes)' % (elapsed / 60,))
+ last_log_time = current_time
+
+ # Warn if taking too long
+ if elapsed > max_duration and not warning_logged:
+ logger.warning('Workflow has been running for over 12 hours (%.1f hours)!' % (elapsed / 3600,))
+ logger.warning('This is unusually long - check for stuck tasks or performance issues')
+ warning_logged = True
+
+ # Check if successful
+ if result.successful():
+ logger.info('Workflow completed successfully')
+ return True
+ else:
+ error_msg = 'Workflow failed: %s' % str(result.info)
+ logger.error(error_msg)
+ raise Exception(error_msg)
+
+
+def run(days_back=1):
+ # Check for existing lockfile
+ if os.path.exists(lockfile):
+ logger.error('Lockfile %s already exists; exiting! (if you want to proceed, delete the file)' % (lockfile,))
+ data = read_lockfile(lockfile)
+ for k, v in data.items():
+ logger.error('%s=%s' % (k, v,))
+ sys.exit(1)
+ else:
+ data = {}
+
+ try:
+ now = time.time()
+ data['start'] = now
+ data['operation'] = 'sitemap_auto_update'
+ data['days_back'] = days_back
+ write_lockfile(lockfile, data)
+
+ logger.info('Starting automatic sitemap update (looking back %d days)' % days_back)
+ logger.info('This may take several hours depending on the number of updated records')
+
+ # Execute command and capture workflow ID from output
+ command = 'python3 run.py --update-sitemaps-auto --days-back %d' % days_back
+ retcode, stdout, stderr = execute(command, cwd=proj_home)
+
+ if retcode != 0:
+ data['error'] = '%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode())
+ write_lockfile(lockfile, data)
+ logger.error('stderr=%s' % (stderr.decode(),))
+ raise Exception('%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode()))
+
+ # Parse workflow ID from stdout (JSON log format)
+ stdout_str = stdout.decode()
+ workflow_id = None
+
+ # Try to parse as JSON first
+ for line in stdout_str.split('\n'):
+ if 'Submitted sitemap workflow:' in line:
+ try:
+ log_entry = json.loads(line)
+ message = log_entry.get('message', '')
+ # Extract UUID from message
+ uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
+ match = re.search(uuid_pattern, message)
+ if match:
+ workflow_id = match.group(0)
+ break
+ except (json.JSONDecodeError, ValueError):
+ # Fall back to simple parsing
+ uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
+ match = re.search(uuid_pattern, line)
+ if match:
+ workflow_id = match.group(0)
+ break
+
+ if not workflow_id:
+ logger.info('No workflow was started (no records to update)')
+ logger.info('Operation completed in %s secs' % (time.time() - now,))
+ os.remove(lockfile)
+ return
+
+ logger.info('Workflow ID: %s' % workflow_id)
+ data['workflow_id'] = workflow_id
+ write_lockfile(lockfile, data)
+
+ # Monitor workflow until completion
+ monitor_workflow(workflow_id, now)
+
+ logger.info('Successfully finished sitemap auto-update in %s secs (%.1f minutes)' %
+ (time.time() - now, (time.time() - now) / 60))
+
+ # Success - remove lockfile
+ logger.info('Deleting the lock; sitemap auto-update completed successfully!')
+ os.remove(lockfile)
+
+ except Exception as e:
+ logger.exception('Failed: we will keep the process permanently locked')
+ data['last-exception'] = str(e)
+ data['failed_at'] = time.time()
+ write_lockfile(lockfile, data)
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Automatic sitemap update with lockfile protection')
+ parser.add_argument('--days-back', dest='days_back', type=int, default=1,
+ help='Number of days to look back for updated records (default: 1)')
+ args = parser.parse_args()
+
+ run(days_back=args.days_back)