diff --git a/adsmp/app.py b/adsmp/app.py index b3fbf9a..c2cd9fd 100644 --- a/adsmp/app.py +++ b/adsmp/app.py @@ -181,14 +181,6 @@ def update_storage(self, bibcode, type, payload): record.scix_id = "scix:" + str(self.generate_scix_id(record.bib_data)) out = record.toJSON() session.commit() - - # Send payload to Boost pipeline - if type != 'boost' and not self._config.get('TESTING_MODE', False): - try: - self.generate_boost_request_message(bibcode) - except Exception as e: - self.logger.exception('Error generating boost request message for bibcode %s: %s', bibcode, e) - return out except exc.IntegrityError: self.logger.exception('error in app.update_storage while updating database for bibcode {}, type {}'.format(bibcode, type)) @@ -196,7 +188,11 @@ def update_storage(self, bibcode, type, payload): raise def generate_scix_id(self, bib_data): - return scix_id.generate_scix_id(bib_data) + if self._config.get('SCIX_ID_GENERATION_FIELDS', None): + user_fields = self._config.get('SCIX_ID_GENERATION_FIELDS') + else: + user_fields = None + return scix_id.generate_scix_id(bib_data, user_fields = user_fields) def delete_by_bibcode(self, bibcode): with self.session_scope() as session: @@ -894,7 +890,7 @@ def should_include_in_sitemap(self, record): 3. If processed, processing isn't too stale Args: - record: Dictionary with record data including bib_data, status, timestamps + record: Dictionary with record data including has_bib_data, status, timestamps Returns: bool: True if record should be included in sitemap, False otherwise @@ -903,14 +899,14 @@ def should_include_in_sitemap(self, record): # Extract values from record dictionary bibcode = record.get('bibcode', None) - bib_data = record.get('bib_data', None) + has_bib_data = record.get('has_bib_data', None) bib_data_updated = record.get('bib_data_updated') solr_processed = record.get('solr_processed') status = record.get('status') # Must have bibliographic data - if not bib_data or not bibcode or (isinstance(bib_data, str) and not bib_data.strip()): - self.logger.debug('Excluding %s from sitemap: No bibcode or bib_data', bibcode) + if not has_bib_data or not bibcode: + self.logger.debug('Excluding %s from sitemap: No bibcode or has_bib_data is False', bibcode) return False # Exclude if SOLR failed or if record is being retried (previously failed) @@ -959,6 +955,8 @@ def get_records_bulk(self, bibcodes, session, load_only=None): record_data = {} for field in (load_only or ['id', 'bibcode', 'bib_data', 'bib_data_updated', 'solr_processed', 'status']): record_data[field] = getattr(record, field, None) + # Add has_bib_data boolean for sitemap checks + record_data['has_bib_data'] = bool(record_data.get('bib_data')) records_dict[record.bibcode] = record_data return records_dict diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py index 808a961..eccfc5f 100644 --- a/adsmp/solr_updater.py +++ b/adsmp/solr_updater.py @@ -179,6 +179,7 @@ def extract_classifications_pipeline(db_classifications, solrdoc): """retrieve expected classifier collections classifications is a solr virtual field so it should never be set""" + db_classifications = [element for element in db_classifications if element] # remove empty strings if db_classifications is None or len(db_classifications) == 0: return {"database" : solrdoc.get("database", None)} diff --git a/adsmp/tasks.py b/adsmp/tasks.py index efdfc3e..53a6cef 100644 --- a/adsmp/tasks.py +++ b/adsmp/tasks.py @@ -42,10 +42,73 @@ Queue('update-sitemap-files', app.exchange, routing_key='update-sitemap-files'), Queue('update-scixid', app.exchange, routing_key='update-scixid'), Queue('boost-request', app.exchange, routing_key='boost-request'), + Queue('augment-record', app.exchange, routing_key='augment-record'), ) # ============================= TASKS ============================================= # +@app.task(queue='augment-record') +def task_augment_record(msg): + """Receives payload to augment the record. + + @param msg: protobuff that contains at minimum + - bibcode + - and specific payload + """ + # logger.debug('Updating record: %s', msg) + logger.debug('Updating record: %s', msg) + status = app.get_msg_status(msg) + logger.debug(f'Message status: {status}') + type = app.get_msg_type(msg) + logger.debug(f'Message type: {type}') + bibcodes = [] + + if status == 'active': + # save into a database + # passed msg may contain details on one bibcode or a list of bibcodes + if type == 'nonbib_records': + for m in msg.nonbib_records: + m = Msg(m, None, None) # m is a raw protobuf, TODO: return proper instance from .nonbib_records + bibcodes.append(m.bibcode) + record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON()) + if record: + logger.debug('Saved record from list: %s', record) + elif type == 'metrics_records': + for m in msg.metrics_records: + m = Msg(m, None, None) + bibcodes.append(m.bibcode) + record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True)) + if record: + logger.debug('Saved record from list: %s', record) + elif type == 'augment': + bibcodes.append(msg.bibcode) + record = app.update_storage(msg.bibcode, 'augment', + msg.toJSON(including_default_value_fields=True)) + if record: + logger.debug('Saved augment message: %s', msg) + elif type == 'classify': + bibcodes.append(msg.bibcode) + logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}') + payload = msg.toJSON(including_default_value_fields=True) + payload = payload['collections'] + record = app.update_storage(msg.bibcode, 'classify',payload) + if record: + logger.debug('Saved classify message: %s', msg) + else: + # here when record has a single bibcode + bibcodes.append(msg.bibcode) + record = app.update_storage(msg.bibcode, type, msg.toJSON()) + if record: + logger.debug('Saved record: %s', record) + if record: + # Send payload to Boost pipeline + if type != 'boost' and not app._config.get('TESTING_MODE', False): + try: + task_boost_request.apply_async(args=(msg.bibcode,)) + except Exception as e: + app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e) + else: + logger.error('Received a message with unclear status: %s', msg) @app.task(queue='update-record') def task_update_record(msg): @@ -94,6 +157,7 @@ def task_update_record(msg): record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON()) if record: logger.debug('Saved record from list: %s', record) + _generate_boost_request(m, type) elif type == 'metrics_records': for m in msg.metrics_records: m = Msg(m, None, None) @@ -101,12 +165,14 @@ def task_update_record(msg): record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True)) if record: logger.debug('Saved record from list: %s', record) + _generate_boost_request(m, type) elif type == 'augment': bibcodes.append(msg.bibcode) record = app.update_storage(msg.bibcode, 'augment', msg.toJSON(including_default_value_fields=True)) if record: logger.debug('Saved augment message: %s', msg) + _generate_boost_request(msg, type) elif type == 'classify': bibcodes.append(msg.bibcode) logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}') @@ -115,21 +181,32 @@ def task_update_record(msg): record = app.update_storage(msg.bibcode, 'classify',payload) if record: logger.debug('Saved classify message: %s', msg) + _generate_boost_request(msg, type) else: # here when record has a single bibcode bibcodes.append(msg.bibcode) record = app.update_storage(msg.bibcode, type, msg.toJSON()) if record: logger.debug('Saved record: %s', record) + _generate_boost_request(msg, type) if type == 'metadata': # with new bib data we request to augment the affiliation # that pipeline will eventually respond with a msg to task_update_record logger.debug('requesting affilation augmentation for %s', msg.bibcode) app.request_aff_augment(msg.bibcode) - else: logger.error('Received a message with unclear status: %s', msg) +def _generate_boost_request(msg, msg_type): + # Send payload to Boost pipeline + if msg_type not in app._config.get('IGNORED_BOOST_PAYLOAD_TYPES', ['boost']) and not app._config.get('TESTING_MODE', False): + try: + task_boost_request.apply_async(args=(msg.bibcode,)) + except Exception as e: + app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e) + else: + app.logger.debug("Message for bibcode %s has type: %s, Skipping.".format(msg.bibcode, msg_type)) + @app.task(queue='update-scixid') def task_update_scixid(bibcodes, flag): """Receives bibcodes to add scix id to the record. @@ -428,7 +505,7 @@ def task_cleanup_invalid_sitemaps(): session.query( SitemapInfo.id, SitemapInfo.bibcode, - Records.bib_data, + (Records.bib_data.isnot(None)).label('has_bib_data'), Records.bib_data_updated, Records.solr_processed, Records.status @@ -457,7 +534,7 @@ def task_cleanup_invalid_sitemaps(): # Convert to dict for should_include_in_sitemap function record_dict = { 'bibcode': record_data.bibcode, - 'bib_data': record_data.bib_data, + 'has_bib_data': record_data.has_bib_data, 'bib_data_updated': record_data.bib_data_updated, 'solr_processed': record_data.solr_processed, 'status': record_data.status @@ -626,7 +703,7 @@ def task_manage_sitemap(bibcodes, action): # Apply SOLR filtering - convert record to dict for should_include_in_sitemap record_dict = { 'bibcode': record.bibcode, - 'bib_data': record.bib_data, + 'has_bib_data': bool(record.bib_data), 'bib_data_updated': record.bib_data_updated, 'solr_processed': record.solr_processed, 'status': record.status @@ -688,7 +765,6 @@ def task_manage_sitemap(bibcodes, action): logger.info('Bootstrap completed: %d successful, %d failed out of %d total records', successful_count, failed_count, processed) logger.info('All records marked with update_flag=True') - logger.info('Run --update-sitemap-files to generate sitemap XML files') return elif action in ['add', 'force-update']: diff --git a/adsmp/tests/test_app.py b/adsmp/tests/test_app.py index 9c4bb21..178982d 100644 --- a/adsmp/tests/test_app.py +++ b/adsmp/tests/test_app.py @@ -1,56 +1,59 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import mock -from mock import patch -import unittest -import os -import sys import json +import os import re +import sys import tempfile import time +import unittest +from datetime import timedelta import adsputils -from adsmp import app, models -from adsmp.models import Base, MetricsBase, Records, SitemapInfo, ChangeLog -from adsputils import get_date +import mock import testing.postgresql +from adsputils import get_date +from mock import patch from sqlalchemy.exc import IntegrityError, SQLAlchemyError -from datetime import timedelta + +from adsmp import app, models +from adsmp.models import Base, ChangeLog, MetricsBase, Records, SitemapInfo class TestAdsOrcidCelery(unittest.TestCase): """ Tests the appliction's methods """ - + @classmethod def setUpClass(cls): - cls.postgresql = \ - testing.postgresql.Postgresql(host='127.0.0.1', port=15678, user='postgres', - database='test') + cls.postgresql = testing.postgresql.Postgresql( + host="127.0.0.1", port=15678, user="postgres", database="test" + ) @classmethod def tearDownClass(cls): cls.postgresql.stop() - + def setUp(self): unittest.TestCase.setUp(self) - - proj_home = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) - with mock.patch.dict('os.environ', {'ADS_API_TOKEN': 'fixme'}): - self.app = app.ADSMasterPipelineCelery('test', local_config=\ - { - 'SQLALCHEMY_URL': 'sqlite:///', - 'METRICS_SQLALCHEMY_URL': 'postgresql://postgres@127.0.0.1:15678/test', - 'SQLALCHEMY_ECHO': False, - 'PROJ_HOME' : proj_home, - 'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'), - }) + + proj_home = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) + with mock.patch.dict("os.environ", {"ADS_API_TOKEN": "fixme"}): + self.app = app.ADSMasterPipelineCelery( + "test", + local_config={ + "SQLALCHEMY_URL": "sqlite:///", + "METRICS_SQLALCHEMY_URL": "postgresql://postgres@127.0.0.1:15678/test", + "SQLALCHEMY_ECHO": False, + "PROJ_HOME": proj_home, + "TEST_DIR": os.path.join(proj_home, "adsmp/tests"), + }, + ) Base.metadata.bind = self.app._session.get_bind() Base.metadata.create_all() - + MetricsBase.metadata.bind = self.app._metrics_engine MetricsBase.metadata.create_all() @@ -61,228 +64,306 @@ def tearDown(self): self.app.close_app() def test_app(self): - assert self.app._config.get('SQLALCHEMY_URL') == 'sqlite:///' - assert self.app.conf.get('SQLALCHEMY_URL') == 'sqlite:///' + assert self.app._config.get("SQLALCHEMY_URL") == "sqlite:///" + assert self.app.conf.get("SQLALCHEMY_URL") == "sqlite:///" def test_mark_processed(self): - self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success') - r = self.app.get_record('abc') + self.app.mark_processed(["abc"], "solr", checksums=["jkl"], status="success") + r = self.app.get_record("abc") self.assertEqual(r, None) - - self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1}) - self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success') - r = self.app.get_record('abc') - - self.assertTrue(r['solr_processed']) - self.assertTrue(r['status']) - - self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='solr-failed') - r = self.app.get_record('abc') - self.assertTrue(r['solr_processed']) - self.assertTrue(r['processed']) - self.assertEqual(r['status'], 'solr-failed') + + self.app.update_storage("abc", "bib_data", {"bibcode": "abc", "hey": 1}) + self.app.mark_processed(["abc"], "solr", checksums=["jkl"], status="success") + r = self.app.get_record("abc") + + self.assertTrue(r["solr_processed"]) + self.assertTrue(r["status"]) + + self.app.mark_processed( + ["abc"], "solr", checksums=["jkl"], status="solr-failed" + ) + r = self.app.get_record("abc") + self.assertTrue(r["solr_processed"]) + self.assertTrue(r["processed"]) + self.assertEqual(r["status"], "solr-failed") def test_index_solr(self): - self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1, 'test': 'test'}) - self.app.update_storage('foo', 'bib_data', {'bibcode': 'foo', 'hey': 1}) - - with mock.patch('adsmp.solr_updater.update_solr', return_value=[200]): - self.app.index_solr([{'bibcode': 'abc'}, - {'bibcode': 'foo'}], - ['checksum1', 'checksum2'], - ['http://solr1']) + self.app.update_storage( + "abc", + "bib_data", + { + "bibcode": "abc", + "hey": 1, + "test": "test", + "title": "Test record abc solr", + }, + ) + self.app.update_storage( + "foo", + "bib_data", + {"bibcode": "foo", "hey": 1, "title": "Test record foo solr"}, + ) + + with mock.patch("adsmp.solr_updater.update_solr", return_value=[200]): + self.app.index_solr( + [{"bibcode": "abc"}, {"bibcode": "foo"}], + ["checksum1", "checksum2"], + ["http://solr1"], + ) with self.app.session_scope() as session: - for x in ['abc', 'foo']: + for x in ["abc", "foo"]: r = session.query(models.Records).filter_by(bibcode=x).first() self.assertTrue(r.processed) self.assertFalse(r.metrics_processed) self.assertTrue(r.solr_processed) - + # pretend group failure and then success when records sent individually - with mock.patch('adsmp.solr_updater.update_solr') as us, \ - mock.patch.object(self.app, 'mark_processed') as mp: + with mock.patch("adsmp.solr_updater.update_solr") as us, mock.patch.object( + self.app, "mark_processed" + ) as mp: us.side_effect = [[503], [200], [200]] - self.app.index_solr([{'bibcode': 'abc'}, - {'bibcode': 'foo'}], - ['checksum1', 'checksum2'], - ['http://solr1']) + self.app.index_solr( + [{"bibcode": "abc"}, {"bibcode": "foo"}], + ["checksum1", "checksum2"], + ["http://solr1"], + ) # self.assertTrue(len(failed) == 0) x = str(mp.call_args_list[0]) - self.assertTrue('abc' in x) - self.assertTrue('success' in x) - self.assertTrue('solr' in x) + self.assertTrue("abc" in x) + self.assertTrue("success" in x) + self.assertTrue("solr" in x) self.assertEqual(us.call_count, 3) x = str(mp.call_args_list[1]) - self.assertTrue('foo' in x) - self.assertTrue('success' in x) - self.assertTrue('solr' in x) + self.assertTrue("foo" in x) + self.assertTrue("success" in x) + self.assertTrue("solr" in x) # pretend failure and success without body # update_solr should try to send two records together and then # each record by itself twice: once as is and once without fulltext - with mock.patch('adsmp.solr_updater.update_solr') as us, \ - mock.patch.object(self.app, 'mark_processed') as mp: - us.side_effect = [[503, 503], Exception('body failed'), 200, Exception('body failed'), 200] - self.app.index_solr([{'bibcode': 'abc', 'body': 'BAD BODY'}, - {'bibcode': 'foo', 'body': 'BAD BODY'}], - ['checksum1', 'checksum2'], - ['http://solr1']) + with mock.patch("adsmp.solr_updater.update_solr") as us, mock.patch.object( + self.app, "mark_processed" + ) as mp: + us.side_effect = [ + [503, 503], + Exception("body failed"), + 200, + Exception("body failed"), + 200, + ] + self.app.index_solr( + [ + {"bibcode": "abc", "body": "BAD BODY"}, + {"bibcode": "foo", "body": "BAD BODY"}, + ], + ["checksum1", "checksum2"], + ["http://solr1"], + ) self.assertEqual(us.call_count, 5) # self.assertTrue(len(failed) == 0) self.assertEqual(mp.call_count, 2) x = str(us.call_args_list[-2]) - self.assertTrue('http://solr1' in x) - self.assertTrue('foo' in x) - self.assertTrue('body' in x) - self.assertTrue('BAD BODY' in x) + self.assertTrue("http://solr1" in x) + self.assertTrue("foo" in x) + self.assertTrue("body" in x) + self.assertTrue("BAD BODY" in x) x = str(us.call_args_list[-1]) - self.assertTrue('http://solr1' in x) - self.assertTrue('foo' in x) + self.assertTrue("http://solr1" in x) + self.assertTrue("foo" in x) # pretend failure and then lots more failure # update_solr should try to send two records together and then # each record by itself twice: once as is and once without fulltext - with mock.patch('adsmp.solr_updater.update_solr') as us: - us.side_effect = [[503, 503], - Exception('body failed'), Exception('body failed'), - Exception('body failed'), Exception('body failed')] - self.app.index_solr([{'bibcode': 'abc', 'body': 'bad body'}, - {'bibcode': 'foo', 'body': 'bad body'}], - ['checksum1', 'checksum2'], - ['http://solr1']) + with mock.patch("adsmp.solr_updater.update_solr") as us: + us.side_effect = [ + [503, 503], + Exception("body failed"), + Exception("body failed"), + Exception("body failed"), + Exception("body failed"), + ] + self.app.index_solr( + [ + {"bibcode": "abc", "body": "bad body"}, + {"bibcode": "foo", "body": "bad body"}, + ], + ["checksum1", "checksum2"], + ["http://solr1"], + ) self.assertEqual(us.call_count, 5) # pretend failure and and then failure for a mix of reasons - with mock.patch('adsmp.solr_updater.update_solr') as us: - us.side_effect = [[503, 503], Exception('body failed'), Exception('failed'), Exception('failed')] - self.app.index_solr([{'bibcode': 'abc', 'body': 'bad body'}, - {'bibcode': 'foo', 'body': 'good body'}], - ['checksum1', 'checksum2'], - ['http://solr1']) + with mock.patch("adsmp.solr_updater.update_solr") as us: + us.side_effect = [ + [503, 503], + Exception("body failed"), + Exception("failed"), + Exception("failed"), + ] + self.app.index_solr( + [ + {"bibcode": "abc", "body": "bad body"}, + {"bibcode": "foo", "body": "good body"}, + ], + ["checksum1", "checksum2"], + ["http://solr1"], + ) self.assertEqual(us.call_count, 4) if sys.version_info > (3,): call_dict = "{'bibcode': 'foo', 'body': 'good body'}" else: call_dict = "{'body': 'good body', 'bibcode': 'foo'}" - self.assertEqual(str(us.call_args_list[-1]), "call([%s], ['http://solr1'], commit=False, ignore_errors=False)" % call_dict) + self.assertEqual( + str(us.call_args_list[-1]), + "call([%s], ['http://solr1'], commit=False, ignore_errors=False)" + % call_dict, + ) # pretend failure and and then a mix of failure and success - with mock.patch('adsmp.solr_updater.update_solr') as us, \ - mock.patch.object(self.app, 'mark_processed') as mp: - us.side_effect = [[503, 503], Exception('body failed'), [200]] - self.app.index_solr([{'bibcode': 'abc', 'body': 'bad body'}, - {'bibcode': 'foo', 'body': 'good body'}], - ['checksum1', 'checksum2'], - ['http://solr1']) + with mock.patch("adsmp.solr_updater.update_solr") as us, mock.patch.object( + self.app, "mark_processed" + ) as mp: + us.side_effect = [[503, 503], Exception("body failed"), [200]] + self.app.index_solr( + [ + {"bibcode": "abc", "body": "bad body"}, + {"bibcode": "foo", "body": "good body"}, + ], + ["checksum1", "checksum2"], + ["http://solr1"], + ) self.assertEqual(us.call_count, 4) # self.assertTrue(len(failed) == 1) self.assertEqual(us.call_count, 4) self.assertEqual(mp.call_count, 2) x = str(us.call_args_list[-1]) - self.assertTrue('foo' in x) - self.assertTrue('good body' in x) - self.assertTrue('http://solr1' in x) + self.assertTrue("foo" in x) + self.assertTrue("good body" in x) + self.assertTrue("http://solr1" in x) def test_update_metrics(self): - self.app.update_storage('abc', 'metrics', { - 'author_num': 1, - 'bibcode': 'abc', - }) - self.app.update_storage('foo', 'metrics', { - 'bibcode': 'foo', - 'citation_num': 6, - 'author_num': 3, - }) - - batch_metrics = [self.app.get_record('abc')['metrics'], self.app.get_record('foo')['metrics']] - batch_checksum = ['checksum1', 'checksum2'] + self.app.update_storage( + "abc", + "metrics", + { + "author_num": 1, + "bibcode": "abc", + }, + ) + self.app.update_storage( + "foo", + "metrics", + { + "bibcode": "foo", + "citation_num": 6, + "author_num": 3, + }, + ) + + batch_metrics = [ + self.app.get_record("abc")["metrics"], + self.app.get_record("foo")["metrics"], + ] + batch_checksum = ["checksum1", "checksum2"] self.app.index_metrics(batch_metrics, batch_checksum) - - for x in ['abc', 'foo']: + + for x in ["abc", "foo"]: r = self.app.get_record(x) - self.assertTrue(r['processed']) - self.assertTrue(r['metrics_processed']) - self.assertFalse(r['solr_processed']) - + self.assertTrue(r["processed"]) + self.assertTrue(r["metrics_processed"]) + self.assertFalse(r["solr_processed"]) + def test_delete_metrics(self): """Makes sure we can delete a metrics record by bibcode""" - self.app.update_storage('abc', 'metrics', { - 'author_num': 1, - 'bibcode': 'abc', - }) - r = self.app.get_record('abc') - self.app.index_metrics([r], ['checksum']) - m = self.app.get_metrics('abc') - self.assertTrue(m, 'intialized metrics data') - self.app.metrics_delete_by_bibcode('abc') - m = self.app.get_metrics('abc') - self.assertFalse(m, 'deleted metrics data') - + self.app.update_storage( + "abc", + "metrics", + { + "author_num": 1, + "bibcode": "abc", + }, + ) + r = self.app.get_record("abc") + self.app.index_metrics([r], ["checksum"]) + m = self.app.get_metrics("abc") + self.assertTrue(m, "intialized metrics data") + self.app.metrics_delete_by_bibcode("abc") + m = self.app.get_metrics("abc") + self.assertFalse(m, "deleted metrics data") + def test_update_records(self): """Makes sure we can write recs into the storage.""" now = adsputils.get_date() last_time = adsputils.get_date() - for k in ['bib_data', 'nonbib_data', 'orcid_claims']: - self.app.update_storage('abc', k, {'foo': 'bar', 'hey': 1}) + for k in ["bib_data", "nonbib_data", "orcid_claims"]: + self.app.update_storage( + "abc", k, {"foo": "bar", "hey": 1, "title": "Test record abc"} + ) with self.app.session_scope() as session: - r = session.query(models.Records).filter_by(bibcode='abc').first() + r = session.query(models.Records).filter_by(bibcode="abc").first() self.assertTrue(r.id == 1) - self.assertTrue(r.scix_id == 'scix:0RW9-X19B-XHYY') + self.assertEqual(r.scix_id, "scix:50RZ-VNK5-03S7") j = r.toJSON() - self.assertEqual(j[k], {'foo': 'bar', 'hey': 1}) - t = j[k + '_updated'] + self.assertEqual( + j[k], {"foo": "bar", "hey": 1, "title": "Test record abc"} + ) + t = j[k + "_updated"] self.assertTrue(now < t) - self.assertTrue(last_time < j['updated']) - last_time = j['updated'] - - self.app.update_storage('abc', 'fulltext', {'body': 'foo bar'}) + self.assertTrue(last_time < j["updated"]) + last_time = j["updated"] + + self.app.update_storage("abc", "fulltext", {"body": "foo bar"}) with self.app.session_scope() as session: - r = session.query(models.Records).filter_by(bibcode='abc').first() + r = session.query(models.Records).filter_by(bibcode="abc").first() self.assertTrue(r.id == 1) - self.assertTrue(r.scix_id == 'scix:0RW9-X19B-XHYY') + self.assertEqual(r.scix_id, "scix:50RZ-VNK5-03S7") j = r.toJSON() - self.assertEqual(j['fulltext'], {'body': 'foo bar'}) - t = j['fulltext_updated'] + self.assertEqual(j["fulltext"], {"body": "foo bar"}) + t = j["fulltext_updated"] self.assertTrue(now < t) - - r = self.app.get_record('abc') - self.assertEqual(r['id'], 1) - self.assertEqual(r['scix_id'],'scix:0RW9-X19B-XHYY') - self.assertEqual(r['processed'], None) - - r = self.app.get_record(['abc']) - self.assertEqual(r[0]['id'], 1) - self.assertEqual(r[0]['scix_id'],'scix:0RW9-X19B-XHYY') - self.assertEqual(r[0]['processed'], None) - - r = self.app.get_record('abc', load_only=['id']) - self.assertEqual(r['id'], 1) - self.assertFalse('processed' in r) + + r = self.app.get_record("abc") + self.assertEqual(r["id"], 1) + self.assertEqual(r["scix_id"], "scix:50RZ-VNK5-03S7") + self.assertEqual(r["processed"], None) + + r = self.app.get_record(["abc"]) + self.assertEqual(r[0]["id"], 1) + self.assertEqual(r[0]["scix_id"], "scix:50RZ-VNK5-03S7") + self.assertEqual(r[0]["processed"], None) + + r = self.app.get_record("abc", load_only=["id"]) + self.assertEqual(r["id"], 1) + self.assertFalse("processed" in r) with self.assertRaises(ValueError) as e: - self.app.mark_processed(['abc'], 'foobar') - self.assertTrue('foobar' in e.exception) - + self.app.mark_processed(["abc"], "foobar") + self.assertTrue("foobar" in e.exception) + # now delete it - self.app.delete_by_bibcode('abc') - r = self.app.get_record('abc') + self.app.delete_by_bibcode("abc") + r = self.app.get_record("abc") self.assertTrue(r is None) with self.app.session_scope() as session: - r = session.query(models.ChangeLog).filter_by(key='bibcode:abc').first() - self.assertTrue(r.key, 'abc') + r = session.query(models.ChangeLog).filter_by(key="bibcode:abc").first() + self.assertTrue(r.key, "abc") def test_index_metrics_database_failure(self): """ - verify handles failure from database - send one bibcode, verify there are two commits + verify handles failure from database + send one bibcode, verify there are two commits """ - self.app.update_storage('abc', 'metrics', { - 'author_num': 1, - 'bibcode': 'abc', - }) + self.app.update_storage( + "abc", + "metrics", + { + "author_num": 1, + "bibcode": "abc", + }, + ) trans = mock.Mock() - trans.commit.side_effect = SQLAlchemyError('test') + trans.commit.side_effect = SQLAlchemyError("test") m = mock.Mock() m.begin_nested.return_value = trans m.__exit__ = mock.Mock() @@ -290,559 +371,689 @@ def test_index_metrics_database_failure(self): m.__enter__.return_value = mock.Mock() m.__enter__.return_value.begin_nested.return_value = trans # init database so timestamps and checksum can be updated - with mock.patch('adsmp.app.ADSMasterPipelineCelery.metrics_session_scope', return_value=m) as p: - metrics_payload = {'bibcode': 'abc', 'author_num': 1} - checksum = 'checksum' + with mock.patch( + "adsmp.app.ADSMasterPipelineCelery.metrics_session_scope", return_value=m + ) as p: + metrics_payload = {"bibcode": "abc", "author_num": 1} + checksum = "checksum" self.app.index_metrics([metrics_payload], [checksum]) self.assertEqual(trans.commit.call_count, 2) def test_index_datalinks_success(self): """verify passed data sent to resolver service - verify handles success from service - verify records table updated with processed, status and checksum + verify handles success from service + verify records table updated with processed, status and checksum """ m = mock.Mock() m.status_code = 200 # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) - with mock.patch('requests.put', return_value=m) as p: - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} - checksum = 'thechecksum' + nonbib_data = {"data_links_rows": [{"baz": 0}]} + self.app.update_storage("linkstest", "nonbib_data", nonbib_data) + with mock.patch("requests.put", return_value=m) as p: + datalinks_payload = { + "bibcode": "linkstest", + "data_links_rows": [{"baz": 0}], + } + checksum = "thechecksum" self.app.index_datalinks([datalinks_payload], [checksum]) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), - headers={'Authorization': 'Bearer fixme'}) + p.assert_called_with( + "http://localhost:8080/update", + data=json.dumps( + [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}] + ), + headers={"Authorization": "Bearer fixme"}, + ) self.assertEqual(p.call_count, 1) # verify database updated - rec = self.app.get_record(bibcode='linkstest') - self.assertEqual(rec['datalinks_checksum'], 'thechecksum') - self.assertEqual(rec['solr_checksum'], None) - self.assertEqual(rec['metrics_checksum'], None) - self.assertEqual(rec['status'], 'success') - self.assertTrue(rec['datalinks_processed']) + rec = self.app.get_record(bibcode="linkstest") + self.assertEqual(rec["datalinks_checksum"], "thechecksum") + self.assertEqual(rec["solr_checksum"], None) + self.assertEqual(rec["metrics_checksum"], None) + self.assertEqual(rec["status"], "success") + self.assertTrue(rec["datalinks_processed"]) def test_index_datalinks_service_failure(self): """ - verify handles failure from service + verify handles failure from service """ m = mock.Mock() m.status_code = 500 # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) - with mock.patch('requests.put', return_value=m) as p: - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} - checksum = 'thechecksum' + nonbib_data = {"data_links_rows": [{"baz": 0}]} + self.app.update_storage("linkstest", "nonbib_data", nonbib_data) + with mock.patch("requests.put", return_value=m) as p: + datalinks_payload = { + "bibcode": "linkstest", + "data_links_rows": [{"baz": 0}], + } + checksum = "thechecksum" self.app.index_datalinks([datalinks_payload], [checksum]) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), - headers={'Authorization': 'Bearer fixme'}) + p.assert_called_with( + "http://localhost:8080/update", + data=json.dumps( + [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}] + ), + headers={"Authorization": "Bearer fixme"}, + ) - rec = self.app.get_record(bibcode='linkstest') + rec = self.app.get_record(bibcode="linkstest") self.assertEqual(p.call_count, 2) - self.assertEqual(rec['datalinks_checksum'], None) - self.assertEqual(rec['solr_checksum'], None) - self.assertEqual(rec['metrics_checksum'], None) - self.assertEqual(rec['status'], 'links-failed') - self.assertTrue(rec['datalinks_processed']) + self.assertEqual(rec["datalinks_checksum"], None) + self.assertEqual(rec["solr_checksum"], None) + self.assertEqual(rec["metrics_checksum"], None) + self.assertEqual(rec["status"], "links-failed") + self.assertTrue(rec["datalinks_processed"]) def test_index_datalinks_service_only_batch_failure(self): # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) - with mock.patch('requests.put') as p: + nonbib_data = {"data_links_rows": [{"baz": 0}]} + self.app.update_storage("linkstest", "nonbib_data", nonbib_data) + with mock.patch("requests.put") as p: bad = mock.Mock() bad.status_code = 500 good = mock.Mock() good.status_code = 200 p.side_effect = [bad, good] - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} - checksum = 'thechecksum' + datalinks_payload = { + "bibcode": "linkstest", + "data_links_rows": [{"baz": 0}], + } + checksum = "thechecksum" self.app.index_datalinks([datalinks_payload], [checksum]) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), - headers={'Authorization': 'Bearer fixme'}) + p.assert_called_with( + "http://localhost:8080/update", + data=json.dumps( + [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}] + ), + headers={"Authorization": "Bearer fixme"}, + ) self.assertEqual(p.call_count, 2) # verify database updated - rec = self.app.get_record(bibcode='linkstest') - self.assertEqual(rec['datalinks_checksum'], 'thechecksum') - self.assertEqual(rec['solr_checksum'], None) - self.assertEqual(rec['metrics_checksum'], None) - self.assertEqual(rec['status'], 'success') - self.assertTrue(rec['datalinks_processed']) + rec = self.app.get_record(bibcode="linkstest") + self.assertEqual(rec["datalinks_checksum"], "thechecksum") + self.assertEqual(rec["solr_checksum"], None) + self.assertEqual(rec["metrics_checksum"], None) + self.assertEqual(rec["status"], "success") + self.assertTrue(rec["datalinks_processed"]) def test_index_datalinks_update_processed_false(self): m = mock.Mock() m.status_code = 200 # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) - with mock.patch('requests.put', return_value=m) as p: - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} - checksum = 'thechecksum' - self.app.index_datalinks([datalinks_payload], [checksum], update_processed=False) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), - headers={'Authorization': 'Bearer fixme'}) + nonbib_data = {"data_links_rows": [{"baz": 0}]} + self.app.update_storage("linkstest", "nonbib_data", nonbib_data) + with mock.patch("requests.put", return_value=m) as p: + datalinks_payload = { + "bibcode": "linkstest", + "data_links_rows": [{"baz": 0}], + } + checksum = "thechecksum" + self.app.index_datalinks( + [datalinks_payload], [checksum], update_processed=False + ) + p.assert_called_with( + "http://localhost:8080/update", + data=json.dumps( + [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}] + ), + headers={"Authorization": "Bearer fixme"}, + ) # verify database updated - rec = self.app.get_record(bibcode='linkstest') - self.assertEqual(rec['datalinks_checksum'], None) - self.assertEqual(rec['solr_checksum'], None) - self.assertEqual(rec['metrics_checksum'], None) - self.assertEqual(rec['status'], None) - self.assertEqual(rec['datalinks_processed'], None) + rec = self.app.get_record(bibcode="linkstest") + self.assertEqual(rec["datalinks_checksum"], None) + self.assertEqual(rec["solr_checksum"], None) + self.assertEqual(rec["metrics_checksum"], None) + self.assertEqual(rec["status"], None) + self.assertEqual(rec["datalinks_processed"], None) def test_update_records_db_error(self): """test database exception IntegrityError is caught""" - with mock.patch('sqlalchemy.orm.session.Session.commit', side_effect=[IntegrityError('a', 'b', 'c', 'd'), None]): - self.assertRaises(IntegrityError, self.app.update_storage, 'abc', 'nonbib_data', '{}') - + with mock.patch( + "sqlalchemy.orm.session.Session.commit", + side_effect=[IntegrityError("a", "b", "c", "d"), None], + ): + self.assertRaises( + IntegrityError, self.app.update_storage, "abc", "nonbib_data", "{}" + ) + def test_rename_bibcode(self): - self.app.update_storage('abc', 'metadata', {'foo': 'bar', 'hey': 1}) - r = self.app.get_record('abc') - - self.app.rename_bibcode('abc', 'def') - + self.app.update_storage("abc", "metadata", {"foo": "bar", "hey": 1}) + r = self.app.get_record("abc") + + self.app.rename_bibcode("abc", "def") + with self.app.session_scope() as session: - ref = session.query(models.IdentifierMapping).filter_by(key='abc').first() - self.assertTrue(ref.target, 'def') - - self.assertTrue(self.app.get_changelog('abc'), [{'target': u'def', 'key': u'abc'}]) + ref = session.query(models.IdentifierMapping).filter_by(key="abc").first() + self.assertTrue(ref.target, "def") + + self.assertTrue( + self.app.get_changelog("abc"), [{"target": "def", "key": "abc"}] + ) def test_generate_links_for_resolver(self): - only_nonbib = {'bibcode': 'asdf', - 'nonbib_data': - {'data_links_rows': [{'url': ['http://arxiv.org/abs/1902.09522']}]}} + only_nonbib = { + "bibcode": "asdf", + "nonbib_data": { + "data_links_rows": [{"url": ["http://arxiv.org/abs/1902.09522"]}] + }, + } links = self.app.generate_links_for_resolver(only_nonbib) - self.assertEqual(only_nonbib['bibcode'], links['bibcode']) - self.assertEqual(only_nonbib['nonbib_data']['data_links_rows'], links['data_links_rows']) + self.assertEqual(only_nonbib["bibcode"], links["bibcode"]) + self.assertEqual( + only_nonbib["nonbib_data"]["data_links_rows"], links["data_links_rows"] + ) - only_bib = {'bibcode': 'asdf', - 'bib_data': - {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}']}} + only_bib = { + "bibcode": "asdf", + "bib_data": { + "links_data": [ + '{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}' + ] + }, + } links = self.app.generate_links_for_resolver(only_bib) - self.assertEqual(only_bib['bibcode'], links['bibcode']) - first = links['data_links_rows'][0] - self.assertEqual('http://arxiv.org/abs/1902.09522', first['url'][0]) - self.assertEqual('ESOURCE', first['link_type']) - self.assertEqual('EPRINT_HTML', first['link_sub_type']) - self.assertEqual([''], first['title']) - self.assertEqual(0, first['item_count']) - - bib_and_nonbib = {'bibcode': 'asdf', - 'bib_data': - {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522zz"}']}, - 'nonbib_data': - {'data_links_rows': [{'url': ['http://arxiv.org/abs/1902.09522']}]}} + self.assertEqual(only_bib["bibcode"], links["bibcode"]) + first = links["data_links_rows"][0] + self.assertEqual("http://arxiv.org/abs/1902.09522", first["url"][0]) + self.assertEqual("ESOURCE", first["link_type"]) + self.assertEqual("EPRINT_HTML", first["link_sub_type"]) + self.assertEqual([""], first["title"]) + self.assertEqual(0, first["item_count"]) + + bib_and_nonbib = { + "bibcode": "asdf", + "bib_data": { + "links_data": [ + '{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522zz"}' + ] + }, + "nonbib_data": { + "data_links_rows": [{"url": ["http://arxiv.org/abs/1902.09522"]}] + }, + } links = self.app.generate_links_for_resolver(bib_and_nonbib) - self.assertEqual(only_nonbib['bibcode'], links['bibcode']) - self.assertEqual(only_nonbib['nonbib_data']['data_links_rows'], links['data_links_rows']) + self.assertEqual(only_nonbib["bibcode"], links["bibcode"]) + self.assertEqual( + only_nonbib["nonbib_data"]["data_links_rows"], links["data_links_rows"] + ) # string in database - only_bib = {'bibcode': 'asdf', - 'bib_data': - {'links_data': [u'{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}']}} + only_bib = { + "bibcode": "asdf", + "bib_data": { + "links_data": [ + '{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}' + ] + }, + } links = self.app.generate_links_for_resolver(only_bib) - self.assertEqual(only_bib['bibcode'], links['bibcode']) - first = links['data_links_rows'][0] - self.assertEqual('http://arxiv.org/abs/1902.09522', first['url'][0]) - self.assertEqual('ESOURCE', first['link_type']) - self.assertEqual('EPRINT_HTML', first['link_sub_type']) - + self.assertEqual(only_bib["bibcode"], links["bibcode"]) + first = links["data_links_rows"][0] + self.assertEqual("http://arxiv.org/abs/1902.09522", first["url"][0]) + self.assertEqual("ESOURCE", first["link_type"]) + self.assertEqual("EPRINT_HTML", first["link_sub_type"]) + # bad string in database - with mock.patch.object(self.app.logger, 'error') as m: - only_bib = {'bibcode': 'testbib', - 'bib_data': - {'links_data': u'foobar[!)'}} + with mock.patch.object(self.app.logger, "error") as m: + only_bib = {"bibcode": "testbib", "bib_data": {"links_data": "foobar[!)"}} links = self.app.generate_links_for_resolver(only_bib) self.assertEqual(None, links) self.assertEqual(1, m.call_count) m_args = m.call_args_list - self.assertTrue('testbib' in str(m_args[0])) - self.assertTrue('foobar' in str(m_args[0])) + self.assertTrue("testbib" in str(m_args[0])) + self.assertTrue("foobar" in str(m_args[0])) def test_should_include_in_sitemap_comprehensive(self): """Test all code paths and scenarios in should_include_in_sitemap function""" - + base_time = adsputils.get_date() - - + # Test 1: Record with no bib_data (should be excluded) record_no_data = { - 'bibcode': '2023NoData..1..1A', - 'bib_data': None, - 'status': 'success' + "bibcode": "2023NoData..1..1A", + "has_bib_data": False, + "status": "success", } - self.assertFalse(self.app.should_include_in_sitemap(record_no_data), - "Record without bib_data should be excluded") - + self.assertFalse( + self.app.should_include_in_sitemap(record_no_data), + "Record without bib_data should be excluded", + ) + # Test 2: Record with empty bib_data string (should be excluded) record_empty_data = { - 'bibcode': '2023Empty..1..1A', - 'bib_data': '', - 'status': 'success' + "bibcode": "2023Empty..1..1A", + "has_bib_data": False, + "status": "success", } - self.assertFalse(self.app.should_include_in_sitemap(record_empty_data), - "Record with empty bib_data should be excluded") - + self.assertFalse( + self.app.should_include_in_sitemap(record_empty_data), + "Record with empty bib_data should be excluded", + ) + # Test 3: Record with solr-failed status (should be excluded) record_solr_failed = { - 'bibcode': '2023Failed..1..1A', - 'bib_data': '{"title": "Test"}', - 'status': 'solr-failed' + "bibcode": "2023Failed..1..1A", + "has_bib_data": True, + "status": "solr-failed", } - self.assertFalse(self.app.should_include_in_sitemap(record_solr_failed), - "Record with solr-failed status should be excluded") - + self.assertFalse( + self.app.should_include_in_sitemap(record_solr_failed), + "Record with solr-failed status should be excluded", + ) + # Test 4: Record with retrying status (should be excluded) record_retrying = { - 'bibcode': '2023Retrying..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'retrying' + "bibcode": "2023Retrying..1..1A", + "has_bib_data": True, + "status": "retrying", } - self.assertFalse(self.app.should_include_in_sitemap(record_retrying), - "Record with retrying status should be excluded") - + self.assertFalse( + self.app.should_include_in_sitemap(record_retrying), + "Record with retrying status should be excluded", + ) + # Test 5: Record with None status (should be included) record_none_status = { - 'bibcode': '2023NoneStatus..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': None + "bibcode": "2023NoneStatus..1..1A", + "has_bib_data": True, + "status": None, } - self.assertTrue(self.app.should_include_in_sitemap(record_none_status), - "Record with None status should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_none_status), + "Record with None status should be included", + ) + # Test 6: Record with success status (should be included) record_success = { - 'bibcode': '2023Success..1..1A', - 'bib_data': '{"title": "Test"}', - 'status': 'success', - 'bib_data_updated': base_time - timedelta(days=1) + "bibcode": "2023Success..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": base_time - timedelta(days=1), } - self.assertTrue(self.app.should_include_in_sitemap(record_success), - "Record with success status should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_success), + "Record with success status should be included", + ) + # Test 7: Record with metrics-failed status (should be included - not SOLR-related) record_metrics_failed = { - 'bibcode': '2023MetricsFailed..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'metrics-failed' + "bibcode": "2023MetricsFailed..1..1A", + "has_bib_data": True, + "status": "metrics-failed", } - self.assertTrue(self.app.should_include_in_sitemap(record_metrics_failed), - "Record with metrics-failed status should be included (not SOLR-related)") - + self.assertTrue( + self.app.should_include_in_sitemap(record_metrics_failed), + "Record with metrics-failed status should be included (not SOLR-related)", + ) + # Test 8: Record with links-failed status (should be included - not SOLR-related) record_links_failed = { - 'bibcode': '2023LinksFailed..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'links-failed' + "bibcode": "2023LinksFailed..1..1A", + "has_bib_data": True, + "status": "links-failed", } - self.assertTrue(self.app.should_include_in_sitemap(record_links_failed), - "Record with links-failed status should be included (not SOLR-related)") - + self.assertTrue( + self.app.should_include_in_sitemap(record_links_failed), + "Record with links-failed status should be included (not SOLR-related)", + ) + # Test 9: Record with None status and no solr_processed (should be included - not yet processed) record_not_processed = { - 'bibcode': '2023NotProcessed..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': None, - 'solr_processed': None + "bibcode": "2023NotProcessed..1..1A", + "has_bib_data": True, + "status": None, + "solr_processed": None, } - self.assertTrue(self.app.should_include_in_sitemap(record_not_processed), - "Record not yet processed by SOLR should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_not_processed), + "Record not yet processed by SOLR should be included", + ) + # Test 10: Record with recent solr_processed (should be included) record_recent_solr = { - 'bibcode': '2023Recent..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': base_time - timedelta(days=1), - 'solr_processed': base_time # More recent than bib_data_updated + "bibcode": "2023Recent..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": base_time - timedelta(days=1), + "solr_processed": base_time, # More recent than bib_data_updated } - self.assertTrue(self.app.should_include_in_sitemap(record_recent_solr), - "Record with recent SOLR processing should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_recent_solr), + "Record with recent SOLR processing should be included", + ) + # Test 11: Record with stale solr_processed (should be included with warning) record_stale_solr = { - 'bibcode': '2023Stale..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': base_time, - 'solr_processed': base_time - timedelta(days=6) # 6 days stale (> 5 day threshold) + "bibcode": "2023Stale..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": base_time, + "solr_processed": base_time + - timedelta(days=6), # 6 days stale (> 5 day threshold) } - self.assertTrue(self.app.should_include_in_sitemap(record_stale_solr), - "Record with stale SOLR processing should still be included (with warning)") - + self.assertTrue( + self.app.should_include_in_sitemap(record_stale_solr), + "Record with stale SOLR processing should still be included (with warning)", + ) + # Test 12: Record with exactly 5+ days staleness (boundary condition) record_boundary = { - 'bibcode': '2023Boundary..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': base_time, - 'solr_processed': base_time - timedelta(days=5, seconds=1) # Just over 5 days + "bibcode": "2023Boundary..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": base_time, + "solr_processed": base_time + - timedelta(days=5, seconds=1), # Just over 5 days } - self.assertTrue(self.app.should_include_in_sitemap(record_boundary), - "Record with exactly 5+ days staleness should be included with warning") - + self.assertTrue( + self.app.should_include_in_sitemap(record_boundary), + "Record with exactly 5+ days staleness should be included with warning", + ) + # Test 13: Record with no timestamps (should be included) record_no_timestamps = { - 'bibcode': '2023NoTimestamps..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': None, - 'solr_processed': None + "bibcode": "2023NoTimestamps..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": None, + "solr_processed": None, } - self.assertTrue(self.app.should_include_in_sitemap(record_no_timestamps), - "Record with no timestamps should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_no_timestamps), + "Record with no timestamps should be included", + ) + # Test 14: Record with bib_data_updated but no solr_processed (should be included) record_no_solr_time = { - 'bibcode': '2023NoSolrTime..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': base_time, - 'solr_processed': None + "bibcode": "2023NoSolrTime..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": base_time, + "solr_processed": None, } - self.assertTrue(self.app.should_include_in_sitemap(record_no_solr_time), - "Record with bib_data_updated but no solr_processed should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_no_solr_time), + "Record with bib_data_updated but no solr_processed should be included", + ) + # Test 15: Record with solr_processed but no bib_data_updated (should be included) record_no_bib_time = { - 'bibcode': '2023NoBibTime..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': None, - 'solr_processed': base_time + "bibcode": "2023NoBibTime..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": None, + "solr_processed": base_time, } - self.assertTrue(self.app.should_include_in_sitemap(record_no_bib_time), - "Record with solr_processed but no bib_data_updated should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_no_bib_time), + "Record with solr_processed but no bib_data_updated should be included", + ) + # Test 16: Record with very fresh processing (should be included) record_fresh = { - 'bibcode': '2023Fresh..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': base_time - timedelta(minutes=30), - 'solr_processed': base_time + "bibcode": "2023Fresh..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": base_time - timedelta(minutes=30), + "solr_processed": base_time, } - self.assertTrue(self.app.should_include_in_sitemap(record_fresh), - "Record with very fresh processing should be included") - + self.assertTrue( + self.app.should_include_in_sitemap(record_fresh), + "Record with very fresh processing should be included", + ) + # Test 17: Record with moderate lag (2 days, should be included without warning) record_moderate_lag = { - 'bibcode': '2023Moderate..1..1A', - 'bib_data': {'title': 'Test'}, - 'status': 'success', - 'bib_data_updated': base_time - timedelta(days=2), - 'solr_processed': base_time + "bibcode": "2023Moderate..1..1A", + "has_bib_data": True, + "status": "success", + "bib_data_updated": base_time - timedelta(days=2), + "solr_processed": base_time, } - self.assertTrue(self.app.should_include_in_sitemap(record_moderate_lag), - "Record with moderate processing lag should be included") + self.assertTrue( + self.app.should_include_in_sitemap(record_moderate_lag), + "Record with moderate processing lag should be included", + ) def test_get_records_bulk_performance(self): """Test get_records_bulk with a considerable number of records""" - + # Create 1000 test records test_bibcodes = [] - + for i in range(1000): - bibcode = f'2023Bulk..{i:04d}..{i:04d}A' + bibcode = f"2023Bulk..{i:04d}..{i:04d}A" test_bibcodes.append(bibcode) - + # Simple test data - bib_data = { - 'title': f'Test Paper {i}', - 'year': 2023 - } - + bib_data = {"title": f"Test Paper {i}", "year": 2023} + # Store record in database - self.app.update_storage(bibcode, 'bib_data', bib_data) - + self.app.update_storage(bibcode, "bib_data", bib_data) + # Test 1: Get all records with default fields with self.app.session_scope() as session: start_time = adsputils.get_date() - + result = self.app.get_records_bulk(test_bibcodes, session) - + end_time = adsputils.get_date() query_time = (end_time - start_time).total_seconds() - + # Performance assertion - should complete within reasonable time - self.assertLess(query_time, 10.0, f"Bulk query took {query_time:.2f}s, should be under 10s") - + self.assertLess( + query_time, + 10.0, + f"Bulk query took {query_time:.2f}s, should be under 10s", + ) + # Verify all records returned self.assertEqual(len(result), 1000, "Should return all 1000 records") - + # Verify basic structure for bibcode in test_bibcodes[:5]: # Check first 5 records self.assertIn(bibcode, result, f"Should contain record {bibcode}") record = result[bibcode] - + # Check required fields are present - self.assertIn('id', record, "Should contain id field") - self.assertIn('bibcode', record, "Should contain bibcode field") - self.assertIn('bib_data', record, "Should contain bib_data field") - + self.assertIn("id", record, "Should contain id field") + self.assertIn("bibcode", record, "Should contain bibcode field") + self.assertIn("bib_data", record, "Should contain bib_data field") + # Verify bibcode matches - self.assertEqual(record['bibcode'], bibcode, "Bibcode should match") - - print(f" get_records_bulk performance: 1000 records retrieved in {query_time:.2f}s") - + self.assertEqual(record["bibcode"], bibcode, "Bibcode should match") + + print( + f" get_records_bulk performance: 1000 records retrieved in {query_time:.2f}s" + ) + # Test 2: Test load_only functionality with self.app.session_scope() as session: result_limited = self.app.get_records_bulk( - test_bibcodes[:10], - session, - load_only=['bibcode', 'bib_data_updated'] + test_bibcodes[:10], session, load_only=["bibcode", "bib_data_updated"] ) - + # Verify correct fields returned for bibcode in test_bibcodes[:5]: record = result_limited[bibcode] - + # Should have requested fields - self.assertIn('bibcode', record, "Should contain bibcode field") - self.assertIn('bib_data_updated', record, "Should contain bib_data_updated field") - + self.assertIn("bibcode", record, "Should contain bibcode field") + self.assertIn( + "bib_data_updated", record, "Should contain bib_data_updated field" + ) + # Should not have other fields (they should be None) - self.assertIsNone(record.get('bib_data'), "bib_data should be None when not requested") - + self.assertIsNone( + record.get("bib_data"), "bib_data should be None when not requested" + ) + # Test 3: Empty bibcode list with self.app.session_scope() as session: empty_result = self.app.get_records_bulk([], session) - self.assertEqual(empty_result, {}, "Empty bibcode list should return empty dict") - + self.assertEqual( + empty_result, {}, "Empty bibcode list should return empty dict" + ) + # Test 4: Non-existent bibcodes - fake_bibcodes = ['2023Fake..1..1A', '2023Fake..1..2B'] + fake_bibcodes = ["2023Fake..1..1A", "2023Fake..1..2B"] with self.app.session_scope() as session: fake_result = self.app.get_records_bulk(fake_bibcodes, session) - self.assertEqual(fake_result, {}, "Non-existent bibcodes should return empty dict") + self.assertEqual( + fake_result, {}, "Non-existent bibcodes should return empty dict" + ) def test_get_sitemap_info_bulk_performance(self): """Test get_sitemap_info_bulk with a considerable number of sitemaps""" - + # Create 1000 test records and sitemap entries test_bibcodes = [] - + for i in range(1000): - bibcode = f'2023Sitemap..{i:04d}..{i:04d}A' + bibcode = f"2023Sitemap..{i:04d}..{i:04d}A" test_bibcodes.append(bibcode) - + # Simple test data - bib_data = { - 'title': f'Test Sitemap Paper {i}', - 'year': 2023 - } - + bib_data = {"title": f"Test Sitemap Paper {i}", "year": 2023} + # Store record in database - self.app.update_storage(bibcode, 'bib_data', bib_data) - + self.app.update_storage(bibcode, "bib_data", bib_data) + # Create sitemap entries for these records with self.app.session_scope() as session: # Get record IDs - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + records = ( + session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + ) record_map = {r.bibcode: r.id for r in records} - + # Create sitemap info entries for i, bibcode in enumerate(test_bibcodes): sitemap_info = SitemapInfo( record_id=record_map[bibcode], bibcode=bibcode, - sitemap_filename=f'sitemap_bib_{(i // 50) + 1}.xml', # 50 records per file + sitemap_filename=f"sitemap_bib_{(i // 50) + 1}.xml", # 50 records per file filename_lastmoddate=adsputils.get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Test 1: Get all sitemap infos with performance timing with self.app.session_scope() as session: start_time = adsputils.get_date() - + result = self.app.get_sitemap_info_bulk(test_bibcodes, session) - + end_time = adsputils.get_date() query_time = (end_time - start_time).total_seconds() - + # Performance assertion - should complete within reasonable time - self.assertLess(query_time, 10.0, f"Bulk sitemap query took {query_time:.2f}s, should be under 10s") - + self.assertLess( + query_time, + 10.0, + f"Bulk sitemap query took {query_time:.2f}s, should be under 10s", + ) + # Verify all sitemap infos returned self.assertEqual(len(result), 1000, "Should return all 1000 sitemap infos") - + # Verify basic structure for bibcode in test_bibcodes[:5]: # Check first 5 records - self.assertIn(bibcode, result, f"Should contain sitemap info for {bibcode}") + self.assertIn( + bibcode, result, f"Should contain sitemap info for {bibcode}" + ) sitemap_data = result[bibcode] - + # Check required fields are present (toJSON() format) - self.assertIn('bibcode', sitemap_data, "Should contain bibcode field") - self.assertIn('sitemap_filename', sitemap_data, "Should contain sitemap_filename field") - self.assertIn('update_flag', sitemap_data, "Should contain update_flag field") - + self.assertIn("bibcode", sitemap_data, "Should contain bibcode field") + self.assertIn( + "sitemap_filename", + sitemap_data, + "Should contain sitemap_filename field", + ) + self.assertIn( + "update_flag", sitemap_data, "Should contain update_flag field" + ) + # Verify bibcode matches - self.assertEqual(sitemap_data['bibcode'], bibcode, "Bibcode should match") - + self.assertEqual( + sitemap_data["bibcode"], bibcode, "Bibcode should match" + ) + # Verify filename format - self.assertTrue(sitemap_data['sitemap_filename'].startswith('sitemap_bib_'), - "Filename should have correct format") - - print(f"get_sitemap_info_bulk performance: 1000 sitemap infos retrieved in {query_time:.2f}s") - + self.assertTrue( + sitemap_data["sitemap_filename"].startswith("sitemap_bib_"), + "Filename should have correct format", + ) + + print( + f"get_sitemap_info_bulk performance: 1000 sitemap infos retrieved in {query_time:.2f}s" + ) + # Test 2: Empty bibcode list with self.app.session_scope() as session: empty_result = self.app.get_sitemap_info_bulk([], session) - self.assertEqual(empty_result, {}, "Empty bibcode list should return empty dict") - + self.assertEqual( + empty_result, {}, "Empty bibcode list should return empty dict" + ) + # Test 3: Non-existent bibcodes - fake_bibcodes = ['2023FakeSitemap..1..1A', '2023FakeSitemap..1..2B'] + fake_bibcodes = ["2023FakeSitemap..1..1A", "2023FakeSitemap..1..2B"] with self.app.session_scope() as session: fake_result = self.app.get_sitemap_info_bulk(fake_bibcodes, session) - self.assertEqual(fake_result, {}, "Non-existent bibcodes should return empty dict") + self.assertEqual( + fake_result, {}, "Non-existent bibcodes should return empty dict" + ) def test_get_current_sitemap_state_performance(self): """Test get_current_sitemap_state with multiple sitemaps and records""" - + # Create test records across multiple sitemap files test_bibcodes = [] - - for i in range(500): - bibcode = f'2023State..{i:04d}..{i:04d}A' + + for i in range(500): + bibcode = f"2023State..{i:04d}..{i:04d}A" test_bibcodes.append(bibcode) - + # Create highly unique bib_data to ensure different scix_ids - + bib_data = { - 'title': f'Test State Paper {i} - Unique Content {i*17} - {bibcode}', - 'year': 2023 + (i % 10), # Vary the year - 'bibcode': bibcode, # Include bibcode for uniqueness - 'abstract': f'This is a unique abstract for paper {i} with specific content {i*23} and bibcode {bibcode}', - 'authors': [f'Author{i}_{bibcode}', f'CoAuthor{i*2}_{bibcode}'], - 'unique_field': f'unique_value_{i}_{i*37}_{bibcode}_{int(time.time()*1000000) % 1000000}', - 'doi': f'10.1000/test.{i}.{i*41}', - 'page': f'{i*100}-{i*100+10}', - 'volume': str(i % 100 + 1), - 'issue': str(i % 12 + 1) + "title": f"Test State Paper {i} - Unique Content {i*17} - {bibcode}", + "year": 2023 + (i % 10), # Vary the year + "bibcode": bibcode, # Include bibcode for uniqueness + "abstract": f"This is a unique abstract for paper {i} with specific content {i*23} and bibcode {bibcode}", + "authors": [f"Author{i}_{bibcode}", f"CoAuthor{i*2}_{bibcode}"], + "unique_field": f"unique_value_{i}_{i*37}_{bibcode}_{int(time.time()*1000000) % 1000000}", + "doi": f"10.1000/test.{i}.{i*41}", + "page": f"{i*100}-{i*100+10}", + "volume": str(i % 100 + 1), + "issue": str(i % 12 + 1), } - + # Store record in database - self.app.update_storage(bibcode, 'bib_data', bib_data) - + self.app.update_storage(bibcode, "bib_data", bib_data) + # Test Scenario 1: Last file has EQUAL records (100 each) with self.app.session_scope() as session: # Get record IDs - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + records = ( + session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + ) record_map = {r.bibcode: r.id for r in records} - + # Create sitemap info entries - all files have 100 records each sitemap_distributions_equal = [ - ('sitemap_bib_1.xml', 100), # 100 records - ('sitemap_bib_2.xml', 100), # 100 records - ('sitemap_bib_3.xml', 100), # 100 records - ('sitemap_bib_4.xml', 100), # 100 records - ('sitemap_bib_5.xml', 100), # 100 records (equal - should be returned as highest) + ("sitemap_bib_1.xml", 100), # 100 records + ("sitemap_bib_2.xml", 100), # 100 records + ("sitemap_bib_3.xml", 100), # 100 records + ("sitemap_bib_4.xml", 100), # 100 records + ( + "sitemap_bib_5.xml", + 100, + ), # 100 records (equal - should be returned as highest) ] - + bibcode_index = 0 for filename, record_count in sitemap_distributions_equal: for _ in range(record_count): @@ -853,49 +1064,65 @@ def test_get_current_sitemap_state_performance(self): bibcode=bibcode, sitemap_filename=filename, filename_lastmoddate=adsputils.get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_info) bibcode_index += 1 session.commit() - + # Test 1: Get current sitemap state with performance timing (EQUAL scenario) with self.app.session_scope() as session: start_time = adsputils.get_date() - + result = self.app.get_current_sitemap_state(session) - + end_time = adsputils.get_date() query_time = (end_time - start_time).total_seconds() - + # Performance assertion - should complete quickly - self.assertLess(query_time, 2.0, f"Sitemap state query took {query_time:.3f}s, should be under 2s") - + self.assertLess( + query_time, + 2.0, + f"Sitemap state query took {query_time:.3f}s, should be under 2s", + ) + # Verify it returns the latest filename (highest index) when all have equal records - self.assertEqual(result['filename'], 'sitemap_bib_5.xml', - "Should return the highest numbered sitemap file when all have equal records") - self.assertEqual(result['count'], 100, - "Should return 100 records for the latest file (equal scenario)") - self.assertEqual(result['index'], 5, - "Should return index 5 for the latest file") - - print(f"get_current_sitemap_state performance (EQUAL): query completed in {query_time:.3f}s") - + self.assertEqual( + result["filename"], + "sitemap_bib_5.xml", + "Should return the highest numbered sitemap file when all have equal records", + ) + self.assertEqual( + result["count"], + 100, + "Should return 100 records for the latest file (equal scenario)", + ) + self.assertEqual( + result["index"], 5, "Should return index 5 for the latest file" + ) + + print( + f"get_current_sitemap_state performance (EQUAL): query completed in {query_time:.3f}s" + ) + # Test Scenario 2: Last file has FEWER records (100, 100, 100, 100, 80) with self.app.session_scope() as session: # Clear existing sitemap info session.query(SitemapInfo).delete(synchronize_session=False) session.commit() - + # Create new distribution where last file has fewer records sitemap_distributions_fewer = [ - ('sitemap_bib_1.xml', 100), # 100 records - ('sitemap_bib_2.xml', 100), # 100 records - ('sitemap_bib_3.xml', 100), # 100 records - ('sitemap_bib_4.xml', 100), # 100 records - ('sitemap_bib_5.xml', 80), # 80 records (fewer - should still be returned as highest) + ("sitemap_bib_1.xml", 100), # 100 records + ("sitemap_bib_2.xml", 100), # 100 records + ("sitemap_bib_3.xml", 100), # 100 records + ("sitemap_bib_4.xml", 100), # 100 records + ( + "sitemap_bib_5.xml", + 80, + ), # 80 records (fewer - should still be returned as highest) ] - + bibcode_index = 0 for filename, record_count in sitemap_distributions_fewer: for _ in range(record_count): @@ -906,62 +1133,83 @@ def test_get_current_sitemap_state_performance(self): bibcode=bibcode, sitemap_filename=filename, filename_lastmoddate=adsputils.get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_info) bibcode_index += 1 session.commit() - + # Test with fewer records in last file start_time = adsputils.get_date() - + result = self.app.get_current_sitemap_state(session) - + end_time = adsputils.get_date() query_time_fewer = (end_time - start_time).total_seconds() - + # Performance assertion - self.assertLess(query_time_fewer, 2.0, f"Sitemap state query took {query_time_fewer:.3f}s, should be under 2s") - + self.assertLess( + query_time_fewer, + 2.0, + f"Sitemap state query took {query_time_fewer:.3f}s, should be under 2s", + ) + # Verify it still returns the latest filename even with fewer records - self.assertEqual(result['filename'], 'sitemap_bib_5.xml', - "Should return the highest numbered sitemap file even when it has fewer records") - self.assertEqual(result['count'], 80, - "Should return 80 records for the latest file (fewer scenario)") - self.assertEqual(result['index'], 5, - "Should return index 5 for the latest file") - + self.assertEqual( + result["filename"], + "sitemap_bib_5.xml", + "Should return the highest numbered sitemap file even when it has fewer records", + ) + self.assertEqual( + result["count"], + 80, + "Should return 80 records for the latest file (fewer scenario)", + ) + self.assertEqual( + result["index"], 5, "Should return index 5 for the latest file" + ) + # Test 3: Verify state reflects the actual database content (using fewer scenario data) with self.app.session_scope() as session: # Verify the count matches actual database records - actual_count = session.query(SitemapInfo).filter( - SitemapInfo.sitemap_filename == 'sitemap_bib_5.xml' - ).count() - + actual_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.sitemap_filename == "sitemap_bib_5.xml") + .count() + ) + result = self.app.get_current_sitemap_state(session) - self.assertEqual(result['count'], actual_count, - "State count should match actual database count") - self.assertEqual(result['count'], 80, - "Should reflect the fewer records scenario (80 records)") - + self.assertEqual( + result["count"], + actual_count, + "State count should match actual database count", + ) + self.assertEqual( + result["count"], + 80, + "Should reflect the fewer records scenario (80 records)", + ) + # Test 4: Test with files that have None filenames (should be filtered out) with self.app.session_scope() as session: # Add some records with None filenames - none_bibcodes = ['2023None..1..1A', '2023None..2..2A'] + none_bibcodes = ["2023None..1..1A", "2023None..2..2A"] for i, bibcode in enumerate(none_bibcodes): bib_data = { - 'title': f'Test None {i} - {bibcode}', - 'year': 2024 + i, - 'bibcode': bibcode, - 'unique_field': f'none_test_{i}_{bibcode}_{int(time.time()*1000000) % 1000000}', - 'abstract': f'Unique abstract for none test {i} with bibcode {bibcode}', - 'authors': [f'NoneAuthor{i}_{bibcode}'] + "title": f"Test None {i} - {bibcode}", + "year": 2024 + i, + "bibcode": bibcode, + "unique_field": f"none_test_{i}_{bibcode}_{int(time.time()*1000000) % 1000000}", + "abstract": f"Unique abstract for none test {i} with bibcode {bibcode}", + "authors": [f"NoneAuthor{i}_{bibcode}"], } - self.app.update_storage(bibcode, 'bib_data', bib_data) - + self.app.update_storage(bibcode, "bib_data", bib_data) + # Get the record IDs - none_records = session.query(Records).filter(Records.bibcode.in_(none_bibcodes)).all() - + none_records = ( + session.query(Records).filter(Records.bibcode.in_(none_bibcodes)).all() + ) + # Add SitemapInfo entries with None filenames for record in none_records: sitemap_info = SitemapInfo( @@ -969,1415 +1217,2167 @@ def test_get_current_sitemap_state_performance(self): bibcode=record.bibcode, sitemap_filename=None, # None filename should be filtered out filename_lastmoddate=adsputils.get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Should still return sitemap_bib_5.xml, ignoring None filenames result = self.app.get_current_sitemap_state(session) - self.assertEqual(result['filename'], 'sitemap_bib_5.xml', - "Should ignore None filenames and return highest valid filename") - self.assertEqual(result['count'], 80, - "Should still return 80 records from the valid highest file") - + self.assertEqual( + result["filename"], + "sitemap_bib_5.xml", + "Should ignore None filenames and return highest valid filename", + ) + self.assertEqual( + result["count"], + 80, + "Should still return 80 records from the valid highest file", + ) + # Test 5: Empty database state (edge case) with self.app.session_scope() as session: # Clear all sitemap info session.query(SitemapInfo).delete(synchronize_session=False) session.commit() - + result = self.app.get_current_sitemap_state(session) - + # Should return default state - self.assertEqual(result['filename'], 'sitemap_bib_1.xml', - "Should return default filename when no records exist") - self.assertEqual(result['count'], 0, - "Should return 0 count when no records exist") - self.assertEqual(result['index'], 1, - "Should return default index 1 when no records exist") + self.assertEqual( + result["filename"], + "sitemap_bib_1.xml", + "Should return default filename when no records exist", + ) + self.assertEqual( + result["count"], 0, "Should return 0 count when no records exist" + ) + self.assertEqual( + result["index"], + 1, + "Should return default index 1 when no records exist", + ) def test_process_sitemap_batch_session_persistence(self): """Test _process_sitemap_batch with session management and persistence""" - + # Create test records for batch processing test_bibcodes = [] - + for i in range(100): - bibcode = f'2023Batch..{i:04d}..{i:04d}A' + bibcode = f"2023Batch..{i:04d}..{i:04d}A" test_bibcodes.append(bibcode) - + # Simple test data - bib_data = { - 'title': f'Test Batch Paper {i}', - 'year': 2023 - } - + bib_data = {"title": f"Test Batch Paper {i}", "year": 2023} + # Store record in database - self.app.update_storage(bibcode, 'bib_data', bib_data) - + self.app.update_storage(bibcode, "bib_data", bib_data) + # Test session persistence with self.app.session_scope() as session: start_time = adsputils.get_date() - + # Get initial sitemap state sitemap_state = self.app.get_current_sitemap_state(session) - + # Test 1: Process first batch of 50 bibcodes batch_bibcodes_1 = test_bibcodes[:50] batch_stats, updated_state_1 = self.app._process_sitemap_batch( - batch_bibcodes_1, 'add', session, sitemap_state + batch_bibcodes_1, "add", session, sitemap_state ) - + end_time = adsputils.get_date() query_time = (end_time - start_time).total_seconds() - + # Performance assertion - self.assertLess(query_time, 5.0, f"Batch processing took {query_time:.3f}s, should be under 5s") - + self.assertLess( + query_time, + 5.0, + f"Batch processing took {query_time:.3f}s, should be under 5s", + ) + # Verify first batch results - self.assertEqual(batch_stats['successful'], 50, "Should successfully process all 50 bibcodes") - self.assertEqual(batch_stats['failed'], 0, "Should have no failed bibcodes") - self.assertEqual(len(batch_stats['sitemap_records']), 50, "Should return 50 sitemap records") - - print(f"process_sitemap_batch performance (ADD): processed 50 records in {query_time:.3f}s") - + self.assertEqual( + batch_stats["successful"], + 50, + "Should successfully process all 50 bibcodes", + ) + self.assertEqual(batch_stats["failed"], 0, "Should have no failed bibcodes") + self.assertEqual( + len(batch_stats["sitemap_records"]), + 50, + "Should return 50 sitemap records", + ) + + print( + f"process_sitemap_batch performance (ADD): processed 50 records in {query_time:.3f}s" + ) + # Test 2: Verify session persistence - created_records_1 = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(batch_bibcodes_1) - ).all() - - self.assertEqual(len(created_records_1), 50, - "All 50 sitemap records should be visible in same session") - + created_records_1 = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(batch_bibcodes_1)) + .all() + ) + + self.assertEqual( + len(created_records_1), + 50, + "All 50 sitemap records should be visible in same session", + ) + # Test 3: Process second batch using updated_state from first batch batch_bibcodes_2 = test_bibcodes[50:80] batch_stats, updated_state_2 = self.app._process_sitemap_batch( - batch_bibcodes_2, 'force-update', session, updated_state_1 + batch_bibcodes_2, "force-update", session, updated_state_1 ) - + # Verify second batch results - self.assertEqual(batch_stats['successful'], 30, "Should successfully process all 30 bibcodes") - self.assertEqual(batch_stats['failed'], 0, "Should have no failed bibcodes") - self.assertEqual(len(batch_stats['sitemap_records']), 30, "Should return 30 sitemap records") - + self.assertEqual( + batch_stats["successful"], + 30, + "Should successfully process all 30 bibcodes", + ) + self.assertEqual(batch_stats["failed"], 0, "Should have no failed bibcodes") + self.assertEqual( + len(batch_stats["sitemap_records"]), + 30, + "Should return 30 sitemap records", + ) + # Test 4: Verify session consistency - state should be cumulative - initial_count = sitemap_state['count'] - if initial_count + 80 <= self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000): + initial_count = sitemap_state["count"] + if initial_count + 80 <= self.app.conf.get( + "MAX_RECORDS_PER_SITEMAP", 50000 + ): # Should be same file with cumulative records - self.assertEqual(updated_state_2['filename'], sitemap_state['filename'], - "Should use same filename when under limit") - self.assertEqual(updated_state_2['count'], initial_count + 80, - "Count should be cumulative across batches") - + self.assertEqual( + updated_state_2["filename"], + sitemap_state["filename"], + "Should use same filename when under limit", + ) + self.assertEqual( + updated_state_2["count"], + initial_count + 80, + "Count should be cumulative across batches", + ) + # Test 5: Verify all records are visible in same session (no commits yet!) - all_records_in_session = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes[:80]) - ).all() - - self.assertEqual(len(all_records_in_session), 80, - "All 80 records should be visible in same session before commit") - + all_records_in_session = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes[:80])) + .all() + ) + + self.assertEqual( + len(all_records_in_session), + 80, + "All 80 records should be visible in same session before commit", + ) + # Test 6: Verify state consistency within session current_state_in_session = self.app.get_current_sitemap_state(session) - self.assertEqual(current_state_in_session['count'], updated_state_2['count'], - "Current state should match updated state within same session") - - + self.assertEqual( + current_state_in_session["count"], + updated_state_2["count"], + "Current state should match updated state within same session", + ) + # Now commit everything at once session.commit() - + # Test 7: Verify data persisted after session ends with self.app.session_scope() as new_session: - verification_records = new_session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes[:80]) - ).count() - - self.assertEqual(verification_records, 80, - "New session should see all committed records") - - + verification_records = ( + new_session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes[:80])) + .count() + ) + + self.assertEqual( + verification_records, 80, "New session should see all committed records" + ) + # Test 6: Test empty batch edge case with self.app.session_scope() as session: empty_state = self.app.get_current_sitemap_state(session) batch_stats, empty_updated_state = self.app._process_sitemap_batch( - [], 'add', session, empty_state + [], "add", session, empty_state + ) + self.assertEqual( + batch_stats["successful"], 0, "Empty batch should return 0 successful" + ) + self.assertEqual( + batch_stats["failed"], 0, "Empty batch should return 0 failed" + ) + self.assertEqual( + len(batch_stats["sitemap_records"]), + 0, + "Empty batch should return empty records list", + ) + self.assertEqual( + empty_updated_state, + empty_state, + "Empty batch should return unchanged state", ) - self.assertEqual(batch_stats['successful'], 0, "Empty batch should return 0 successful") - self.assertEqual(batch_stats['failed'], 0, "Empty batch should return 0 failed") - self.assertEqual(len(batch_stats['sitemap_records']), 0, "Empty batch should return empty records list") - self.assertEqual(empty_updated_state, empty_state, "Empty batch should return unchanged state") def test_process_sitemap_batch_solr_filtering(self): """Test SOLR status filtering logic in _process_sitemap_batch""" - + # Create records with different statuses to test all should_include_in_sitemap logic test_bibcodes = [ - '2023Success..1..1A', # success - should be included - '2023SolrFailed..1..1A', # solr-failed - should be excluded - '2023Retrying..1..1A', # retrying - should be excluded - '2023MetricsFailed..1..1A', # metrics-failed - should be included (not SOLR-related) - '2023LinksFailed..1..1A', # links-failed - should be included (not SOLR-related) - '2023NoBibData..1..1A' # will have no bib_data - should be excluded + "2023Success..1..1A", # success - should be included + "2023SolrFailed..1..1A", # solr-failed - should be excluded + "2023Retrying..1..1A", # retrying - should be excluded + "2023MetricsFailed..1..1A", # metrics-failed - should be included (not SOLR-related) + "2023LinksFailed..1..1A", # links-failed - should be included (not SOLR-related) + "2023NoBibData..1..1A", # will have no bib_data - should be excluded ] - + for i, bibcode in enumerate(test_bibcodes): - if bibcode != '2023NoBibData..1..1A': # Skip creating bib_data for this one - bib_data = {'title': f'Test Paper {i}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + if bibcode != "2023NoBibData..1..1A": # Skip creating bib_data for this one + bib_data = {"title": f"Test Paper {i}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Set different statuses - self.app.mark_processed(['2023SolrFailed..1..1A'], 'solr', checksums=['checksum_failed'], status='solr-failed') - self.app.mark_processed(['2023Retrying..1..1A'], 'solr', checksums=['checksum_retrying'], status='retrying') - self.app.mark_processed(['2023MetricsFailed..1..1A'], 'solr', checksums=['checksum_metrics'], status='metrics-failed') - self.app.mark_processed(['2023LinksFailed..1..1A'], 'solr', checksums=['checksum_links'], status='links-failed') + self.app.mark_processed( + ["2023SolrFailed..1..1A"], + "solr", + checksums=["checksum_failed"], + status="solr-failed", + ) + self.app.mark_processed( + ["2023Retrying..1..1A"], + "solr", + checksums=["checksum_retrying"], + status="retrying", + ) + self.app.mark_processed( + ["2023MetricsFailed..1..1A"], + "solr", + checksums=["checksum_metrics"], + status="metrics-failed", + ) + self.app.mark_processed( + ["2023LinksFailed..1..1A"], + "solr", + checksums=["checksum_links"], + status="links-failed", + ) # 2023Success..1..1A gets default 'success' status # 2023NoBibData..1..1A will have no bib_data at all - + # Test 'add' action with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1} + batch_stats, updated_state_add = self.app._process_sitemap_batch( - test_bibcodes, 'add', session, initial_state + test_bibcodes, "add", session, initial_state ) - + # Should include: success, metrics-failed, links-failed = 3 successful # Should exclude: solr-failed, retrying, no-bib-data = 3 failed - self.assertEqual(batch_stats['successful'], 3, "Add: Should include success, metrics-failed, links-failed statuses") - self.assertEqual(batch_stats['failed'], 3, "Add: Should exclude solr-failed, retrying, and no-bib-data records") - self.assertEqual(len(batch_stats['sitemap_records']), 3, "Add: Should return 3 sitemap records") - self.assertEqual(updated_state_add['count'], 3, "Add: State should reflect only successful records") - + self.assertEqual( + batch_stats["successful"], + 3, + "Add: Should include success, metrics-failed, links-failed statuses", + ) + self.assertEqual( + batch_stats["failed"], + 3, + "Add: Should exclude solr-failed, retrying, and no-bib-data records", + ) + self.assertEqual( + len(batch_stats["sitemap_records"]), + 3, + "Add: Should return 3 sitemap records", + ) + self.assertEqual( + updated_state_add["count"], + 3, + "Add: State should reflect only successful records", + ) + # Test 'force-update' action - should have same filtering results with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_2.xml', 'count': 0, 'index': 2} - + initial_state = {"filename": "sitemap_bib_2.xml", "count": 0, "index": 2} + batch_stats, updated_state_force = self.app._process_sitemap_batch( - test_bibcodes, 'force-update', session, initial_state + test_bibcodes, "force-update", session, initial_state ) - + # Force-update should have same filtering results as add - self.assertEqual(batch_stats['successful'], 3, "Force-update: Should include success, metrics-failed, links-failed statuses") - self.assertEqual(batch_stats['failed'], 3, "Force-update: Should exclude solr-failed, retrying, and no-bib-data records") - self.assertEqual(len(batch_stats['sitemap_records']), 3, "Force-update: Should return updated sitemap records for reporting") - self.assertEqual(updated_state_force['count'], 0, "Force-update: State count should remain 0 (updating existing, not adding new)") - - # Results should be identical for filtering - self.assertEqual(batch_stats['successful'], batch_stats['successful'], "Both actions should have same successful count") - self.assertEqual(batch_stats['failed'], batch_stats['failed'], "Both actions should have same failed count") + self.assertEqual( + batch_stats["successful"], + 3, + "Force-update: Should include success, metrics-failed, links-failed statuses", + ) + self.assertEqual( + batch_stats["failed"], + 3, + "Force-update: Should exclude solr-failed, retrying, and no-bib-data records", + ) + self.assertEqual( + len(batch_stats["sitemap_records"]), + 3, + "Force-update: Should return updated sitemap records for reporting", + ) + self.assertEqual( + updated_state_force["count"], + 0, + "Force-update: State count should remain 0 (updating existing, not adding new)", + ) + # Results should be identical for filtering + self.assertEqual( + batch_stats["successful"], + batch_stats["successful"], + "Both actions should have same successful count", + ) + self.assertEqual( + batch_stats["failed"], + batch_stats["failed"], + "Both actions should have same failed count", + ) def test_process_sitemap_batch_new_vs_existing_records(self): """Test handling of new records vs existing sitemap entries""" - + # Create test records with specific timestamps base_time = adsputils.get_date() - new_bibcode = '2023New..1..1A' - existing_recent_bibcode = '2023ExistingRecent..1..1A' - existing_stale_bibcode = '2023ExistingStale..1..1A' - + new_bibcode = "2023New..1..1A" + existing_recent_bibcode = "2023ExistingRecent..1..1A" + existing_stale_bibcode = "2023ExistingStale..1..1A" + test_bibcodes = [new_bibcode, existing_recent_bibcode, existing_stale_bibcode] - + # Create records with specific bib_data_updated timestamps for i, bibcode in enumerate(test_bibcodes): - bib_data = {'title': f'Test Paper {bibcode}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + bib_data = {"title": f"Test Paper {bibcode}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Update bib_data_updated timestamps with self.app.session_scope() as session: - session.query(Records).filter(Records.bibcode == bibcode).update({ - 'bib_data_updated': base_time - timedelta(hours=i) # Different timestamps - }, synchronize_session=False) + session.query(Records).filter(Records.bibcode == bibcode).update( + { + "bib_data_updated": base_time + - timedelta(hours=i) # Different timestamps + }, + synchronize_session=False, + ) session.commit() - + # Create existing sitemap entries with self.app.session_scope() as session: - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + records = ( + session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + ) record_map = {r.bibcode: r.id for r in records} - + # Recent sitemap entry - filename_lastmoddate is NEWER than bib_data_updated recent_sitemap = SitemapInfo( record_id=record_map[existing_recent_bibcode], bibcode=existing_recent_bibcode, - sitemap_filename='sitemap_bib_1.xml', - filename_lastmoddate=base_time + timedelta(hours=1), # NEWER than bib_data_updated - update_flag=False + sitemap_filename="sitemap_bib_1.xml", + filename_lastmoddate=base_time + + timedelta(hours=1), # NEWER than bib_data_updated + update_flag=False, ) - + # Stale sitemap entry - filename_lastmoddate is OLDER than bib_data_updated stale_sitemap = SitemapInfo( record_id=record_map[existing_stale_bibcode], bibcode=existing_stale_bibcode, - sitemap_filename='sitemap_bib_1.xml', - filename_lastmoddate=base_time - timedelta(days=10), # OLDER than bib_data_updated - update_flag=False + sitemap_filename="sitemap_bib_1.xml", + filename_lastmoddate=base_time + - timedelta(days=10), # OLDER than bib_data_updated + update_flag=False, ) - + session.add(recent_sitemap) session.add(stale_sitemap) session.commit() - + with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 5, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 5, "index": 1} + batch_stats, updated_state = self.app._process_sitemap_batch( - test_bibcodes, 'add', session, initial_state + test_bibcodes, "add", session, initial_state ) - + # All 3 should be successful - self.assertEqual(batch_stats['successful'], 3, "All records should be processed successfully") - self.assertEqual(batch_stats['failed'], 0, "No records should fail") - + self.assertEqual( + batch_stats["successful"], + 3, + "All records should be processed successfully", + ) + self.assertEqual(batch_stats["failed"], 0, "No records should fail") + # Only NEW record increments count (1 new record) - self.assertEqual(updated_state['count'], 6, "Only new record should increment count (5 + 1 = 6)") - + self.assertEqual( + updated_state["count"], + 6, + "Only new record should increment count (5 + 1 = 6)", + ) + # Check that update_flags are set correctly with self.app.session_scope() as session: - batch_stats['sitemap_records'] = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).all() - - for record in batch_stats['sitemap_records']: + batch_stats["sitemap_records"] = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .all() + ) + + for record in batch_stats["sitemap_records"]: if record.bibcode == new_bibcode: # New record should have update_flag = True - self.assertTrue(record.update_flag, f"New record {record.bibcode} should have update_flag=True") + self.assertTrue( + record.update_flag, + f"New record {record.bibcode} should have update_flag=True", + ) elif record.bibcode == existing_recent_bibcode: # Recent record should NOT be updated (filename_lastmoddate > bib_data_updated) - self.assertFalse(record.update_flag, f"Recent record {record.bibcode} should have update_flag=False") + self.assertFalse( + record.update_flag, + f"Recent record {record.bibcode} should have update_flag=False", + ) elif record.bibcode == existing_stale_bibcode: # Stale record should be updated (filename_lastmoddate < bib_data_updated) - self.assertTrue(record.update_flag, f"Stale record {record.bibcode} should have update_flag=True") + self.assertTrue( + record.update_flag, + f"Stale record {record.bibcode} should have update_flag=True", + ) def test_process_sitemap_batch_add_action_with_recent_file(self): """Test 'add' action when file is newer than data (should NOT update)""" - + base_time = adsputils.get_date() - test_bibcode = '2023AddRecent..1..1A' - bib_data = {'title': 'Test Add Recent Paper', 'year': 2023} - self.app.update_storage(test_bibcode, 'bib_data', bib_data) - + test_bibcode = "2023AddRecent..1..1A" + bib_data = {"title": "Test Add Recent Paper", "year": 2023} + self.app.update_storage(test_bibcode, "bib_data", bib_data) + # Set bib_data_updated to be OLDER than filename_lastmoddate with self.app.session_scope() as session: - session.query(Records).filter(Records.bibcode == test_bibcode).update({ - 'bib_data_updated': base_time - timedelta(hours=2) # 2 hours ago (OLDER) - }, synchronize_session=False) + session.query(Records).filter(Records.bibcode == test_bibcode).update( + { + "bib_data_updated": base_time + - timedelta(hours=2) # 2 hours ago (OLDER) + }, + synchronize_session=False, + ) session.commit() - + # Create existing sitemap entry with NEWER timestamp with self.app.session_scope() as session: - record = session.query(Records).filter(Records.bibcode == test_bibcode).first() - + record = ( + session.query(Records).filter(Records.bibcode == test_bibcode).first() + ) + sitemap_info = SitemapInfo( record_id=record.id, bibcode=test_bibcode, - sitemap_filename='sitemap_bib_1.xml', + sitemap_filename="sitemap_bib_1.xml", filename_lastmoddate=base_time, # NEWER than bib_data_updated - update_flag=False + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Store original sitemap_info values for comparison with self.app.session_scope() as session: - original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() + original_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) original_filename_lastmoddate = original_record.filename_lastmoddate original_sitemap_filename = original_record.sitemap_filename original_update_flag = original_record.update_flag - + # Test 'add' action with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1} + batch_stats, _ = self.app._process_sitemap_batch( - [test_bibcode], 'add', session, initial_state + [test_bibcode], "add", session, initial_state ) - + # Check that sitemap_info record remains unchanged - sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() - - self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully") - self.assertEqual(batch_stats['failed'], 0, "No records should fail") - + sitemap_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) + + self.assertEqual( + batch_stats["successful"], 1, "Record should be processed successfully" + ) + self.assertEqual(batch_stats["failed"], 0, "No records should fail") + # Verify the record was not modified - self.assertFalse(sitemap_record.update_flag, "'add' should NOT set update_flag when file is newer than data") - self.assertEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should remain unchanged") - self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged") - self.assertEqual(sitemap_record.update_flag, original_update_flag, "update_flag should remain unchanged (False)") + self.assertFalse( + sitemap_record.update_flag, + "'add' should NOT set update_flag when file is newer than data", + ) + self.assertEqual( + sitemap_record.filename_lastmoddate, + original_filename_lastmoddate, + "filename_lastmoddate should remain unchanged", + ) + self.assertEqual( + sitemap_record.sitemap_filename, + original_sitemap_filename, + "sitemap_filename should remain unchanged", + ) + self.assertEqual( + sitemap_record.update_flag, + original_update_flag, + "update_flag should remain unchanged (False)", + ) def test_process_sitemap_batch_add_action_with_stale_file(self): """Test 'add' action when data is newer than file (should update)""" - + base_time = adsputils.get_date() - test_bibcode = '2023AddStale..1..1A' - bib_data = {'title': 'Test Add Stale Paper', 'year': 2023} - self.app.update_storage(test_bibcode, 'bib_data', bib_data) - + test_bibcode = "2023AddStale..1..1A" + bib_data = {"title": "Test Add Stale Paper", "year": 2023} + self.app.update_storage(test_bibcode, "bib_data", bib_data) + # Set bib_data_updated to be NEWER than filename_lastmoddate with self.app.session_scope() as session: - session.query(Records).filter(Records.bibcode == test_bibcode).update({ - 'bib_data_updated': base_time # Current time (NEWER) - }, synchronize_session=False) + session.query(Records).filter(Records.bibcode == test_bibcode).update( + {"bib_data_updated": base_time}, # Current time (NEWER) + synchronize_session=False, + ) session.commit() - + # Create existing sitemap entry with OLDER timestamp with self.app.session_scope() as session: - record = session.query(Records).filter(Records.bibcode == test_bibcode).first() - + record = ( + session.query(Records).filter(Records.bibcode == test_bibcode).first() + ) + sitemap_info = SitemapInfo( record_id=record.id, bibcode=test_bibcode, - sitemap_filename='sitemap_bib_1.xml', - filename_lastmoddate=base_time - timedelta(hours=3), # OLDER than bib_data_updated - update_flag=False + sitemap_filename="sitemap_bib_1.xml", + filename_lastmoddate=base_time + - timedelta(hours=3), # OLDER than bib_data_updated + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Store original sitemap_info values for comparison with self.app.session_scope() as session: - original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() + original_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) original_filename_lastmoddate = original_record.filename_lastmoddate original_sitemap_filename = original_record.sitemap_filename original_update_flag = original_record.update_flag - + # Test 'add' action with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1} + batch_stats, _ = self.app._process_sitemap_batch( - [test_bibcode], 'add', session, initial_state + [test_bibcode], "add", session, initial_state ) - + # Check that sitemap_info record was updated appropriately - sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() - - self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully") - self.assertEqual(batch_stats['failed'], 0, "No records should fail") - + sitemap_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) + + self.assertEqual( + batch_stats["successful"], 1, "Record should be processed successfully" + ) + self.assertEqual(batch_stats["failed"], 0, "No records should fail") + # Verify the record was updated correctly - self.assertTrue(sitemap_record.update_flag, "'add' should set update_flag when data is newer than file") - self.assertEqual(sitemap_record.filename_lastmoddate, base_time, "filename_lastmoddate should be updated to bib_data_updated") - self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged") - self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True") - self.assertNotEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should have been updated") + self.assertTrue( + sitemap_record.update_flag, + "'add' should set update_flag when data is newer than file", + ) + self.assertEqual( + sitemap_record.filename_lastmoddate, + base_time, + "filename_lastmoddate should be updated to bib_data_updated", + ) + self.assertEqual( + sitemap_record.sitemap_filename, + original_sitemap_filename, + "sitemap_filename should remain unchanged", + ) + self.assertNotEqual( + sitemap_record.update_flag, + original_update_flag, + "update_flag should have changed from False to True", + ) + self.assertNotEqual( + sitemap_record.filename_lastmoddate, + original_filename_lastmoddate, + "filename_lastmoddate should have been updated", + ) def test_process_sitemap_batch_add_action_with_never_generated_file(self): """Test 'add' action when file has never been generated (filename_lastmoddate is None)""" - + base_time = adsputils.get_date() - test_bibcode = '2023AddNeverGenerated..1..1A' - bib_data = {'title': 'Test Never Generated Paper', 'year': 2023} - self.app.update_storage(test_bibcode, 'bib_data', bib_data) - + test_bibcode = "2023AddNeverGenerated..1..1A" + bib_data = {"title": "Test Never Generated Paper", "year": 2023} + self.app.update_storage(test_bibcode, "bib_data", bib_data) + # Set bib_data_updated to any time (doesn't matter since filename_lastmoddate is None) with self.app.session_scope() as session: - session.query(Records).filter(Records.bibcode == test_bibcode).update({ - 'bib_data_updated': base_time - timedelta(hours=1) # 1 hour ago - }, synchronize_session=False) + session.query(Records).filter(Records.bibcode == test_bibcode).update( + {"bib_data_updated": base_time - timedelta(hours=1)}, # 1 hour ago + synchronize_session=False, + ) session.commit() - + # Create existing sitemap entry with None filename_lastmoddate (never generated) with self.app.session_scope() as session: - record = session.query(Records).filter(Records.bibcode == test_bibcode).first() - + record = ( + session.query(Records).filter(Records.bibcode == test_bibcode).first() + ) + sitemap_info = SitemapInfo( record_id=record.id, bibcode=test_bibcode, - sitemap_filename='sitemap_bib_1.xml', + sitemap_filename="sitemap_bib_1.xml", filename_lastmoddate=None, # Never been generated - update_flag=False + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Store original sitemap_info values for comparison with self.app.session_scope() as session: - original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() + original_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) original_filename_lastmoddate = original_record.filename_lastmoddate original_sitemap_filename = original_record.sitemap_filename original_update_flag = original_record.update_flag - + # Test 'add' action with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1} + batch_stats, _ = self.app._process_sitemap_batch( - [test_bibcode], 'add', session, initial_state + [test_bibcode], "add", session, initial_state ) - + # Check that sitemap_info record was updated appropriately - sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() - - self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully") - self.assertEqual(batch_stats['failed'], 0, "No records should fail") - + sitemap_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) + + self.assertEqual( + batch_stats["successful"], 1, "Record should be processed successfully" + ) + self.assertEqual(batch_stats["failed"], 0, "No records should fail") + # Verify the record was updated correctly - self.assertTrue(sitemap_record.update_flag, "'add' should set update_flag when file has never been generated") - self.assertEqual(sitemap_record.filename_lastmoddate, base_time - timedelta(hours=1), "filename_lastmoddate should be updated to bib_data_updated") - self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged") - self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True") - self.assertIsNone(original_filename_lastmoddate, "Original filename_lastmoddate should have been None") - self.assertIsNotNone(sitemap_record.filename_lastmoddate, "filename_lastmoddate should now be set") + self.assertTrue( + sitemap_record.update_flag, + "'add' should set update_flag when file has never been generated", + ) + self.assertEqual( + sitemap_record.filename_lastmoddate, + base_time - timedelta(hours=1), + "filename_lastmoddate should be updated to bib_data_updated", + ) + self.assertEqual( + sitemap_record.sitemap_filename, + original_sitemap_filename, + "sitemap_filename should remain unchanged", + ) + self.assertNotEqual( + sitemap_record.update_flag, + original_update_flag, + "update_flag should have changed from False to True", + ) + self.assertIsNone( + original_filename_lastmoddate, + "Original filename_lastmoddate should have been None", + ) + self.assertIsNotNone( + sitemap_record.filename_lastmoddate, + "filename_lastmoddate should now be set", + ) def test_process_sitemap_batch_force_update_with_recent_file(self): """Test 'force-update' action when file is newer than data (should still update)""" - + base_time = adsputils.get_date() - test_bibcode = '2023ForceRecent..1..1A' - bib_data = {'title': 'Test Force Recent Paper', 'year': 2023} - self.app.update_storage(test_bibcode, 'bib_data', bib_data) - + test_bibcode = "2023ForceRecent..1..1A" + bib_data = {"title": "Test Force Recent Paper", "year": 2023} + self.app.update_storage(test_bibcode, "bib_data", bib_data) + # Set bib_data_updated to be OLDER than filename_lastmoddate with self.app.session_scope() as session: - session.query(Records).filter(Records.bibcode == test_bibcode).update({ - 'bib_data_updated': base_time - timedelta(hours=4) # 4 hours ago (OLDER) - }, synchronize_session=False) + session.query(Records).filter(Records.bibcode == test_bibcode).update( + { + "bib_data_updated": base_time + - timedelta(hours=4) # 4 hours ago (OLDER) + }, + synchronize_session=False, + ) session.commit() - + # Create existing sitemap entry with NEWER timestamp with self.app.session_scope() as session: - record = session.query(Records).filter(Records.bibcode == test_bibcode).first() - + record = ( + session.query(Records).filter(Records.bibcode == test_bibcode).first() + ) + sitemap_info = SitemapInfo( record_id=record.id, bibcode=test_bibcode, - sitemap_filename='sitemap_bib_1.xml', + sitemap_filename="sitemap_bib_1.xml", filename_lastmoddate=base_time, # NEWER than bib_data_updated - update_flag=False + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Store original sitemap_info values for comparison with self.app.session_scope() as session: - original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() + original_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) original_filename_lastmoddate = original_record.filename_lastmoddate original_sitemap_filename = original_record.sitemap_filename original_update_flag = original_record.update_flag - + # Test 'force-update' action with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1} + batch_stats, _ = self.app._process_sitemap_batch( - [test_bibcode], 'force-update', session, initial_state + [test_bibcode], "force-update", session, initial_state ) - + # Check that sitemap_info record was updated appropriately - sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() - - self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully") - self.assertEqual(batch_stats['failed'], 0, "No records should fail") - + sitemap_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) + + self.assertEqual( + batch_stats["successful"], 1, "Record should be processed successfully" + ) + self.assertEqual(batch_stats["failed"], 0, "No records should fail") + # Verify the record was updated correctly - self.assertTrue(sitemap_record.update_flag, "'force-update' should ALWAYS set update_flag, even when file is newer") - self.assertEqual(sitemap_record.filename_lastmoddate, base_time - timedelta(hours=4), "filename_lastmoddate should be updated to bib_data_updated") - self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged") - self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True") - self.assertNotEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should have been updated") + self.assertTrue( + sitemap_record.update_flag, + "'force-update' should ALWAYS set update_flag, even when file is newer", + ) + self.assertEqual( + sitemap_record.filename_lastmoddate, + base_time - timedelta(hours=4), + "filename_lastmoddate should be updated to bib_data_updated", + ) + self.assertEqual( + sitemap_record.sitemap_filename, + original_sitemap_filename, + "sitemap_filename should remain unchanged", + ) + self.assertNotEqual( + sitemap_record.update_flag, + original_update_flag, + "update_flag should have changed from False to True", + ) + self.assertNotEqual( + sitemap_record.filename_lastmoddate, + original_filename_lastmoddate, + "filename_lastmoddate should have been updated", + ) def test_process_sitemap_batch_force_update_with_stale_file(self): """Test 'force-update' action when data is newer than file (should still update)""" - + base_time = adsputils.get_date() - test_bibcode = '2023ForceStale..1..1A' - bib_data = {'title': 'Test Force Stale Paper', 'year': 2023} - self.app.update_storage(test_bibcode, 'bib_data', bib_data) - + test_bibcode = "2023ForceStale..1..1A" + bib_data = {"title": "Test Force Stale Paper", "year": 2023} + self.app.update_storage(test_bibcode, "bib_data", bib_data) + # Set bib_data_updated to be NEWER than filename_lastmoddate with self.app.session_scope() as session: - session.query(Records).filter(Records.bibcode == test_bibcode).update({ - 'bib_data_updated': base_time # Current time (NEWER) - }, synchronize_session=False) + session.query(Records).filter(Records.bibcode == test_bibcode).update( + {"bib_data_updated": base_time}, # Current time (NEWER) + synchronize_session=False, + ) session.commit() - + # Create existing sitemap entry with OLDER timestamp with self.app.session_scope() as session: - record = session.query(Records).filter(Records.bibcode == test_bibcode).first() - + record = ( + session.query(Records).filter(Records.bibcode == test_bibcode).first() + ) + sitemap_info = SitemapInfo( record_id=record.id, bibcode=test_bibcode, - sitemap_filename='sitemap_bib_1.xml', - filename_lastmoddate=base_time - timedelta(hours=2), # OLDER than bib_data_updated - update_flag=False + sitemap_filename="sitemap_bib_1.xml", + filename_lastmoddate=base_time + - timedelta(hours=2), # OLDER than bib_data_updated + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Store original sitemap_info values for comparison with self.app.session_scope() as session: - original_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() + original_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) original_filename_lastmoddate = original_record.filename_lastmoddate original_sitemap_filename = original_record.sitemap_filename original_update_flag = original_record.update_flag - + # Test 'force-update' action with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1} + batch_stats, _ = self.app._process_sitemap_batch( - [test_bibcode], 'force-update', session, initial_state + [test_bibcode], "force-update", session, initial_state ) - + # Check that sitemap_info record was updated appropriately - sitemap_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).first() - - self.assertEqual(batch_stats['successful'], 1, "Record should be processed successfully") - self.assertEqual(batch_stats['failed'], 0, "No records should fail") - - # Verify the record was updated correctly - self.assertTrue(sitemap_record.update_flag, "'force-update' should ALWAYS set update_flag, regardless of timestamps") - self.assertEqual(sitemap_record.filename_lastmoddate, base_time, "filename_lastmoddate should be updated to bib_data_updated") - self.assertEqual(sitemap_record.sitemap_filename, original_sitemap_filename, "sitemap_filename should remain unchanged") - self.assertNotEqual(sitemap_record.update_flag, original_update_flag, "update_flag should have changed from False to True") - self.assertNotEqual(sitemap_record.filename_lastmoddate, original_filename_lastmoddate, "filename_lastmoddate should have been updated") + sitemap_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) - + self.assertEqual( + batch_stats["successful"], 1, "Record should be processed successfully" + ) + self.assertEqual(batch_stats["failed"], 0, "No records should fail") + + # Verify the record was updated correctly + self.assertTrue( + sitemap_record.update_flag, + "'force-update' should ALWAYS set update_flag, regardless of timestamps", + ) + self.assertEqual( + sitemap_record.filename_lastmoddate, + base_time, + "filename_lastmoddate should be updated to bib_data_updated", + ) + self.assertEqual( + sitemap_record.sitemap_filename, + original_sitemap_filename, + "sitemap_filename should remain unchanged", + ) + self.assertNotEqual( + sitemap_record.update_flag, + original_update_flag, + "update_flag should have changed from False to True", + ) + self.assertNotEqual( + sitemap_record.filename_lastmoddate, + original_filename_lastmoddate, + "filename_lastmoddate should have been updated", + ) def test_process_sitemap_batch_file_rollover(self): """Test sitemap file rollover when MAX_RECORDS_PER_SITEMAP is exceeded""" - + # Create new records for rollover test - rollover_bibcodes = ['2023Rollover..1..1A', '2023Rollover..2..2A'] + rollover_bibcodes = ["2023Rollover..1..1A", "2023Rollover..2..2A"] for bibcode in rollover_bibcodes: - bib_data = {'title': f'Rollover Paper {bibcode}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + bib_data = {"title": f"Rollover Paper {bibcode}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Set low limit to trigger rollover - original_max = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000) - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 1 # Very low limit - + original_max = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000) + self.app.conf["MAX_RECORDS_PER_SITEMAP"] = 1 # Very low limit + try: with self.app.session_scope() as session: initial_state = { - 'filename': 'sitemap_bib_3.xml', - 'count': 1, # At limit - 'index': 3 + "filename": "sitemap_bib_3.xml", + "count": 1, # At limit + "index": 3, } - + batch_stats, updated_state = self.app._process_sitemap_batch( - rollover_bibcodes, 'add', session, initial_state + rollover_bibcodes, "add", session, initial_state ) - + # Should roll over to next file (final state after processing both records) - self.assertEqual(updated_state['filename'], 'sitemap_bib_5.xml', - "Final filename should be sitemap_bib_5.xml after both rollovers") - self.assertEqual(updated_state['index'], 5, "Final index should be 5 after both rollovers") - self.assertEqual(updated_state['count'], 1, "Final count should be 1 (second record in sitemap_bib_5.xml)") - self.assertEqual(batch_stats['successful'], 2, "Both records should be processed successfully") - + self.assertEqual( + updated_state["filename"], + "sitemap_bib_5.xml", + "Final filename should be sitemap_bib_5.xml after both rollovers", + ) + self.assertEqual( + updated_state["index"], + 5, + "Final index should be 5 after both rollovers", + ) + self.assertEqual( + updated_state["count"], + 1, + "Final count should be 1 (second record in sitemap_bib_5.xml)", + ) + self.assertEqual( + batch_stats["successful"], + 2, + "Both records should be processed successfully", + ) + # Verify database was updated correctly - sitemap_records_db = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(rollover_bibcodes) - ).order_by(SitemapInfo.bibcode).all() - - self.assertEqual(len(sitemap_records_db), 2, "Should have 2 records in database") - + sitemap_records_db = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(rollover_bibcodes)) + .order_by(SitemapInfo.bibcode) + .all() + ) + + self.assertEqual( + len(sitemap_records_db), 2, "Should have 2 records in database" + ) + # Check first record (should be in sitemap_bib_4.xml after first rollover) first_record = sitemap_records_db[0] # 2023Rollover..1..1A - self.assertEqual(first_record.bibcode, '2023Rollover..1..1A', "First record bibcode should match") - self.assertEqual(first_record.sitemap_filename, 'sitemap_bib_4.xml', "First record should be in sitemap_bib_4.xml") - self.assertTrue(first_record.update_flag, "First record should have update_flag=True") - self.assertIsNone(first_record.filename_lastmoddate, "First record should have filename_lastmoddate=None (new record)") - + self.assertEqual( + first_record.bibcode, + "2023Rollover..1..1A", + "First record bibcode should match", + ) + self.assertEqual( + first_record.sitemap_filename, + "sitemap_bib_4.xml", + "First record should be in sitemap_bib_4.xml", + ) + self.assertTrue( + first_record.update_flag, + "First record should have update_flag=True", + ) + self.assertIsNone( + first_record.filename_lastmoddate, + "First record should have filename_lastmoddate=None (new record)", + ) + # Check second record (should be in sitemap_bib_5.xml after second rollover) second_record = sitemap_records_db[1] # 2023Rollover..2..2A - self.assertEqual(second_record.bibcode, '2023Rollover..2..2A', "Second record bibcode should match") - self.assertEqual(second_record.sitemap_filename, 'sitemap_bib_5.xml', "Second record should be in sitemap_bib_5.xml") - self.assertTrue(second_record.update_flag, "Second record should have update_flag=True") - self.assertIsNone(second_record.filename_lastmoddate, "Second record should have filename_lastmoddate=None (new record)") - + self.assertEqual( + second_record.bibcode, + "2023Rollover..2..2A", + "Second record bibcode should match", + ) + self.assertEqual( + second_record.sitemap_filename, + "sitemap_bib_5.xml", + "Second record should be in sitemap_bib_5.xml", + ) + self.assertTrue( + second_record.update_flag, + "Second record should have update_flag=True", + ) + self.assertIsNone( + second_record.filename_lastmoddate, + "Second record should have filename_lastmoddate=None (new record)", + ) + # Verify both records have valid record_id links - self.assertIsNotNone(first_record.record_id, "First record should have valid record_id") - self.assertIsNotNone(second_record.record_id, "Second record should have valid record_id") - + self.assertIsNotNone( + first_record.record_id, "First record should have valid record_id" + ) + self.assertIsNotNone( + second_record.record_id, "Second record should have valid record_id" + ) + # Verify the Records table entries exist - records_db = session.query(Records).filter(Records.bibcode.in_(rollover_bibcodes)).all() - self.assertEqual(len(records_db), 2, "Should have 2 records in Records table") - + records_db = ( + session.query(Records) + .filter(Records.bibcode.in_(rollover_bibcodes)) + .all() + ) + self.assertEqual( + len(records_db), 2, "Should have 2 records in Records table" + ) + # Verify record_id relationships are correct record_ids = {r.bibcode: r.id for r in records_db} - self.assertEqual(first_record.record_id, record_ids['2023Rollover..1..1A'], "First sitemap record_id should match Records table") - self.assertEqual(second_record.record_id, record_ids['2023Rollover..2..2A'], "Second sitemap record_id should match Records table") - + self.assertEqual( + first_record.record_id, + record_ids["2023Rollover..1..1A"], + "First sitemap record_id should match Records table", + ) + self.assertEqual( + second_record.record_id, + record_ids["2023Rollover..2..2A"], + "Second sitemap record_id should match Records table", + ) + finally: # Restore original limit - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_max + self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_max def test_process_sitemap_batch_error_handling(self): """Test error handling for non-existent records and exceptions""" - + # Test 1: Non-existent record - non_existent_bibcode = '2023Missing..1..1A' - + non_existent_bibcode = "2023Missing..1..1A" + with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 0, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 0, "index": 1} + batch_stats, updated_state = self.app._process_sitemap_batch( - [non_existent_bibcode], 'add', session, initial_state + [non_existent_bibcode], "add", session, initial_state ) - - self.assertEqual(batch_stats['successful'], 0, "Non-existent record should not be processed") - self.assertEqual(batch_stats['failed'], 1, "Non-existent record should be counted as failed") - self.assertEqual(updated_state, initial_state, "State should not change for failed records") - + + self.assertEqual( + batch_stats["successful"], + 0, + "Non-existent record should not be processed", + ) + self.assertEqual( + batch_stats["failed"], + 1, + "Non-existent record should be counted as failed", + ) + self.assertEqual( + updated_state, + initial_state, + "State should not change for failed records", + ) + # Test 2: Exception during processing - problematic_bibcode = '2023Problem..1..1A' - bib_data = {'title': 'Problematic Paper'} - self.app.update_storage(problematic_bibcode, 'bib_data', bib_data) - + problematic_bibcode = "2023Problem..1..1A" + bib_data = {"title": "Problematic Paper"} + self.app.update_storage(problematic_bibcode, "bib_data", bib_data) + # Mock should_include_in_sitemap to raise an exception original_method = self.app.should_include_in_sitemap + def mock_should_include(record): - if record.get('bibcode') == problematic_bibcode: + if record.get("bibcode") == problematic_bibcode: raise Exception("Test exception") return original_method(record) - + self.app.should_include_in_sitemap = mock_should_include - + try: with self.app.session_scope() as session: batch_stats, updated_state = self.app._process_sitemap_batch( - [problematic_bibcode], 'add', session, initial_state + [problematic_bibcode], "add", session, initial_state + ) + + self.assertEqual( + batch_stats["successful"], + 0, + "Exception should result in 0 successful", ) - - self.assertEqual(batch_stats['successful'], 0, "Exception should result in 0 successful") - self.assertEqual(batch_stats['failed'], 1, "Exception should result in 1 failed") - + self.assertEqual( + batch_stats["failed"], 1, "Exception should result in 1 failed" + ) + finally: # Restore original method self.app.should_include_in_sitemap = original_method def test_process_sitemap_batch_empty_input(self): """Test handling of empty bibcode list""" - + with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 5, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 5, "index": 1} + batch_stats, updated_state = self.app._process_sitemap_batch( - [], 'add', session, initial_state + [], "add", session, initial_state + ) + + self.assertEqual( + batch_stats["successful"], 0, "Empty batch should have 0 successful" + ) + self.assertEqual( + batch_stats["failed"], 0, "Empty batch should have 0 failed" + ) + self.assertEqual( + len(batch_stats["sitemap_records"]), + 0, + "Empty batch should return empty records", + ) + self.assertEqual( + updated_state, initial_state, "Empty batch should not change state" ) - - self.assertEqual(batch_stats['successful'], 0, "Empty batch should have 0 successful") - self.assertEqual(batch_stats['failed'], 0, "Empty batch should have 0 failed") - self.assertEqual(len(batch_stats['sitemap_records']), 0, "Empty batch should return empty records") - self.assertEqual(updated_state, initial_state, "Empty batch should not change state") def test_process_sitemap_batch_integration(self): """Integration test combining multiple scenarios in realistic workflow""" - + # Create a mix of different record types (realistic scenario) test_data = [ - ('2023Integration..1..1A', 'success', 'new'), # New valid record - ('2023Integration..2..2A', 'success', 'existing'), # Existing valid record - ('2023Integration..3..3A', 'solr-failed', 'new'), # New but SOLR failed + ("2023Integration..1..1A", "success", "new"), # New valid record + ("2023Integration..2..2A", "success", "existing"), # Existing valid record + ("2023Integration..3..3A", "solr-failed", "new"), # New but SOLR failed ] - + # Setup records for bibcode, status, record_type in test_data: - bib_data = {'title': f'Integration Test {bibcode}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - - if status != 'success': - self.app.mark_processed([bibcode], 'solr', checksums=[f'checksum_{bibcode}'], status=status) - + bib_data = {"title": f"Integration Test {bibcode}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + + if status != "success": + self.app.mark_processed( + [bibcode], "solr", checksums=[f"checksum_{bibcode}"], status=status + ) + # Create existing sitemap entry for one record with self.app.session_scope() as session: - records = session.query(Records).filter(Records.bibcode.like('2023Integration%')).all() + records = ( + session.query(Records) + .filter(Records.bibcode.like("2023Integration%")) + .all() + ) record_map = {r.bibcode: r.id for r in records} - + existing_sitemap = SitemapInfo( - record_id=record_map['2023Integration..2..2A'], - bibcode='2023Integration..2..2A', - sitemap_filename='sitemap_bib_1.xml', + record_id=record_map["2023Integration..2..2A"], + bibcode="2023Integration..2..2A", + sitemap_filename="sitemap_bib_1.xml", filename_lastmoddate=adsputils.get_date() - timedelta(days=5), # Stale - update_flag=False + update_flag=False, ) session.add(existing_sitemap) session.commit() - + # Run the integration test test_bibcodes = [item[0] for item in test_data] - + with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1} - + initial_state = {"filename": "sitemap_bib_1.xml", "count": 10, "index": 1} + batch_stats, updated_state = self.app._process_sitemap_batch( - test_bibcodes, 'add', session, initial_state + test_bibcodes, "add", session, initial_state ) - + # Expected: 2 successful (1 new valid + 1 existing valid), 1 failed (solr-failed) - self.assertEqual(batch_stats['successful'], 2, "Should process 1 new + 1 existing valid record") - self.assertEqual(batch_stats['failed'], 1, "Should fail 1 solr-failed record") + self.assertEqual( + batch_stats["successful"], + 2, + "Should process 1 new + 1 existing valid record", + ) + self.assertEqual( + batch_stats["failed"], 1, "Should fail 1 solr-failed record" + ) # Only 1 new record should increment count - self.assertEqual(updated_state['count'], 11, "Only new record should increment count") - self.assertEqual(updated_state['filename'], 'sitemap_bib_1.xml', "Should stay in same file") - + self.assertEqual( + updated_state["count"], 11, "Only new record should increment count" + ) + self.assertEqual( + updated_state["filename"], + "sitemap_bib_1.xml", + "Should stay in same file", + ) + # Verify database state - sitemap_records_db = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.like('2023Integration%') - ).order_by(SitemapInfo.bibcode).all() - + sitemap_records_db = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023Integration%")) + .order_by(SitemapInfo.bibcode) + .all() + ) + # Should have 2 records in database (1 new + 1 existing, solr-failed was not added) - self.assertEqual(len(sitemap_records_db), 2, "Should have 2 sitemap records in database") - + self.assertEqual( + len(sitemap_records_db), 2, "Should have 2 sitemap records in database" + ) + # Check new record (2023Integration..1..1A) - new_record = next((r for r in sitemap_records_db if r.bibcode == '2023Integration..1..1A'), None) + new_record = next( + ( + r + for r in sitemap_records_db + if r.bibcode == "2023Integration..1..1A" + ), + None, + ) self.assertIsNotNone(new_record, "New record should exist in database") - self.assertEqual(new_record.sitemap_filename, 'sitemap_bib_1.xml', "New record should be in sitemap_bib_1.xml") - self.assertTrue(new_record.update_flag, "New record should have update_flag=True") - self.assertIsNone(new_record.filename_lastmoddate, "New record should have filename_lastmoddate=None") - self.assertIsNotNone(new_record.record_id, "New record should have valid record_id") - - # Check existing record (2023Integration..2..2A) - existing_record = next((r for r in sitemap_records_db if r.bibcode == '2023Integration..2..2A'), None) - self.assertIsNotNone(existing_record, "Existing record should still exist in database") - self.assertEqual(existing_record.sitemap_filename, 'sitemap_bib_1.xml', "Existing record should stay in sitemap_bib_1.xml") - self.assertTrue(existing_record.update_flag, "Existing record should have update_flag=True (was updated)") + self.assertEqual( + new_record.sitemap_filename, + "sitemap_bib_1.xml", + "New record should be in sitemap_bib_1.xml", + ) + self.assertTrue( + new_record.update_flag, "New record should have update_flag=True" + ) + self.assertIsNone( + new_record.filename_lastmoddate, + "New record should have filename_lastmoddate=None", + ) + self.assertIsNotNone( + new_record.record_id, "New record should have valid record_id" + ) + + # Check existing record (2023Integration..2..2A) + existing_record = next( + ( + r + for r in sitemap_records_db + if r.bibcode == "2023Integration..2..2A" + ), + None, + ) + self.assertIsNotNone( + existing_record, "Existing record should still exist in database" + ) + self.assertEqual( + existing_record.sitemap_filename, + "sitemap_bib_1.xml", + "Existing record should stay in sitemap_bib_1.xml", + ) + self.assertTrue( + existing_record.update_flag, + "Existing record should have update_flag=True (was updated)", + ) # filename_lastmoddate should be updated to bib_data_updated for existing record - self.assertIsNotNone(existing_record.filename_lastmoddate, "Existing record should have filename_lastmoddate updated") - self.assertIsNotNone(existing_record.record_id, "Existing record should have valid record_id") - + self.assertIsNotNone( + existing_record.filename_lastmoddate, + "Existing record should have filename_lastmoddate updated", + ) + self.assertIsNotNone( + existing_record.record_id, "Existing record should have valid record_id" + ) + # Verify solr-failed record is NOT in sitemap database - failed_record = next((r for r in sitemap_records_db if r.bibcode == '2023Integration..3..3A'), None) - self.assertIsNone(failed_record, "SOLR-failed record should NOT be in sitemap database") - + failed_record = next( + ( + r + for r in sitemap_records_db + if r.bibcode == "2023Integration..3..3A" + ), + None, + ) + self.assertIsNone( + failed_record, "SOLR-failed record should NOT be in sitemap database" + ) + # Verify Records table has all 3 records (including the failed one) - records_db = session.query(Records).filter(Records.bibcode.like('2023Integration%')).all() - self.assertEqual(len(records_db), 3, "Should have 3 records in Records table (including failed one)") - + records_db = ( + session.query(Records) + .filter(Records.bibcode.like("2023Integration%")) + .all() + ) + self.assertEqual( + len(records_db), + 3, + "Should have 3 records in Records table (including failed one)", + ) + # Verify record_id relationships are correct record_ids = {r.bibcode: r.id for r in records_db} - self.assertEqual(new_record.record_id, record_ids['2023Integration..1..1A'], "New record record_id should match") - self.assertEqual(existing_record.record_id, record_ids['2023Integration..2..2A'], "Existing record record_id should match") - - + self.assertEqual( + new_record.record_id, + record_ids["2023Integration..1..1A"], + "New record record_id should match", + ) + self.assertEqual( + existing_record.record_id, + record_ids["2023Integration..2..2A"], + "Existing record record_id should match", + ) def test_bulk_insert_and_update_operations(self): """Test both bulk_insert_sitemap_records and bulk_update_sitemap_records in single batch""" - + # Create test data - mix of new records and records that will need updates test_bibcodes = [ - '2023BulkOps..1..1A', # Will be new (insert) - '2023BulkOps..2..2A', # Will be new (insert) - '2023BulkOps..3..3A', # Will be existing (update) - '2023BulkOps..4..4A', # Will be existing (update) + "2023BulkOps..1..1A", # Will be new (insert) + "2023BulkOps..2..2A", # Will be new (insert) + "2023BulkOps..3..3A", # Will be existing (update) + "2023BulkOps..4..4A", # Will be existing (update) ] - + # Create Records entries for bibcode in test_bibcodes: - bib_data = {'title': f'Bulk Operations Test {bibcode}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + bib_data = {"title": f"Bulk Operations Test {bibcode}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Create existing SitemapInfo entries for records 3 and 4 (these will be updates) with self.app.session_scope() as session: - records = session.query(Records).filter(Records.bibcode.like('2023BulkOps%')).all() + records = ( + session.query(Records) + .filter(Records.bibcode.like("2023BulkOps%")) + .all() + ) record_map = {r.bibcode: r.id for r in records} - + existing_entries = [ SitemapInfo( - record_id=record_map['2023BulkOps..3..3A'], - bibcode='2023BulkOps..3..3A', - sitemap_filename='sitemap_bib_1.xml', - filename_lastmoddate=adsputils.get_date() - timedelta(days=5), # Stale - update_flag=False + record_id=record_map["2023BulkOps..3..3A"], + bibcode="2023BulkOps..3..3A", + sitemap_filename="sitemap_bib_1.xml", + filename_lastmoddate=adsputils.get_date() + - timedelta(days=5), # Stale + update_flag=False, ), SitemapInfo( - record_id=record_map['2023BulkOps..4..4A'], - bibcode='2023BulkOps..4..4A', - sitemap_filename='sitemap_bib_1.xml', - filename_lastmoddate=adsputils.get_date() - timedelta(days=3), # Stale - update_flag=False - ) + record_id=record_map["2023BulkOps..4..4A"], + bibcode="2023BulkOps..4..4A", + sitemap_filename="sitemap_bib_1.xml", + filename_lastmoddate=adsputils.get_date() + - timedelta(days=3), # Stale + update_flag=False, + ), ] - + for entry in existing_entries: session.add(entry) session.commit() - + # Mock the bulk operations to verify they're called correctly - with patch.object(self.app, 'bulk_insert_sitemap_records') as mock_insert, \ - patch.object(self.app, 'bulk_update_sitemap_records') as mock_update: - + with patch.object( + self.app, "bulk_insert_sitemap_records" + ) as mock_insert, patch.object( + self.app, "bulk_update_sitemap_records" + ) as mock_update: # Run the batch processing with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1} - + initial_state = { + "filename": "sitemap_bib_1.xml", + "count": 10, + "index": 1, + } + batch_stats, updated_state = self.app._process_sitemap_batch( - test_bibcodes, 'add', session, initial_state + test_bibcodes, "add", session, initial_state ) - + # Verify results - self.assertEqual(batch_stats['successful'], 4, "Should process all 4 records successfully") - self.assertEqual(batch_stats['failed'], 0, "Should have no failures") - self.assertEqual(updated_state['count'], 12, "Should increment count by 2 (new records only)") - + self.assertEqual( + batch_stats["successful"], + 4, + "Should process all 4 records successfully", + ) + self.assertEqual(batch_stats["failed"], 0, "Should have no failures") + self.assertEqual( + updated_state["count"], + 12, + "Should increment count by 2 (new records only)", + ) + # Verify bulk_insert_sitemap_records was called with new records - self.assertTrue(mock_insert.called, "bulk_insert_sitemap_records should be called") + self.assertTrue( + mock_insert.called, "bulk_insert_sitemap_records should be called" + ) insert_call_args = mock_insert.call_args[0] insert_records = insert_call_args[0] # First argument: new_records list insert_session = insert_call_args[1] # Second argument: session - + # Should have 2 new records (records 1 and 2) self.assertEqual(len(insert_records), 2, "Should insert 2 new records") - insert_bibcodes = {r['bibcode'] for r in insert_records} - expected_new = {'2023BulkOps..1..1A', '2023BulkOps..2..2A'} - self.assertEqual(insert_bibcodes, expected_new, "Should insert correct new records") - + insert_bibcodes = {r["bibcode"] for r in insert_records} + expected_new = {"2023BulkOps..1..1A", "2023BulkOps..2..2A"} + self.assertEqual( + insert_bibcodes, expected_new, "Should insert correct new records" + ) + # Verify session parameter - self.assertIs(insert_session, session, "Should pass correct session to bulk_insert") - + self.assertIs( + insert_session, + session, + "Should pass correct session to bulk_insert", + ) + # Verify bulk_update_sitemap_records was called with existing records - self.assertTrue(mock_update.called, "bulk_update_sitemap_records should be called") + self.assertTrue( + mock_update.called, "bulk_update_sitemap_records should be called" + ) update_call_args = mock_update.call_args[0] - update_records = update_call_args[0] # First argument: update_records list + update_records = update_call_args[ + 0 + ] # First argument: update_records list update_session = update_call_args[1] # Second argument: session - + # Should have 2 update records (records 3 and 4) - self.assertEqual(len(update_records), 2, "Should update 2 existing records") - update_bibcodes = {r[0]['bibcode'] for r in update_records} # r[0] is sitemap_record - expected_updates = {'2023BulkOps..3..3A', '2023BulkOps..4..4A'} - self.assertEqual(update_bibcodes, expected_updates, "Should update correct existing records") - + self.assertEqual( + len(update_records), 2, "Should update 2 existing records" + ) + update_bibcodes = { + r[0]["bibcode"] for r in update_records + } # r[0] is sitemap_record + expected_updates = {"2023BulkOps..3..3A", "2023BulkOps..4..4A"} + self.assertEqual( + update_bibcodes, + expected_updates, + "Should update correct existing records", + ) + # Verify session parameter - self.assertIs(update_session, session, "Should pass correct session to bulk_update") - + self.assertIs( + update_session, + session, + "Should pass correct session to bulk_update", + ) + # Verify update records have correct properties for sitemap_record, sitemap_info in update_records: # Unpack tuple - self.assertTrue(sitemap_record['update_flag'], f"Update record {sitemap_record['bibcode']} should have update_flag=True") - self.assertIsNotNone(sitemap_record['filename_lastmoddate'], f"Update record {sitemap_record['bibcode']} should have filename_lastmoddate updated") - + self.assertTrue( + sitemap_record["update_flag"], + f"Update record {sitemap_record['bibcode']} should have update_flag=True", + ) + self.assertIsNotNone( + sitemap_record["filename_lastmoddate"], + f"Update record {sitemap_record['bibcode']} should have filename_lastmoddate updated", + ) + # Verify insert records have correct properties for record in insert_records: - self.assertTrue(record['update_flag'], f"Insert record {record['bibcode']} should have update_flag=True") - self.assertIsNone(record['filename_lastmoddate'], f"Insert record {record['bibcode']} should have filename_lastmoddate=None") - self.assertEqual(record['sitemap_filename'], 'sitemap_bib_1.xml', f"Insert record {record['bibcode']} should be in correct file") - - + self.assertTrue( + record["update_flag"], + f"Insert record {record['bibcode']} should have update_flag=True", + ) + self.assertIsNone( + record["filename_lastmoddate"], + f"Insert record {record['bibcode']} should have filename_lastmoddate=None", + ) + self.assertEqual( + record["sitemap_filename"], + "sitemap_bib_1.xml", + f"Insert record {record['bibcode']} should be in correct file", + ) def test_bulk_operations_error_handling(self): """Test error handling in bulk database operations during _process_sitemap_batch""" - + # Create test data - test_bibcodes = ['2023BulkError..1..1A', '2023BulkError..2..2A'] - + test_bibcodes = ["2023BulkError..1..1A", "2023BulkError..2..2A"] + for bibcode in test_bibcodes: - bib_data = {'title': f'Bulk Error Test {bibcode}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + bib_data = {"title": f"Bulk Error Test {bibcode}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Mock bulk_insert to raise an exception - with patch.object(self.app, 'bulk_insert_sitemap_records', side_effect=Exception("Database insert failed")): - + with patch.object( + self.app, + "bulk_insert_sitemap_records", + side_effect=Exception("Database insert failed"), + ): with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1} - + initial_state = { + "filename": "sitemap_bib_1.xml", + "count": 10, + "index": 1, + } + # Should raise the exception from bulk operations with self.assertRaises(Exception) as context: - self.app._process_sitemap_batch(test_bibcodes, 'add', session, initial_state) - + self.app._process_sitemap_batch( + test_bibcodes, "add", session, initial_state + ) + self.assertIn("Database insert failed", str(context.exception)) def test_bulk_operations_empty_scenarios(self): """Test bulk operations when there are no records to insert or update""" - + # Create test records that will all be filtered out by SOLR status - test_bibcodes = ['2023BulkEmpty..1..1A', '2023BulkEmpty..2..2A'] - + test_bibcodes = ["2023BulkEmpty..1..1A", "2023BulkEmpty..2..2A"] + for bibcode in test_bibcodes: - bib_data = {'title': f'Bulk Empty Test {bibcode}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) + bib_data = {"title": f"Bulk Empty Test {bibcode}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) # Mark as solr-failed so they get filtered out - self.app.mark_processed([bibcode], 'solr', checksums=[f'checksum_{bibcode}'], status='solr-failed') - + self.app.mark_processed( + [bibcode], + "solr", + checksums=[f"checksum_{bibcode}"], + status="solr-failed", + ) + # Mock the bulk operations to verify they're not called - with patch.object(self.app, 'bulk_insert_sitemap_records') as mock_insert, \ - patch.object(self.app, 'bulk_update_sitemap_records') as mock_update: - + with patch.object( + self.app, "bulk_insert_sitemap_records" + ) as mock_insert, patch.object( + self.app, "bulk_update_sitemap_records" + ) as mock_update: with self.app.session_scope() as session: - initial_state = {'filename': 'sitemap_bib_1.xml', 'count': 10, 'index': 1} - + initial_state = { + "filename": "sitemap_bib_1.xml", + "count": 10, + "index": 1, + } + batch_stats, updated_state = self.app._process_sitemap_batch( - test_bibcodes, 'add', session, initial_state + test_bibcodes, "add", session, initial_state ) - + # Verify results - self.assertEqual(batch_stats['successful'], 0, "Should have no successful records") - self.assertEqual(batch_stats['failed'], 2, "Should have 2 failed records (filtered out)") - self.assertEqual(updated_state['count'], 10, "Count should not change") - + self.assertEqual( + batch_stats["successful"], 0, "Should have no successful records" + ) + self.assertEqual( + batch_stats["failed"], + 2, + "Should have 2 failed records (filtered out)", + ) + self.assertEqual(updated_state["count"], 10, "Count should not change") + # Verify bulk operations were not called (no valid records to process) - self.assertFalse(mock_insert.called, "bulk_insert_sitemap_records should not be called") - self.assertFalse(mock_update.called, "bulk_update_sitemap_records should not be called") - + self.assertFalse( + mock_insert.called, + "bulk_insert_sitemap_records should not be called", + ) + self.assertFalse( + mock_update.called, + "bulk_update_sitemap_records should not be called", + ) def test_bulk_update_sitemap_records(self): """Test bulk_update_sitemap_records method with performance timing""" - + # Create test records test_bibcodes = [] for i in range(100): - bibcode = f'2023BulkUpdate..{i:04d}..{i:04d}A' + bibcode = f"2023BulkUpdate..{i:04d}..{i:04d}A" test_bibcodes.append(bibcode) - bib_data = {'title': f'Bulk Update Test {i}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + bib_data = {"title": f"Bulk Update Test {i}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Create initial sitemap entries with self.app.session_scope() as session: - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + records = ( + session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + ) record_map = {r.bibcode: r.id for r in records} - + for i, bibcode in enumerate(test_bibcodes): sitemap_info = SitemapInfo( record_id=record_map[bibcode], bibcode=bibcode, - sitemap_filename=f'sitemap_bib_{(i // 50) + 1}.xml', # 50 per file - filename_lastmoddate=adsputils.get_date() - timedelta(hours=i), # Different timestamps - update_flag=False + sitemap_filename=f"sitemap_bib_{(i // 50) + 1}.xml", # 50 per file + filename_lastmoddate=adsputils.get_date() + - timedelta(hours=i), # Different timestamps + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Prepare update records (tuples of sitemap_record, sitemap_info) update_records = [] new_timestamp = adsputils.get_date() - + with self.app.session_scope() as session: - sitemap_infos = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).all() - + sitemap_infos = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .all() + ) + for sitemap_info in sitemap_infos: # Create sitemap_record dict with updated values sitemap_record = { - 'bibcode': sitemap_info.bibcode, - 'bib_data_updated': new_timestamp, - 'sitemap_filename': sitemap_info.sitemap_filename, - 'filename_lastmoddate': new_timestamp, # Updated timestamp - 'update_flag': True # Mark for regeneration + "bibcode": sitemap_info.bibcode, + "bib_data_updated": new_timestamp, + "sitemap_filename": sitemap_info.sitemap_filename, + "filename_lastmoddate": new_timestamp, # Updated timestamp + "update_flag": True, # Mark for regeneration } - + # Create sitemap_info dict with id for bulk update sitemap_info_dict = { - 'id': sitemap_info.id, - 'bibcode': sitemap_info.bibcode, - 'sitemap_filename': sitemap_info.sitemap_filename + "id": sitemap_info.id, + "bibcode": sitemap_info.bibcode, + "sitemap_filename": sitemap_info.sitemap_filename, } - + update_records.append((sitemap_record, sitemap_info_dict)) - + # Test bulk update with performance timing with self.app.session_scope() as session: start_time = adsputils.get_date() - + self.app.bulk_update_sitemap_records(update_records, session) session.commit() - + end_time = adsputils.get_date() update_time = (end_time - start_time).total_seconds() - + # Performance assertion - self.assertLess(update_time, 5.0, f"Bulk update took {update_time:.3f}s, should be under 5s") - - print(f"bulk_update_sitemap_records performance: 100 records updated in {update_time:.3f}s") - + self.assertLess( + update_time, + 5.0, + f"Bulk update took {update_time:.3f}s, should be under 5s", + ) + + print( + f"bulk_update_sitemap_records performance: 100 records updated in {update_time:.3f}s" + ) + # Verify all records were updated correctly with self.app.session_scope() as session: - updated_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).all() - - self.assertEqual(len(updated_records), 100, "All 100 records should still exist") - + updated_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .all() + ) + + self.assertEqual( + len(updated_records), 100, "All 100 records should still exist" + ) + for record in updated_records: # Verify update_flag was set to True - self.assertTrue(record.update_flag, f"Record {record.bibcode} should have update_flag=True") - + self.assertTrue( + record.update_flag, + f"Record {record.bibcode} should have update_flag=True", + ) + # Verify filename_lastmoddate was updated (should be close to new_timestamp) - time_diff = abs((record.filename_lastmoddate - new_timestamp).total_seconds()) - self.assertLess(time_diff, 60, f"Record {record.bibcode} filename_lastmoddate should be updated") - + time_diff = abs( + (record.filename_lastmoddate - new_timestamp).total_seconds() + ) + self.assertLess( + time_diff, + 60, + f"Record {record.bibcode} filename_lastmoddate should be updated", + ) + # Verify bib_data_updated was updated - bib_time_diff = abs((record.bib_data_updated - new_timestamp).total_seconds()) - self.assertLess(bib_time_diff, 60, f"Record {record.bibcode} bib_data_updated should be updated") - + bib_time_diff = abs( + (record.bib_data_updated - new_timestamp).total_seconds() + ) + self.assertLess( + bib_time_diff, + 60, + f"Record {record.bibcode} bib_data_updated should be updated", + ) + # Verify sitemap_filename remains unchanged - expected_filename = f'sitemap_bib_{(test_bibcodes.index(record.bibcode) // 50) + 1}.xml' - self.assertEqual(record.sitemap_filename, expected_filename, - f"Record {record.bibcode} sitemap_filename should remain unchanged") - + expected_filename = ( + f"sitemap_bib_{(test_bibcodes.index(record.bibcode) // 50) + 1}.xml" + ) + self.assertEqual( + record.sitemap_filename, + expected_filename, + f"Record {record.bibcode} sitemap_filename should remain unchanged", + ) + # Test edge cases - + # Test 1: Empty update_records list with self.app.session_scope() as session: # Should not raise an exception self.app.bulk_update_sitemap_records([], session) - + # Test 2: Single record update single_update = [(update_records[0][0], update_records[0][1])] # First record - + with self.app.session_scope() as session: # Change update_flag back to False for testing session.query(SitemapInfo).filter( SitemapInfo.bibcode == test_bibcodes[0] - ).update({'update_flag': False}, synchronize_session=False) + ).update({"update_flag": False}, synchronize_session=False) session.commit() - + # Update with new values - single_update[0][0]['update_flag'] = True - single_update[0][0]['filename_lastmoddate'] = adsputils.get_date() - + single_update[0][0]["update_flag"] = True + single_update[0][0]["filename_lastmoddate"] = adsputils.get_date() + self.app.bulk_update_sitemap_records(single_update, session) session.commit() - + # Verify single record was updated - updated_record = session.query(SitemapInfo).filter( - SitemapInfo.bibcode == test_bibcodes[0] - ).first() - - self.assertTrue(updated_record.update_flag, "Single record should have update_flag=True") - + updated_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcodes[0]) + .first() + ) + + self.assertTrue( + updated_record.update_flag, "Single record should have update_flag=True" + ) + # Test 3: Partial field updates (only some fields provided in sitemap_record) with self.app.session_scope() as session: # Get the record ID within the active session - second_record = session.query(SitemapInfo).filter( - SitemapInfo.bibcode == test_bibcodes[1] - ).first() - + second_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcodes[1]) + .first() + ) + partial_update_record = { - 'bibcode': test_bibcodes[1], - 'update_flag': False # Only updating this field - } - - partial_sitemap_info = { - 'id': second_record.id, - 'bibcode': test_bibcodes[1] + "bibcode": test_bibcodes[1], + "update_flag": False, # Only updating this field } - + + partial_sitemap_info = {"id": second_record.id, "bibcode": test_bibcodes[1]} + partial_updates = [(partial_update_record, partial_sitemap_info)] - + self.app.bulk_update_sitemap_records(partial_updates, session) session.commit() - + # Verify only update_flag was changed - partially_updated = session.query(SitemapInfo).filter( - SitemapInfo.bibcode == test_bibcodes[1] - ).first() - - self.assertFalse(partially_updated.update_flag, "Partial update should set update_flag=False") + partially_updated = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcodes[1]) + .first() + ) + + self.assertFalse( + partially_updated.update_flag, + "Partial update should set update_flag=False", + ) # Other fields should remain as they were (not None) - self.assertIsNotNone(partially_updated.sitemap_filename, "sitemap_filename should not be cleared") - self.assertIsNotNone(partially_updated.filename_lastmoddate, "filename_lastmoddate should not be cleared") - + self.assertIsNotNone( + partially_updated.sitemap_filename, + "sitemap_filename should not be cleared", + ) + self.assertIsNotNone( + partially_updated.filename_lastmoddate, + "filename_lastmoddate should not be cleared", + ) def test_bulk_insert_sitemap_records(self): """Test bulk_insert_sitemap_records method with performance timing""" - + # Create test records in Records table first test_bibcodes = [] for i in range(200): - bibcode = f'2023BulkInsert..{i:04d}..{i:04d}A' + bibcode = f"2023BulkInsert..{i:04d}..{i:04d}A" test_bibcodes.append(bibcode) - bib_data = {'title': f'Bulk Insert Test {i}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + bib_data = {"title": f"Bulk Insert Test {i}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Get record IDs for foreign key relationships with self.app.session_scope() as session: - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + records = ( + session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + ) record_map = {r.bibcode: r.id for r in records} - + # Prepare sitemap records for bulk insert sitemap_records = [] base_timestamp = adsputils.get_date() - + for i, bibcode in enumerate(test_bibcodes): sitemap_record = { - 'record_id': record_map[bibcode], - 'bibcode': bibcode, - 'sitemap_filename': f'sitemap_bib_{(i // 100) + 1}.xml', # 100 per file - 'bib_data_updated': base_timestamp - timedelta(minutes=i), # Different timestamps - 'filename_lastmoddate': None, # New records start with None - 'update_flag': True # New records need file generation + "record_id": record_map[bibcode], + "bibcode": bibcode, + "sitemap_filename": f"sitemap_bib_{(i // 100) + 1}.xml", # 100 per file + "bib_data_updated": base_timestamp + - timedelta(minutes=i), # Different timestamps + "filename_lastmoddate": None, # New records start with None + "update_flag": True, # New records need file generation } sitemap_records.append(sitemap_record) - + # Test bulk insert with performance timing with self.app.session_scope() as session: # Verify no sitemap records exist initially - initial_count = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).count() + initial_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .count() + ) self.assertEqual(initial_count, 0, "Should start with no sitemap records") - + start_time = adsputils.get_date() - + self.app.bulk_insert_sitemap_records(sitemap_records, session) session.commit() - + end_time = adsputils.get_date() insert_time = (end_time - start_time).total_seconds() - + # Performance assertion - self.assertLess(insert_time, 5.0, f"Bulk insert took {insert_time:.3f}s, should be under 5s") - - print(f"bulk_insert_sitemap_records performance: 200 records inserted in {insert_time:.3f}s") - + self.assertLess( + insert_time, + 5.0, + f"Bulk insert took {insert_time:.3f}s, should be under 5s", + ) + + print( + f"bulk_insert_sitemap_records performance: 200 records inserted in {insert_time:.3f}s" + ) + # Verify all records were inserted correctly with self.app.session_scope() as session: - inserted_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).order_by(SitemapInfo.bibcode).all() - - self.assertEqual(len(inserted_records), 200, "All 200 records should be inserted") - + inserted_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .order_by(SitemapInfo.bibcode) + .all() + ) + + self.assertEqual( + len(inserted_records), 200, "All 200 records should be inserted" + ) + # Verify record distribution across files file_counts = {} for record in inserted_records: filename = record.sitemap_filename file_counts[filename] = file_counts.get(filename, 0) + 1 - + # Should have 2 files with 100 records each self.assertEqual(len(file_counts), 2, "Should have exactly 2 sitemap files") - self.assertEqual(file_counts.get('sitemap_bib_1.xml', 0), 100, "First file should have 100 records") - self.assertEqual(file_counts.get('sitemap_bib_2.xml', 0), 100, "Second file should have 100 records") - + self.assertEqual( + file_counts.get("sitemap_bib_1.xml", 0), + 100, + "First file should have 100 records", + ) + self.assertEqual( + file_counts.get("sitemap_bib_2.xml", 0), + 100, + "Second file should have 100 records", + ) + # Verify individual record properties for i, record in enumerate(inserted_records): expected_bibcode = test_bibcodes[i] - self.assertEqual(record.bibcode, expected_bibcode, f"Record {i} bibcode should match") - + self.assertEqual( + record.bibcode, expected_bibcode, f"Record {i} bibcode should match" + ) + # Verify foreign key relationship - self.assertEqual(record.record_id, record_map[expected_bibcode], - f"Record {expected_bibcode} should have correct record_id") - + self.assertEqual( + record.record_id, + record_map[expected_bibcode], + f"Record {expected_bibcode} should have correct record_id", + ) + # Verify initial values for new records - self.assertTrue(record.update_flag, f"Record {expected_bibcode} should have update_flag=True") - self.assertIsNone(record.filename_lastmoddate, f"Record {expected_bibcode} should have filename_lastmoddate=None") - + self.assertTrue( + record.update_flag, + f"Record {expected_bibcode} should have update_flag=True", + ) + self.assertIsNone( + record.filename_lastmoddate, + f"Record {expected_bibcode} should have filename_lastmoddate=None", + ) + # Verify sitemap filename assignment - expected_filename = f'sitemap_bib_{(i // 100) + 1}.xml' - self.assertEqual(record.sitemap_filename, expected_filename, - f"Record {expected_bibcode} should be in {expected_filename}") - + expected_filename = f"sitemap_bib_{(i // 100) + 1}.xml" + self.assertEqual( + record.sitemap_filename, + expected_filename, + f"Record {expected_bibcode} should be in {expected_filename}", + ) + # Verify timestamp was set - self.assertIsNotNone(record.bib_data_updated, f"Record {expected_bibcode} should have bib_data_updated") - + self.assertIsNotNone( + record.bib_data_updated, + f"Record {expected_bibcode} should have bib_data_updated", + ) + # Verify timestamp precision (should be within expected range) expected_time = base_timestamp - timedelta(minutes=i) - time_diff = abs((record.bib_data_updated - expected_time).total_seconds()) - self.assertLess(time_diff, 60, f"Record {expected_bibcode} timestamp should be accurate") - + time_diff = abs( + (record.bib_data_updated - expected_time).total_seconds() + ) + self.assertLess( + time_diff, + 60, + f"Record {expected_bibcode} timestamp should be accurate", + ) + # Test edge cases - + # Test 1: Empty batch_stats['sitemap_records'] list with self.app.session_scope() as session: # Should not raise an exception self.app.bulk_insert_sitemap_records([], session) session.commit() - + # Test 2: Single record insert - single_bibcode = '2023SingleInsert..1..1A' - single_bib_data = {'title': 'Single Insert Test', 'year': 2023} - self.app.update_storage(single_bibcode, 'bib_data', single_bib_data) - + single_bibcode = "2023SingleInsert..1..1A" + single_bib_data = {"title": "Single Insert Test", "year": 2023} + self.app.update_storage(single_bibcode, "bib_data", single_bib_data) + with self.app.session_scope() as session: - single_record = session.query(Records).filter(Records.bibcode == single_bibcode).first() - + single_record = ( + session.query(Records).filter(Records.bibcode == single_bibcode).first() + ) + single_sitemap_record = { - 'record_id': single_record.id, - 'bibcode': single_bibcode, - 'sitemap_filename': 'sitemap_bib_single.xml', - 'bib_data_updated': adsputils.get_date(), - 'filename_lastmoddate': None, - 'update_flag': True + "record_id": single_record.id, + "bibcode": single_bibcode, + "sitemap_filename": "sitemap_bib_single.xml", + "bib_data_updated": adsputils.get_date(), + "filename_lastmoddate": None, + "update_flag": True, } - + self.app.bulk_insert_sitemap_records([single_sitemap_record], session) session.commit() - + # Verify single record was inserted - inserted_single = session.query(SitemapInfo).filter( - SitemapInfo.bibcode == single_bibcode - ).first() - + inserted_single = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == single_bibcode) + .first() + ) + self.assertIsNotNone(inserted_single, "Single record should be inserted") - self.assertEqual(inserted_single.bibcode, single_bibcode, "Single record bibcode should match") - self.assertEqual(inserted_single.sitemap_filename, 'sitemap_bib_single.xml', "Single record filename should match") - self.assertTrue(inserted_single.update_flag, "Single record should have update_flag=True") - + self.assertEqual( + inserted_single.bibcode, + single_bibcode, + "Single record bibcode should match", + ) + self.assertEqual( + inserted_single.sitemap_filename, + "sitemap_bib_single.xml", + "Single record filename should match", + ) + self.assertTrue( + inserted_single.update_flag, + "Single record should have update_flag=True", + ) + # Test 3: Minimal required fields (test with only required fields) - minimal_bibcode = '2023MinimalInsert..1..1A' - minimal_bib_data = {'title': 'Minimal Insert Test', 'year': 2023} - self.app.update_storage(minimal_bibcode, 'bib_data', minimal_bib_data) - + minimal_bibcode = "2023MinimalInsert..1..1A" + minimal_bib_data = {"title": "Minimal Insert Test", "year": 2023} + self.app.update_storage(minimal_bibcode, "bib_data", minimal_bib_data) + with self.app.session_scope() as session: - minimal_record = session.query(Records).filter(Records.bibcode == minimal_bibcode).first() - + minimal_record = ( + session.query(Records) + .filter(Records.bibcode == minimal_bibcode) + .first() + ) + minimal_sitemap_record = { - 'record_id': minimal_record.id, - 'bibcode': minimal_bibcode, + "record_id": minimal_record.id, + "bibcode": minimal_bibcode, # Only required fields, test defaults } - + self.app.bulk_insert_sitemap_records([minimal_sitemap_record], session) session.commit() - + # Verify minimal record was inserted with defaults - inserted_minimal = session.query(SitemapInfo).filter( - SitemapInfo.bibcode == minimal_bibcode - ).first() - + inserted_minimal = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == minimal_bibcode) + .first() + ) + self.assertIsNotNone(inserted_minimal, "Minimal record should be inserted") - self.assertEqual(inserted_minimal.bibcode, minimal_bibcode, "Minimal record bibcode should match") - self.assertEqual(inserted_minimal.record_id, minimal_record.id, "Minimal record should have correct record_id") + self.assertEqual( + inserted_minimal.bibcode, + minimal_bibcode, + "Minimal record bibcode should match", + ) + self.assertEqual( + inserted_minimal.record_id, + minimal_record.id, + "Minimal record should have correct record_id", + ) # Other fields should have their database defaults - self.assertIsNone(inserted_minimal.sitemap_filename, "Minimal record should have default sitemap_filename") - self.assertFalse(inserted_minimal.update_flag, "Minimal record should have default update_flag=False") - + self.assertIsNone( + inserted_minimal.sitemap_filename, + "Minimal record should have default sitemap_filename", + ) + self.assertFalse( + inserted_minimal.update_flag, + "Minimal record should have default update_flag=False", + ) + # Test 4: Verify no duplicate inserts (attempt to insert same bibcode twice should fail) - duplicate_bibcode = '2023DuplicateTest..1..1A' - duplicate_bib_data = {'title': 'Duplicate Test', 'year': 2023} - self.app.update_storage(duplicate_bibcode, 'bib_data', duplicate_bib_data) - + duplicate_bibcode = "2023DuplicateTest..1..1A" + duplicate_bib_data = {"title": "Duplicate Test", "year": 2023} + self.app.update_storage(duplicate_bibcode, "bib_data", duplicate_bib_data) + with self.app.session_scope() as session: - duplicate_record = session.query(Records).filter(Records.bibcode == duplicate_bibcode).first() - + duplicate_record = ( + session.query(Records) + .filter(Records.bibcode == duplicate_bibcode) + .first() + ) + duplicate_sitemap_record = { - 'record_id': duplicate_record.id, - 'bibcode': duplicate_bibcode, - 'sitemap_filename': 'sitemap_bib_duplicate.xml', - 'update_flag': True + "record_id": duplicate_record.id, + "bibcode": duplicate_bibcode, + "sitemap_filename": "sitemap_bib_duplicate.xml", + "update_flag": True, } - + # First insert should succeed self.app.bulk_insert_sitemap_records([duplicate_sitemap_record], session) session.commit() - + # Second insert of same bibcode should fail due to UNIQUE constraint with self.assertRaises(Exception): # Should raise IntegrityError or similar - with self.app.session_scope() as new_session: - self.app.bulk_insert_sitemap_records([duplicate_sitemap_record], new_session) + with self.app.session_scope() as new_session: + self.app.bulk_insert_sitemap_records( + [duplicate_sitemap_record], new_session + ) new_session.commit() - def test_delete_contents(self): """Test delete_contents method""" - + # Create test records in SitemapInfo table - test_bibcodes = ['2023DeleteTest..1..1A', '2023DeleteTest..2..2A', '2023DeleteTest..3..3A'] - + test_bibcodes = [ + "2023DeleteTest..1..1A", + "2023DeleteTest..2..2A", + "2023DeleteTest..3..3A", + ] + for bibcode in test_bibcodes: - bib_data = {'title': f'Delete Test {bibcode}', 'year': 2023} - self.app.update_storage(bibcode, 'bib_data', bib_data) - + bib_data = {"title": f"Delete Test {bibcode}", "year": 2023} + self.app.update_storage(bibcode, "bib_data", bib_data) + # Create sitemap entries with self.app.session_scope() as session: - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() - + records = ( + session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + ) + for record in records: sitemap_info = SitemapInfo( record_id=record.id, bibcode=record.bibcode, - sitemap_filename='sitemap_bib_test.xml', - update_flag=True + sitemap_filename="sitemap_bib_test.xml", + update_flag=True, ) session.add(sitemap_info) session.commit() - + # Verify records exist before deletion with self.app.session_scope() as session: - initial_count = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).count() - self.assertEqual(initial_count, 3, "Should have 3 sitemap records before deletion") - + initial_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .count() + ) + self.assertEqual( + initial_count, 3, "Should have 3 sitemap records before deletion" + ) + # Test delete_contents self.app.delete_contents(SitemapInfo) - + # Verify all records were deleted with self.app.session_scope() as session: final_count = session.query(SitemapInfo).count() self.assertEqual(final_count, 0, "All sitemap records should be deleted") - + # Verify Records table is unaffected - records_count = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).count() + records_count = ( + session.query(Records) + .filter(Records.bibcode.in_(test_bibcodes)) + .count() + ) self.assertEqual(records_count, 3, "Records table should be unaffected") - def test_backup_sitemap_files(self): """Test backup_sitemap_files method""" - # Create temporary directory for test with tempfile.TemporaryDirectory() as temp_dir: # Create test sitemap files - test_files = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml', 'sitemap_index.xml'] - + test_files = ["sitemap_bib_1.xml", "sitemap_bib_2.xml", "sitemap_index.xml"] + for filename in test_files: file_path = os.path.join(temp_dir, filename) - with open(file_path, 'w') as f: - f.write(f'Test content for {filename}') - + with open(file_path, "w") as f: + f.write(f"Test content for {filename}") + # Verify files exist before backup initial_files = os.listdir(temp_dir) - self.assertEqual(len(initial_files), 3, "Should have 3 test files before backup") + self.assertEqual( + len(initial_files), 3, "Should have 3 test files before backup" + ) for filename in test_files: - self.assertIn(filename, initial_files, f"File {filename} should exist before backup") - + self.assertIn( + filename, + initial_files, + f"File {filename} should exist before backup", + ) + # Mock os.system to capture the backup commands backup_commands = [] original_system = os.system - + def mock_system(command): backup_commands.append(command) # Execute mkdir command but skip mv command for testing - if command.startswith('mkdir'): + if command.startswith("mkdir"): return original_system(command) return 0 # Success for mv command - + # Test backup_sitemap_files with mocked os.system - with patch('os.system', side_effect=mock_system): + with patch("os.system", side_effect=mock_system): self.app.backup_sitemap_files(temp_dir) - + # Verify backup commands were called - self.assertEqual(len(backup_commands), 2, "Should execute 2 commands (mkdir + mv)") - + self.assertEqual( + len(backup_commands), 2, "Should execute 2 commands (mkdir + mv)" + ) + # Check mkdir command mkdir_command = backup_commands[0] - self.assertTrue(mkdir_command.startswith('mkdir -p /app/logs/tmp/sitemap_'), - "First command should create backup directory") - + self.assertTrue( + mkdir_command.startswith("mkdir -p /app/logs/tmp/sitemap_"), + "First command should create backup directory", + ) + # Check mv command mv_command = backup_commands[1] - self.assertTrue(mv_command.startswith(f'mv {temp_dir}/*'), - "Second command should move files from source directory") - self.assertIn('/app/logs/tmp/sitemap_', mv_command, - "Move command should target backup directory") - + self.assertTrue( + mv_command.startswith(f"mv {temp_dir}/*"), + "Second command should move files from source directory", + ) + self.assertIn( + "/app/logs/tmp/sitemap_", + mv_command, + "Move command should target backup directory", + ) + # Verify backup directory path format (contains date components) - - date_pattern = r'/app/logs/tmp/sitemap_\d{4}_\d{1,2}_\d{1,2}-' - self.assertTrue(re.search(date_pattern, mkdir_command), - "Backup directory should contain date components") - + + date_pattern = r"/app/logs/tmp/sitemap_\d{4}_\d{1,2}_\d{1,2}-" + self.assertTrue( + re.search(date_pattern, mkdir_command), + "Backup directory should contain date components", + ) def test_execute_remove_action_basic_functionality(self): """Test basic functionality of _execute_remove_action method""" - + # Create test records and sitemap entries test_bibcodes = [ - '2023RemoveTest..1..1A', - '2023RemoveTest..1..2A', - '2023RemoveTest..1..3A' + "2023RemoveTest..1..1A", + "2023RemoveTest..1..2A", + "2023RemoveTest..1..3A", ] - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023RemoveTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023RemoveTest%") + ).delete(synchronize_session=False) session.commit() - + # Create Records entries records = [] for i, bibcode in enumerate(test_bibcodes): @@ -2385,72 +3385,107 @@ def test_execute_remove_action_basic_functionality(self): bibcode=bibcode, bib_data='{"title": "Test Record"}', bib_data_updated=get_date(), - status='success' + status="success", ) session.add(record) records.append(record) - + session.flush() # Get record IDs - + # Create SitemapInfo entries sitemap_records = [] for i, (bibcode, record) in enumerate(zip(test_bibcodes, records)): sitemap_record = SitemapInfo( record_id=record.id, bibcode=bibcode, - sitemap_filename=f'sitemap_bib_{i+1}.xml', + sitemap_filename=f"sitemap_bib_{i+1}.xml", bib_data_updated=get_date(), filename_lastmoddate=get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_record) sitemap_records.append(sitemap_record) - + session.commit() - + # Verify initial state - initial_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).count() - self.assertEqual(initial_count, 3, "Should have 3 sitemap records initially") - + initial_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023RemoveTest%")) + .count() + ) + self.assertEqual( + initial_count, 3, "Should have 3 sitemap records initially" + ) + # Test removing 2 bibcodes bibcodes_to_remove = test_bibcodes[:2] # Remove first 2 - removed_count, files_to_delete, _ = self.app._execute_remove_action(session, bibcodes_to_remove) - + removed_count, files_to_delete, _ = self.app._execute_remove_action( + session, bibcodes_to_remove + ) + # Verify results self.assertEqual(removed_count, 2, "Should remove exactly 2 bibcodes") - self.assertEqual(files_to_delete, {'sitemap_bib_1.xml', 'sitemap_bib_2.xml'}, - "Should identify 2 files for deletion") - + self.assertEqual( + files_to_delete, + {"sitemap_bib_1.xml", "sitemap_bib_2.xml"}, + "Should identify 2 files for deletion", + ) + # Verify database state - remaining_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).count() - self.assertEqual(remaining_count, 1, "Should have 1 sitemap record remaining") - - remaining_record = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).first() - self.assertEqual(remaining_record.bibcode, test_bibcodes[2], "Should keep the third bibcode") - self.assertFalse(remaining_record.update_flag, "Remaining record should have update_flag=False") - + remaining_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023RemoveTest%")) + .count() + ) + self.assertEqual( + remaining_count, 1, "Should have 1 sitemap record remaining" + ) + + remaining_record = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023RemoveTest%")) + .first() + ) + self.assertEqual( + remaining_record.bibcode, + test_bibcodes[2], + "Should keep the third bibcode", + ) + self.assertFalse( + remaining_record.update_flag, + "Remaining record should have update_flag=False", + ) + # Clean up - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023RemoveTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023RemoveTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023RemoveTest%") + ).delete(synchronize_session=False) session.commit() - def test_execute_remove_action_empty_files_detection(self): """Test that _execute_remove_action correctly identifies empty files""" - + test_bibcodes = [ - '2023EmptyTest..1..1A', - '2023EmptyTest..1..2A', - '2023EmptyTest..1..3A', - '2023EmptyTest..1..4A' + "2023EmptyTest..1..1A", + "2023EmptyTest..1..2A", + "2023EmptyTest..1..3A", + "2023EmptyTest..1..4A", ] - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023EmptyTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023EmptyTest%") + ).delete(synchronize_session=False) session.commit() - + # Create Records entries records = [] for bibcode in test_bibcodes: @@ -2458,21 +3493,21 @@ def test_execute_remove_action_empty_files_detection(self): bibcode=bibcode, bib_data='{"title": "Test Record"}', bib_data_updated=get_date(), - status='success' + status="success", ) session.add(record) records.append(record) - + session.flush() - + # Create SitemapInfo entries - 2 records in file1, 1 record in file2, 1 record in file3 sitemap_assignments = [ - ('sitemap_bib_1.xml', test_bibcodes[0]), # File 1: 2 records - ('sitemap_bib_1.xml', test_bibcodes[1]), - ('sitemap_bib_2.xml', test_bibcodes[2]), # File 2: 1 record - ('sitemap_bib_3.xml', test_bibcodes[3]) # File 3: 1 record + ("sitemap_bib_1.xml", test_bibcodes[0]), # File 1: 2 records + ("sitemap_bib_1.xml", test_bibcodes[1]), + ("sitemap_bib_2.xml", test_bibcodes[2]), # File 2: 1 record + ("sitemap_bib_3.xml", test_bibcodes[3]), # File 3: 1 record ] - + for i, (filename, bibcode) in enumerate(sitemap_assignments): sitemap_record = SitemapInfo( record_id=records[i].id, @@ -2480,73 +3515,113 @@ def test_execute_remove_action_empty_files_detection(self): sitemap_filename=filename, bib_data_updated=get_date(), filename_lastmoddate=get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_record) - + session.commit() - + # Remove records that will make file2 and file3 empty, but leave file1 with 1 record - bibcodes_to_remove = [test_bibcodes[1], test_bibcodes[2], test_bibcodes[3]] # Remove from file1, all of file2, all of file3 - removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, bibcodes_to_remove) - + bibcodes_to_remove = [ + test_bibcodes[1], + test_bibcodes[2], + test_bibcodes[3], + ] # Remove from file1, all of file2, all of file3 + ( + removed_count, + files_to_delete, + files_to_update, + ) = self.app._execute_remove_action(session, bibcodes_to_remove) + # Verify results self.assertEqual(removed_count, 3, "Should remove exactly 3 bibcodes") - self.assertEqual(files_to_delete, {'sitemap_bib_2.xml', 'sitemap_bib_3.xml'}, - "Should identify files 2 and 3 as empty") - + self.assertEqual( + files_to_delete, + {"sitemap_bib_2.xml", "sitemap_bib_3.xml"}, + "Should identify files 2 and 3 as empty", + ) + # Verify file1 is in files_to_update (needs regeneration but not deletion) - self.assertIn('sitemap_bib_1.xml', files_to_update, "File 1 should be marked for update") - + self.assertIn( + "sitemap_bib_1.xml", + files_to_update, + "File 1 should be marked for update", + ) + # Verify file1 still has records - file1_records = session.query(SitemapInfo).filter( - SitemapInfo.sitemap_filename == 'sitemap_bib_1.xml' - ).all() - self.assertEqual(len(file1_records), 1, "File 1 should have 1 remaining record") - + file1_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.sitemap_filename == "sitemap_bib_1.xml") + .all() + ) + self.assertEqual( + len(file1_records), 1, "File 1 should have 1 remaining record" + ) + # Clean up - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023EmptyTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023EmptyTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023EmptyTest%") + ).delete(synchronize_session=False) session.commit() def test_execute_remove_action_no_matching_records(self): """Test _execute_remove_action with bibcodes that don't exist""" - + with self.app.session_scope() as session: # Test with non-existent bibcodes - non_existent_bibcodes = ['2023NonExistent..1..1A', '2023NonExistent..1..2A'] - removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, non_existent_bibcodes) - + non_existent_bibcodes = ["2023NonExistent..1..1A", "2023NonExistent..1..2A"] + ( + removed_count, + files_to_delete, + files_to_update, + ) = self.app._execute_remove_action(session, non_existent_bibcodes) + # Should return zero results - self.assertEqual(removed_count, 0, "Should remove 0 bibcodes when none exist") - self.assertEqual(files_to_delete, set(), "Should return empty set for files to delete") - + self.assertEqual( + removed_count, 0, "Should remove 0 bibcodes when none exist" + ) + self.assertEqual( + files_to_delete, set(), "Should return empty set for files to delete" + ) + def test_execute_remove_action_empty_input(self): """Test _execute_remove_action with empty input""" - + with self.app.session_scope() as session: # Test with empty list - removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, []) - + ( + removed_count, + files_to_delete, + files_to_update, + ) = self.app._execute_remove_action(session, []) + # Should return zero results immediately - self.assertEqual(removed_count, 0, "Should remove 0 bibcodes with empty input") - self.assertEqual(files_to_delete, set(), "Should return empty set for files to delete") - + self.assertEqual( + removed_count, 0, "Should remove 0 bibcodes with empty input" + ) + self.assertEqual( + files_to_delete, set(), "Should return empty set for files to delete" + ) + def test_execute_remove_action_mixed_scenarios(self): """Test _execute_remove_action with mixed existing/non-existing bibcodes""" - - test_bibcodes = [ - '2023MixedTest..1..1A', - '2023MixedTest..1..2A' - ] - non_existent_bibcodes = ['2023NonExist..1..1A', '2023NonExist..1..2A'] - + + test_bibcodes = ["2023MixedTest..1..1A", "2023MixedTest..1..2A"] + non_existent_bibcodes = ["2023NonExist..1..1A", "2023NonExist..1..2A"] + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023MixedTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023MixedTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023MixedTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023MixedTest%") + ).delete(synchronize_session=False) session.commit() - + # Create Records entries records = [] for bibcode in test_bibcodes: @@ -2554,60 +3629,82 @@ def test_execute_remove_action_mixed_scenarios(self): bibcode=bibcode, bib_data='{"title": "Test Record"}', bib_data_updated=get_date(), - status='success' + status="success", ) session.add(record) records.append(record) - + session.flush() - + # Create SitemapInfo entries for i, (bibcode, record) in enumerate(zip(test_bibcodes, records)): sitemap_record = SitemapInfo( record_id=record.id, bibcode=bibcode, - sitemap_filename='sitemap_bib_1.xml', + sitemap_filename="sitemap_bib_1.xml", bib_data_updated=get_date(), filename_lastmoddate=get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_record) - + session.commit() - + # Test removing mix of existing and non-existing bibcodes mixed_bibcodes = test_bibcodes + non_existent_bibcodes - removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, mixed_bibcodes) - + ( + removed_count, + files_to_delete, + files_to_update, + ) = self.app._execute_remove_action(session, mixed_bibcodes) + # Should only remove the existing ones - self.assertEqual(removed_count, 2, "Should remove only the 2 existing bibcodes") - self.assertEqual(files_to_delete, {'sitemap_bib_1.xml'}, "Should identify 1 file for deletion") - + self.assertEqual( + removed_count, 2, "Should remove only the 2 existing bibcodes" + ) + self.assertEqual( + files_to_delete, + {"sitemap_bib_1.xml"}, + "Should identify 1 file for deletion", + ) + # Verify database state - remaining_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023MixedTest%')).count() - self.assertEqual(remaining_count, 0, "Should have no sitemap records remaining") - + remaining_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023MixedTest%")) + .count() + ) + self.assertEqual( + remaining_count, 0, "Should have no sitemap records remaining" + ) + # Clean up - session.query(Records).filter(Records.bibcode.like('2023MixedTest%')).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023MixedTest%") + ).delete(synchronize_session=False) session.commit() - + def test_execute_remove_action_partial_file_removal(self): """Test _execute_remove_action when only some records are removed from files""" - + test_bibcodes = [ - '2023PartialTest..1..1A', - '2023PartialTest..1..2A', - '2023PartialTest..1..3A', - '2023PartialTest..1..4A', - '2023PartialTest..1..5A' + "2023PartialTest..1..1A", + "2023PartialTest..1..2A", + "2023PartialTest..1..3A", + "2023PartialTest..1..4A", + "2023PartialTest..1..5A", ] - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PartialTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023PartialTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023PartialTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023PartialTest%") + ).delete(synchronize_session=False) session.commit() - + # Create Records entries records = [] for bibcode in test_bibcodes: @@ -2615,22 +3712,22 @@ def test_execute_remove_action_partial_file_removal(self): bibcode=bibcode, bib_data='{"title": "Test Record"}', bib_data_updated=get_date(), - status='success' + status="success", ) session.add(record) records.append(record) - + session.flush() - + # Create SitemapInfo entries - distribute across 2 files sitemap_assignments = [ - ('sitemap_bib_1.xml', test_bibcodes[0]), # File 1: 3 records - ('sitemap_bib_1.xml', test_bibcodes[1]), - ('sitemap_bib_1.xml', test_bibcodes[2]), - ('sitemap_bib_2.xml', test_bibcodes[3]), # File 2: 2 records - ('sitemap_bib_2.xml', test_bibcodes[4]) + ("sitemap_bib_1.xml", test_bibcodes[0]), # File 1: 3 records + ("sitemap_bib_1.xml", test_bibcodes[1]), + ("sitemap_bib_1.xml", test_bibcodes[2]), + ("sitemap_bib_2.xml", test_bibcodes[3]), # File 2: 2 records + ("sitemap_bib_2.xml", test_bibcodes[4]), ] - + for i, (filename, bibcode) in enumerate(sitemap_assignments): sitemap_record = SitemapInfo( record_id=records[i].id, @@ -2638,53 +3735,88 @@ def test_execute_remove_action_partial_file_removal(self): sitemap_filename=filename, bib_data_updated=get_date(), filename_lastmoddate=get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_record) - + session.commit() - + # Remove 1 record from file1 and 1 record from file2 (partial removal) - bibcodes_to_remove = [test_bibcodes[1], test_bibcodes[3]] # 1 from each file - removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, bibcodes_to_remove) - + bibcodes_to_remove = [ + test_bibcodes[1], + test_bibcodes[3], + ] # 1 from each file + ( + removed_count, + files_to_delete, + files_to_update, + ) = self.app._execute_remove_action(session, bibcodes_to_remove) + # Verify results self.assertEqual(removed_count, 2, "Should remove exactly 2 bibcodes") - self.assertEqual(files_to_delete, set(), "Should not delete any files (both still have records)") - + self.assertEqual( + files_to_delete, + set(), + "Should not delete any files (both still have records)", + ) + # Verify both files are in files_to_update - self.assertIn('sitemap_bib_1.xml', files_to_update, "File 1 should be marked for update") - self.assertIn('sitemap_bib_2.xml', files_to_update, "File 2 should be marked for update") - + self.assertIn( + "sitemap_bib_1.xml", + files_to_update, + "File 1 should be marked for update", + ) + self.assertIn( + "sitemap_bib_2.xml", + files_to_update, + "File 2 should be marked for update", + ) + # Verify both files still have records - file1_records = session.query(SitemapInfo).filter( - SitemapInfo.sitemap_filename == 'sitemap_bib_1.xml' - ).all() - file2_records = session.query(SitemapInfo).filter( - SitemapInfo.sitemap_filename == 'sitemap_bib_2.xml' - ).all() - - self.assertEqual(len(file1_records), 2, "File 1 should have 2 remaining records") - self.assertEqual(len(file2_records), 1, "File 2 should have 1 remaining record") - + file1_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.sitemap_filename == "sitemap_bib_1.xml") + .all() + ) + file2_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.sitemap_filename == "sitemap_bib_2.xml") + .all() + ) + + self.assertEqual( + len(file1_records), 2, "File 1 should have 2 remaining records" + ) + self.assertEqual( + len(file2_records), 1, "File 2 should have 1 remaining record" + ) + # Clean up - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PartialTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023PartialTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023PartialTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023PartialTest%") + ).delete(synchronize_session=False) session.commit() - + def test_execute_remove_action_performance_with_large_batch(self): """Test _execute_remove_action performance with larger batch sizes""" - + # Create a larger batch for performance testing batch_size = 1000 - test_bibcodes = [f'2023PerfTest..{i:03d}..{i:03d}A' for i in range(batch_size)] - + test_bibcodes = [f"2023PerfTest..{i:03d}..{i:03d}A" for i in range(batch_size)] + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PerfTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023PerfTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023PerfTest%") + ).delete(synchronize_session=False) + session.query(Records).filter(Records.bibcode.like("2023PerfTest%")).delete( + synchronize_session=False + ) session.commit() - + # Create Records entries records = [] for bibcode in test_bibcodes: @@ -2692,289 +3824,383 @@ def test_execute_remove_action_performance_with_large_batch(self): bibcode=bibcode, bib_data='{"title": "Performance Test Record"}', bib_data_updated=get_date(), - status='success' + status="success", ) session.add(record) records.append(record) - + session.flush() - + # Create SitemapInfo entries - distribute across multiple files for i, (bibcode, record) in enumerate(zip(test_bibcodes, records)): file_index = (i // 10) + 1 # 10 records per file sitemap_record = SitemapInfo( record_id=record.id, bibcode=bibcode, - sitemap_filename=f'sitemap_bib_{file_index}.xml', + sitemap_filename=f"sitemap_bib_{file_index}.xml", bib_data_updated=get_date(), filename_lastmoddate=get_date(), - update_flag=False + update_flag=False, ) session.add(sitemap_record) - + session.commit() - + # Time the removal operation start_time = time.time() - removed_count, files_to_delete, files_to_update = self.app._execute_remove_action(session, test_bibcodes) + ( + removed_count, + files_to_delete, + files_to_update, + ) = self.app._execute_remove_action(session, test_bibcodes) end_time = time.time() - + execution_time = end_time - start_time - + # Verify results - self.assertEqual(removed_count, batch_size, f"Should remove all {batch_size} bibcodes") - self.assertEqual(len(files_to_delete), 100, "Should identify 100 files for deletion") - + self.assertEqual( + removed_count, batch_size, f"Should remove all {batch_size} bibcodes" + ) + self.assertEqual( + len(files_to_delete), 100, "Should identify 100 files for deletion" + ) + # Performance assertion - should complete reasonably quickly - self.assertLess(execution_time, 5.0, f"Removal of {batch_size} records should complete in under 5 seconds") - + self.assertLess( + execution_time, + 5.0, + f"Removal of {batch_size} records should complete in under 5 seconds", + ) + # Verify database state - remaining_count = session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023PerfTest%')).count() - self.assertEqual(remaining_count, 0, "Should have no sitemap records remaining") - + remaining_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023PerfTest%")) + .count() + ) + self.assertEqual( + remaining_count, 0, "Should have no sitemap records remaining" + ) + # Clean up - session.query(Records).filter(Records.bibcode.like('2023PerfTest%')).delete(synchronize_session=False) + session.query(Records).filter(Records.bibcode.like("2023PerfTest%")).delete( + synchronize_session=False + ) session.commit() - - print(f"_execute_remove_action performance test completed in {execution_time:.3f} seconds for {batch_size} records") - + + print( + f"_execute_remove_action performance test completed in {execution_time:.3f} seconds for {batch_size} records" + ) + def test_delete_sitemap_files(self): """Test delete_sitemap_files method""" - + # Create temporary directory structure for test with tempfile.TemporaryDirectory() as temp_dir: # Mock SITES configuration sites_config = { - 'ads': {'base_url': 'https://ui.adsabs.harvard.edu/'}, - 'scix': {'base_url': 'https://scixplorer.org/'} + "ads": {"base_url": "https://ui.adsabs.harvard.edu/"}, + "scix": {"base_url": "https://scixplorer.org/"}, } - + # Create site directories and test files - test_files = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml', 'sitemap_index.xml'] + test_files = ["sitemap_bib_1.xml", "sitemap_bib_2.xml", "sitemap_index.xml"] created_files = [] - + for site_key in sites_config.keys(): site_dir = os.path.join(temp_dir, site_key) os.makedirs(site_dir) - + for filename in test_files: file_path = os.path.join(site_dir, filename) - with open(file_path, 'w') as f: - f.write(f'Test content for {filename} in {site_key}') + with open(file_path, "w") as f: + f.write( + f"Test content for {filename} in {site_key}" + ) created_files.append(file_path) - + # Verify all files exist before deletion for file_path in created_files: - self.assertTrue(os.path.exists(file_path), f"File {file_path} should exist before deletion") - + self.assertTrue( + os.path.exists(file_path), + f"File {file_path} should exist before deletion", + ) + # Mock the SITES configuration - original_sites = self.app.conf.get('SITES', {}) - self.app.conf['SITES'] = sites_config - + original_sites = self.app.conf.get("SITES", {}) + self.app.conf["SITES"] = sites_config + try: # Test delete_sitemap_files - delete first 2 files - files_to_delete = {'sitemap_bib_1.xml', 'sitemap_bib_2.xml'} - + files_to_delete = {"sitemap_bib_1.xml", "sitemap_bib_2.xml"} + self.app.delete_sitemap_files(files_to_delete, temp_dir) - + # Verify deleted files are gone for site_key in sites_config.keys(): for filename in files_to_delete: file_path = os.path.join(temp_dir, site_key, filename) - self.assertFalse(os.path.exists(file_path), - f"File {file_path} should be deleted") - + self.assertFalse( + os.path.exists(file_path), + f"File {file_path} should be deleted", + ) + # Verify remaining files still exist for site_key in sites_config.keys(): - remaining_file = os.path.join(temp_dir, site_key, 'sitemap_index.xml') - self.assertTrue(os.path.exists(remaining_file), - f"File {remaining_file} should still exist") - + remaining_file = os.path.join( + temp_dir, site_key, "sitemap_index.xml" + ) + self.assertTrue( + os.path.exists(remaining_file), + f"File {remaining_file} should still exist", + ) + # Test empty files_to_delete set (should do nothing) - remaining_count_before = sum(len(os.listdir(os.path.join(temp_dir, site))) - for site in sites_config.keys()) - + remaining_count_before = sum( + len(os.listdir(os.path.join(temp_dir, site))) + for site in sites_config.keys() + ) + self.app.delete_sitemap_files(set(), temp_dir) - - remaining_count_after = sum(len(os.listdir(os.path.join(temp_dir, site))) - for site in sites_config.keys()) - - self.assertEqual(remaining_count_before, remaining_count_after, - "Empty files_to_delete should not change file count") - + + remaining_count_after = sum( + len(os.listdir(os.path.join(temp_dir, site))) + for site in sites_config.keys() + ) + + self.assertEqual( + remaining_count_before, + remaining_count_after, + "Empty files_to_delete should not change file count", + ) + # Test non-existent files (should not raise error) - non_existent_files = {'non_existent_1.xml', 'non_existent_2.xml'} - + non_existent_files = {"non_existent_1.xml", "non_existent_2.xml"} + # Should not raise an exception self.app.delete_sitemap_files(non_existent_files, temp_dir) - + # Remaining files should still exist - final_count = sum(len(os.listdir(os.path.join(temp_dir, site))) - for site in sites_config.keys()) - self.assertEqual(final_count, 2, "Should still have 2 files (1 per site)") - + final_count = sum( + len(os.listdir(os.path.join(temp_dir, site))) + for site in sites_config.keys() + ) + self.assertEqual( + final_count, 2, "Should still have 2 files (1 per site)" + ) + finally: # Restore original SITES configuration - self.app.conf['SITES'] = original_sites - + self.app.conf["SITES"] = original_sites def test_chunked(self): """Test chunked method""" - + # Test 1: Normal chunking with exact division data = list(range(10)) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] chunks = list(self.app.chunked(data, 5)) - + self.assertEqual(len(chunks), 2, "Should create 2 chunks") - self.assertEqual(chunks[0], [0, 1, 2, 3, 4], "First chunk should contain first 5 elements") - self.assertEqual(chunks[1], [5, 6, 7, 8, 9], "Second chunk should contain last 5 elements") - + self.assertEqual( + chunks[0], [0, 1, 2, 3, 4], "First chunk should contain first 5 elements" + ) + self.assertEqual( + chunks[1], [5, 6, 7, 8, 9], "Second chunk should contain last 5 elements" + ) + # Test 2: Chunking with remainder data = list(range(7)) # [0, 1, 2, 3, 4, 5, 6] chunks = list(self.app.chunked(data, 3)) - + self.assertEqual(len(chunks), 3, "Should create 3 chunks") self.assertEqual(chunks[0], [0, 1, 2], "First chunk should have 3 elements") self.assertEqual(chunks[1], [3, 4, 5], "Second chunk should have 3 elements") - self.assertEqual(chunks[2], [6], "Third chunk should have 1 element (remainder)") - + self.assertEqual( + chunks[2], [6], "Third chunk should have 1 element (remainder)" + ) + # Test 3: Single chunk (chunk_size larger than data) data = [1, 2, 3] chunks = list(self.app.chunked(data, 10)) - + self.assertEqual(len(chunks), 1, "Should create 1 chunk") - self.assertEqual(chunks[0], [1, 2, 3], "Single chunk should contain all elements") - + self.assertEqual( + chunks[0], [1, 2, 3], "Single chunk should contain all elements" + ) + # Test 4: Empty iterable data = [] chunks = list(self.app.chunked(data, 5)) - + self.assertEqual(len(chunks), 0, "Empty iterable should produce no chunks") - + # Test 5: Chunk size of 1 - data = ['a', 'b', 'c'] + data = ["a", "b", "c"] chunks = list(self.app.chunked(data, 1)) - + self.assertEqual(len(chunks), 3, "Should create 3 chunks with size 1") - self.assertEqual(chunks[0], ['a'], "First chunk should contain 'a'") - self.assertEqual(chunks[1], ['b'], "Second chunk should contain 'b'") - self.assertEqual(chunks[2], ['c'], "Third chunk should contain 'c'") - + self.assertEqual(chunks[0], ["a"], "First chunk should contain 'a'") + self.assertEqual(chunks[1], ["b"], "Second chunk should contain 'b'") + self.assertEqual(chunks[2], ["c"], "Third chunk should contain 'c'") + # Test 6: Memory efficiency test with generator (doesn't copy data) def large_generator(): for i in range(1000): yield f"item_{i}" - + chunks = list(self.app.chunked(large_generator(), 100)) - + self.assertEqual(len(chunks), 10, "Should create 10 chunks from 1000 items") self.assertEqual(len(chunks[0]), 100, "Each chunk should have 100 items") self.assertEqual(len(chunks[-1]), 100, "Last chunk should also have 100 items") self.assertEqual(chunks[0][0], "item_0", "First item should be 'item_0'") self.assertEqual(chunks[-1][-1], "item_999", "Last item should be 'item_999'") - + # Test 7: String chunking data = "abcdefghij" chunks = list(self.app.chunked(data, 4)) - + self.assertEqual(len(chunks), 3, "Should create 3 chunks from string") - self.assertEqual(chunks[0], ['a', 'b', 'c', 'd'], "First chunk should contain first 4 chars") - self.assertEqual(chunks[1], ['e', 'f', 'g', 'h'], "Second chunk should contain next 4 chars") - self.assertEqual(chunks[2], ['i', 'j'], "Third chunk should contain remaining 2 chars") - + self.assertEqual( + chunks[0], ["a", "b", "c", "d"], "First chunk should contain first 4 chars" + ) + self.assertEqual( + chunks[1], ["e", "f", "g", "h"], "Second chunk should contain next 4 chars" + ) + self.assertEqual( + chunks[2], ["i", "j"], "Third chunk should contain remaining 2 chars" + ) + # Test 8: Different data types - data = [1, 'two', 3.0, [4, 5], {'six': 6}] + data = [1, "two", 3.0, [4, 5], {"six": 6}] chunks = list(self.app.chunked(data, 2)) - + self.assertEqual(len(chunks), 3, "Should create 3 chunks from mixed data types") - self.assertEqual(chunks[0], [1, 'two'], "First chunk should contain first 2 items") - self.assertEqual(chunks[1], [3.0, [4, 5]], "Second chunk should contain next 2 items") - self.assertEqual(chunks[2], [{'six': 6}], "Third chunk should contain last item") - + self.assertEqual( + chunks[0], [1, "two"], "First chunk should contain first 2 items" + ) + self.assertEqual( + chunks[1], [3.0, [4, 5]], "Second chunk should contain next 2 items" + ) + self.assertEqual( + chunks[2], [{"six": 6}], "Third chunk should contain last item" + ) def test_delete_by_bibcode_with_sitemap(self): """Test delete_by_bibcode function with sitemap records (database deletion only)""" # TEST CASE 1: Delete record with both Records and SitemapInfo entries - test_bibcode = '2023DeleteSitemapTest..1..1A' - bib_data = {'title': 'Test Record for Sitemap Deletion', 'year': 2023} - + test_bibcode = "2023DeleteSitemapTest..1..1A" + bib_data = {"title": "Test Record for Sitemap Deletion", "year": 2023} + # Create test record - self.app.update_storage(test_bibcode, 'bib_data', bib_data) - + self.app.update_storage(test_bibcode, "bib_data", bib_data) + # Create sitemap entry with self.app.session_scope() as session: record = session.query(Records).filter_by(bibcode=test_bibcode).first() self.assertIsNotNone(record, "Test record should exist") - + sitemap_info = SitemapInfo( record_id=record.id, bibcode=test_bibcode, - sitemap_filename='sitemap_bib_delete_test.xml', + sitemap_filename="sitemap_bib_delete_test.xml", update_flag=False, - bib_data_updated=record.bib_data_updated + bib_data_updated=record.bib_data_updated, ) session.add(sitemap_info) session.commit() - + # Verify setup: record and sitemap entry exist with self.app.session_scope() as session: - record_count = session.query(Records).filter_by(bibcode=test_bibcode).count() - sitemap_count = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).count() - self.assertEqual(record_count, 1, "Should have 1 Records entry before deletion") - self.assertEqual(sitemap_count, 1, "Should have 1 SitemapInfo entry before deletion") - + record_count = ( + session.query(Records).filter_by(bibcode=test_bibcode).count() + ) + sitemap_count = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode).count() + ) + self.assertEqual( + record_count, 1, "Should have 1 Records entry before deletion" + ) + self.assertEqual( + sitemap_count, 1, "Should have 1 SitemapInfo entry before deletion" + ) + # Delete the record result = self.app.delete_by_bibcode(test_bibcode) - self.assertTrue(result, "delete_by_bibcode should return True for successful deletion") - + self.assertTrue( + result, "delete_by_bibcode should return True for successful deletion" + ) + # Verify both Records and SitemapInfo entries are deleted with self.app.session_scope() as session: record = session.query(Records).filter_by(bibcode=test_bibcode).first() self.assertIsNone(record, "Records entry should be deleted") - + # Verify ChangeLog entry was created - changelog = session.query(ChangeLog).filter_by(key=f'bibcode:{test_bibcode}').first() + changelog = ( + session.query(ChangeLog) + .filter_by(key=f"bibcode:{test_bibcode}") + .first() + ) self.assertIsNotNone(changelog, "ChangeLog entry should be created") - self.assertEqual(changelog.type, 'deleted', "ChangeLog type should be 'deleted'") - + self.assertEqual( + changelog.type, "deleted", "ChangeLog type should be 'deleted'" + ) + # With application-level cascade, SitemapInfo should be deleted - sitemap_info = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() - self.assertIsNone(sitemap_info, "SitemapInfo entry should be deleted by application logic") - + sitemap_info = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + ) + self.assertIsNone( + sitemap_info, "SitemapInfo entry should be deleted by application logic" + ) + # TEST CASE 2: Delete when only SitemapInfo exists (Records already deleted) - test_bibcode_2 = '2023DeleteSitemapTest..2..2A' - + test_bibcode_2 = "2023DeleteSitemapTest..2..2A" + # Create only SitemapInfo entry (no Records entry) with self.app.session_scope() as session: sitemap_info_2 = SitemapInfo( record_id=999999, # Non-existent record_id bibcode=test_bibcode_2, - sitemap_filename='sitemap_bib_orphan.xml', - update_flag=False + sitemap_filename="sitemap_bib_orphan.xml", + update_flag=False, ) session.add(sitemap_info_2) session.commit() - + # Verify setup: no Records entry, but SitemapInfo exists with self.app.session_scope() as session: - record_count = session.query(Records).filter_by(bibcode=test_bibcode_2).count() - sitemap_count = session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).count() + record_count = ( + session.query(Records).filter_by(bibcode=test_bibcode_2).count() + ) + sitemap_count = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).count() + ) self.assertEqual(record_count, 0, "Should have 0 Records entries") self.assertEqual(sitemap_count, 1, "Should have 1 SitemapInfo entry") - + # Delete orphaned sitemap entry result_2 = self.app.delete_by_bibcode(test_bibcode_2) - self.assertTrue(result_2, "delete_by_bibcode should return True for SitemapInfo deletion") - + self.assertTrue( + result_2, "delete_by_bibcode should return True for SitemapInfo deletion" + ) + # Verify SitemapInfo entry is deleted with self.app.session_scope() as session: - sitemap_info = session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).first() - self.assertIsNone(sitemap_info, "Orphaned SitemapInfo entry should be deleted") - + sitemap_info = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode_2).first() + ) + self.assertIsNone( + sitemap_info, "Orphaned SitemapInfo entry should be deleted" + ) + # TEST CASE 3: Delete non-existent bibcode - result_3 = self.app.delete_by_bibcode('2023NonExistent..1..1A') - self.assertIsNone(result_3, "delete_by_bibcode should return None for non-existent bibcode") + result_3 = self.app.delete_by_bibcode("2023NonExistent..1..1A") + self.assertIsNone( + result_3, "delete_by_bibcode should return None for non-existent bibcode" + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/adsmp/tests/test_solr_updater.py b/adsmp/tests/test_solr_updater.py index 114917d..7dc8d85 100644 --- a/adsmp/tests/test_solr_updater.py +++ b/adsmp/tests/test_solr_updater.py @@ -109,6 +109,7 @@ def test_solr_transformer(self): "page": ["283"], # u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'], "pub": "Astronomical Data Analysis Software and Systems XII", + "pub_abbrev": "ADASS XII", "pub_raw": "Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283", "pubdate": "2003-00-00", "title": ["Chandra Data Archive Download and Usage Database"], @@ -128,7 +129,7 @@ def test_solr_transformer(self): "boost_factor": 0.5142857142857143, "astronomy_final_boost": 0.5142857142857143, "physics_final_boost": 0.5142857142857143, - } + }, ) self.app.update_storage( "bibcode", @@ -367,8 +368,8 @@ def test_solr_transformer(self): "volume", ], ) - self.assertEqual(x["scix_id"], "scix:42MM-89VE-90A0") - self.assertEqual(round(x["doctype_boost"],3),0.857) + self.assertEqual(x["scix_id"], "scix:2VD6-M93T-HEGP") + self.assertEqual(round(x["doctype_boost"], 3), 0.857) self.app.update_storage( "bibcode", @@ -382,13 +383,13 @@ def test_solr_transformer(self): "boost_factor": 0.5142857142857143, "astronomy_final_boost": 0.5142857142857143, "physics_final_boost": 0.5142857142857143, - } + }, ) rec = self.app.get_record("bibcode") x = solr_updater.transform_json_record(rec) - self.assertEqual(x["scix_id"], "scix:42MM-89VE-90A0") - self.assertEqual(round(x["doctype_boost"],3),0.857) - self.assertEqual(round(x["astronomy_final_boost"],3), 0.514) + self.assertEqual(x["scix_id"], "scix:2VD6-M93T-HEGP") + self.assertEqual(round(x["doctype_boost"], 3), 0.857) + self.assertEqual(round(x["astronomy_final_boost"], 3), 0.514) self.app.update_storage( "bibcode", @@ -503,6 +504,7 @@ def test_solr_transformer(self): "property": ["OPENACCESS", "ADS_OPENACCESS", "ARTICLE", "NOT REFEREED"], "pub": "Astronomical Data Analysis Software and Systems XII", "pub_raw": "Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283", + "pub_abbrev": "ADASS XII", "pubdate": "2003-00-00", "read_count": 0, "reference": [ @@ -609,8 +611,7 @@ def test_solr_transformer(self): "volume", ], ) - self.assertEqual(round(x["doctype_boost"],3),0.857) - + self.assertEqual(round(x["doctype_boost"], 3), 0.857) def test_links_data_merge(self): # links_data only from bib diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py index bf1072f..6ace026 100644 --- a/adsmp/tests/test_tasks.py +++ b/adsmp/tests/test_tasks.py @@ -1,13 +1,14 @@ import copy import html import json +import logging import os import shutil -import unittest -from datetime import datetime, timedelta, timezone import tempfile import time -from unittest.mock import patch, MagicMock +import unittest +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock, patch import mock from adsmsg import ( @@ -21,15 +22,15 @@ ) from adsmsg.orcid_claims import OrcidClaims from adsputils import get_date -from mock import Mock, patch, MagicMock +from mock import MagicMock, Mock, patch from adsmp import app, tasks -from adsmp.models import Base, Records, SitemapInfo, ChangeLog -from adsmp.tasks import update_sitemap_index, update_robots_files +from adsmp.models import Base, ChangeLog, Records, SitemapInfo +from adsmp.tasks import update_robots_files, update_sitemap_index -import logging logger = logging.getLogger(__name__) + def unwind_task_index_solr_apply_async(args=None, kwargs=None, priority=None): tasks.task_index_solr(args[0], args[1], kwargs) @@ -706,18 +707,23 @@ def test_index_metrics_no_data(self): x.assert_not_called() def test_task_update_scixid(self): - self.app.update_storage("bibcode", "bib_data", {"title":"abc test 123"}) - self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:5RNB-CG0M-EQYN") + self.app.update_storage("bibcode", "bib_data", {"title": "abc test 123"}) + self.assertEqual( + self.app.get_record("bibcode")["scix_id"], "scix:8KM7-38V2-N637" + ) tasks.task_update_scixid(bibcodes=["bibcode"], flag="force") # scixid should not change since bib_data has not changed - self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:5RNB-CG0M-EQYN") + self.assertEqual( + self.app.get_record("bibcode")["scix_id"], "scix:8KM7-38V2-N637" + ) - self.app.update_storage("bibcode", "bib_data", {"title":"abc test 456"}) + self.app.update_storage("bibcode", "bib_data", {"title": "abc test 456"}) tasks.task_update_scixid(bibcodes=["bibcode"], flag="force") # scix_id should change since bib_data has changed and we used the force flag to create a new scix_id - self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:3BPZ-TQ3C-HFMU") - + self.assertEqual( + self.app.get_record("bibcode")["scix_id"], "scix:6Z3P-MJ87-67A1" + ) with self.app.session_scope() as session: r = session.query(Records).filter_by(bibcode="bibcode").first() @@ -726,18 +732,16 @@ def test_task_update_scixid(self): tasks.task_update_scixid(bibcodes=["bibcode"], flag="update") # bibcode should still be the same as above since bib_data has not changed - self.assertEqual(self.app.get_record("bibcode")["scix_id"], "scix:3BPZ-TQ3C-HFMU") - - - - + self.assertEqual( + self.app.get_record("bibcode")["scix_id"], "scix:6Z3P-MJ87-67A1" + ) class TestSitemapWorkflow(unittest.TestCase): """ Comprehensive tests for the complete sitemap workflow """ - + def setUp(self): unittest.TestCase.setUp(self) self.proj_home = os.path.join(os.path.dirname(__file__), "../..") @@ -756,78 +760,80 @@ def setUp(self): tasks.app = self.app # monkey-patch the app object Base.metadata.bind = self.app._session.get_bind() Base.metadata.create_all() - + # Drop and recreate tables to ensure they have proper schema with indexes try: SitemapInfo.__table__.drop(self.app._session.get_bind(), checkfirst=True) Records.__table__.drop(self.app._session.get_bind(), checkfirst=True) except: pass # Tables might not exist - + # Recreate tables with current schema (including indexes) Records.__table__.create(self.app._session.get_bind()) SitemapInfo.__table__.create(self.app._session.get_bind()) - + # Configure app for sitemap testing - self.app.conf.update({ - 'SITEMAP_DIR': '/tmp/test_sitemap/', - 'SITES': { - 'ads': { - 'name': 'ADS', - 'base_url': 'https://ui.adsabs.harvard.edu/', - 'sitemap_url': 'https://ui.adsabs.harvard.edu/sitemap', - 'abs_url_pattern': 'https://ui.adsabs.harvard.edu/abs/{bibcode}' + self.app.conf.update( + { + "SITEMAP_DIR": "/tmp/test_sitemap/", + "SITES": { + "ads": { + "name": "ADS", + "base_url": "https://ui.adsabs.harvard.edu/", + "sitemap_url": "https://ui.adsabs.harvard.edu/sitemap", + "abs_url_pattern": "https://ui.adsabs.harvard.edu/abs/{bibcode}", + }, + "scix": { + "name": "SciX", + "base_url": "https://scixplorer.org/", + "sitemap_url": "https://scixplorer.org/sitemap", + "abs_url_pattern": "https://scixplorer.org/abs/{bibcode}", + }, }, - 'scix': { - 'name': 'SciX', - 'base_url': 'https://scixplorer.org/', - 'sitemap_url': 'https://scixplorer.org/sitemap', - 'abs_url_pattern': 'https://scixplorer.org/abs/{bibcode}' - } } - }) - + ) + # Set up test data self.test_records = [ { - 'bibcode': '2023ApJ...123..456A', - 'id': 1, - 'bib_data': '{"title": "Test Paper A"}', - 'bib_data_updated': get_date() - timedelta(days=1) + "bibcode": "2023ApJ...123..456A", + "id": 1, + "bib_data": '{"title": "Test Paper A"}', + "bib_data_updated": get_date() - timedelta(days=1), }, { - 'bibcode': '2023ApJ...123..457B', - 'id': 2, - 'bib_data': '{"title": "Test Paper B"}', - 'bib_data_updated': get_date() - timedelta(days=2) + "bibcode": "2023ApJ...123..457B", + "id": 2, + "bib_data": '{"title": "Test Paper B"}', + "bib_data_updated": get_date() - timedelta(days=2), }, { - 'bibcode': '2023ApJ...123..458C', - 'id': 3, - 'bib_data': '{"title": "Test Paper C"}', - 'bib_data_updated': get_date() - timedelta(days=3) + "bibcode": "2023ApJ...123..458C", + "id": 3, + "bib_data": '{"title": "Test Paper C"}', + "bib_data_updated": get_date() - timedelta(days=3), }, { - 'bibcode': '2023ApJ...123..459D', - 'id': 4, - 'bib_data': '{"title": "Test Paper D"}', - 'bib_data_updated': get_date() - } + "bibcode": "2023ApJ...123..459D", + "id": 4, + "bib_data": '{"title": "Test Paper D"}', + "bib_data_updated": get_date(), + }, ] - + # Clean database and insert test records with self.app.session_scope() as session: # Clear existing records session.query(Records).delete() session.commit() - + # Insert test records with specified IDs for record_data in self.test_records: record = Records( - id=record_data['id'], - bibcode=record_data['bibcode'], - bib_data=record_data['bib_data'], - bib_data_updated=record_data['bib_data_updated'] + id=record_data["id"], + bibcode=record_data["bibcode"], + bib_data=record_data["bib_data"], + bib_data_updated=record_data["bib_data_updated"], ) session.add(record) session.commit() @@ -847,42 +853,48 @@ def tearDown(self): self.app.close_app() tasks.app = self._app - - def test_task_cleanup_invalid_sitemaps(self): """Test the task_cleanup_invalid_sitemaps function thoroughly""" - + # Setup test data - create records with different statuses - valid_bibcodes = ['2023CleanValid1A', '2023CleanValid2B'] - invalid_bibcodes = ['2023CleanInvalid1C', '2023CleanInvalid2D', '2023CleanInvalid3E'] + valid_bibcodes = ["2023CleanValid1A", "2023CleanValid2B"] + invalid_bibcodes = [ + "2023CleanInvalid1C", + "2023CleanInvalid2D", + "2023CleanInvalid3E", + ] all_bibcodes = valid_bibcodes + invalid_bibcodes - + with self.app.session_scope() as session: # Verify clean state total_records_before = session.query(SitemapInfo).count() - self.assertEqual(total_records_before, 0, "Should start with empty sitemap table") - + self.assertEqual( + total_records_before, 0, "Should start with empty sitemap table" + ) + # Create valid records (should remain in sitemap) for bibcode in valid_bibcodes: record = Records() record.bibcode = bibcode record.bib_data = '{"title": "Valid Test Record"}' record.bib_data_updated = get_date() - timedelta(days=1) - record.solr_processed = get_date() - timedelta(hours=12) # Recently processed - record.status = 'success' + record.solr_processed = get_date() - timedelta( + hours=12 + ) # Recently processed + record.status = "success" session.add(record) session.flush() - + # Create sitemap entry sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_valid.xml' + sitemap_record.sitemap_filename = "sitemap_bib_valid.xml" sitemap_record.update_flag = False session.add(sitemap_record) - + # Create invalid records (should be removed from sitemap) - statuses = ['solr-failed', 'retrying', 'solr-failed'] + statuses = ["solr-failed", "retrying", "solr-failed"] for i, bibcode in enumerate(invalid_bibcodes): record = Records() record.bibcode = bibcode @@ -892,117 +904,155 @@ def test_task_cleanup_invalid_sitemaps(self): record.status = statuses[i] session.add(record) session.flush() - + # Create sitemap entry sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_invalid.xml' + sitemap_record.sitemap_filename = "sitemap_bib_invalid.xml" sitemap_record.update_flag = False session.add(sitemap_record) - + session.commit() - + # Verify we have exactly 5 records final_count = session.query(SitemapInfo).count() - self.assertEqual(final_count, 5, "Should have exactly 5 sitemap records after setup") - + self.assertEqual( + final_count, 5, "Should have exactly 5 sitemap records after setup" + ) + # Execute cleanup with small batch size for testing - original_batch_size = self.app.conf.get('SITEMAP_BOOTSTRAP_BATCH_SIZE', 50000) - self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = 2 # Small batch for testing - + original_batch_size = self.app.conf.get("SITEMAP_BOOTSTRAP_BATCH_SIZE", 50000) + self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = 2 # Small batch for testing + try: - # Mock delete_sitemap_files - with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files: + # Mock delete_sitemap_files + with patch.object(self.app, "delete_sitemap_files") as mock_delete_files: result = tasks.task_cleanup_invalid_sitemaps() finally: # Restore original batch size - self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = original_batch_size - + self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = original_batch_size + # Verify result structure and content self.assertIsInstance(result, dict, "Should return result dictionary") - self.assertIn('total_processed', result, "Should include total_processed count") - self.assertIn('invalid_removed', result, "Should include invalid_removed count") - self.assertIn('batches_processed', result, "Should include batches_processed count") - self.assertIn('files_regenerated', result, "Should include files_regenerated flag") - self.assertIn('files_flagged', result, "Should include files_flagged count") - + self.assertIn("total_processed", result, "Should include total_processed count") + self.assertIn("invalid_removed", result, "Should include invalid_removed count") + self.assertIn( + "batches_processed", result, "Should include batches_processed count" + ) + self.assertIn( + "files_regenerated", result, "Should include files_regenerated flag" + ) + self.assertIn("files_flagged", result, "Should include files_flagged count") + # Verify cleanup results - should have processed exactly our 5 records - self.assertEqual(result['total_processed'], 5, "Should have processed exactly 5 records") - self.assertEqual(result['invalid_removed'], 3, "Should have removed exactly 3 invalid records") - self.assertGreaterEqual(result['batches_processed'], 1, "Should have processed at least 1 batch") - self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration") + self.assertEqual( + result["total_processed"], 5, "Should have processed exactly 5 records" + ) + self.assertEqual( + result["invalid_removed"], + 3, + "Should have removed exactly 3 invalid records", + ) + self.assertGreaterEqual( + result["batches_processed"], 1, "Should have processed at least 1 batch" + ) + self.assertTrue( + result["files_regenerated"], "Should indicate files need regeneration" + ) # files_flagged may be 0 if all invalid records were in files that became completely empty - + # Verify delete_sitemap_files was called to clean up empty files - self.assertTrue(mock_delete_files.called, "delete_sitemap_files should have been called") + self.assertTrue( + mock_delete_files.called, "delete_sitemap_files should have been called" + ) # Verify it was called with a non-empty set of files to delete call_args = mock_delete_files.call_args[0] files_to_delete = call_args[0] # First argument is the files_to_delete set - self.assertIsInstance(files_to_delete, set, "Should pass a set of files to delete") + self.assertIsInstance( + files_to_delete, set, "Should pass a set of files to delete" + ) self.assertEqual(len(files_to_delete), 1, "Should have files to delete") - + # Verify database state after cleanup with self.app.session_scope() as session: # Should have exactly 2 records remaining total_remaining = session.query(SitemapInfo).count() - self.assertEqual(total_remaining, 2, "Should have exactly 2 records remaining") - + self.assertEqual( + total_remaining, 2, "Should have exactly 2 records remaining" + ) + # Valid records should remain - valid_remaining = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(valid_bibcodes) - ).all() - self.assertEqual(len(valid_remaining), 2, "Valid records should remain in sitemap") - + valid_remaining = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(valid_bibcodes)) + .all() + ) + self.assertEqual( + len(valid_remaining), 2, "Valid records should remain in sitemap" + ) + # Invalid records should be removed - invalid_remaining = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(invalid_bibcodes) - ).all() - self.assertEqual(len(invalid_remaining), 0, "Invalid records should be removed from sitemap") - + invalid_remaining = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(invalid_bibcodes)) + .all() + ) + self.assertEqual( + len(invalid_remaining), + 0, + "Invalid records should be removed from sitemap", + ) + # Verify remaining records have correct properties for sitemap_record in valid_remaining: self.assertIn(sitemap_record.bibcode, valid_bibcodes) - self.assertEqual(sitemap_record.sitemap_filename, 'sitemap_bib_valid.xml') - + self.assertEqual( + sitemap_record.sitemap_filename, "sitemap_bib_valid.xml" + ) + # Verify the Records table is unchanged (cleanup should only affect SitemapInfo) with self.app.session_scope() as session: - all_records = session.query(Records).filter( - Records.bibcode.in_(all_bibcodes) - ).all() + all_records = ( + session.query(Records).filter(Records.bibcode.in_(all_bibcodes)).all() + ) self.assertEqual(len(all_records), 5, "All Records should still exist") - + # Verify record statuses are unchanged valid_records = [r for r in all_records if r.bibcode in valid_bibcodes] invalid_records = [r for r in all_records if r.bibcode in invalid_bibcodes] - + for record in valid_records: - self.assertEqual(record.status, 'success') - + self.assertEqual(record.status, "success") + for record in invalid_records: - self.assertIn(record.status, ['solr-failed', 'retrying']) + self.assertIn(record.status, ["solr-failed", "retrying"]) def test_task_cleanup_invalid_sitemaps_with_file_flagging(self): """Test that cleanup correctly flags files for regeneration when some records remain""" - + # Setup: Create TWO files: # File 1 (mixed): Has both valid and invalid records - should be flagged when invalid ones removed # File 2 (invalid only): Has only invalid records - should be deleted entirely test_bibcodes = [ - '2023FlagTest1A', # Valid - will remain in file1 - '2023FlagTest2B', # Valid - will remain in file1 - '2023FlagTest3C', # Invalid - will be removed from file1 - '2023FlagTest4D', # Invalid - will be removed from file2 + "2023FlagTest1A", # Valid - will remain in file1 + "2023FlagTest2B", # Valid - will remain in file1 + "2023FlagTest3C", # Invalid - will be removed from file1 + "2023FlagTest4D", # Invalid - will be removed from file2 ] valid_bibcodes = test_bibcodes[:2] invalid_bibcodes = test_bibcodes[2:] - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023FlagTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023FlagTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023FlagTest%") + ).delete(synchronize_session=False) + session.query(Records).filter(Records.bibcode.like("2023FlagTest%")).delete( + synchronize_session=False + ) session.commit() - + # Create valid records (should remain in sitemap) for bibcode in valid_bibcodes: record = Records() @@ -1010,17 +1060,17 @@ def test_task_cleanup_invalid_sitemaps_with_file_flagging(self): record.bib_data = '{"title": "Valid Test Record"}' record.bib_data_updated = get_date() - timedelta(days=1) record.solr_processed = get_date() - timedelta(hours=12) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_mixed.xml' # File 1 + sitemap_record.sitemap_filename = "sitemap_bib_mixed.xml" # File 1 sitemap_record.update_flag = False session.add(sitemap_record) - + # Create invalid records (should be removed from sitemap) for i, bibcode in enumerate(invalid_bibcodes): record = Records() @@ -1028,94 +1078,146 @@ def test_task_cleanup_invalid_sitemaps_with_file_flagging(self): record.bib_data = '{"title": "Invalid Test Record"}' record.bib_data_updated = get_date() - timedelta(days=1) record.solr_processed = get_date() - timedelta(days=2) - record.status = 'solr-failed' + record.status = "solr-failed" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id # First invalid goes to file1 (mixed), second to file2 (will be deleted) - sitemap_record.sitemap_filename = 'sitemap_bib_mixed.xml' if i == 0 else 'sitemap_bib_invalid_only.xml' + sitemap_record.sitemap_filename = ( + "sitemap_bib_mixed.xml" + if i == 0 + else "sitemap_bib_invalid_only.xml" + ) sitemap_record.update_flag = False session.add(sitemap_record) - + session.commit() - + # Verify initial state: 4 records, all in same file, none flagged - total_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.like('2023FlagTest%') - ).count() + total_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023FlagTest%")) + .count() + ) self.assertEqual(total_records, 4, "Should have 4 records initially") - - flagged_count = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.like('2023FlagTest%'), - SitemapInfo.update_flag == True - ).count() - self.assertEqual(flagged_count, 0, "Should have 0 flagged records initially") - + + flagged_count = ( + session.query(SitemapInfo) + .filter( + SitemapInfo.bibcode.like("2023FlagTest%"), + SitemapInfo.update_flag == True, + ) + .count() + ) + self.assertEqual( + flagged_count, 0, "Should have 0 flagged records initially" + ) + # Execute cleanup - original_batch_size = self.app.conf.get('SITEMAP_BOOTSTRAP_BATCH_SIZE', 50000) - self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = 10 # Small batch - + original_batch_size = self.app.conf.get("SITEMAP_BOOTSTRAP_BATCH_SIZE", 50000) + self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = 10 # Small batch + try: - with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files: + with patch.object(self.app, "delete_sitemap_files") as mock_delete_files: result = tasks.task_cleanup_invalid_sitemaps() finally: - self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = original_batch_size - + self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = original_batch_size + # Verify cleanup results - self.assertEqual(result['total_processed'], 4, "Should have processed 4 records") - self.assertEqual(result['invalid_removed'], 2, "Should have removed 2 invalid records") - self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration") - self.assertEqual(result['files_flagged'], 1, "Should have flagged exactly 1 file (mixed file)") - + self.assertEqual( + result["total_processed"], 4, "Should have processed 4 records" + ) + self.assertEqual( + result["invalid_removed"], 2, "Should have removed 2 invalid records" + ) + self.assertTrue( + result["files_regenerated"], "Should indicate files need regeneration" + ) + self.assertEqual( + result["files_flagged"], + 1, + "Should have flagged exactly 1 file (mixed file)", + ) + # Verify delete_sitemap_files was called for the file that became empty self.assertTrue(mock_delete_files.called, "Should have deleted the empty file") # Check that the empty file was deleted call_args = mock_delete_files.call_args[0] files_to_delete = call_args[0] - self.assertIn('sitemap_bib_invalid_only.xml', files_to_delete, "Should delete the file with only invalid records") - + self.assertIn( + "sitemap_bib_invalid_only.xml", + files_to_delete, + "Should delete the file with only invalid records", + ) + # Verify database state after cleanup with self.app.session_scope() as session: # Should have 2 valid records remaining - remaining_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.like('2023FlagTest%') - ).all() - self.assertEqual(len(remaining_records), 2, "Should have 2 remaining records") - + remaining_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.like("2023FlagTest%")) + .all() + ) + self.assertEqual( + len(remaining_records), 2, "Should have 2 remaining records" + ) + # At least one record should be flagged for update flagged_records = [r for r in remaining_records if r.update_flag] - self.assertGreaterEqual(len(flagged_records), 1, "At least one record should be flagged") - + self.assertGreaterEqual( + len(flagged_records), 1, "At least one record should be flagged" + ) + # All remaining records should be valid bibcodes remaining_bibcodes = [r.bibcode for r in remaining_records] - self.assertEqual(set(remaining_bibcodes), set(valid_bibcodes), "Only valid bibcodes should remain") - + self.assertEqual( + set(remaining_bibcodes), + set(valid_bibcodes), + "Only valid bibcodes should remain", + ) + # All remaining records should be in the mixed file (not the deleted one) filenames = set(r.sitemap_filename for r in remaining_records) - self.assertEqual(filenames, {'sitemap_bib_mixed.xml'}, "All remaining records should be in mixed file") - + self.assertEqual( + filenames, + {"sitemap_bib_mixed.xml"}, + "All remaining records should be in mixed file", + ) + # Clean up test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023FlagTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023FlagTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023FlagTest%") + ).delete(synchronize_session=False) + session.query(Records).filter(Records.bibcode.like("2023FlagTest%")).delete( + synchronize_session=False + ) session.commit() def test_task_cleanup_invalid_sitemaps_orphaned_entries_cleanup(self): """Test cleanup of orphaned sitemap entries (part 2)""" - + # Setup orphaned entries - create records where some will become orphaned - test_bibcodes = ['2023OrphanCleanup1A', '2023OrphanCleanup2B', '2023ValidCleanup3C'] + test_bibcodes = [ + "2023OrphanCleanup1A", + "2023OrphanCleanup2B", + "2023ValidCleanup3C", + ] orphaned_bibcodes = test_bibcodes[:2] # First 2 will become orphaned - valid_bibcodes = test_bibcodes[2:] # Last 1 will remain valid - + valid_bibcodes = test_bibcodes[2:] # Last 1 will remain valid + with self.app.session_scope() as session: # Clean up any existing test data first - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023%Cleanup%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023%Cleanup%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023%Cleanup%") + ).delete(synchronize_session=False) + session.query(Records).filter(Records.bibcode.like("2023%Cleanup%")).delete( + synchronize_session=False + ) session.commit() - + # Create Records and SitemapInfo for all bibcodes record_ids = {} for i, bibcode in enumerate(test_bibcodes): @@ -1123,126 +1225,166 @@ def test_task_cleanup_invalid_sitemaps_orphaned_entries_cleanup(self): record.bibcode = bibcode record.bib_data = '{"title": "Test Record"}' record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() record_ids[bibcode] = record.id - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_cleanup_test.xml' + sitemap_record.sitemap_filename = "sitemap_bib_cleanup_test.xml" sitemap_record.update_flag = False session.add(sitemap_record) - + session.commit() - + # Delete Records entries for orphaned bibcodes - session.query(Records).filter(Records.bibcode.in_(orphaned_bibcodes)).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.in_(orphaned_bibcodes) + ).delete(synchronize_session=False) session.commit() - + # Execute cleanup with small batch size for testing - original_batch_size = self.app.conf.get('SITEMAP_BOOTSTRAP_BATCH_SIZE', 50000) - self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = 2 # Small batch for testing - + original_batch_size = self.app.conf.get("SITEMAP_BOOTSTRAP_BATCH_SIZE", 50000) + self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = 2 # Small batch for testing + try: # Mock delete_sitemap_files - with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files: + with patch.object(self.app, "delete_sitemap_files") as mock_delete_files: result = tasks.task_cleanup_invalid_sitemaps() finally: # Restore original batch size - self.app.conf['SITEMAP_BOOTSTRAP_BATCH_SIZE'] = original_batch_size - + self.app.conf["SITEMAP_BOOTSTRAP_BATCH_SIZE"] = original_batch_size + # Verify cleanup results - should have processed exactly 3 records and removed 2 orphaned ones - self.assertEqual(result['total_processed'], 3, "Should have processed exactly 3 records") - self.assertEqual(result['invalid_removed'], 2, "Should have removed exactly 2 orphaned records") - self.assertGreaterEqual(result['batches_processed'], 1, "Should have processed at least 1 batch") - self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration") - + self.assertEqual( + result["total_processed"], 3, "Should have processed exactly 3 records" + ) + self.assertEqual( + result["invalid_removed"], + 2, + "Should have removed exactly 2 orphaned records", + ) + self.assertGreaterEqual( + result["batches_processed"], 1, "Should have processed at least 1 batch" + ) + self.assertTrue( + result["files_regenerated"], "Should indicate files need regeneration" + ) + # Verify database state after cleanup with self.app.session_scope() as session: # Valid record should remain - valid_remaining = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(valid_bibcodes) - ).all() - self.assertEqual(len(valid_remaining), 1, "Valid record should remain in sitemap") - + valid_remaining = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(valid_bibcodes)) + .all() + ) + self.assertEqual( + len(valid_remaining), 1, "Valid record should remain in sitemap" + ) + # Orphaned records should be removed - orphaned_remaining = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(orphaned_bibcodes) - ).all() - self.assertEqual(len(orphaned_remaining), 0, "Orphaned records should be removed from sitemap") + orphaned_remaining = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(orphaned_bibcodes)) + .all() + ) + self.assertEqual( + len(orphaned_remaining), + 0, + "Orphaned records should be removed from sitemap", + ) def test_task_cleanup_invalid_sitemaps_orphaned_entries_verification(self): """Test verification that remaining entries are valid after orphan cleanup (part 3)""" - - test_bibcode = '2023OrphanVerify1A' - + + test_bibcode = "2023OrphanVerify1A" + with self.app.session_scope() as session: # Clean up any existing test data first - session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode == test_bibcode).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode == test_bibcode + ).delete(synchronize_session=False) + session.query(Records).filter(Records.bibcode == test_bibcode).delete( + synchronize_session=False + ) session.commit() - + # Create a valid Records and SitemapInfo entry record = Records() record.bibcode = test_bibcode record.bib_data = '{"title": "Valid Test Record"}' record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = test_bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_verify_test.xml' + sitemap_record.sitemap_filename = "sitemap_bib_verify_test.xml" sitemap_record.update_flag = False session.add(sitemap_record) session.commit() - + # Execute cleanup - should not remove the valid entry - with patch('adsmp.tasks.task_update_sitemap_files.apply_async'), \ - patch.object(self.app, 'delete_sitemap_files'): + with patch("adsmp.tasks.task_update_sitemap_files.apply_async"), patch.object( + self.app, "delete_sitemap_files" + ): tasks.task_cleanup_invalid_sitemaps() - + # Verify the valid entry still exists and has correct relationships with self.app.session_scope() as session: - remaining_sitemap = session.query(SitemapInfo).filter( - SitemapInfo.bibcode == test_bibcode - ).first() + remaining_sitemap = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode == test_bibcode) + .first() + ) self.assertIsNotNone(remaining_sitemap, "Valid sitemap entry should remain") - + # Verify the sitemap entry still has a valid Records entry - corresponding_record = session.query(Records).filter( - Records.id == remaining_sitemap.record_id - ).first() - self.assertIsNotNone(corresponding_record, "Sitemap entry should have valid Records entry") - self.assertEqual(corresponding_record.bibcode, remaining_sitemap.bibcode, "Bibcodes should match") + corresponding_record = ( + session.query(Records) + .filter(Records.id == remaining_sitemap.record_id) + .first() + ) + self.assertIsNotNone( + corresponding_record, "Sitemap entry should have valid Records entry" + ) + self.assertEqual( + corresponding_record.bibcode, + remaining_sitemap.bibcode, + "Bibcodes should match", + ) def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self): """Test cleanup of all types of invalid records that should be removed from sitemaps""" - + # Test various invalid scenarios test_cases = [ # (bibcode, bib_data, status, description) - ('2023NoData..1..1A', None, 'success', 'No bib_data'), - ('2023EmptyData..1..1B', '', 'success', 'Empty bib_data'), - ('2023EmptyData2..1..1C', ' ', 'success', 'Whitespace-only bib_data'), - ('2023SolrFailed..1..1D', '{"title": "Test"}', 'solr-failed', 'SOLR failed status'), - ('2023Retrying..1..1E', '{"title": "Test"}', 'retrying', 'Retrying status'), + ("2023NoData..1..1A", None, "success", "No bib_data"), + ( + "2023SolrFailed..1..1D", + '{"title": "Test"}', + "solr-failed", + "SOLR failed status", + ), + ("2023Retrying..1..1E", '{"title": "Test"}', "retrying", "Retrying status"), ] - - valid_bibcode = '2023ValidRecord..1..1F' + + valid_bibcode = "2023ValidRecord..1..1F" # Add another valid bibcode in the same file as invalid ones to ensure file gets flagged not deleted - valid_bibcode_in_mixed_file = '2023ValidMixed..1..1G' - + valid_bibcode_in_mixed_file = "2023ValidMixed..1..1G" + with self.app.session_scope() as session: # Clean up all test data session.query(SitemapInfo).delete(synchronize_session=False) session.query(Records).delete(synchronize_session=False) session.commit() - + # Create invalid records - all in the same file for bibcode, bib_data, status, description in test_cases: record = Records() @@ -1252,292 +1394,411 @@ def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self): record.status = status session.add(record) session.flush() - + # Create sitemap entry sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_invalid_comprehensive.xml' + sitemap_record.sitemap_filename = ( + "sitemap_bib_invalid_comprehensive.xml" + ) sitemap_record.update_flag = False session.add(sitemap_record) - + # Create a valid record in the same file as invalid ones (mixed file) mixed_record = Records() mixed_record.bibcode = valid_bibcode_in_mixed_file mixed_record.bib_data = '{"title": "Valid Mixed Record"}' mixed_record.bib_data_updated = get_date() - timedelta(days=1) - mixed_record.status = 'success' + mixed_record.status = "success" session.add(mixed_record) session.flush() - + mixed_sitemap_record = SitemapInfo() mixed_sitemap_record.bibcode = valid_bibcode_in_mixed_file mixed_sitemap_record.record_id = mixed_record.id - mixed_sitemap_record.sitemap_filename = 'sitemap_bib_invalid_comprehensive.xml' # Same file! + mixed_sitemap_record.sitemap_filename = ( + "sitemap_bib_invalid_comprehensive.xml" # Same file! + ) mixed_sitemap_record.update_flag = False session.add(mixed_sitemap_record) - + # Create another valid record in a different file valid_record = Records() valid_record.bibcode = valid_bibcode valid_record.bib_data = '{"title": "Valid Record"}' valid_record.bib_data_updated = get_date() - timedelta(days=1) - valid_record.status = 'success' + valid_record.status = "success" session.add(valid_record) session.flush() - + valid_sitemap_record = SitemapInfo() valid_sitemap_record.bibcode = valid_bibcode valid_sitemap_record.record_id = valid_record.id - valid_sitemap_record.sitemap_filename = 'sitemap_bib_valid_comprehensive.xml' + valid_sitemap_record.sitemap_filename = ( + "sitemap_bib_valid_comprehensive.xml" + ) valid_sitemap_record.update_flag = False session.add(valid_sitemap_record) - + session.commit() - + # Verify setup total_records = session.query(SitemapInfo).count() - self.assertEqual(total_records, 7, "Should have 7 records (5 invalid + 2 valid)") - + self.assertEqual( + total_records, 5, "Should have 5 records (3 invalid + 2 valid)" + ) + # Execute cleanup - with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files: + with patch.object(self.app, "delete_sitemap_files") as mock_delete_files: result = tasks.task_cleanup_invalid_sitemaps() - - # Verify all invalid records were removed - self.assertEqual(result['invalid_removed'], 5, "Should remove all 5 invalid records") - self.assertEqual(result['total_processed'], 7, "Should process all 7 records") - self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration") - + + # Verify invalid records were removed + self.assertEqual( + result["invalid_removed"], + 3, + "Should remove 3 invalid records (None, solr-failed, retrying)", + ) + self.assertEqual(result["total_processed"], 5, "Should process all 5 records") + self.assertTrue( + result["files_regenerated"], "Should indicate files need regeneration" + ) + # Verify files were flagged for regeneration (1 file with mixed valid/invalid records) - self.assertGreaterEqual(result['files_flagged'], 1, "Should have flagged at least 1 file") - + self.assertGreaterEqual( + result["files_flagged"], 1, "Should have flagged at least 1 file" + ) + # Verify database state with self.app.session_scope() as session: - # Two valid records should remain + # Only 2 valid records should remain remaining_records = session.query(SitemapInfo).all() - self.assertEqual(len(remaining_records), 2, "Should have 2 remaining valid records") + self.assertEqual( + len(remaining_records), 2, "Should have 2 remaining valid records" + ) remaining_bibcodes = {r.bibcode for r in remaining_records} - self.assertEqual(remaining_bibcodes, {valid_bibcode, valid_bibcode_in_mixed_file}, "Both valid records should remain") - - # All invalid records should be gone - for bibcode, _, _, description in test_cases: - invalid_count = session.query(SitemapInfo).filter_by(bibcode=bibcode).count() - self.assertEqual(invalid_count, 0, f"Invalid record should be removed: {description}") + expected_bibcodes = {valid_bibcode, valid_bibcode_in_mixed_file} + self.assertEqual( + remaining_bibcodes, + expected_bibcodes, + "Both valid records should remain", + ) + + # All invalid records (None bib_data, solr-failed, retrying) should be removed + removed_bibcodes = { + "2023NoData..1..1A", + "2023SolrFailed..1..1D", + "2023Retrying..1..1E", + } + for bibcode in removed_bibcodes: + invalid_count = ( + session.query(SitemapInfo).filter_by(bibcode=bibcode).count() + ) + self.assertEqual( + invalid_count, 0, f"Invalid record should be removed: {bibcode}" + ) def test_delete_by_bibcode_marks_sitemap_files_for_regeneration(self): """Test that delete_by_bibcode properly marks affected sitemap files for regeneration""" - + # Create test records in the same sitemap file - test_bibcodes = ['2023DeleteRegen1A', '2023DeleteRegen2B', '2023DeleteRegen3C'] + test_bibcodes = ["2023DeleteRegen1A", "2023DeleteRegen2B", "2023DeleteRegen3C"] bibcode_to_delete = test_bibcodes[0] # Will delete the first one remaining_bibcodes = test_bibcodes[1:] # These should be marked for update - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023DeleteRegen%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023DeleteRegen%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023DeleteRegen%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023DeleteRegen%") + ).delete(synchronize_session=False) session.commit() - + # Create records and sitemap entries in the same file for bibcode in test_bibcodes: record = Records() record.bibcode = bibcode record.bib_data = '{"title": "Test Record"}' record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_delete_test.xml' # Same file + sitemap_record.sitemap_filename = ( + "sitemap_bib_delete_test.xml" # Same file + ) sitemap_record.update_flag = False # Start with False session.add(sitemap_record) - + session.commit() - + # Verify setup: all records exist with update_flag=False - all_sitemap_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).all() - self.assertEqual(len(all_sitemap_records), 3, "Should have 3 sitemap records") + all_sitemap_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .all() + ) + self.assertEqual( + len(all_sitemap_records), 3, "Should have 3 sitemap records" + ) for record in all_sitemap_records: - self.assertFalse(record.update_flag, f"Update flag should start False for {record.bibcode}") - + self.assertFalse( + record.update_flag, + f"Update flag should start False for {record.bibcode}", + ) + # Delete one bibcode using delete_by_bibcode result = self.app.delete_by_bibcode(bibcode_to_delete) self.assertTrue(result, "delete_by_bibcode should succeed") - + # Verify the deletion and regeneration marking with self.app.session_scope() as session: # Deleted record should be gone from both tables - deleted_record = session.query(Records).filter_by(bibcode=bibcode_to_delete).first() + deleted_record = ( + session.query(Records).filter_by(bibcode=bibcode_to_delete).first() + ) self.assertIsNone(deleted_record, "Deleted Records entry should be gone") - - deleted_sitemap = session.query(SitemapInfo).filter_by(bibcode=bibcode_to_delete).first() - self.assertIsNone(deleted_sitemap, "Deleted SitemapInfo entry should be gone") - + + deleted_sitemap = ( + session.query(SitemapInfo).filter_by(bibcode=bibcode_to_delete).first() + ) + self.assertIsNone( + deleted_sitemap, "Deleted SitemapInfo entry should be gone" + ) + # Remaining records should exist and exactly one should be marked for update (one-row-per-file flagging) - remaining_sitemap_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(remaining_bibcodes) - ).all() - self.assertEqual(len(remaining_sitemap_records), 2, "Should have 2 remaining sitemap records") - + remaining_sitemap_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(remaining_bibcodes)) + .all() + ) + self.assertEqual( + len(remaining_sitemap_records), + 2, + "Should have 2 remaining sitemap records", + ) + flagged_count = sum(1 for r in remaining_sitemap_records if r.update_flag) - self.assertEqual(flagged_count, 1, "At least one remaining record should be marked for update") + self.assertEqual( + flagged_count, + 1, + "At least one remaining record should be marked for update", + ) for record in remaining_sitemap_records: - self.assertEqual(record.sitemap_filename, 'sitemap_bib_delete_test.xml', "Should be in same sitemap file") - + self.assertEqual( + record.sitemap_filename, + "sitemap_bib_delete_test.xml", + "Should be in same sitemap file", + ) + # Verify ChangeLog entry was created - changelog = session.query(ChangeLog).filter_by(key=f'bibcode:{bibcode_to_delete}').first() + changelog = ( + session.query(ChangeLog) + .filter_by(key=f"bibcode:{bibcode_to_delete}") + .first() + ) self.assertIsNotNone(changelog, "ChangeLog entry should be created") - self.assertEqual(changelog.type, 'deleted', "ChangeLog type should be 'deleted'") + self.assertEqual( + changelog.type, "deleted", "ChangeLog type should be 'deleted'" + ) def test_sitemap_file_regeneration_after_deletion_and_cleanup(self): """Test that sitemap files are correctly regenerated after deletion and cleanup operations""" - + # Create temporary sitemap directory temp_dir = tempfile.mkdtemp() - original_sitemap_dir = self.app.conf.get('SITEMAP_DIR') - self.app.conf['SITEMAP_DIR'] = temp_dir - + original_sitemap_dir = self.app.conf.get("SITEMAP_DIR") + self.app.conf["SITEMAP_DIR"] = temp_dir + try: # Test the core functionality we implemented: delete_by_bibcode marking files for regeneration - test_bibcodes = ['2023FileRegen1A', '2023FileRegen2B', '2023FileRegen3C'] + test_bibcodes = ["2023FileRegen1A", "2023FileRegen2B", "2023FileRegen3C"] bibcode_to_delete = test_bibcodes[0] remaining_bibcodes = test_bibcodes[1:] - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023FileRegen%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023FileRegen%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023FileRegen%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023FileRegen%") + ).delete(synchronize_session=False) session.commit() - + # Create test records and sitemap entries for bibcode in test_bibcodes: record = Records() record.bibcode = bibcode record.bib_data = '{"title": "File Regeneration Test"}' record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_file_regen.xml' + sitemap_record.sitemap_filename = "sitemap_bib_file_regen.xml" sitemap_record.update_flag = False # Start with False session.add(sitemap_record) - + session.commit() - + # Configure sites for testing - sites_config = {'ads': {'name': 'ADS'}} - original_sites = self.app.conf.get('SITES') - self.app.conf['SITES'] = sites_config - + sites_config = {"ads": {"name": "ADS"}} + original_sites = self.app.conf.get("SITES") + self.app.conf["SITES"] = sites_config + # Create site directory - site_dir = os.path.join(temp_dir, 'ads') + site_dir = os.path.join(temp_dir, "ads") os.makedirs(site_dir, exist_ok=True) - + # STEP 1: Test delete_by_bibcode marks files for regeneration result = self.app.delete_by_bibcode(bibcode_to_delete) self.assertTrue(result, f"Should successfully delete {bibcode_to_delete}") - + # Verify exactly one remaining record is marked for update (one-row-per-file flagging) with self.app.session_scope() as session: - remaining_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(remaining_bibcodes) - ).all() - - self.assertEqual(len(remaining_records), 2, "Should have 2 remaining records") + remaining_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(remaining_bibcodes)) + .all() + ) + + self.assertEqual( + len(remaining_records), 2, "Should have 2 remaining records" + ) flagged_count = sum(1 for r in remaining_records if r.update_flag) - self.assertEqual(flagged_count, 1, "At least one record should be marked for update") - + self.assertEqual( + flagged_count, 1, "At least one record should be marked for update" + ) + # Get record IDs while still in session record_ids = [r.id for r in remaining_records] - + # STEP 2: Generate sitemap file to verify the deleted bibcode is excluded - tasks.task_generate_single_sitemap('sitemap_bib_file_regen.xml', record_ids) - + tasks.task_generate_single_sitemap("sitemap_bib_file_regen.xml", record_ids) + # STEP 3: Verify the generated file excludes the deleted bibcode - sitemap_file = os.path.join(site_dir, 'sitemap_bib_file_regen.xml') - self.assertTrue(os.path.exists(sitemap_file), "Sitemap file should be generated") - - with open(sitemap_file, 'r') as f: + sitemap_file = os.path.join(site_dir, "sitemap_bib_file_regen.xml") + self.assertTrue( + os.path.exists(sitemap_file), "Sitemap file should be generated" + ) + + with open(sitemap_file, "r") as f: content = f.read() - + # Should contain remaining records for bibcode in remaining_bibcodes: - self.assertIn(bibcode, content, f"Sitemap should contain remaining record {bibcode}") - + self.assertIn( + bibcode, + content, + f"Sitemap should contain remaining record {bibcode}", + ) + # Should NOT contain deleted record (this proves the bug fix works) - self.assertNotIn(bibcode_to_delete, content, f"Sitemap should NOT contain deleted record {bibcode_to_delete}") - + self.assertNotIn( + bibcode_to_delete, + content, + f"Sitemap should NOT contain deleted record {bibcode_to_delete}", + ) + # Verify basic XML structure - self.assertIn('', content, "Should have XML declaration") - self.assertIn('', + content, + "Should have XML declaration", + ) + self.assertIn("', content) - self.assertIn('', content) + self.assertIn( + '', + content, + ) for bibcode in test_bibcodes: - self.assertIn(bibcode, content, f"Bibcode {bibcode} should be in sitemap") - + self.assertIn( + bibcode, content, f"Bibcode {bibcode} should be in sitemap" + ) + # Verify update_flag was reset to False with self.app.session_scope() as session: - updated_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).all() + updated_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .all() + ) for record in updated_records: - self.assertFalse(record.update_flag, f"Update flag should be False for {record.bibcode}") - self.assertIsNotNone(record.filename_lastmoddate, f"Last mod date should be set for {record.bibcode}") - + self.assertFalse( + record.update_flag, + f"Update flag should be False for {record.bibcode}", + ) + self.assertIsNotNone( + record.filename_lastmoddate, + f"Last mod date should be set for {record.bibcode}", + ) + finally: # Cleanup - self.app.conf['SITEMAP_DIR'] = original_sitemap_dir + self.app.conf["SITEMAP_DIR"] = original_sitemap_dir if original_sites: - self.app.conf['SITES'] = original_sites - + self.app.conf["SITES"] = original_sites + try: shutil.rmtree(temp_dir) except OSError: @@ -2000,98 +2393,124 @@ def test_task_update_sitemap_files_full_workflow(self): def test_task_update_sitemap_files_after_record_deletion(self): """Test task_update_sitemap_files after records have been deleted (simulating cleanup scenario)""" - + # Create temporary sitemap directory temp_dir = tempfile.mkdtemp() - original_sitemap_dir = self.app.conf.get('SITEMAP_DIR') - self.app.conf['SITEMAP_DIR'] = temp_dir - + original_sitemap_dir = self.app.conf.get("SITEMAP_DIR") + self.app.conf["SITEMAP_DIR"] = temp_dir + try: # Setup: Create records, then delete some to simulate cleanup scenario - test_bibcodes = ['2023DeleteTest1A', '2023DeleteTest2B', '2023DeleteTest3C'] + test_bibcodes = ["2023DeleteTest1A", "2023DeleteTest2B", "2023DeleteTest3C"] remaining_bibcodes = test_bibcodes[1:] # Keep last 2 records - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023DeleteTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023DeleteTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023DeleteTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023DeleteTest%") + ).delete(synchronize_session=False) session.commit() - + # Create Records and SitemapInfo for remaining bibcodes only (simulating post-cleanup state) for bibcode in remaining_bibcodes: record = Records() record.bibcode = bibcode record.bib_data = f'{{"title": "Remaining Record", "year": 2023}}' record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_after_delete.xml' + sitemap_record.sitemap_filename = "sitemap_bib_after_delete.xml" sitemap_record.update_flag = True # Mark for regeneration session.add(sitemap_record) - + session.commit() - + # Configure sites - sites_config = {'ads': {'name': 'ADS'}} - original_sites = self.app.conf.get('SITES') - self.app.conf['SITES'] = sites_config - + sites_config = {"ads": {"name": "ADS"}} + original_sites = self.app.conf.get("SITES") + self.app.conf["SITES"] = sites_config + # Create site directory - site_dir = os.path.join(temp_dir, 'ads') + site_dir = os.path.join(temp_dir, "ads") os.makedirs(site_dir, exist_ok=True) - + # Execute the workflow synchronously with self.app.session_scope() as session: - files_to_generate = session.query(SitemapInfo.sitemap_filename).filter( - SitemapInfo.update_flag == True - ).distinct().all() - - # Generate each sitemap file + files_to_generate = ( + session.query(SitemapInfo.sitemap_filename) + .filter(SitemapInfo.update_flag == True) + .distinct() + .all() + ) + + # Generate each sitemap file for (filename,) in files_to_generate: with self.app.session_scope() as session: - record_ids = session.query(SitemapInfo.id).filter( - SitemapInfo.sitemap_filename == filename, - SitemapInfo.update_flag == True - ).all() + record_ids = ( + session.query(SitemapInfo.id) + .filter( + SitemapInfo.sitemap_filename == filename, + SitemapInfo.update_flag == True, + ) + .all() + ) record_ids = [r[0] for r in record_ids] - + tasks.task_generate_single_sitemap(filename, record_ids) - + # Generate the index tasks.task_generate_sitemap_index() - + # Verify sitemap file contains only remaining records - sitemap_file = os.path.join(temp_dir, 'ads', 'sitemap_bib_after_delete.xml') + sitemap_file = os.path.join(temp_dir, "ads", "sitemap_bib_after_delete.xml") self.assertTrue(os.path.exists(sitemap_file), "Sitemap file should exist") - - with open(sitemap_file, 'r') as f: + + with open(sitemap_file, "r") as f: content = f.read() # Should contain remaining bibcodes for bibcode in remaining_bibcodes: - self.assertIn(bibcode, content, f"Remaining bibcode {bibcode} should be in sitemap") + self.assertIn( + bibcode, + content, + f"Remaining bibcode {bibcode} should be in sitemap", + ) # Should NOT contain deleted bibcode - self.assertNotIn(test_bibcodes[0], content, f"Deleted bibcode {test_bibcodes[0]} should not be in sitemap") - + self.assertNotIn( + test_bibcodes[0], + content, + f"Deleted bibcode {test_bibcodes[0]} should not be in sitemap", + ) + # Verify update flags were reset with self.app.session_scope() as session: - updated_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(remaining_bibcodes) - ).all() - self.assertEqual(len(updated_records), 2, "Should have 2 remaining records") + updated_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(remaining_bibcodes)) + .all() + ) + self.assertEqual( + len(updated_records), 2, "Should have 2 remaining records" + ) for record in updated_records: - self.assertFalse(record.update_flag, f"Update flag should be False for {record.bibcode}") - + self.assertFalse( + record.update_flag, + f"Update flag should be False for {record.bibcode}", + ) + finally: # Cleanup - self.app.conf['SITEMAP_DIR'] = original_sitemap_dir + self.app.conf["SITEMAP_DIR"] = original_sitemap_dir if original_sites: - self.app.conf['SITES'] = original_sites - + self.app.conf["SITES"] = original_sites + try: shutil.rmtree(temp_dir) except OSError: @@ -2099,68 +2518,75 @@ def test_task_update_sitemap_files_after_record_deletion(self): def test_task_update_sitemap_files_no_updates_needed(self): """Test task_update_sitemap_files when no files need updating""" - + # Create temporary sitemap directory temp_dir = tempfile.mkdtemp() - original_sitemap_dir = self.app.conf.get('SITEMAP_DIR') - self.app.conf['SITEMAP_DIR'] = temp_dir - + original_sitemap_dir = self.app.conf.get("SITEMAP_DIR") + self.app.conf["SITEMAP_DIR"] = temp_dir + try: # Setup test data with NO update flags set - test_bibcodes = ['2023NoUpdateTest1A', '2023NoUpdateTest2B'] - + test_bibcodes = ["2023NoUpdateTest1A", "2023NoUpdateTest2B"] + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023NoUpdateTest%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023NoUpdateTest%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023NoUpdateTest%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023NoUpdateTest%") + ).delete(synchronize_session=False) session.commit() - + # Create Records and SitemapInfo entries WITHOUT update flag for bibcode in test_bibcodes: record = Records() record.bibcode = bibcode record.bib_data = f'{{"title": "No Update Record", "year": 2023}}' record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_no_update.xml' + sitemap_record.sitemap_filename = "sitemap_bib_no_update.xml" sitemap_record.update_flag = False # No update needed session.add(sitemap_record) - + session.commit() - + # Configure sites - sites_config = {'ads': {'name': 'ADS'}} - original_sites = self.app.conf.get('SITES') - self.app.conf['SITES'] = sites_config - + sites_config = {"ads": {"name": "ADS"}} + original_sites = self.app.conf.get("SITES") + self.app.conf["SITES"] = sites_config + # Create site directory - site_dir = os.path.join(temp_dir, 'ads') + site_dir = os.path.join(temp_dir, "ads") os.makedirs(site_dir, exist_ok=True) - + # Execute the workflow - should only regenerate index - with patch('adsmp.tasks.update_sitemap_index') as mock_update_index: + with patch("adsmp.tasks.update_sitemap_index") as mock_update_index: mock_update_index.return_value = True tasks.task_update_sitemap_files() - + # Verify no individual sitemap files were created (since no updates needed) - sitemap_file = os.path.join(temp_dir, 'ads', 'sitemap_bib_no_update.xml') - self.assertFalse(os.path.exists(sitemap_file), "No sitemap files should be generated when no updates needed") - + sitemap_file = os.path.join(temp_dir, "ads", "sitemap_bib_no_update.xml") + self.assertFalse( + os.path.exists(sitemap_file), + "No sitemap files should be generated when no updates needed", + ) + # Verify the index update function was called (even when no files need updating) mock_update_index.assert_called_once_with() - + finally: # Cleanup - self.app.conf['SITEMAP_DIR'] = original_sitemap_dir + self.app.conf["SITEMAP_DIR"] = original_sitemap_dir if original_sites: - self.app.conf['SITES'] = original_sites - + self.app.conf["SITES"] = original_sites + try: shutil.rmtree(temp_dir) except OSError: @@ -2168,126 +2594,154 @@ def test_task_update_sitemap_files_no_updates_needed(self): def test_task_update_sitemap_files_multiple_files(self): """Test task_update_sitemap_files with multiple sitemap files needing updates""" - + # Create temporary sitemap directory temp_dir = tempfile.mkdtemp() - original_sitemap_dir = self.app.conf.get('SITEMAP_DIR') - self.app.conf['SITEMAP_DIR'] = temp_dir - + original_sitemap_dir = self.app.conf.get("SITEMAP_DIR") + self.app.conf["SITEMAP_DIR"] = temp_dir + try: # Setup test data across multiple sitemap files - file1_bibcodes = ['2023MultiFile1A', '2023MultiFile1B'] - file2_bibcodes = ['2023MultiFile2A', '2023MultiFile2B'] + file1_bibcodes = ["2023MultiFile1A", "2023MultiFile1B"] + file2_bibcodes = ["2023MultiFile2A", "2023MultiFile2B"] all_bibcodes = file1_bibcodes + file2_bibcodes - + with self.app.session_scope() as session: # Clean up any existing test data - session.query(SitemapInfo).filter(SitemapInfo.bibcode.like('2023MultiFile%')).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.like('2023MultiFile%')).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.like("2023MultiFile%") + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.like("2023MultiFile%") + ).delete(synchronize_session=False) session.commit() - + # Create Records and SitemapInfo for file 1 for bibcode in file1_bibcodes: record = Records() record.bibcode = bibcode - record.bib_data = f'{{"title": "Multi File Record 1", "year": 2023}}' + record.bib_data = ( + f'{{"title": "Multi File Record 1", "year": 2023}}' + ) record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_multi_1.xml' + sitemap_record.sitemap_filename = "sitemap_bib_multi_1.xml" sitemap_record.update_flag = True session.add(sitemap_record) - + # Create Records and SitemapInfo for file 2 for bibcode in file2_bibcodes: record = Records() record.bibcode = bibcode - record.bib_data = f'{{"title": "Multi File Record 2", "year": 2023}}' + record.bib_data = ( + f'{{"title": "Multi File Record 2", "year": 2023}}' + ) record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_multi_2.xml' + sitemap_record.sitemap_filename = "sitemap_bib_multi_2.xml" sitemap_record.update_flag = True session.add(sitemap_record) - + session.commit() - + # Configure sites - sites_config = {'ads': {'name': 'ADS'}} - original_sites = self.app.conf.get('SITES') - self.app.conf['SITES'] = sites_config - + sites_config = {"ads": {"name": "ADS"}} + original_sites = self.app.conf.get("SITES") + self.app.conf["SITES"] = sites_config + # Create site directory - site_dir = os.path.join(temp_dir, 'ads') + site_dir = os.path.join(temp_dir, "ads") os.makedirs(site_dir, exist_ok=True) - + # Execute the workflow synchronously with self.app.session_scope() as session: - files_to_generate = session.query(SitemapInfo.sitemap_filename).filter( - SitemapInfo.update_flag == True - ).distinct().all() - + files_to_generate = ( + session.query(SitemapInfo.sitemap_filename) + .filter(SitemapInfo.update_flag == True) + .distinct() + .all() + ) + # Generate each sitemap file synchronously for (filename,) in files_to_generate: with self.app.session_scope() as session: - record_ids = session.query(SitemapInfo.id).filter( - SitemapInfo.sitemap_filename == filename, - SitemapInfo.update_flag == True - ).all() + record_ids = ( + session.query(SitemapInfo.id) + .filter( + SitemapInfo.sitemap_filename == filename, + SitemapInfo.update_flag == True, + ) + .all() + ) record_ids = [r[0] for r in record_ids] - + tasks.task_generate_single_sitemap(filename, record_ids) - + # Generate the index tasks.task_generate_sitemap_index() - + # Verify both sitemap files were created - file1_path = os.path.join(temp_dir, 'ads', 'sitemap_bib_multi_1.xml') - file2_path = os.path.join(temp_dir, 'ads', 'sitemap_bib_multi_2.xml') - - self.assertTrue(os.path.exists(file1_path), "First sitemap file should exist") - self.assertTrue(os.path.exists(file2_path), "Second sitemap file should exist") - + file1_path = os.path.join(temp_dir, "ads", "sitemap_bib_multi_1.xml") + file2_path = os.path.join(temp_dir, "ads", "sitemap_bib_multi_2.xml") + + self.assertTrue( + os.path.exists(file1_path), "First sitemap file should exist" + ) + self.assertTrue( + os.path.exists(file2_path), "Second sitemap file should exist" + ) + # Verify file contents - with open(file1_path, 'r') as f: + with open(file1_path, "r") as f: content1 = f.read() for bibcode in file1_bibcodes: self.assertIn(bibcode, content1, f"File 1 should contain {bibcode}") for bibcode in file2_bibcodes: - self.assertNotIn(bibcode, content1, f"File 1 should not contain {bibcode}") - - with open(file2_path, 'r') as f: + self.assertNotIn( + bibcode, content1, f"File 1 should not contain {bibcode}" + ) + + with open(file2_path, "r") as f: content2 = f.read() for bibcode in file2_bibcodes: self.assertIn(bibcode, content2, f"File 2 should contain {bibcode}") for bibcode in file1_bibcodes: - self.assertNotIn(bibcode, content2, f"File 2 should not contain {bibcode}") - + self.assertNotIn( + bibcode, content2, f"File 2 should not contain {bibcode}" + ) + # Verify all update flags were reset with self.app.session_scope() as session: - updated_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(all_bibcodes) - ).all() + updated_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(all_bibcodes)) + .all() + ) self.assertEqual(len(updated_records), 4, "Should have 4 total records") for record in updated_records: - self.assertFalse(record.update_flag, f"Update flag should be False for {record.bibcode}") - + self.assertFalse( + record.update_flag, + f"Update flag should be False for {record.bibcode}", + ) + finally: # Cleanup - self.app.conf['SITEMAP_DIR'] = original_sitemap_dir + self.app.conf["SITEMAP_DIR"] = original_sitemap_dir if original_sites: - self.app.conf['SITES'] = original_sites - + self.app.conf["SITES"] = original_sites + try: shutil.rmtree(temp_dir) except OSError: @@ -2295,34 +2749,42 @@ def test_task_update_sitemap_files_multiple_files(self): def test_task_generate_single_sitemap_multi_site(self): """Test generating sitemap files for multiple sites (ADS + SciX) with multiple records""" - + # Setup test data with multiple bibcodes to create multiple files test_bibcodes = [ - '2023Multi..1..1A', '2023Multi..1..2B', '2023Multi..1..3C', - '2023Multi..2..1D', '2023Multi..2..2E', '2023Multi..2..3F', - '2023Multi..3..1G', '2023Multi..3..2H', '2023Multi..3..3I' + "2023Multi..1..1A", + "2023Multi..1..2B", + "2023Multi..1..3C", + "2023Multi..2..1D", + "2023Multi..2..2E", + "2023Multi..2..3F", + "2023Multi..3..1G", + "2023Multi..3..2H", + "2023Multi..3..3I", ] - + # Override MAX_RECORDS_PER_SITEMAP to force multiple files - original_max_records = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000) - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 3 # Force 3 files with 3 records each - + original_max_records = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000) + self.app.conf[ + "MAX_RECORDS_PER_SITEMAP" + ] = 3 # Force 3 files with 3 records each + try: with self.app.session_scope() as session: # Track record IDs by filename for efficient file generation file_record_mapping = {} - + for i, bibcode in enumerate(test_bibcodes): record = Records() record.bibcode = bibcode record.bib_data = f'{{"title": "Multi-site Test {i+1}"}}' record.bib_data_updated = get_date() - timedelta(days=1) - record.status = 'success' + record.status = "success" session.add(record) session.flush() - + # Assign records to different sitemap files - filename = f'sitemap_bib_{(i // 3) + 1}.xml' # 3 records per file + filename = f"sitemap_bib_{(i // 3) + 1}.xml" # 3 records per file sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id @@ -2331,865 +2793,1274 @@ def test_task_generate_single_sitemap_multi_site(self): sitemap_record.update_flag = True session.add(sitemap_record) session.flush() - + # Group record IDs by filename if filename not in file_record_mapping: file_record_mapping[filename] = [] file_record_mapping[filename].append(sitemap_record.id) - + session.commit() - + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Create site directories - ads_dir = os.path.join(temp_dir, 'ads') - scix_dir = os.path.join(temp_dir, 'scix') + ads_dir = os.path.join(temp_dir, "ads") + scix_dir = os.path.join(temp_dir, "scix") os.makedirs(ads_dir, exist_ok=True) os.makedirs(scix_dir, exist_ok=True) - + # Execute sitemap generation for each file using pre-collected mapping for filename, file_record_ids in file_record_mapping.items(): tasks.task_generate_single_sitemap(filename, file_record_ids) - + # Verify all files were created for both sites for filename in file_record_mapping.keys(): ads_file = os.path.join(ads_dir, filename) scix_file = os.path.join(scix_dir, filename) - - self.assertTrue(os.path.exists(ads_file), f"Should create ADS {filename}") - self.assertTrue(os.path.exists(scix_file), f"Should create SciX {filename}") - + + self.assertTrue( + os.path.exists(ads_file), f"Should create ADS {filename}" + ) + self.assertTrue( + os.path.exists(scix_file), f"Should create SciX {filename}" + ) + # Test file content validation for each file - with open(ads_file, 'r', encoding='utf-8') as f: + with open(ads_file, "r", encoding="utf-8") as f: ads_content = f.read() - with open(scix_file, 'r', encoding='utf-8') as f: + with open(scix_file, "r", encoding="utf-8") as f: scix_content = f.read() - + # Verify XML structure for both sites - self.assertIn('', ads_content, f"ADS {filename} should have XML declaration") - self.assertIn('', ads_content, f"ADS {filename} should have urlset element") - self.assertIn('', ads_content, f"ADS {filename} should close urlset element") - - self.assertIn('', scix_content, f"SciX {filename} should have XML declaration") - self.assertIn('', scix_content, f"SciX {filename} should have urlset element") - self.assertIn('', scix_content, f"SciX {filename} should close urlset element") - + self.assertIn( + '', + ads_content, + f"ADS {filename} should have XML declaration", + ) + self.assertIn( + '', + ads_content, + f"ADS {filename} should have urlset element", + ) + self.assertIn( + "", + ads_content, + f"ADS {filename} should close urlset element", + ) + + self.assertIn( + '', + scix_content, + f"SciX {filename} should have XML declaration", + ) + self.assertIn( + '', + scix_content, + f"SciX {filename} should have urlset element", + ) + self.assertIn( + "", + scix_content, + f"SciX {filename} should close urlset element", + ) + # Verify each file contains exactly 3 URL entries - ads_url_count = ads_content.count('') - scix_url_count = scix_content.count('') - self.assertEqual(ads_url_count, 3, f"ADS {filename} should contain exactly 3 URL entries") - self.assertEqual(scix_url_count, 3, f"SciX {filename} should contain exactly 3 URL entries") - + ads_url_count = ads_content.count("") + scix_url_count = scix_content.count("") + self.assertEqual( + ads_url_count, + 3, + f"ADS {filename} should contain exactly 3 URL entries", + ) + self.assertEqual( + scix_url_count, + 3, + f"SciX {filename} should contain exactly 3 URL entries", + ) + # Verify lastmod elements are present - self.assertIn('', ads_content, f"ADS {filename} should contain lastmod elements") - self.assertIn('', scix_content, f"SciX {filename} should contain lastmod elements") - + self.assertIn( + "", + ads_content, + f"ADS {filename} should contain lastmod elements", + ) + self.assertIn( + "", + scix_content, + f"SciX {filename} should contain lastmod elements", + ) + # Test specific bibcode content in files file_bibcode_mapping = { - 'sitemap_bib_1.xml': test_bibcodes[0:3], # First 3 bibcodes - 'sitemap_bib_2.xml': test_bibcodes[3:6], # Next 3 bibcodes - 'sitemap_bib_3.xml': test_bibcodes[6:9], # Last 3 bibcodes + "sitemap_bib_1.xml": test_bibcodes[0:3], # First 3 bibcodes + "sitemap_bib_2.xml": test_bibcodes[3:6], # Next 3 bibcodes + "sitemap_bib_3.xml": test_bibcodes[6:9], # Last 3 bibcodes } - + for filename, expected_bibcodes in file_bibcode_mapping.items(): ads_file = os.path.join(ads_dir, filename) scix_file = os.path.join(scix_dir, filename) - - with open(ads_file, 'r', encoding='utf-8') as f: + + with open(ads_file, "r", encoding="utf-8") as f: ads_content = f.read() - with open(scix_file, 'r', encoding='utf-8') as f: + with open(scix_file, "r", encoding="utf-8") as f: scix_content = f.read() - + # Verify each expected bibcode appears in the correct file for bibcode in expected_bibcodes: escaped_bibcode = html.escape(bibcode) - ads_url = f'https://ui.adsabs.harvard.edu/abs/{escaped_bibcode}' - scix_url = f'https://scixplorer.org/abs/{escaped_bibcode}' - - self.assertIn(f'{ads_url}', ads_content, f"ADS {filename} should contain URL for {bibcode}") - self.assertIn(f'{scix_url}', scix_content, f"SciX {filename} should contain URL for {bibcode}") - + ads_url = f"https://ui.adsabs.harvard.edu/abs/{escaped_bibcode}" + scix_url = f"https://scixplorer.org/abs/{escaped_bibcode}" + + self.assertIn( + f"{ads_url}", + ads_content, + f"ADS {filename} should contain URL for {bibcode}", + ) + self.assertIn( + f"{scix_url}", + scix_content, + f"SciX {filename} should contain URL for {bibcode}", + ) + # Verify total record distribution across all files total_ads_urls = 0 total_scix_urls = 0 for filename in file_record_mapping.keys(): - with open(os.path.join(ads_dir, filename), 'r') as f: - total_ads_urls += f.read().count('') - with open(os.path.join(scix_dir, filename), 'r') as f: - total_scix_urls += f.read().count('') - - self.assertEqual(total_ads_urls, 9, "Total ADS URLs across all files should be 9") - self.assertEqual(total_scix_urls, 9, "Total SciX URLs across all files should be 9") - + with open(os.path.join(ads_dir, filename), "r") as f: + total_ads_urls += f.read().count("") + with open(os.path.join(scix_dir, filename), "r") as f: + total_scix_urls += f.read().count("") + + self.assertEqual( + total_ads_urls, 9, "Total ADS URLs across all files should be 9" + ) + self.assertEqual( + total_scix_urls, 9, "Total SciX URLs across all files should be 9" + ) + finally: # Restore original configuration - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_max_records + self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_max_records def test_task_update_robots_files_creation(self): """Test robots.txt file creation for multiple sites with content validation""" - + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Create site directories - ads_dir = os.path.join(temp_dir, 'ads') - scix_dir = os.path.join(temp_dir, 'scix') + ads_dir = os.path.join(temp_dir, "ads") + scix_dir = os.path.join(temp_dir, "scix") os.makedirs(ads_dir, exist_ok=True) os.makedirs(scix_dir, exist_ok=True) - + # Execute robots.txt update - + result = update_robots_files(True) - + # Verify function completed self.assertTrue(isinstance(result, bool), "Should return boolean result") - + # Verify robots.txt files were created for both sites - ads_robots = os.path.join(ads_dir, 'robots.txt') - scix_robots = os.path.join(scix_dir, 'robots.txt') - self.assertTrue(os.path.exists(ads_robots), "Should create ADS robots.txt file") - self.assertTrue(os.path.exists(scix_robots), "Should create SciX robots.txt file") - + ads_robots = os.path.join(ads_dir, "robots.txt") + scix_robots = os.path.join(scix_dir, "robots.txt") + self.assertTrue( + os.path.exists(ads_robots), "Should create ADS robots.txt file" + ) + self.assertTrue( + os.path.exists(scix_robots), "Should create SciX robots.txt file" + ) + # Test ADS robots.txt content - with open(ads_robots, 'r', encoding='utf-8') as f: + with open(ads_robots, "r", encoding="utf-8") as f: ads_robots_content = f.read() - + # Verify ADS robots.txt content - self.assertIn('User-agent: *', ads_robots_content, "ADS robots.txt should contain User-agent directive") - self.assertIn('Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml', ads_robots_content, "ADS robots.txt should contain sitemap URL") - self.assertIn('Disallow: /abs/', ads_robots_content, "ADS robots.txt should contain disallow directives") - + self.assertIn( + "User-agent: *", + ads_robots_content, + "ADS robots.txt should contain User-agent directive", + ) + self.assertIn( + "Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml", + ads_robots_content, + "ADS robots.txt should contain sitemap URL", + ) + self.assertIn( + "Disallow: /abs/", + ads_robots_content, + "ADS robots.txt should contain disallow directives", + ) + # Test SciX robots.txt content - with open(scix_robots, 'r', encoding='utf-8') as f: + with open(scix_robots, "r", encoding="utf-8") as f: scix_robots_content = f.read() - # Verify SciX robots.txt content - self.assertIn('User-agent: *', scix_robots_content, "SciX robots.txt should contain User-agent directive") - self.assertIn('Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml', scix_robots_content, "SciX robots.txt should contain sitemap URL") - self.assertIn('Disallow: /abs/', scix_robots_content, "SciX robots.txt should contain disallow directives") - + self.assertIn( + "User-agent: *", + scix_robots_content, + "SciX robots.txt should contain User-agent directive", + ) + self.assertIn( + "Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml", + scix_robots_content, + "SciX robots.txt should contain sitemap URL", + ) + self.assertIn( + "Disallow: /abs/", + scix_robots_content, + "SciX robots.txt should contain disallow directives", + ) + # Verify robots.txt files are not empty - self.assertGreater(len(ads_robots_content.strip()), 0, "ADS robots.txt should not be empty") - self.assertGreater(len(scix_robots_content.strip()), 0, "SciX robots.txt should not be empty") - + self.assertGreater( + len(ads_robots_content.strip()), 0, "ADS robots.txt should not be empty" + ) + self.assertGreater( + len(scix_robots_content.strip()), + 0, + "SciX robots.txt should not be empty", + ) + # Verify proper line endings and format - self.assertTrue(ads_robots_content.endswith('\n'), "ADS robots.txt should end with newline") - self.assertTrue(scix_robots_content.endswith('\n'), "SciX robots.txt should end with newline") - + self.assertTrue( + ads_robots_content.endswith("\n"), + "ADS robots.txt should end with newline", + ) + self.assertTrue( + scix_robots_content.endswith("\n"), + "SciX robots.txt should end with newline", + ) + # Verify correct sitemap URLs - should match production URLs - self.assertIn('Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml', ads_robots_content, - "ADS robots.txt should contain correct sitemap URL") - self.assertIn('Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml', scix_robots_content, - "SciX robots.txt should contain correct sitemap URL") - + self.assertIn( + "Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml", + ads_robots_content, + "ADS robots.txt should contain correct sitemap URL", + ) + self.assertIn( + "Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml", + scix_robots_content, + "SciX robots.txt should contain correct sitemap URL", + ) + # Verify we have the expected user agents (robots.txt contains intentional duplicates for different agents) - self.assertIn('User-agent: Googlebot', ads_robots_content, "Should have Googlebot directives") - self.assertIn('User-agent: msnbot', ads_robots_content, "Should have msnbot directives") - self.assertIn('User-agent: Slurp', ads_robots_content, "Should have Slurp directives") - self.assertIn('User-agent: *', ads_robots_content, "Should have wildcard user-agent directives") - + self.assertIn( + "User-agent: Googlebot", + ads_robots_content, + "Should have Googlebot directives", + ) + self.assertIn( + "User-agent: msnbot", + ads_robots_content, + "Should have msnbot directives", + ) + self.assertIn( + "User-agent: Slurp", ads_robots_content, "Should have Slurp directives" + ) + self.assertIn( + "User-agent: *", + ads_robots_content, + "Should have wildcard user-agent directives", + ) + # Same for SciX - self.assertIn('User-agent: Googlebot', scix_robots_content, "Should have Googlebot directives") - self.assertIn('User-agent: msnbot', scix_robots_content, "Should have msnbot directives") - self.assertIn('User-agent: Slurp', scix_robots_content, "Should have Slurp directives") - self.assertIn('User-agent: *', scix_robots_content, "Should have wildcard user-agent directives") + self.assertIn( + "User-agent: Googlebot", + scix_robots_content, + "Should have Googlebot directives", + ) + self.assertIn( + "User-agent: msnbot", + scix_robots_content, + "Should have msnbot directives", + ) + self.assertIn( + "User-agent: Slurp", scix_robots_content, "Should have Slurp directives" + ) + self.assertIn( + "User-agent: *", + scix_robots_content, + "Should have wildcard user-agent directives", + ) def test_task_update_sitemap_index_generation(self): """Test comprehensive sitemap index generation with actual files and database records""" - + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Create test records and sitemap entries in database - test_bibcodes = ['2023Index..1..1A', '2023Index..1..2B', '2023Index..1..3C'] - + test_bibcodes = ["2023Index..1..1A", "2023Index..1..2B", "2023Index..1..3C"] + with self.app.session_scope() as session: # Clear any existing data session.query(SitemapInfo).delete(synchronize_session=False) session.query(Records).delete(synchronize_session=False) - + # Create Records entries for i, bibcode in enumerate(test_bibcodes): record = Records( bibcode=bibcode, bib_data='{"title": "Test Title"}', bib_data_updated=get_date() - timedelta(days=1), - status='success' + status="success", ) session.add(record) - + session.commit() - + # Get record IDs for sitemap entries - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + records = ( + session.query(Records) + .filter(Records.bibcode.in_(test_bibcodes)) + .all() + ) record_map = {r.bibcode: r.id for r in records} - + # Create SitemapInfo entries with different filenames sitemap_entries = [ - {'bibcode': test_bibcodes[0], 'filename': 'sitemap_bib_1.xml', 'record_id': record_map[test_bibcodes[0]]}, - {'bibcode': test_bibcodes[1], 'filename': 'sitemap_bib_1.xml', 'record_id': record_map[test_bibcodes[1]]}, - {'bibcode': test_bibcodes[2], 'filename': 'sitemap_bib_2.xml', 'record_id': record_map[test_bibcodes[2]]}, + { + "bibcode": test_bibcodes[0], + "filename": "sitemap_bib_1.xml", + "record_id": record_map[test_bibcodes[0]], + }, + { + "bibcode": test_bibcodes[1], + "filename": "sitemap_bib_1.xml", + "record_id": record_map[test_bibcodes[1]], + }, + { + "bibcode": test_bibcodes[2], + "filename": "sitemap_bib_2.xml", + "record_id": record_map[test_bibcodes[2]], + }, ] - + for entry in sitemap_entries: sitemap_info = SitemapInfo( - bibcode=entry['bibcode'], - record_id=entry['record_id'], - sitemap_filename=entry['filename'], + bibcode=entry["bibcode"], + record_id=entry["record_id"], + sitemap_filename=entry["filename"], bib_data_updated=get_date() - timedelta(days=1), filename_lastmoddate=get_date() - timedelta(hours=1), - update_flag=False + update_flag=False, ) session.add(sitemap_info) - + session.commit() - + # Create site directories and actual sitemap files - sites_config = self.app.conf.get('SITES', {}) - expected_filenames = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml'] - + sites_config = self.app.conf.get("SITES", {}) + expected_filenames = ["sitemap_bib_1.xml", "sitemap_bib_2.xml"] + for site_key in sites_config.keys(): site_dir = os.path.join(temp_dir, site_key) os.makedirs(site_dir, exist_ok=True) - + # Create actual sitemap files with content for filename in expected_filenames: sitemap_path = os.path.join(site_dir, filename) - with open(sitemap_path, 'w', encoding='utf-8') as f: - f.write(f'\n{filename}') - + with open(sitemap_path, "w", encoding="utf-8") as f: + f.write( + f'\n{filename}' + ) + # Execute sitemap index update result = update_sitemap_index() - + # Verify function completed successfully - self.assertTrue(result, "update_sitemap_index should return True on success") - + self.assertTrue( + result, "update_sitemap_index should return True on success" + ) + # Verify sitemap_index.xml files were created for each site for site_key, site_config in sites_config.items(): site_dir = os.path.join(temp_dir, site_key) - index_path = os.path.join(site_dir, 'sitemap_index.xml') - - self.assertTrue(os.path.exists(index_path), - f"sitemap_index.xml should be created for site {site_key}") - + index_path = os.path.join(site_dir, "sitemap_index.xml") + + self.assertTrue( + os.path.exists(index_path), + f"sitemap_index.xml should be created for site {site_key}", + ) + # Verify index file content - with open(index_path, 'r', encoding='utf-8') as f: + with open(index_path, "r", encoding="utf-8") as f: index_content = f.read() - + # Should contain XML structure - self.assertIn('', index_content, - "Index should contain XML declaration") - self.assertIn('', index_content, - "Index should close sitemapindex element") - + self.assertIn( + '', + index_content, + "Index should contain XML declaration", + ) + self.assertIn( + "", + index_content, + "Index should close sitemapindex element", + ) + # Should contain entries for each sitemap file that exists (production URL structure) - sitemap_base_url = site_config.get('sitemap_url', 'https://ui.adsabs.harvard.edu/sitemap') + sitemap_base_url = site_config.get( + "sitemap_url", "https://ui.adsabs.harvard.edu/sitemap" + ) for filename in expected_filenames: expected_url = f"{sitemap_base_url}/{filename}" - self.assertIn(f'{html.escape(expected_url)}', index_content, - f"Index should reference {filename} with correct URL") - self.assertIn('', index_content, - "Index should contain lastmod elements") - + self.assertIn( + f"{html.escape(expected_url)}", + index_content, + f"Index should reference {filename} with correct URL", + ) + self.assertIn( + "", + index_content, + "Index should contain lastmod elements", + ) + # Verify we have the expected number of sitemap entries (2 bib files + 1 static) - sitemap_count = index_content.count('') - self.assertEqual(sitemap_count, 3, - f"Index should contain exactly 3 sitemap entries (2 bib + 1 static), found {sitemap_count}") - + sitemap_count = index_content.count("") + self.assertEqual( + sitemap_count, + 3, + f"Index should contain exactly 3 sitemap entries (2 bib + 1 static), found {sitemap_count}", + ) + # Test cleanup with self.app.session_scope() as session: - session.query(SitemapInfo).filter(SitemapInfo.bibcode.in_(test_bibcodes)).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.in_(test_bibcodes) + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.in_(test_bibcodes) + ).delete(synchronize_session=False) session.commit() - + def test_task_update_sitemap_index_empty_database(self): """Test sitemap index generation when no sitemap files exist in database""" - + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Ensure database has no sitemap entries with self.app.session_scope() as session: session.query(SitemapInfo).delete(synchronize_session=False) session.commit() - + # Execute sitemap index update result = update_sitemap_index() - + # Should still succeed (generates empty index files) - self.assertTrue(result, "update_sitemap_index should return True even with empty database") - + self.assertTrue( + result, + "update_sitemap_index should return True even with empty database", + ) + # Verify empty sitemap_index.xml files were created - sites_config = self.app.conf.get('SITES', {}) + sites_config = self.app.conf.get("SITES", {}) for site_key in sites_config.keys(): site_dir = os.path.join(temp_dir, site_key) - index_path = os.path.join(site_dir, 'sitemap_index.xml') - - self.assertTrue(os.path.exists(index_path), - f"Empty sitemap_index.xml should be created for site {site_key}") - + index_path = os.path.join(site_dir, "sitemap_index.xml") + + self.assertTrue( + os.path.exists(index_path), + f"Empty sitemap_index.xml should be created for site {site_key}", + ) + # Verify empty index file content - with open(index_path, 'r', encoding='utf-8') as f: + with open(index_path, "r", encoding="utf-8") as f: index_content = f.read() - + # Should contain XML structure but no sitemap entries - self.assertIn('', index_content, - "Empty index should contain XML declaration") - self.assertIn('', index_content, - "Empty index should close sitemapindex element") - + self.assertIn( + '', + index_content, + "Empty index should contain XML declaration", + ) + self.assertIn( + "", + index_content, + "Empty index should close sitemapindex element", + ) + # Should contain only static sitemap entry (1 entry) - sitemap_count = index_content.count('') - self.assertEqual(sitemap_count, 1, - f"Empty index should contain only static sitemap entry, found {sitemap_count}") - + sitemap_count = index_content.count("") + self.assertEqual( + sitemap_count, + 1, + f"Empty index should contain only static sitemap entry, found {sitemap_count}", + ) + def test_task_update_sitemap_index_missing_files(self): """Test sitemap index generation when database has entries but physical files don't exist""" - + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Create database entries but no physical files - test_bibcode = '2023Missing..1..1A' - + test_bibcode = "2023Missing..1..1A" + with self.app.session_scope() as session: # Clear existing data session.query(SitemapInfo).delete(synchronize_session=False) session.query(Records).delete(synchronize_session=False) - + # Create a Record entry record = Records( bibcode=test_bibcode, bib_data='{"title": "Test Title"}', bib_data_updated=get_date() - timedelta(days=1), - status='success' + status="success", ) session.add(record) session.commit() - + # Create SitemapInfo entry sitemap_info = SitemapInfo( bibcode=test_bibcode, record_id=record.id, - sitemap_filename='sitemap_bib_missing.xml', + sitemap_filename="sitemap_bib_missing.xml", bib_data_updated=get_date() - timedelta(days=1), filename_lastmoddate=get_date() - timedelta(hours=1), - update_flag=False + update_flag=False, ) session.add(sitemap_info) session.commit() - + # Create site directories but NO sitemap files - sites_config = self.app.conf.get('SITES', {}) + sites_config = self.app.conf.get("SITES", {}) for site_key in sites_config.keys(): site_dir = os.path.join(temp_dir, site_key) os.makedirs(site_dir, exist_ok=True) # Deliberately NOT creating the sitemap_bib_missing.xml file - + # Execute sitemap index update result = update_sitemap_index() - + # Should still succeed - self.assertTrue(result, "update_sitemap_index should return True even when files are missing") - + self.assertTrue( + result, + "update_sitemap_index should return True even when files are missing", + ) + # Verify empty sitemap_index.xml files were created (no entries since files don't exist) for site_key in sites_config.keys(): site_dir = os.path.join(temp_dir, site_key) - index_path = os.path.join(site_dir, 'sitemap_index.xml') - - self.assertTrue(os.path.exists(index_path), - f"sitemap_index.xml should be created for site {site_key}") - + index_path = os.path.join(site_dir, "sitemap_index.xml") + + self.assertTrue( + os.path.exists(index_path), + f"sitemap_index.xml should be created for site {site_key}", + ) + # Verify index file has no entries (since physical files don't exist) - with open(index_path, 'r', encoding='utf-8') as f: + with open(index_path, "r", encoding="utf-8") as f: index_content = f.read() - - sitemap_count = index_content.count('') - self.assertEqual(sitemap_count, 1, - f"Index should contain only static sitemap when physical files missing, found {sitemap_count}") - + + sitemap_count = index_content.count("") + self.assertEqual( + sitemap_count, + 1, + f"Index should contain only static sitemap when physical files missing, found {sitemap_count}", + ) + # Test cleanup with self.app.session_scope() as session: - session.query(SitemapInfo).filter(SitemapInfo.bibcode == test_bibcode).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode == test_bibcode).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode == test_bibcode + ).delete(synchronize_session=False) + session.query(Records).filter(Records.bibcode == test_bibcode).delete( + synchronize_session=False + ) session.commit() def test_task_generate_sitemap_index(self): """Test the Celery task wrapper for sitemap index generation""" - + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Create test data in database first (required for sitemap index generation) - test_bibcodes = ['2023TaskIndex..1..1A', '2023TaskIndex..1..2B', '2023TaskIndex..1..3C'] + test_bibcodes = [ + "2023TaskIndex..1..1A", + "2023TaskIndex..1..2B", + "2023TaskIndex..1..3C", + ] # Use production-like filenames (compressed format with zero-padded numbers) - sample_sitemaps = ['sitemap_bib.0001.xml.gz', 'sitemap_bib.0002.xml.gz', 'sitemap_bib.0003.xml.gz'] - + sample_sitemaps = [ + "sitemap_bib.0001.xml.gz", + "sitemap_bib.0002.xml.gz", + "sitemap_bib.0003.xml.gz", + ] + with self.app.session_scope() as session: # Clear existing data session.query(SitemapInfo).delete(synchronize_session=False) session.query(Records).delete(synchronize_session=False) - + # Create Records entries for i, bibcode in enumerate(test_bibcodes): record = Records( bibcode=bibcode, bib_data='{"title": "Test Title"}', bib_data_updated=get_date() - timedelta(days=1), - status='success' + status="success", ) session.add(record) - + session.commit() - + # Get record IDs for sitemap entries - records = session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).all() + records = ( + session.query(Records) + .filter(Records.bibcode.in_(test_bibcodes)) + .all() + ) record_map = {r.bibcode: r.id for r in records} - + # Create SitemapInfo entries for different filenames sitemap_mappings = [ - {'bibcode': test_bibcodes[0], 'filename': sample_sitemaps[0]}, - {'bibcode': test_bibcodes[1], 'filename': sample_sitemaps[1]}, - {'bibcode': test_bibcodes[2], 'filename': sample_sitemaps[2]}, + {"bibcode": test_bibcodes[0], "filename": sample_sitemaps[0]}, + {"bibcode": test_bibcodes[1], "filename": sample_sitemaps[1]}, + {"bibcode": test_bibcodes[2], "filename": sample_sitemaps[2]}, ] - + for mapping in sitemap_mappings: sitemap_info = SitemapInfo( - bibcode=mapping['bibcode'], - record_id=record_map[mapping['bibcode']], - sitemap_filename=mapping['filename'], + bibcode=mapping["bibcode"], + record_id=record_map[mapping["bibcode"]], + sitemap_filename=mapping["filename"], bib_data_updated=get_date() - timedelta(days=1), filename_lastmoddate=get_date() - timedelta(hours=1), - update_flag=False + update_flag=False, ) session.add(sitemap_info) - + session.commit() - + # Create site directories and physical sitemap files - ads_dir = os.path.join(temp_dir, 'ads') - scix_dir = os.path.join(temp_dir, 'scix') + ads_dir = os.path.join(temp_dir, "ads") + scix_dir = os.path.join(temp_dir, "scix") os.makedirs(ads_dir, exist_ok=True) os.makedirs(scix_dir, exist_ok=True) - + for filename in sample_sitemaps: # Create sample XML files for both sites - with open(os.path.join(ads_dir, filename), 'w') as f: + with open(os.path.join(ads_dir, filename), "w") as f: f.write('') - with open(os.path.join(scix_dir, filename), 'w') as f: + with open(os.path.join(scix_dir, filename), "w") as f: f.write('') - + # Execute the Celery task try: tasks.task_generate_sitemap_index() success = True except Exception as e: success = False - + # Verify task executed without errors self.assertTrue(success, "Task should execute without errors") - + # Verify sitemap index files were created for both sites - ads_index = os.path.join(ads_dir, 'sitemap_index.xml') - scix_index = os.path.join(scix_dir, 'sitemap_index.xml') - self.assertTrue(os.path.exists(ads_index), "Should create ADS sitemap index file") - self.assertTrue(os.path.exists(scix_index), "Should create SciX sitemap index file") - + ads_index = os.path.join(ads_dir, "sitemap_index.xml") + scix_index = os.path.join(scix_dir, "sitemap_index.xml") + self.assertTrue( + os.path.exists(ads_index), "Should create ADS sitemap index file" + ) + self.assertTrue( + os.path.exists(scix_index), "Should create SciX sitemap index file" + ) + # Test ADS sitemap index content - with open(ads_index, 'r', encoding='utf-8') as f: + with open(ads_index, "r", encoding="utf-8") as f: ads_index_content = f.read() - + # Verify XML structure for ADS index - self.assertIn('', ads_index_content, "ADS index should have XML declaration") - self.assertIn('', ads_index_content, "ADS index should have sitemapindex element") - self.assertIn('', ads_index_content, "ADS index should close sitemapindex element") - + self.assertIn( + '', + ads_index_content, + "ADS index should have XML declaration", + ) + self.assertIn( + '', + ads_index_content, + "ADS index should have sitemapindex element", + ) + self.assertIn( + "", + ads_index_content, + "ADS index should close sitemapindex element", + ) + # Verify all sample sitemaps are referenced in ADS index (production URL structure) for filename in sample_sitemaps: - sitemap_url = f'https://ui.adsabs.harvard.edu/sitemap/{filename}' - self.assertIn(f'{html.escape(sitemap_url)}', ads_index_content, f"ADS index should reference {filename}") - self.assertIn('', ads_index_content, "ADS index should contain lastmod elements") - + sitemap_url = f"https://ui.adsabs.harvard.edu/sitemap/{filename}" + self.assertIn( + f"{html.escape(sitemap_url)}", + ads_index_content, + f"ADS index should reference {filename}", + ) + self.assertIn( + "", + ads_index_content, + "ADS index should contain lastmod elements", + ) + # Test SciX sitemap index content - with open(scix_index, 'r', encoding='utf-8') as f: + with open(scix_index, "r", encoding="utf-8") as f: scix_index_content = f.read() - + # Verify XML structure for SciX index - self.assertIn('', scix_index_content, "SciX index should have XML declaration") - self.assertIn('', scix_index_content, "SciX index should have sitemapindex element") - self.assertIn('', scix_index_content, "SciX index should close sitemapindex element") - + self.assertIn( + '', + scix_index_content, + "SciX index should have XML declaration", + ) + self.assertIn( + '', + scix_index_content, + "SciX index should have sitemapindex element", + ) + self.assertIn( + "", + scix_index_content, + "SciX index should close sitemapindex element", + ) + # Verify all sample sitemaps are referenced in SciX index (production URL structure) for filename in sample_sitemaps: - sitemap_url = f'https://scixplorer.org/sitemap/{filename}' - self.assertIn(f'{html.escape(sitemap_url)}', scix_index_content, f"SciX index should reference {filename}") - self.assertIn('', scix_index_content, "SciX index should contain lastmod elements") - + sitemap_url = f"https://scixplorer.org/sitemap/{filename}" + self.assertIn( + f"{html.escape(sitemap_url)}", + scix_index_content, + f"SciX index should reference {filename}", + ) + self.assertIn( + "", + scix_index_content, + "SciX index should contain lastmod elements", + ) + # Verify sitemap count matches expected (3 bib files + 1 static file) - ads_sitemap_count = ads_index_content.count('') - scix_sitemap_count = scix_index_content.count('') - self.assertEqual(ads_sitemap_count, 4, "ADS index should contain exactly 4 sitemap entries (3 bib + 1 static)") - self.assertEqual(scix_sitemap_count, 4, "SciX index should contain exactly 4 sitemap entries (3 bib + 1 static)") - + ads_sitemap_count = ads_index_content.count("") + scix_sitemap_count = scix_index_content.count("") + self.assertEqual( + ads_sitemap_count, + 4, + "ADS index should contain exactly 4 sitemap entries (3 bib + 1 static)", + ) + self.assertEqual( + scix_sitemap_count, + 4, + "SciX index should contain exactly 4 sitemap entries (3 bib + 1 static)", + ) + # Verify proper URL structure and no broken links (production structure) - self.assertIn('https://ui.adsabs.harvard.edu/sitemap/', ads_index_content, "ADS index should contain ADS sitemap base URL") - self.assertIn('https://scixplorer.org/sitemap/', scix_index_content, "SciX index should contain SciX sitemap base URL") - + self.assertIn( + "https://ui.adsabs.harvard.edu/sitemap/", + ads_index_content, + "ADS index should contain ADS sitemap base URL", + ) + self.assertIn( + "https://scixplorer.org/sitemap/", + scix_index_content, + "SciX index should contain SciX sitemap base URL", + ) + # Verify index files are not empty and have reasonable content - self.assertGreater(len(ads_index_content.strip()), 200, "ADS index should have substantial content") - self.assertGreater(len(scix_index_content.strip()), 200, "SciX index should have substantial content") - + self.assertGreater( + len(ads_index_content.strip()), + 200, + "ADS index should have substantial content", + ) + self.assertGreater( + len(scix_index_content.strip()), + 200, + "SciX index should have substantial content", + ) + # Verify no duplicate sitemap entries - ads_locs = [line.strip() for line in ads_index_content.split('\n') if '' in line] - scix_locs = [line.strip() for line in scix_index_content.split('\n') if '' in line] - self.assertEqual(len(ads_locs), len(set(ads_locs)), "ADS index should not contain duplicate sitemap URLs") - self.assertEqual(len(scix_locs), len(set(scix_locs)), "SciX index should not contain duplicate sitemap URLs") - + ads_locs = [ + line.strip() + for line in ads_index_content.split("\n") + if "" in line + ] + scix_locs = [ + line.strip() + for line in scix_index_content.split("\n") + if "" in line + ] + self.assertEqual( + len(ads_locs), + len(set(ads_locs)), + "ADS index should not contain duplicate sitemap URLs", + ) + self.assertEqual( + len(scix_locs), + len(set(scix_locs)), + "SciX index should not contain duplicate sitemap URLs", + ) + # Test cleanup with self.app.session_scope() as session: - session.query(SitemapInfo).filter(SitemapInfo.bibcode.in_(test_bibcodes)).delete(synchronize_session=False) - session.query(Records).filter(Records.bibcode.in_(test_bibcodes)).delete(synchronize_session=False) + session.query(SitemapInfo).filter( + SitemapInfo.bibcode.in_(test_bibcodes) + ).delete(synchronize_session=False) + session.query(Records).filter( + Records.bibcode.in_(test_bibcodes) + ).delete(synchronize_session=False) session.commit() def test_force_update_workflow(self): """Test the complete force-update workflow with timestamp updates""" - test_bibcode = '2023Test.....1....A' - + test_bibcode = "2023Test.....1....A" + # Create initial record with self.app.session_scope() as session: record = Records(bibcode=test_bibcode, bib_data='{"title": "test"}') session.add(record) session.commit() record_id = record.id - + # Add to sitemap initially - tasks.task_manage_sitemap([test_bibcode], 'add') - + tasks.task_manage_sitemap([test_bibcode], "add") + # Verify initial creation with self.app.session_scope() as session: - sitemap_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + sitemap_record = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + ) self.assertIsNotNone(sitemap_record) initial_timestamp = sitemap_record.bib_data_updated - + # Wait a moment to ensure timestamp difference time.sleep(0.01) - + # Update the record's bib_data_updated timestamp with self.app.session_scope() as session: - session.query(Records).filter_by(id=record_id).update({ - 'bib_data_updated': datetime.now(timezone.utc) - }, synchronize_session=False) + session.query(Records).filter_by(id=record_id).update( + {"bib_data_updated": datetime.now(timezone.utc)}, + synchronize_session=False, + ) session.commit() - + # Force update - tasks.task_manage_sitemap([test_bibcode], 'force-update') - + tasks.task_manage_sitemap([test_bibcode], "force-update") + # Verify timestamp was updated with self.app.session_scope() as session: - updated_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + updated_record = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + ) self.assertIsNotNone(updated_record) self.assertNotEqual(updated_record.bib_data_updated, initial_timestamp) def test_add_action_timestamp_logic(self): """Test that add action correctly handles timestamp comparisons""" - test_bibcode = '2023Test.....2....A' - + test_bibcode = "2023Test.....2....A" + # Create record with self.app.session_scope() as session: - record = Records( - bibcode=test_bibcode, - bib_data='{"title": "test"}' - ) + record = Records(bibcode=test_bibcode, bib_data='{"title": "test"}') session.add(record) session.commit() initial_record_timestamp = record.bib_data_updated - + # Add to sitemap - tasks.task_manage_sitemap([test_bibcode], 'add') - + tasks.task_manage_sitemap([test_bibcode], "add") + # Verify sitemap record was created with self.app.session_scope() as session: - sitemap_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + sitemap_record = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + ) self.assertIsNotNone(sitemap_record) initial_sitemap_timestamp = sitemap_record.bib_data_updated - + # Wait a moment to ensure timestamp difference time.sleep(0.01) - + # Update record timestamp with self.app.session_scope() as session: - session.query(Records).filter_by(bibcode=test_bibcode).update({ - 'bib_data_updated': datetime.now(timezone.utc) - }, synchronize_session=False) + session.query(Records).filter_by(bibcode=test_bibcode).update( + {"bib_data_updated": datetime.now(timezone.utc)}, + synchronize_session=False, + ) session.commit() - + # Add again - should update timestamp - tasks.task_manage_sitemap([test_bibcode], 'add') - + tasks.task_manage_sitemap([test_bibcode], "add") + # Verify timestamp was updated with self.app.session_scope() as session: - updated_record = session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + updated_record = ( + session.query(SitemapInfo).filter_by(bibcode=test_bibcode).first() + ) self.assertIsNotNone(updated_record) # Timestamp should be different from initial - self.assertNotEqual(updated_record.bib_data_updated, initial_sitemap_timestamp) + self.assertNotEqual( + updated_record.bib_data_updated, initial_sitemap_timestamp + ) def test_max_records_per_sitemap_logic(self): """Test that sitemap files are created with proper record limits""" # Use a small limit for testing - original_limit = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000) - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 3 # Small limit for testing - + original_limit = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000) + self.app.conf["MAX_RECORDS_PER_SITEMAP"] = 3 # Small limit for testing + try: # Create test records - test_bibcodes = [f'2023Test.....{i}....A' for i in range(5)] - + test_bibcodes = [f"2023Test.....{i}....A" for i in range(5)] + with self.app.session_scope() as session: for bibcode in test_bibcodes: record = Records(bibcode=bibcode, bib_data='{"title": "test"}') session.add(record) session.commit() - + # Add all records - tasks.task_manage_sitemap(test_bibcodes, 'add') - + tasks.task_manage_sitemap(test_bibcodes, "add") + # Verify records are distributed across multiple files with self.app.session_scope() as session: - sitemap_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(test_bibcodes) - ).all() - + sitemap_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(test_bibcodes)) + .all() + ) + # Should have all 5 records self.assertEqual(len(sitemap_records), 5) - + # Should use at least 2 different filenames (3+2 distribution) filenames = set(record.sitemap_filename for record in sitemap_records) self.assertGreaterEqual(len(filenames), 2) - + # Verify no file has more than 3 records filename_counts = {} for record in sitemap_records: - filename_counts[record.sitemap_filename] = filename_counts.get(record.sitemap_filename, 0) + 1 - + filename_counts[record.sitemap_filename] = ( + filename_counts.get(record.sitemap_filename, 0) + 1 + ) + for filename, count in filename_counts.items(): - self.assertLessEqual(count, 3, f"File {filename} has {count} records, exceeds limit of 3") - + self.assertLessEqual( + count, + 3, + f"File {filename} has {count} records, exceeds limit of 3", + ) + finally: # Restore original limit - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_limit + self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_limit def test_batch_processing_mixed_records(self): """Test batch processing with mix of new and existing records""" # Create some existing records - existing_bibcodes = ['2023Existing.1....A', '2023Existing.2....A'] - new_bibcodes = ['2023New......1....A', '2023New......2....A'] + existing_bibcodes = ["2023Existing.1....A", "2023Existing.2....A"] + new_bibcodes = ["2023New......1....A", "2023New......2....A"] all_bibcodes = existing_bibcodes + new_bibcodes - + # Override MAX_RECORDS_PER_SITEMAP to force file distribution - original_max_records = self.app.conf.get('MAX_RECORDS_PER_SITEMAP', 50000) - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = 2 # Force max 2 records per file - + original_max_records = self.app.conf.get("MAX_RECORDS_PER_SITEMAP", 50000) + self.app.conf["MAX_RECORDS_PER_SITEMAP"] = 2 # Force max 2 records per file + try: with self.app.session_scope() as session: for bibcode in all_bibcodes: record = Records(bibcode=bibcode, bib_data='{"title": "test"}') session.add(record) session.commit() - + # Add existing records first - tasks.task_manage_sitemap(existing_bibcodes, 'add') - + tasks.task_manage_sitemap(existing_bibcodes, "add") + # Verify existing records are in sitemap with self.app.session_scope() as session: - existing_count = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(existing_bibcodes) - ).count() + existing_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(existing_bibcodes)) + .count() + ) self.assertEqual(existing_count, 2) - + # Now add all records (mix of existing and new) - tasks.task_manage_sitemap(all_bibcodes, 'add') - + tasks.task_manage_sitemap(all_bibcodes, "add") + # Verify all records are now in sitemap with self.app.session_scope() as session: - total_count = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(all_bibcodes) - ).count() + total_count = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(all_bibcodes)) + .count() + ) self.assertEqual(total_count, 4) - + # Verify no duplicates - all_records = session.query(SitemapInfo).filter( - SitemapInfo.bibcode.in_(all_bibcodes) - ).all() + all_records = ( + session.query(SitemapInfo) + .filter(SitemapInfo.bibcode.in_(all_bibcodes)) + .all() + ) bibcodes_in_sitemap = [record.bibcode for record in all_records] - self.assertEqual(len(bibcodes_in_sitemap), len(set(bibcodes_in_sitemap))) - + self.assertEqual( + len(bibcodes_in_sitemap), len(set(bibcodes_in_sitemap)) + ) + # Verify records are distributed across multiple files with max 2 records per file - filenames = session.query(SitemapInfo.sitemap_filename).filter( - SitemapInfo.bibcode.in_(all_bibcodes) - ).distinct().all() + filenames = ( + session.query(SitemapInfo.sitemap_filename) + .filter(SitemapInfo.bibcode.in_(all_bibcodes)) + .distinct() + .all() + ) filename_set = {f[0] for f in filenames} - + # Should use at least 2 different filenames (2+2 distribution) - self.assertGreaterEqual(len(filename_set), 2, "Should use at least 2 different sitemap files") - + self.assertGreaterEqual( + len(filename_set), + 2, + "Should use at least 2 different sitemap files", + ) + # Verify no file has more than 2 records filename_counts = {} for record in all_records: - filename_counts[record.sitemap_filename] = filename_counts.get(record.sitemap_filename, 0) + 1 - + filename_counts[record.sitemap_filename] = ( + filename_counts.get(record.sitemap_filename, 0) + 1 + ) + for filename, count in filename_counts.items(): - self.assertLessEqual(count, 2, f"File {filename} has {count} records, exceeds limit of 2") - + self.assertLessEqual( + count, + 2, + f"File {filename} has {count} records, exceeds limit of 2", + ) + # Verify we have exactly 2 files with 2 records each - self.assertEqual(len(filename_counts), 2, "Should have exactly 2 sitemap files") + self.assertEqual( + len(filename_counts), 2, "Should have exactly 2 sitemap files" + ) for filename, count in filename_counts.items(): - self.assertEqual(count, 2, f"File {filename} should have exactly 2 records") - + self.assertEqual( + count, 2, f"File {filename} should have exactly 2 records" + ) + finally: # Restore original limit - self.app.conf['MAX_RECORDS_PER_SITEMAP'] = original_max_records + self.app.conf["MAX_RECORDS_PER_SITEMAP"] = original_max_records def test_task_manage_sitemap_delete_table_action(self): """Test task_manage_sitemap delete-table action""" - + # Create test data first - test_bibcodes = ['2023DeleteTable..1..1A', '2023DeleteTable..1..2B', '2023DeleteTable..1..3C'] - + test_bibcodes = [ + "2023DeleteTable..1..1A", + "2023DeleteTable..1..2B", + "2023DeleteTable..1..3C", + ] + with self.app.session_scope() as session: for bibcode in test_bibcodes: - record = Records(bibcode=bibcode, bib_data='{"title": "Delete Table Test"}') + record = Records( + bibcode=bibcode, bib_data='{"title": "Delete Table Test"}' + ) session.add(record) session.flush() - + # Create sitemap entry sitemap_record = SitemapInfo() sitemap_record.bibcode = bibcode sitemap_record.record_id = record.id - sitemap_record.sitemap_filename = 'sitemap_bib_delete_test.xml' + sitemap_record.sitemap_filename = "sitemap_bib_delete_test.xml" sitemap_record.update_flag = False session.add(sitemap_record) - + session.commit() - + # Verify records exist before deletion initial_count = session.query(SitemapInfo).count() - self.assertEqual(initial_count, 3, "Should have 3 sitemap records before delete-table") - + self.assertEqual( + initial_count, 3, "Should have 3 sitemap records before delete-table" + ) + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Create some dummy sitemap files to test backup functionality - ads_dir = os.path.join(temp_dir, 'ads') - scix_dir = os.path.join(temp_dir, 'scix') + ads_dir = os.path.join(temp_dir, "ads") + scix_dir = os.path.join(temp_dir, "scix") os.makedirs(ads_dir, exist_ok=True) os.makedirs(scix_dir, exist_ok=True) - + # Create test sitemap files - test_files = ['sitemap_bib_1.xml', 'sitemap_bib_2.xml', 'sitemap_index.xml'] + test_files = ["sitemap_bib_1.xml", "sitemap_bib_2.xml", "sitemap_index.xml"] for filename in test_files: - with open(os.path.join(ads_dir, filename), 'w') as f: + with open(os.path.join(ads_dir, filename), "w") as f: f.write('') - with open(os.path.join(scix_dir, filename), 'w') as f: + with open(os.path.join(scix_dir, filename), "w") as f: f.write('') - + # Mock the backup_sitemap_files method to verify it's called - with patch.object(self.app, 'backup_sitemap_files') as mock_backup: + with patch.object(self.app, "backup_sitemap_files") as mock_backup: # Execute delete-table action - tasks.task_manage_sitemap(['dummy'], 'delete-table') - + tasks.task_manage_sitemap(["dummy"], "delete-table") + # Verify backup_sitemap_files was called with correct directory mock_backup.assert_called_once_with(temp_dir) - + # Verify all sitemap records were deleted with self.app.session_scope() as session: final_count = session.query(SitemapInfo).count() - self.assertEqual(final_count, 0, "All sitemap records should be deleted after delete-table action") - + self.assertEqual( + final_count, + 0, + "All sitemap records should be deleted after delete-table action", + ) + # Verify Records table is unchanged (delete-table should only affect SitemapInfo) - records_count = session.query(Records).filter( - Records.bibcode.in_(test_bibcodes) - ).count() - self.assertEqual(records_count, 3, "Records table should be unchanged by delete-table action") + records_count = ( + session.query(Records) + .filter(Records.bibcode.in_(test_bibcodes)) + .count() + ) + self.assertEqual( + records_count, + 3, + "Records table should be unchanged by delete-table action", + ) def test_task_manage_sitemap_update_robots_action(self): """Test task_manage_sitemap update-robots action""" - + with tempfile.TemporaryDirectory() as temp_dir: - self.app.conf['SITEMAP_DIR'] = temp_dir - + self.app.conf["SITEMAP_DIR"] = temp_dir + # Create site directories - ads_dir = os.path.join(temp_dir, 'ads') - scix_dir = os.path.join(temp_dir, 'scix') + ads_dir = os.path.join(temp_dir, "ads") + scix_dir = os.path.join(temp_dir, "scix") os.makedirs(ads_dir, exist_ok=True) os.makedirs(scix_dir, exist_ok=True) - + # Verify no robots.txt files exist initially - ads_robots = os.path.join(ads_dir, 'robots.txt') - scix_robots = os.path.join(scix_dir, 'robots.txt') - self.assertFalse(os.path.exists(ads_robots), "ADS robots.txt should not exist initially") - self.assertFalse(os.path.exists(scix_robots), "SciX robots.txt should not exist initially") - + ads_robots = os.path.join(ads_dir, "robots.txt") + scix_robots = os.path.join(scix_dir, "robots.txt") + self.assertFalse( + os.path.exists(ads_robots), "ADS robots.txt should not exist initially" + ) + self.assertFalse( + os.path.exists(scix_robots), + "SciX robots.txt should not exist initially", + ) + # Execute update-robots action try: - tasks.task_manage_sitemap(['dummy'], 'update-robots') + tasks.task_manage_sitemap(["dummy"], "update-robots") success = True except Exception as e: success = False error_msg = str(e) - + # Verify action completed successfully - self.assertTrue(success, "update-robots action should complete successfully") - + self.assertTrue( + success, "update-robots action should complete successfully" + ) + # Verify robots.txt files were created - self.assertTrue(os.path.exists(ads_robots), "ADS robots.txt should be created") - self.assertTrue(os.path.exists(scix_robots), "SciX robots.txt should be created") - + self.assertTrue( + os.path.exists(ads_robots), "ADS robots.txt should be created" + ) + self.assertTrue( + os.path.exists(scix_robots), "SciX robots.txt should be created" + ) + # Verify robots.txt content is correct - with open(ads_robots, 'r', encoding='utf-8') as f: + with open(ads_robots, "r", encoding="utf-8") as f: ads_content = f.read() - with open(scix_robots, 'r', encoding='utf-8') as f: + with open(scix_robots, "r", encoding="utf-8") as f: scix_content = f.read() - + # Check for expected content in ADS robots.txt - self.assertIn('User-agent: *', ads_content, "ADS robots.txt should contain User-agent directive") - self.assertIn('Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml', ads_content, - "ADS robots.txt should contain correct sitemap URL") - self.assertIn('Disallow:', ads_content, "ADS robots.txt should contain disallow directives") - + self.assertIn( + "User-agent: *", + ads_content, + "ADS robots.txt should contain User-agent directive", + ) + self.assertIn( + "Sitemap: https://ui.adsabs.harvard.edu/sitemap/sitemap_index.xml", + ads_content, + "ADS robots.txt should contain correct sitemap URL", + ) + self.assertIn( + "Disallow:", + ads_content, + "ADS robots.txt should contain disallow directives", + ) + # Check for expected content in SciX robots.txt - self.assertIn('User-agent: *', scix_content, "SciX robots.txt should contain User-agent directive") - self.assertIn('Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml', scix_content, - "SciX robots.txt should contain correct sitemap URL") - self.assertIn('Disallow:', scix_content, "SciX robots.txt should contain disallow directives") - + self.assertIn( + "User-agent: *", + scix_content, + "SciX robots.txt should contain User-agent directive", + ) + self.assertIn( + "Sitemap: https://scixplorer.org/sitemap/sitemap_index.xml", + scix_content, + "SciX robots.txt should contain correct sitemap URL", + ) + self.assertIn( + "Disallow:", + scix_content, + "SciX robots.txt should contain disallow directives", + ) + # Verify files are not empty and properly formatted - self.assertGreater(len(ads_content.strip()), 50, "ADS robots.txt should have substantial content") - self.assertGreater(len(scix_content.strip()), 50, "SciX robots.txt should have substantial content") - self.assertTrue(ads_content.endswith('\n'), "ADS robots.txt should end with newline") - self.assertTrue(scix_content.endswith('\n'), "SciX robots.txt should end with newline") + self.assertGreater( + len(ads_content.strip()), + 50, + "ADS robots.txt should have substantial content", + ) + self.assertGreater( + len(scix_content.strip()), + 50, + "SciX robots.txt should have substantial content", + ) + self.assertTrue( + ads_content.endswith("\n"), "ADS robots.txt should end with newline" + ) + self.assertTrue( + scix_content.endswith("\n"), "SciX robots.txt should end with newline" + ) def test_task_manage_sitemap_update_robots_action_error_handling(self): """Test task_manage_sitemap update-robots action error handling""" - + # Test by mocking update_robots_files to return False (simulating failure) - with patch('adsmp.tasks.update_robots_files') as mock_update_robots: + with patch("adsmp.tasks.update_robots_files") as mock_update_robots: mock_update_robots.return_value = False # Simulate failure - + # Execute update-robots action - should raise exception due to simulated failure with self.assertRaises(Exception) as context: - tasks.task_manage_sitemap(['dummy'], 'update-robots') - + tasks.task_manage_sitemap(["dummy"], "update-robots") + # Verify the exception message indicates robots.txt update failure - self.assertIn('Failed to update robots.txt files', str(context.exception)) - + self.assertIn("Failed to update robots.txt files", str(context.exception)) + # Verify update_robots_files was called with force_update=True mock_update_robots.assert_called_once_with(True) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/config.py b/config.py index 3764fcc..b4934cb 100644 --- a/config.py +++ b/config.py @@ -31,7 +31,7 @@ # db connection to the Boost Pipeline database where boost factors are stored # if not present, boost factors will not be included in SOLR documents BOOST_SQLALCHEMY_URL = None #'postgresql://boost_user:boost_pass@localhost:5432/boost_db' - +IGNORED_BOOST_PAYLOAD_TYPES = ["boost"] # Main Solr # SOLR_URLS = ["http://localhost:9983/solr/collection1/update"] @@ -142,3 +142,12 @@ "techreport": 3, "misc": 8 } + +SCIX_ID_GENERATION_FIELDS = [ + "author_norm", + "doi", + "abstract", + "title", + "doctype", + "pub_raw" +] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 58ae774..d5df902 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ -adsputils==1.5.13 +adsputils==1.5.15 alembic==0.9.1 httplib2==0.19.0 portalocker==1.7.1 psycopg2==2.8.6 pyrabbit==1.1.0 -ScixPipelineUtils @ git+https://github.com/adsabs/SciXPipelineUtils.git@v0.5.2 +awscli==1.27.60 +ScixPipelineUtils @ git+https://github.com/adsabs/SciXPipelineUtils.git@v0.6.2 diff --git a/scripts/cleanup_sitemaps.py b/scripts/cleanup_sitemaps.py new file mode 100644 index 0000000..5f7cf62 --- /dev/null +++ b/scripts/cleanup_sitemaps.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +""" +Wrapper script for sitemap cleanup operation with lockfile protection. + +This script prevents concurrent cleanup operations using a lockfile mechanism +similar to reindex.py. The cleanup operation can take several hours scanning +millions of sitemap records. +""" + +import os +import sys +import pickle +import time +import re +import json +from subprocess import PIPE, Popen + +proj_home = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if proj_home not in sys.path: + sys.path.append(proj_home) + +from adsputils import setup_logging, load_config +from celery.result import AsyncResult +from adsmp import tasks + +config = load_config(proj_home=proj_home) +logger = setup_logging('sitemap_cleanup', proj_home=proj_home, + level=config.get('LOGGING_LEVEL', 'INFO'), + attach_stdout=config.get('LOG_STDOUT', False)) + +lockfile = os.path.abspath(proj_home + '/sitemap_cleanup.locked') + + +def read_lockfile(lockfile): + with open(lockfile, 'rb') as f: + return pickle.load(f) + + +def write_lockfile(lockfile, data): + with open(lockfile, 'wb') as f: + pickle.dump(data, f) + + +def execute(command, **kwargs): + p = Popen(command, shell=True, stdout=PIPE, stderr=PIPE, **kwargs) + out, err = p.communicate() + return (p.returncode, out, err) + + +def monitor_workflow(workflow_id, start_time): + """ + Monitor Celery workflow until completion. + Returns True if successful, raises Exception if failed. + """ + result = AsyncResult(workflow_id, app=tasks.app) + + check_interval = 30 # seconds + last_log_time = time.time() + log_interval = 300 # Log every 5 minutes + max_duration = 12 * 3600 # 12 hours in seconds + warning_logged = False + + logger.info('Monitoring workflow %s...' % workflow_id) + + while not result.ready(): + time.sleep(check_interval) + current_time = time.time() + elapsed = current_time - start_time + + if current_time - last_log_time >= log_interval: + logger.info('Workflow still running... (elapsed: %.1f minutes)' % (elapsed / 60,)) + last_log_time = current_time + + # Warn if taking too long + if elapsed > max_duration and not warning_logged: + logger.warning('Workflow has been running for over 12 hours (%.1f hours)!' % (elapsed / 3600,)) + logger.warning('This is unusually long - check for stuck tasks or performance issues') + warning_logged = True + + # Check if successful + if result.successful(): + logger.info('Workflow completed successfully') + return True + else: + error_msg = 'Workflow failed: %s' % str(result.info) + logger.error(error_msg) + raise Exception(error_msg) + + +def run(): + # Check for existing lockfile + if os.path.exists(lockfile): + logger.error('Lockfile %s already exists; exiting! (if you want to proceed, delete the file with rm sitemap_cleanup.locked)' % (lockfile,)) + data = read_lockfile(lockfile) + for k, v in data.items(): + logger.error('%s=%s' % (k, v,)) + sys.exit(1) + else: + data = {} + + try: + now = time.time() + data['start'] = now + data['operation'] = 'sitemap_cleanup' + write_lockfile(lockfile, data) + logger.info('Lockfile created') + + logger.info('Starting sitemap cleanup operation') + logger.info('This may take several hours for large sitemap tables') + + # Execute command and capture workflow ID from output + command = 'python3 run.py --cleanup-invalid-sitemaps' + retcode, stdout, stderr = execute(command, cwd=proj_home) + + if retcode != 0: + data['error'] = '%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode()) + write_lockfile(lockfile, data) + logger.error('stderr=%s' % (stderr.decode(),)) + raise Exception('%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode())) + + # Parse workflow ID from stdout (JSON log format) + stdout_str = stdout.decode() + workflow_id = None + + # Try to parse as JSON first + for line in stdout_str.split('\n'): + if 'Sitemap cleanup workflow submitted:' in line or 'cleanup workflow' in line.lower(): + try: + log_entry = json.loads(line) + message = log_entry.get('message', '') + # Extract UUID from message + uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' + match = re.search(uuid_pattern, message) + if match: + workflow_id = match.group(0) + break + except (json.JSONDecodeError, ValueError): + # Fall back to simple parsing + uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' + match = re.search(uuid_pattern, line) + if match: + workflow_id = match.group(0) + break + + if not workflow_id: + logger.info('No workflow was started') + logger.info('Operation completed in %s secs' % (time.time() - now,)) + os.remove(lockfile) + return + + logger.info('Workflow ID: %s' % workflow_id) + data['workflow_id'] = workflow_id + write_lockfile(lockfile, data) + + # Monitor workflow until completion + monitor_workflow(workflow_id, now) + + logger.info('Successfully finished sitemap cleanup in %s secs (%.1f minutes)' % + (time.time() - now, (time.time() - now) / 60)) + + # Success - remove lockfile + logger.info('Deleting the lock; sitemap cleanup completed successfully!') + os.remove(lockfile) + + except Exception as e: + logger.exception('Failed: we will keep the process permanently locked') + data['last-exception'] = str(e) + data['failed_at'] = time.time() + write_lockfile(lockfile, data) + sys.exit(1) + + +if __name__ == '__main__': + run() diff --git a/scripts/update_sitemaps_auto.py b/scripts/update_sitemaps_auto.py new file mode 100644 index 0000000..10e0d0a --- /dev/null +++ b/scripts/update_sitemaps_auto.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python +""" +Wrapper script for automatic sitemap updates with lockfile protection. + +""" + +import os +import sys +import pickle +import time +import argparse +import re +import json +from subprocess import PIPE, Popen + +proj_home = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if proj_home not in sys.path: + sys.path.append(proj_home) + +from adsputils import setup_logging, load_config +from celery.result import AsyncResult +from adsmp import tasks + +config = load_config(proj_home=proj_home) +logger = setup_logging('sitemap_auto_update', proj_home=proj_home, + level=config.get('LOGGING_LEVEL', 'INFO'), + attach_stdout=config.get('LOG_STDOUT', False)) + +lockfile = os.path.abspath(proj_home + '/sitemap_auto_update.locked') + + +def read_lockfile(lockfile): + with open(lockfile, 'rb') as f: + return pickle.load(f) + + +def write_lockfile(lockfile, data): + with open(lockfile, 'wb') as f: + pickle.dump(data, f) + + +def execute(command, **kwargs): + p = Popen(command, shell=True, stdout=PIPE, stderr=PIPE, **kwargs) + out, err = p.communicate() + return (p.returncode, out, err) + + +def monitor_workflow(workflow_id, start_time): + """ + Monitor Celery workflow until completion. + Returns True if successful, raises Exception if failed. + """ + result = AsyncResult(workflow_id, app=tasks.app) + + check_interval = 30 # seconds + last_log_time = time.time() + log_interval = 300 # Log every 5 minutes + max_duration = 12 * 3600 # 12 hours in seconds + warning_logged = False + + logger.info('Monitoring workflow %s...' % workflow_id) + + while not result.ready(): + time.sleep(check_interval) + current_time = time.time() + elapsed = current_time - start_time + + if current_time - last_log_time >= log_interval: + logger.info('Workflow still running... (elapsed: %.1f minutes)' % (elapsed / 60,)) + last_log_time = current_time + + # Warn if taking too long + if elapsed > max_duration and not warning_logged: + logger.warning('Workflow has been running for over 12 hours (%.1f hours)!' % (elapsed / 3600,)) + logger.warning('This is unusually long - check for stuck tasks or performance issues') + warning_logged = True + + # Check if successful + if result.successful(): + logger.info('Workflow completed successfully') + return True + else: + error_msg = 'Workflow failed: %s' % str(result.info) + logger.error(error_msg) + raise Exception(error_msg) + + +def run(days_back=1): + # Check for existing lockfile + if os.path.exists(lockfile): + logger.error('Lockfile %s already exists; exiting! (if you want to proceed, delete the file)' % (lockfile,)) + data = read_lockfile(lockfile) + for k, v in data.items(): + logger.error('%s=%s' % (k, v,)) + sys.exit(1) + else: + data = {} + + try: + now = time.time() + data['start'] = now + data['operation'] = 'sitemap_auto_update' + data['days_back'] = days_back + write_lockfile(lockfile, data) + + logger.info('Starting automatic sitemap update (looking back %d days)' % days_back) + logger.info('This may take several hours depending on the number of updated records') + + # Execute command and capture workflow ID from output + command = 'python3 run.py --update-sitemaps-auto --days-back %d' % days_back + retcode, stdout, stderr = execute(command, cwd=proj_home) + + if retcode != 0: + data['error'] = '%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode()) + write_lockfile(lockfile, data) + logger.error('stderr=%s' % (stderr.decode(),)) + raise Exception('%s failed with retcode=%s\nstderr:\n%s' % (command, retcode, stderr.decode())) + + # Parse workflow ID from stdout (JSON log format) + stdout_str = stdout.decode() + workflow_id = None + + # Try to parse as JSON first + for line in stdout_str.split('\n'): + if 'Submitted sitemap workflow:' in line: + try: + log_entry = json.loads(line) + message = log_entry.get('message', '') + # Extract UUID from message + uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' + match = re.search(uuid_pattern, message) + if match: + workflow_id = match.group(0) + break + except (json.JSONDecodeError, ValueError): + # Fall back to simple parsing + uuid_pattern = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' + match = re.search(uuid_pattern, line) + if match: + workflow_id = match.group(0) + break + + if not workflow_id: + logger.info('No workflow was started (no records to update)') + logger.info('Operation completed in %s secs' % (time.time() - now,)) + os.remove(lockfile) + return + + logger.info('Workflow ID: %s' % workflow_id) + data['workflow_id'] = workflow_id + write_lockfile(lockfile, data) + + # Monitor workflow until completion + monitor_workflow(workflow_id, now) + + logger.info('Successfully finished sitemap auto-update in %s secs (%.1f minutes)' % + (time.time() - now, (time.time() - now) / 60)) + + # Success - remove lockfile + logger.info('Deleting the lock; sitemap auto-update completed successfully!') + os.remove(lockfile) + + except Exception as e: + logger.exception('Failed: we will keep the process permanently locked') + data['last-exception'] = str(e) + data['failed_at'] = time.time() + write_lockfile(lockfile, data) + sys.exit(1) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Automatic sitemap update with lockfile protection') + parser.add_argument('--days-back', dest='days_back', type=int, default=1, + help='Number of days to look back for updated records (default: 1)') + args = parser.parse_args() + + run(days_back=args.days_back)