Skip to content

Commit 1b045c9

Browse files
tjacovichfemalvesmugdhapolimerakelockhartThomas-S-Allen
authored
Boost sitemaps prune (#222)
* adding lockfile scripts to long tasks (#213) * adding aws to requirements (#214) * adding aws to requirements * changing aws version * changing aws version 2 * changing aws version 3 * Added user-defined fields for scix id generation (#212) * Added user-defined fields for scix id generation * updated test cases * new version of scixpipelineutils * updating test scixids * updating test scixids * updating test scixids #3 * modified test cases to have at least title in bib_data * Update requirements.txt Co-authored-by: Taylor Jacovich <tjacovich@cfa.harvard.edu> --------- Co-authored-by: Taylor Jacovich <tjacovich@cfa.harvard.edu> * Update update_record to call async task for boost_mesage. Add queue for receiving augment processes. (#215) * Update update_record to call async task for boost_mesage. Add queue for receiving augment processes. * Broke boost call out of update storage to remove circular import. * removed circular import. * Fix args in task_boost_request. * Add awscli back to requirements.txt Added awscli version 1.27.60 to requirements. * Added support for pub_abbrev from import (#217) * Fix extract_classifications_pipeline empty string bug (#219) * Adding has_bib_data label (#216) * Adding has_bib_data label * Adding has_bib_data label * removing comment * removing unnecessary check * Rewrite task_update_record so that calls to task_boost_request is cal… (#220) * Rewrite task_update_record so that calls to task_boost_request is called for individual records when the message is a list type. * Add IGNORED_BOOST_PAYLOAD_TYPES to config. * upgrading scixpipelineutils requirement * Cleared out all classifier code. * update scix ids (#223) --------- Co-authored-by: Fernanda <femalves@users.noreply.github.com> Co-authored-by: mugdhapolimera <35502000+mugdhapolimera@users.noreply.github.com> Co-authored-by: Kelly Lockhart <2926089+kelockhart@users.noreply.github.com> Co-authored-by: Thomas S. Allen <tom.sco@gmail.com> Co-authored-by: Mugdha Polimera <mugdhapolimera@gmail.com>
1 parent c47f09a commit 1b045c9

File tree

7 files changed

+4963
-2859
lines changed

7 files changed

+4963
-2859
lines changed

adsmp/app.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,7 @@ def should_include_in_sitemap(self, record):
763763
3. If processed, processing isn't too stale
764764
765765
Args:
766-
record: Dictionary with record data including bib_data, status, timestamps
766+
record: Dictionary with record data including has_bib_data, status, timestamps
767767
768768
Returns:
769769
bool: True if record should be included in sitemap, False otherwise
@@ -772,14 +772,14 @@ def should_include_in_sitemap(self, record):
772772

773773
# Extract values from record dictionary
774774
bibcode = record.get('bibcode', None)
775-
bib_data = record.get('bib_data', None)
775+
has_bib_data = record.get('has_bib_data', None)
776776
bib_data_updated = record.get('bib_data_updated')
777777
solr_processed = record.get('solr_processed')
778778
status = record.get('status')
779779

780780
# Must have bibliographic data
781-
if not bib_data or not bibcode or (isinstance(bib_data, str) and not bib_data.strip()):
782-
self.logger.debug('Excluding %s from sitemap: No bibcode or bib_data', bibcode)
781+
if not has_bib_data or not bibcode:
782+
self.logger.debug('Excluding %s from sitemap: No bibcode or has_bib_data is False', bibcode)
783783
return False
784784

785785
# Exclude if SOLR failed or if record is being retried (previously failed)
@@ -828,6 +828,8 @@ def get_records_bulk(self, bibcodes, session, load_only=None):
828828
record_data = {}
829829
for field in (load_only or ['id', 'bibcode', 'bib_data', 'bib_data_updated', 'solr_processed', 'status']):
830830
record_data[field] = getattr(record, field, None)
831+
# Add has_bib_data boolean for sitemap checks
832+
record_data['has_bib_data'] = bool(record_data.get('bib_data'))
831833
records_dict[record.bibcode] = record_data
832834

833835
return records_dict

adsmp/tasks.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -157,13 +157,15 @@ def task_update_record(msg):
157157
record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON())
158158
if record:
159159
logger.debug('Saved record from list: %s', record)
160+
_generate_boost_request(m, type)
160161
elif type == 'metrics_records':
161162
for m in msg.metrics_records:
162163
m = Msg(m, None, None)
163164
bibcodes.append(m.bibcode)
164165
record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True))
165166
if record:
166167
logger.debug('Saved record from list: %s', record)
168+
_generate_boost_request(m, type)
167169
elif type == 'augment':
168170
bibcodes.append(msg.bibcode)
169171
record = app.update_storage(msg.bibcode, 'augment',
@@ -176,22 +178,25 @@ def task_update_record(msg):
176178
record = app.update_storage(msg.bibcode, type, msg.toJSON())
177179
if record:
178180
logger.debug('Saved record: %s', record)
181+
_generate_boost_request(msg, type)
179182
if type == 'metadata':
180183
# with new bib data we request to augment the affiliation
181184
# that pipeline will eventually respond with a msg to task_update_record
182185
logger.debug('requesting affilation augmentation for %s', msg.bibcode)
183186
app.request_aff_augment(msg.bibcode)
184-
if record:
185-
# Send payload to Boost pipeline
186-
if type != 'boost' and not app._config.get('TESTING_MODE', False):
187-
try:
188-
task_boost_request.apply_async(args=(msg.bibcode,))
189-
except Exception as e:
190-
app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
191-
192187
else:
193188
logger.error('Received a message with unclear status: %s', msg)
194189

190+
def _generate_boost_request(msg, msg_type):
191+
# Send payload to Boost pipeline
192+
if msg_type not in app._config.get('IGNORED_BOOST_PAYLOAD_TYPES', ['boost']) and not app._config.get('TESTING_MODE', False):
193+
try:
194+
task_boost_request.apply_async(args=(msg.bibcode,))
195+
except Exception as e:
196+
app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
197+
else:
198+
app.logger.debug("Message for bibcode %s has type: %s, Skipping.".format(msg.bibcode, msg_type))
199+
195200
@app.task(queue='update-scixid')
196201
def task_update_scixid(bibcodes, flag):
197202
"""Receives bibcodes to add scix id to the record.
@@ -490,7 +495,7 @@ def task_cleanup_invalid_sitemaps():
490495
session.query(
491496
SitemapInfo.id,
492497
SitemapInfo.bibcode,
493-
Records.bib_data,
498+
(Records.bib_data.isnot(None)).label('has_bib_data'),
494499
Records.bib_data_updated,
495500
Records.solr_processed,
496501
Records.status
@@ -519,7 +524,7 @@ def task_cleanup_invalid_sitemaps():
519524
# Convert to dict for should_include_in_sitemap function
520525
record_dict = {
521526
'bibcode': record_data.bibcode,
522-
'bib_data': record_data.bib_data,
527+
'has_bib_data': record_data.has_bib_data,
523528
'bib_data_updated': record_data.bib_data_updated,
524529
'solr_processed': record_data.solr_processed,
525530
'status': record_data.status
@@ -688,7 +693,7 @@ def task_manage_sitemap(bibcodes, action):
688693
# Apply SOLR filtering - convert record to dict for should_include_in_sitemap
689694
record_dict = {
690695
'bibcode': record.bibcode,
691-
'bib_data': record.bib_data,
696+
'has_bib_data': bool(record.bib_data),
692697
'bib_data_updated': record.bib_data_updated,
693698
'solr_processed': record.solr_processed,
694699
'status': record.status

0 commit comments

Comments
 (0)