Skip to content
377 changes: 355 additions & 22 deletions adsmp/app.py

Large diffs are not rendered by default.

82 changes: 37 additions & 45 deletions adsmp/solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,62 +446,42 @@ def transform_json_record(db_record):
timestamps = []
for k, v in DB_COLUMN_DESTINATIONS:
ts = db_record.get(k + "_updated", None)
if ts:
ts = time.mktime(ts.timetuple())
else:
ts = sys.maxsize # default to use option without timestamp
ts = time.mktime(ts.timetuple()) if ts else sys.maxsize # default to use option without timestamp
timestamps.append((k, v, ts))
timestamps.sort(key=lambda x: x[2])

# merge data based on timestamps
for field, target, _ in timestamps:
for field, target, ts in timestamps: # fields = {bib_data, nonbib_data, orcid_claims, metrics ..}
if db_record.get(field, None):
if target:
if not target: # bib_data
out.update(db_record.get(field))
else:
if callable(target):
x = target(
enriched_data = target(
db_record.get(field), out
) # in the interest of speed, don't create copy of out
if x:
out.update(x)
else:
out[target] = db_record.get(field)
else:
if target is None:
continue

out.update(db_record.get(field))

elif field.startswith("#"):
if enriched_data:
out.update(enriched_data)
else: # id
out[target] = db_record.get(field)
elif field.startswith("#"): # timestamps
if callable(target):
x = target(
enriched_data = target(
db_record, out
) # in the interest of speed, don't create copy of out
if x:
out.update(x)

# override temporal priority for links data
if (
db_record.get("bib_data", None)
and db_record.get("nonbib_data", None)
and db_record["bib_data"].get("links_data", None)
and db_record["nonbib_data"].get("links_data", None)
if enriched_data:
out.update(enriched_data)

# If both bib and nonbib pipeline provided links data
# use nonbib data even if it is older
if all(
db_record.get(key, {}).get("links_data")
for key in ("bib_data", "nonbib_data")
):
# here if both bib and nonbib pipeline provided links data
# use nonbib data even if it is older
logger.debug('Both bib and nonbib data provided links data. Using nonbib data: {}'.format(db_record["nonbib_data"]["links_data"]))
out["links_data"] = db_record["nonbib_data"]["links_data"]

# override temporal priority for bibgroup and bibgroup_facet, prefer nonbib
if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get(
"bibgroup", None
):
out["bibgroup"] = db_record["nonbib_data"]["bibgroup"]
if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get(
"bibgroup_facet", None
):
out["bibgroup_facet"] = db_record["nonbib_data"]["bibgroup_facet"]

# if only bib data is available, use it to compute property
if db_record.get("nonbib_data", None) is None and db_record.get("bib_data", None):
elif db_record.get("bib_data", {}).get("links_data"):
logger.debug('Only bib data provided links data. Using bib data: {}'.format(db_record["bib_data"]["links_data"]))
links_data = db_record["bib_data"].get("links_data", None)
if links_data:
try:
Expand All @@ -528,6 +508,18 @@ def transform_json_record(db_record):
db_record["bibcode"], type(links_data), links_data
)
)

# override temporal priority for bibgroup and bibgroup_facet, prefer nonbib
if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get(
"bibgroup", None
):
out["bibgroup"] = db_record["nonbib_data"]["bibgroup"]
if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get(
"bibgroup_facet", None
):
out["bibgroup_facet"] = db_record["nonbib_data"]["bibgroup_facet"]


boost_columns = ['doctype_boost', 'recency_boost', 'boost_factor', 'astronomy_final_boost', 'physics_final_boost', \
'earth_science_final_boost', 'planetary_science_final_boost', 'heliophysics_final_boost', 'general_final_boost']

Expand Down Expand Up @@ -562,5 +554,5 @@ def transform_json_record(db_record):
if any([char.isalnum() for char in out_field]):
has.append(field)
out["has"] = has

return out
logger.debug('Out: {}'.format(out))
return out
31 changes: 15 additions & 16 deletions adsmp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,21 +279,21 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True

# check if we have complete record
for bibcode in bibcodes:
r = app.get_record(bibcode, load_only=fields)
record = app.get_record(bibcode, load_only=fields)

if r is None:
if record is None:
logger.error('The bibcode %s doesn\'t exist!', bibcode)
continue

augments_updated = r.get('augments_updated', None)
bib_data_updated = r.get('bib_data_updated', None)
fulltext_updated = r.get('fulltext_updated', None)
metrics_updated = r.get('metrics_updated', None)
nonbib_data_updated = r.get('nonbib_data_updated', None)
orcid_claims_updated = r.get('orcid_claims_updated', None)
augments_updated = record.get('augments_updated', None)
bib_data_updated = record.get('bib_data_updated', None)
fulltext_updated = record.get('fulltext_updated', None)
metrics_updated = record.get('metrics_updated', None)
nonbib_data_updated = record.get('nonbib_data_updated', None)
orcid_claims_updated = record.get('orcid_claims_updated', None)

year_zero = '1972'
processed = r.get('processed', adsputils.get_date(year_zero))
processed = record.get('processed', adsputils.get_date(year_zero))
if processed is None:
processed = adsputils.get_date(year_zero)

Expand All @@ -314,26 +314,25 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
metrics_updated, augments_updated))
# build the solr record
if update_solr:
solr_payload = solr_updater.transform_json_record(r)

solr_payload = solr_updater.transform_json_record(record)
# ADS microservices assume the identifier field exists and contains the canonical bibcode:
if 'identifier' not in solr_payload:
solr_payload['identifier'] = []
if 'bibcode' in solr_payload and solr_payload['bibcode'] not in solr_payload['identifier']:
solr_payload['identifier'].append(solr_payload['bibcode'])
logger.debug('Built SOLR record for %s', solr_payload['bibcode'])
solr_checksum = app.checksum(solr_payload)
if ignore_checksums or r.get('solr_checksum', None) != solr_checksum:
if ignore_checksums or record.get('solr_checksum', None) != solr_checksum:
solr_records.append(solr_payload)
solr_records_checksum.append(solr_checksum)
else:
logger.debug('Checksum identical, skipping solr update for: %s', bibcode)

# get data for metrics
if update_metrics:
metrics_payload = r.get('metrics', None)
metrics_payload = record.get('metrics', None)
metrics_checksum = app.checksum(metrics_payload or '')
if (metrics_payload and ignore_checksums) or (metrics_payload and r.get('metrics_checksum', None) != metrics_checksum):
if (metrics_payload and ignore_checksums) or (metrics_payload and record.get('metrics_checksum', None) != metrics_checksum):
metrics_payload['bibcode'] = bibcode
logger.debug('Got metrics: %s', metrics_payload)
metrics_records.append(metrics_payload)
Expand All @@ -342,10 +341,10 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
logger.debug('Checksum identical or no metrics data available, skipping metrics update for: %s', bibcode)

if update_links and links_url:
datalinks_payload = app.generate_links_for_resolver(r)
datalinks_payload = app.generate_links_for_resolver(record)
if datalinks_payload:
datalinks_checksum = app.checksum(datalinks_payload)
if ignore_checksums or r.get('datalinks_checksum', None) != datalinks_checksum:
if ignore_checksums or record.get('datalinks_checksum', None) != datalinks_checksum:
links_data_records.append(datalinks_payload)
links_data_records_checksum.append(datalinks_checksum)
else:
Expand Down
Loading