Skip to content

Commit 2d376ef

Browse files
committed
Adds OpenAlex integration for publications, closes #171
Enables matching publications against OpenAlex data to enrich metadata. - Adds new fields to the Publication model to store OpenAlex data (ID, match info, fulltext origin, retracted status, IDs, keywords, open access status, topics) - Implements a management command to backfill OpenAlex data for existing publications. - Creates an OpenAlexMatcher class to handle API requests and matching logic, including DOI and title-based matching strategies. - Updates the admin interface to display OpenAlex information and links. - Modifies harvesting tasks to incorporate OpenAlex matching during publication creation. - Enriches the API and UI with openalex data and links
1 parent 751aaf3 commit 2d376ef

File tree

10 files changed

+651
-11
lines changed

10 files changed

+651
-11
lines changed

.claude/settings.local.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
"Bash(pip search:*)",
1010
"Bash(psql:*)",
1111
"Bash(OPTIMAP_LOGGING_LEVEL=WARNING python manage.py test tests.test_work_landing_page.PublicationStatusVisibilityTest)",
12-
"Bash(curl:*)"
12+
"Bash(curl:*)",
13+
"Bash(python manage.py makemigrations:*)",
14+
"Bash(source:*)",
15+
"Bash(python manage.py shell:*)",
16+
"Bash(python manage.py:*)"
1317
],
1418
"deny": [],
1519
"ask": []

publications/admin.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,17 @@ def regenerate_all_exports(modeladmin, request, queryset):
143143
class PublicationAdmin(LeafletGeoAdmin, ImportExportModelAdmin):
144144
list_display = ("title", "doi", "has_permalink", "permalink_link",
145145
"creationDate", "lastUpdate", "created_by", "updated_by",
146-
"status", "provenance", "source")
147-
search_fields = ("title", "doi", "abstract", "source__name")
148-
list_filter = ("status", "creationDate")
146+
"status", "provenance", "source", "openalex_id")
147+
search_fields = ("title", "doi", "abstract", "source__name", "openalex_id")
148+
list_filter = ("status", "creationDate", "openalex_is_retracted", "openalex_open_access_status")
149149
fields = ("title", "doi", "status", "source", "abstract",
150150
"geometry", "timeperiod_startdate", "timeperiod_enddate",
151-
"created_by", "updated_by", "provenance")
152-
readonly_fields = ("created_by", "updated_by")
151+
"created_by", "updated_by", "provenance",
152+
"openalex_id", "openalex_link", "openalex_match_info",
153+
"openalex_fulltext_origin", "openalex_is_retracted",
154+
"openalex_ids", "openalex_keywords", "openalex_open_access_status",
155+
"openalex_topics")
156+
readonly_fields = ("created_by", "updated_by", "openalex_link")
153157
actions = ["make_public", "make_draft", "regenerate_all_exports",
154158
"export_permalinks_csv", "email_permalinks_preview"]
155159

@@ -162,6 +166,12 @@ def permalink_link(self, obj):
162166
url = obj.permalink()
163167
return format_html('<a href="{}" target="_blank">{}</a>', url, url) if url else "—"
164168

169+
@admin.display(description="OpenAlex Link")
170+
def openalex_link(self, obj):
171+
if obj.openalex_id:
172+
return format_html('<a href="{}" target="_blank"><i class="fas fa-external-link-alt"></i> View in OpenAlex</a>', obj.openalex_id)
173+
return "—"
174+
165175
def export_permalinks_csv(self, request, queryset):
166176
rows = [("title", "doi", "permalink")]
167177
rows += [(p.title or "", p.doi, p.permalink() or "")
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""
2+
Management command to backfill OpenAlex data for existing publications.
3+
4+
Usage:
5+
python manage.py backfill_openalex --all
6+
python manage.py backfill_openalex --limit 100
7+
python manage.py backfill_openalex --only-missing
8+
"""
9+
10+
import logging
11+
from django.core.management.base import BaseCommand
12+
from publications.models import Publication
13+
from publications.openalex_matcher import get_openalex_matcher
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
class Command(BaseCommand):
19+
help = 'Backfill OpenAlex data for existing publications'
20+
21+
def add_arguments(self, parser):
22+
parser.add_argument(
23+
'--all',
24+
action='store_true',
25+
help='Process all publications (re-match even if OpenAlex ID exists)',
26+
)
27+
parser.add_argument(
28+
'--only-missing',
29+
action='store_true',
30+
help='Only process publications without OpenAlex ID (default)',
31+
)
32+
parser.add_argument(
33+
'--limit',
34+
type=int,
35+
default=None,
36+
help='Maximum number of publications to process',
37+
)
38+
parser.add_argument(
39+
'--dry-run',
40+
action='store_true',
41+
help='Show what would be done without making changes',
42+
)
43+
44+
def handle(self, *args, **options):
45+
dry_run = options['dry_run']
46+
limit = options['limit']
47+
process_all = options['all']
48+
49+
# Build query
50+
query = Publication.objects.all()
51+
52+
if not process_all:
53+
# Default: only process publications without OpenAlex ID
54+
query = query.filter(openalex_id__isnull=True)
55+
56+
# Apply limit
57+
if limit:
58+
query = query[:limit]
59+
60+
total = query.count()
61+
self.stdout.write(self.style.SUCCESS(f'\nProcessing {total} publications...\n'))
62+
63+
if dry_run:
64+
self.stdout.write(self.style.WARNING('DRY RUN MODE - No changes will be saved\n'))
65+
66+
matcher = get_openalex_matcher()
67+
68+
processed = 0
69+
matched = 0
70+
partial = 0
71+
failed = 0
72+
73+
for pub in query:
74+
processed += 1
75+
76+
if processed % 10 == 0:
77+
self.stdout.write(f'Progress: {processed}/{total} ({matched} matched, {partial} partial, {failed} failed)')
78+
79+
try:
80+
# Extract author if available (simplified - could be improved)
81+
author = None
82+
# You could extract author from abstract or other fields if needed
83+
84+
# Try to match
85+
openalex_data, partial_matches = matcher.match_publication(
86+
title=pub.title,
87+
doi=pub.doi,
88+
author=author
89+
)
90+
91+
if openalex_data:
92+
# Perfect match found
93+
matched += 1
94+
self.stdout.write(
95+
self.style.SUCCESS(
96+
f' ✓ [{pub.id}] Matched: {pub.title[:50]}... -> {openalex_data.get("openalex_id", "N/A")}'
97+
)
98+
)
99+
100+
if not dry_run:
101+
# Update publication with OpenAlex data
102+
for field, value in openalex_data.items():
103+
setattr(pub, field, value)
104+
pub.save()
105+
106+
elif partial_matches:
107+
# Partial matches found
108+
partial += 1
109+
self.stdout.write(
110+
self.style.WARNING(
111+
f' ~ [{pub.id}] Partial matches: {pub.title[:50]}... ({len(partial_matches)} candidates)'
112+
)
113+
)
114+
115+
if not dry_run:
116+
pub.openalex_id = None
117+
pub.openalex_match_info = partial_matches
118+
pub.save()
119+
120+
else:
121+
# No match found
122+
failed += 1
123+
if pub.doi:
124+
# Log as warning if DOI exists - OpenAlex should have it
125+
logger.warning(f'No OpenAlex match for publication {pub.id} with DOI {pub.doi}: {pub.title[:50]}')
126+
else:
127+
logger.debug(f'No OpenAlex match for publication {pub.id}: {pub.title[:50]}')
128+
129+
except Exception as e:
130+
failed += 1
131+
logger.error(f'Error processing publication {pub.id}: {str(e)}')
132+
self.stdout.write(
133+
self.style.ERROR(
134+
f' ✗ [{pub.id}] Error: {pub.title[:50]}... - {str(e)}'
135+
)
136+
)
137+
138+
# Print summary
139+
self.stdout.write(self.style.SUCCESS(f'\n{"="*70}'))
140+
self.stdout.write(self.style.SUCCESS('Backfill Complete'))
141+
self.stdout.write(self.style.SUCCESS(f'{"="*70}\n'))
142+
self.stdout.write(f'Total processed: {processed}')
143+
self.stdout.write(self.style.SUCCESS(f'Perfect matches: {matched}'))
144+
self.stdout.write(self.style.WARNING(f'Partial matches: {partial}'))
145+
self.stdout.write(self.style.ERROR(f'No match: {failed}'))
146+
147+
if dry_run:
148+
self.stdout.write(self.style.WARNING('\n(DRY RUN - No changes were saved)'))
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Generated by Django 5.1.9 on 2025-10-15 10:25
2+
3+
import django.contrib.postgres.fields
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
("publications", "0001_initial"),
11+
]
12+
13+
operations = [
14+
migrations.AddField(
15+
model_name="publication",
16+
name="openalex_fulltext_origin",
17+
field=models.CharField(blank=True, max_length=255, null=True),
18+
),
19+
migrations.AddField(
20+
model_name="publication",
21+
name="openalex_id",
22+
field=models.CharField(
23+
blank=True, db_index=True, max_length=255, null=True
24+
),
25+
),
26+
migrations.AddField(
27+
model_name="publication",
28+
name="openalex_ids",
29+
field=models.JSONField(
30+
blank=True, help_text="OpenAlex IDs object (doi, pmid, etc)", null=True
31+
),
32+
),
33+
migrations.AddField(
34+
model_name="publication",
35+
name="openalex_is_retracted",
36+
field=models.BooleanField(default=False),
37+
),
38+
migrations.AddField(
39+
model_name="publication",
40+
name="openalex_keywords",
41+
field=django.contrib.postgres.fields.ArrayField(
42+
base_field=models.CharField(max_length=255),
43+
blank=True,
44+
null=True,
45+
size=None,
46+
),
47+
),
48+
migrations.AddField(
49+
model_name="publication",
50+
name="openalex_match_info",
51+
field=models.JSONField(
52+
blank=True,
53+
help_text="Information about partial matches found",
54+
null=True,
55+
),
56+
),
57+
migrations.AddField(
58+
model_name="publication",
59+
name="openalex_open_access_status",
60+
field=models.CharField(blank=True, max_length=50, null=True),
61+
),
62+
migrations.AddField(
63+
model_name="publication",
64+
name="openalex_topics",
65+
field=django.contrib.postgres.fields.ArrayField(
66+
base_field=models.CharField(max_length=255),
67+
blank=True,
68+
null=True,
69+
size=None,
70+
),
71+
),
72+
migrations.AlterField(
73+
model_name="emaillog",
74+
name="status",
75+
field=models.CharField(
76+
choices=[
77+
("d", "Draft"),
78+
("p", "Published"),
79+
("t", "Testing"),
80+
("w", "Withdrawn"),
81+
("h", "Harvested"),
82+
("c", "Contributed"),
83+
],
84+
default="success",
85+
max_length=10,
86+
),
87+
),
88+
migrations.AlterField(
89+
model_name="publication",
90+
name="status",
91+
field=models.CharField(
92+
choices=[
93+
("d", "Draft"),
94+
("p", "Published"),
95+
("t", "Testing"),
96+
("w", "Withdrawn"),
97+
("h", "Harvested"),
98+
("c", "Contributed"),
99+
],
100+
default="d",
101+
max_length=1,
102+
),
103+
),
104+
]

publications/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ class Publication(models.Model):
6565
'HarvestingEvent', on_delete=models.CASCADE, related_name='publications', null=True, blank=True
6666
)
6767

68+
# OpenAlex integration fields
69+
openalex_id = models.CharField(max_length=255, blank=True, null=True, db_index=True)
70+
openalex_match_info = models.JSONField(blank=True, null=True, help_text="Information about partial matches found")
71+
openalex_fulltext_origin = models.CharField(max_length=255, blank=True, null=True)
72+
openalex_is_retracted = models.BooleanField(default=False)
73+
openalex_ids = models.JSONField(blank=True, null=True, help_text="OpenAlex IDs object (doi, pmid, etc)")
74+
openalex_keywords = ArrayField(models.CharField(max_length=255), blank=True, null=True)
75+
openalex_open_access_status = models.CharField(max_length=50, blank=True, null=True)
76+
openalex_topics = ArrayField(models.CharField(max_length=255), blank=True, null=True)
77+
6878
class Meta:
6979
ordering = ['-id']
7080
constraints = [

0 commit comments

Comments
 (0)