Skip to content

Commit a0a9800

Browse files
authored
Merge pull request #11584 from sh-andriy/feature/ENG-10093_Write-script-to-migrate-CRFIDs-to-RORIDs
[ENG-10093] | feat(osf): script to migrate Crossref Funder IDs to ROR IDs
2 parents 216d72d + 2d25029 commit a0a9800

File tree

2 files changed

+766
-0
lines changed

2 files changed

+766
-0
lines changed
Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Management command to migrate Crossref Funder IDs to ROR IDs.
4+
5+
This script reads a CSV mapping file and updates all GuidMetadataRecord entries
6+
that have funding_info with Crossref Funder IDs, converting them to ROR IDs.
7+
8+
Usage:
9+
# Dry run (recommended first)
10+
python manage.py migrate_funder_ids_to_ror --csv-file /path/to/mapping.csv --dry-run
11+
12+
# Actual migration
13+
python manage.py migrate_funder_ids_to_ror --csv-file /path/to/mapping.csv
14+
15+
CSV Format Expected (tab or comma separated):
16+
Funder Name, ror ID, ROR name, Crossref DOI, Funder ID
17+
Example:
18+
National Science Foundation, https://ror.org/021nxhr62, National Science Foundation, http://dx.doi.org/10.13039/100000001, 100000001
19+
"""
20+
import csv
21+
import logging
22+
import re
23+
24+
from django.core.management.base import BaseCommand
25+
from django.db import transaction
26+
27+
from osf.models import GuidMetadataRecord
28+
29+
30+
logger = logging.getLogger(__name__)
31+
32+
33+
class Command(BaseCommand):
34+
help = 'Migrate Crossref Funder IDs to ROR IDs in GuidMetadataRecord.funding_info'
35+
36+
def add_arguments(self, parser):
37+
parser.add_argument(
38+
'--csv-file',
39+
type=str,
40+
required=True,
41+
help='Path to the CSV file containing the Crossref to ROR mapping.',
42+
)
43+
parser.add_argument(
44+
'--dry-run',
45+
action='store_true',
46+
dest='dry_run',
47+
help='Run without making any changes to the database.',
48+
)
49+
parser.add_argument(
50+
'--batch-size',
51+
type=int,
52+
default=1000,
53+
help='Number of records to process in each batch (default: 1000).',
54+
)
55+
parser.add_argument(
56+
'--update-funder-name',
57+
action='store_true',
58+
dest='update_funder_name',
59+
help='Also update funder_name to the ROR name from the mapping.',
60+
)
61+
parser.add_argument(
62+
'--skip-reindex',
63+
action='store_true',
64+
dest='skip_reindex',
65+
help='Skip triggering SHARE/DataCite re-indexing after migration. '
66+
'Use this if you plan to run recatalog_metadata separately.',
67+
)
68+
69+
def handle(self, *args, **options):
70+
csv_file = options['csv_file']
71+
dry_run = options['dry_run']
72+
batch_size = options['batch_size']
73+
update_funder_name = options['update_funder_name']
74+
reindex = not options['skip_reindex']
75+
76+
if dry_run:
77+
self.stdout.write(self.style.WARNING('[DRY RUN] No changes will be made to the database.'))
78+
79+
if not reindex:
80+
self.stdout.write(self.style.WARNING('Re-indexing is disabled. Run recatalog_metadata after migration.'))
81+
82+
# Load the mapping
83+
mapping = self.load_mapping(csv_file)
84+
if not mapping:
85+
self.stdout.write(self.style.ERROR('No valid mappings found in CSV file.'))
86+
return
87+
88+
self.stdout.write(f'Loaded {len(mapping)} Crossref to ROR mappings.')
89+
90+
# Find and update records
91+
stats = self.migrate_records(mapping, dry_run, batch_size, update_funder_name, reindex)
92+
93+
# Print summary
94+
self.stdout.write('\n' + '=' * 60)
95+
self.stdout.write(self.style.SUCCESS('Migration Summary:'))
96+
self.stdout.write(f" Records scanned: {stats['scanned']}")
97+
self.stdout.write(f" Records updated: {stats['updated']}")
98+
self.stdout.write(f" Records re-indexed: {stats['reindexed']}")
99+
self.stdout.write(f" Funders migrated: {stats['funders_migrated']}")
100+
self.stdout.write(f" Funders not in mapping: {stats['not_in_mapping']}")
101+
if stats['errors']:
102+
self.stdout.write(self.style.ERROR(f" Errors: {stats['errors']}"))
103+
104+
if stats['unmapped_ids']:
105+
self.stdout.write('\nUnmapped Crossref Funder IDs (not in CSV):')
106+
for funder_id in sorted(stats['unmapped_ids'])[:50]: # Show first 50
107+
self.stdout.write(f' - {funder_id}')
108+
if len(stats['unmapped_ids']) > 50:
109+
self.stdout.write(f' ... and {len(stats["unmapped_ids"]) - 50} more')
110+
111+
def load_mapping(self, csv_file):
112+
"""Load the Crossref to ROR mapping from CSV file.
113+
114+
Returns a dict mapping various forms of Crossref ID to ROR info:
115+
{
116+
'100000001': {'ror_id': 'https://ror.org/021nxhr62', 'ror_name': 'National Science Foundation'},
117+
'http://dx.doi.org/10.13039/100000001': {...},
118+
'https://doi.org/10.13039/100000001': {...},
119+
...
120+
}
121+
"""
122+
mapping = {}
123+
124+
try:
125+
with open(csv_file, 'r', encoding='utf-8-sig') as f:
126+
# Try to detect delimiter
127+
sample = f.read(2048)
128+
f.seek(0)
129+
if '\t' in sample:
130+
delimiter = '\t'
131+
else:
132+
delimiter = ','
133+
134+
reader = csv.DictReader(f, delimiter=delimiter)
135+
136+
# Normalize column names (handle various formats)
137+
for row in reader:
138+
# Try to find the relevant columns
139+
ror_id = None
140+
ror_name = None
141+
crossref_doi = None
142+
funder_id = None
143+
144+
for key, value in row.items():
145+
if not key:
146+
continue
147+
key_lower = key.lower().strip()
148+
149+
if 'ror' in key_lower and 'id' in key_lower and 'ror_name' not in key_lower:
150+
ror_id = value.strip() if value else None
151+
elif 'ror' in key_lower and 'name' in key_lower:
152+
ror_name = value.strip() if value else None
153+
elif 'crossref' in key_lower and 'doi' in key_lower:
154+
crossref_doi = value.strip() if value else None
155+
elif key_lower == 'funder id' or key_lower == 'funder_id':
156+
funder_id = value.strip() if value else None
157+
158+
if not ror_id:
159+
continue
160+
161+
ror_info = {
162+
'ror_id': ror_id,
163+
'ror_name': ror_name,
164+
}
165+
166+
# Add mappings for various ID formats
167+
if funder_id:
168+
mapping[funder_id] = ror_info
169+
# Also add with various DOI prefixes
170+
mapping[f'http://dx.doi.org/10.13039/{funder_id}'] = ror_info
171+
mapping[f'https://doi.org/10.13039/{funder_id}'] = ror_info
172+
mapping[f'10.13039/{funder_id}'] = ror_info
173+
174+
if crossref_doi:
175+
mapping[crossref_doi] = ror_info
176+
# Normalize the DOI URL
177+
if crossref_doi.startswith('http://'):
178+
mapping[crossref_doi.replace('http://', 'https://')] = ror_info
179+
elif crossref_doi.startswith('https://'):
180+
mapping[crossref_doi.replace('https://', 'http://')] = ror_info
181+
182+
except FileNotFoundError:
183+
self.stdout.write(self.style.ERROR(f'CSV file not found: {csv_file}'))
184+
return None
185+
except Exception as e:
186+
self.stdout.write(self.style.ERROR(f'Error reading CSV file: {e}'))
187+
return None
188+
189+
return mapping
190+
191+
def extract_funder_id(self, identifier):
192+
"""Extract the numeric funder ID from various identifier formats."""
193+
if not identifier:
194+
return None
195+
196+
# Already just a number
197+
if re.match(r'^\d+$', identifier):
198+
return identifier
199+
200+
# Extract from DOI URL (e.g., http://dx.doi.org/10.13039/100000001)
201+
match = re.search(r'10\.13039/(\d+)', identifier)
202+
if match:
203+
return match.group(1)
204+
205+
return identifier
206+
207+
def migrate_records(self, mapping, dry_run, batch_size, update_funder_name, reindex):
208+
"""Find and migrate all GuidMetadataRecord entries with Crossref Funder IDs."""
209+
stats = {
210+
'scanned': 0,
211+
'updated': 0,
212+
'reindexed': 0,
213+
'funders_migrated': 0,
214+
'not_in_mapping': 0,
215+
'errors': 0,
216+
'unmapped_ids': set(),
217+
}
218+
219+
# Query records that have non-empty funding_info
220+
# We need to check if any funder has 'Crossref Funder ID' type
221+
queryset = GuidMetadataRecord.objects.exclude(funding_info=[]).exclude(funding_info__isnull=True)
222+
223+
total_count = queryset.count()
224+
self.stdout.write(f'Found {total_count} records with funding_info to scan.')
225+
226+
processed = 0
227+
for record in queryset.iterator(chunk_size=batch_size):
228+
stats['scanned'] += 1
229+
processed += 1
230+
231+
if processed % 500 == 0:
232+
self.stdout.write(f' Processed {processed}/{total_count} records...')
233+
234+
try:
235+
updated, funder_stats = self.migrate_record(record, mapping, dry_run, update_funder_name)
236+
if updated:
237+
stats['updated'] += 1
238+
if reindex and not dry_run:
239+
try:
240+
self.reindex_record(record)
241+
stats['reindexed'] += 1
242+
except Exception as e:
243+
logger.error(f'Error re-indexing record {record.guid._id}: {e}')
244+
stats['funders_migrated'] += funder_stats['migrated']
245+
stats['not_in_mapping'] += funder_stats['not_found']
246+
stats['unmapped_ids'].update(funder_stats['unmapped_ids'])
247+
except Exception as e:
248+
stats['errors'] += 1
249+
logger.error(f'Error migrating record {record.guid._id}: {e}')
250+
251+
return stats
252+
253+
def migrate_record(self, record, mapping, dry_run, update_funder_name):
254+
"""Migrate a single GuidMetadataRecord's funding_info.
255+
256+
Returns (was_updated, funder_stats)
257+
"""
258+
funder_stats = {
259+
'migrated': 0,
260+
'not_found': 0,
261+
'unmapped_ids': set(),
262+
}
263+
264+
if not record.funding_info:
265+
return False, funder_stats
266+
267+
updated_funding_info = []
268+
record_modified = False
269+
270+
for funder in record.funding_info:
271+
funder_type = funder.get('funder_identifier_type', '')
272+
funder_identifier = funder.get('funder_identifier', '')
273+
274+
# Only migrate Crossref Funder IDs (includes legacy 'Crossref Funder URI' type)
275+
if funder_type not in ('Crossref Funder ID', 'Crossref Funder URI'):
276+
updated_funding_info.append(funder)
277+
continue
278+
279+
# Try to find in mapping
280+
ror_info = None
281+
282+
# Try exact match first
283+
if funder_identifier in mapping:
284+
ror_info = mapping[funder_identifier]
285+
else:
286+
# Try to extract numeric ID and look up
287+
numeric_id = self.extract_funder_id(funder_identifier)
288+
if numeric_id and numeric_id in mapping:
289+
ror_info = mapping[numeric_id]
290+
291+
if ror_info:
292+
# Create updated funder entry
293+
updated_funder = funder.copy()
294+
updated_funder['funder_identifier'] = ror_info['ror_id']
295+
updated_funder['funder_identifier_type'] = 'ROR'
296+
297+
if update_funder_name and ror_info.get('ror_name'):
298+
updated_funder['funder_name'] = ror_info['ror_name']
299+
300+
updated_funding_info.append(updated_funder)
301+
record_modified = True
302+
funder_stats['migrated'] += 1
303+
304+
logger.info(
305+
f'{"[DRY RUN] " if dry_run else ""}'
306+
f'Migrating funder in {record.guid._id}: '
307+
f'{funder_identifier} -> {ror_info["ror_id"]}'
308+
)
309+
else:
310+
# No mapping found, keep original
311+
updated_funding_info.append(funder)
312+
funder_stats['not_found'] += 1
313+
funder_stats['unmapped_ids'].add(funder_identifier)
314+
315+
logger.warning(
316+
f'No ROR mapping found for Crossref Funder ID: {funder_identifier} '
317+
f'in record {record.guid._id}'
318+
)
319+
320+
# Warn about duplicate ROR IDs that would result from migration
321+
if record_modified:
322+
ror_identifiers = [
323+
f['funder_identifier']
324+
for f in updated_funding_info
325+
if f.get('funder_identifier_type') == 'ROR'
326+
]
327+
seen = set()
328+
duplicates = {rid for rid in ror_identifiers if rid in seen or seen.add(rid)}
329+
if duplicates:
330+
logger.warning(
331+
f'Record {record.guid._id} has duplicate ROR IDs after migration: {duplicates}'
332+
)
333+
334+
if record_modified and not dry_run:
335+
with transaction.atomic():
336+
record.funding_info = updated_funding_info
337+
record.save(update_fields=['funding_info'])
338+
339+
return record_modified, funder_stats
340+
341+
def reindex_record(self, record):
342+
"""Trigger SHARE/ElasticSearch and DataCite re-indexing for the record's referent."""
343+
referent = record.guid.referent
344+
if hasattr(referent, 'update_search'):
345+
referent.update_search()
346+
if hasattr(referent, 'request_identifier_update'):
347+
referent.request_identifier_update('doi')

0 commit comments

Comments
 (0)