Skip to content

Commit 235889e

Browse files
authored
Merge pull request #1318 from ORCID/PD-3780_Script-to-add-the-missing-ORCID-iD-from-affiliations_Daniel-Palafox
feat: Add scrips to fix data in mongodb
2 parents 241a238 + 357c293 commit 235889e

File tree

9 files changed

+1720
-0
lines changed

9 files changed

+1720
-0
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Script to add any missing ORCID iDs from affiliations.
4+
5+
This script adds any missing ORCID iDs from affiliations that have status 'IN_ORCID' a valid access token
6+
7+
Related to: https://app.clickup.com/t/9014437828/PD-3780
8+
9+
Usage:
10+
fix_short_sf_ids.py should be executed first
11+
python add_missing_ORCID_iD_from_affiliations.py
12+
"""
13+
14+
import argparse
15+
import sys
16+
from typing import List, Dict, Any
17+
from pymongo.errors import OperationFailure
18+
19+
# Import shared modules
20+
from logger_config import setup_logger
21+
from db_connection import MongoDBConnection
22+
from config import Config
23+
24+
# Set up logging
25+
logger = setup_logger(__name__, log_file='add-missing-ORCID-iD-from-affiliations.log')
26+
27+
28+
class AddMissingORCIDiDFROMAffiliations:
29+
30+
def __init__(self, connection: MongoDBConnection, collection_assertion: str, collection_orcid_record: str):
31+
self.connection = connection
32+
self.collection_assertion = connection.get_collection(collection_assertion)
33+
self.collection_orcid_record = connection.get_collection(collection_orcid_record)
34+
35+
def find_problematic_assertions(self) -> List[Dict[str, Any]]:
36+
"""
37+
Find assertions without Orcid iD.
38+
39+
Returns:
40+
List of problematic assertions documents
41+
"""
42+
query = {
43+
'orcid_id': {
44+
'$exists': False
45+
},
46+
'put_code': {
47+
'$exists': True,
48+
'$ne': ''
49+
}
50+
}
51+
52+
try:
53+
logger.info("Searching for problematic assertions...")
54+
assertions = list(self.collection_assertion.find(query))
55+
logger.info(f"Found {len(assertions)} assertions to fix")
56+
return assertions
57+
except OperationFailure as e:
58+
logger.error(f"Failed to query affiliations: {e}")
59+
return []
60+
except Exception as e:
61+
logger.error(f"Unexpected error during query: {e}")
62+
return []
63+
64+
def print_report(self, assertions: List[Dict[str, Any]]):
65+
if not assertions:
66+
logger.info("No problematic assertions found")
67+
return
68+
69+
logger.info("\n" + "="*80)
70+
logger.info("PROBLEMATIC ASSERTIONS REPORT")
71+
logger.info("="*80)
72+
73+
for i, rec in enumerate(assertions, 1):
74+
logger.info(f" _id: {rec.get('_id')}, Email: {rec.get('email')}")
75+
76+
logger.info("\n" + "="*80)
77+
78+
def find_assertions(self, assertions: List[Dict[str, Any]]) -> int:
79+
"""
80+
Fix the assertions without Orcid iD.
81+
82+
Returns:
83+
Number of assertions successfully updated
84+
"""
85+
if not assertions:
86+
logger.info("No orcid records to fix")
87+
return 0
88+
89+
logger.info(f"\n Applying fixes to {len(assertions)} assertions...")
90+
91+
modified_count = 0
92+
93+
orcid_records = list(self.collection_orcid_record.find({}))
94+
95+
try:
96+
97+
for assertion in assertions:
98+
assertion_email = assertion.get('email')
99+
assertion_salesforce_id = assertion.get('salesforce_id')
100+
101+
for orcid_record in orcid_records:
102+
orcid_record_email = orcid_record.get('email')
103+
if assertion_email == orcid_record_email:
104+
same_salesforce_id = False
105+
tokens = orcid_record.get("tokens", [])
106+
107+
for t in tokens:
108+
salesforce_id = t.get("salesforce_id")
109+
110+
if assertion_salesforce_id == salesforce_id:
111+
same_salesforce_id = True
112+
result = self.collection_assertion.update_one(
113+
{"_id": assertion["_id"]},
114+
{
115+
"$set": {
116+
"orcid_id": orcid_record.get("orcid")
117+
}
118+
}
119+
)
120+
modified_count += result.modified_count
121+
122+
logger.info(
123+
f"Assertion updated id:={assertion["_id"]}, orcid={orcid_record.get("orcid")}"
124+
)
125+
126+
if not same_salesforce_id:
127+
logger.info(
128+
f"Same assertion_email={assertion_email} and orcid_record_email={orcid_record_email} but not assertion_salesforce_id={assertion_salesforce_id}"
129+
)
130+
131+
logger.info(f" Successfully updated {modified_count} orcid records")
132+
133+
return modified_count
134+
135+
except OperationFailure as e:
136+
logger.error(f" Failed to update orcid records: {e}")
137+
return 0
138+
except Exception as e:
139+
logger.error(f" Unexpected error during update: {e}")
140+
return 0
141+
142+
def verify_fixes(self) -> bool:
143+
logger.info("\n Verifying fixes...")
144+
remaining = self.find_problematic_assertions()
145+
146+
if not remaining:
147+
logger.info(" Verification passed: No problematic salesforce ids found")
148+
return True
149+
else:
150+
logger.warning(f" Verification failed: {len(remaining)} problematic salesforce ids still exist")
151+
return False
152+
153+
154+
def parse_arguments():
155+
parser = argparse.ArgumentParser(
156+
description='Fix ORCID records salesforce ids',
157+
formatter_class=argparse.RawDescriptionHelpFormatter,
158+
epilog="""
159+
Examples:
160+
# Interactive mode
161+
# fix_short_sf_ids.py should be executed first
162+
python add_missing_ORCID_iD_from_affiliations.py
163+
164+
Environment Variables:
165+
MONGO_URI or MONGO_DB - MongoDB connection string
166+
MONGO_DATABASE or DATABASE - Database name (default: assertionservice)
167+
MONGO_COLLECTION or COLLECTION - Collection name (default: assertion)
168+
"""
169+
)
170+
171+
parser.add_argument('--mongo-uri', help='MongoDB URI (overrides env)')
172+
parser.add_argument('--database', help='Database name (overrides env)')
173+
parser.add_argument('--collection', help='Collection name (overrides env)')
174+
175+
return parser.parse_args()
176+
177+
178+
def main():
179+
args = parse_arguments()
180+
181+
config = Config()
182+
183+
mongo_uri = args.mongo_uri or config.mongo_uri
184+
database = args.database or config.mongo_database
185+
186+
logger.info("="*80)
187+
logger.info("Add missing ORCID iD and correct SF iD")
188+
logger.info("="*80)
189+
logger.info(f"Database: {database}")
190+
logger.info(f"Collections: assertion, orcid_record")
191+
logger.info(f"MongoDB URI: {mongo_uri[:20]}..." if len(mongo_uri) > 20 else f"MongoDB URI: {mongo_uri}")
192+
logger.info("="*80 + "\n")
193+
194+
connection = MongoDBConnection(mongo_uri, database)
195+
collection_assertion = 'assertion'
196+
collection_orcid_record = 'orcid_record'
197+
198+
try:
199+
if not connection.connect():
200+
logger.error("Failed to connect to MongoDB. Exiting.")
201+
return 1
202+
203+
fixer = AddMissingORCIDiDFROMAffiliations(connection, collection_assertion, collection_orcid_record)
204+
205+
assertions = fixer.find_problematic_assertions()
206+
207+
fixer.print_report(assertions)
208+
209+
if not assertions:
210+
logger.info("\n No fixes needed. All orcid records are correct.")
211+
return 0
212+
213+
logger.info("\n" + "="*80)
214+
logger.info(" WARNING: This will modify the database!")
215+
logger.info(f" {len(assertions)} orcid records will be updated")
216+
logger.info("="*80)
217+
218+
try:
219+
response = input("\nDo you want to proceed? (yes/no): ").strip().lower()
220+
if response not in ['yes', 'y']:
221+
logger.info("\n Operation cancelled by user")
222+
return 0
223+
except (KeyboardInterrupt, EOFError):
224+
logger.info("\n\n Operation cancelled by user")
225+
return 1
226+
227+
updated_count = fixer.find_assertions(assertions)
228+
229+
if updated_count > 0:
230+
if not fixer.verify_fixes():
231+
logger.warning("\n Some orcid records may still need attention")
232+
return 1
233+
234+
logger.info("\n" + "="*80)
235+
logger.info("Script completed successfully")
236+
logger.info("="*80)
237+
return 0
238+
239+
except KeyboardInterrupt:
240+
logger.info("\n\n Operation cancelled by user (Ctrl+C)")
241+
return 1
242+
except Exception as e:
243+
logger.error(f"\n Unexpected error: {e}", exc_info=True)
244+
return 1
245+
finally:
246+
connection.disconnect()
247+
248+
249+
if __name__ == '__main__':
250+
sys.exit(main())
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Handles loading configuration from environment variables with sensible defaults.
4+
"""
5+
6+
import os
7+
from typing import Dict, Optional
8+
9+
10+
class Config:
11+
"""
12+
Configuration manager for scripts.
13+
14+
Reads configuration from environment variables with fallbacks.
15+
Supports both Docker Compose and local development setups.
16+
"""
17+
18+
def __init__(self):
19+
# MongoDB connection settings
20+
self.mongo_uri = self._get_mongo_uri()
21+
self.mongo_database = self._get_env('MONGO_DATABASE', 'assertionservice')
22+
self.mongo_collection = self._get_env('MONGO_COLLECTION', 'assertion')
23+
24+
def _get_mongo_uri(self) -> str:
25+
return (
26+
os.getenv('MONGO_URI') or
27+
'mongodb://localhost:27017'
28+
)
29+
30+
def _get_env(self, primary_key: str, default: str) -> str:
31+
return os.getenv(primary_key) or default
32+
33+
def to_dict(self) -> Dict[str, str]:
34+
return {
35+
'mongo_uri': self.mongo_uri,
36+
'mongo_database': self.mongo_database,
37+
'mongo_collection': self.mongo_collection,
38+
}
39+
40+
def __repr__(self) -> str:
41+
masked_uri = self.mongo_uri[:20] + '...' if len(self.mongo_uri) > 20 else self.mongo_uri
42+
return (
43+
f"database={self.mongo_database}, "
44+
f"collection={self.mongo_collection}, "
45+
f"uri={masked_uri})"
46+
)
47+
48+
49+
def load_config() -> Config:
50+
return Config()
51+
52+
53+
def get_mongo_uri(default: str = 'mongodb://localhost:27017') -> str:
54+
return os.getenv('MONGO_URI') or default
55+
56+
57+
def get_database_name(default: str = 'caca') -> str:
58+
return os.getenv('MONGO_DATABASE') or default
59+
60+
61+
def get_collection_name(default: str = 'assertion') -> str:
62+
return os.getenv('MONGO_COLLECTION') or default

0 commit comments

Comments
 (0)