|
| 1 | +import logging |
| 2 | + |
| 3 | +import requests |
| 4 | +from django.conf import settings |
| 5 | + |
| 6 | +from direct_indexing.document_summarisation.const import ( |
| 7 | + ALL_FIELDS, DATASET_FIELDS, DOC_LINK_FIELDS, EXTRA_FIELDS, HASH, IDENTIFIER |
| 8 | +) |
| 9 | + |
| 10 | + |
| 11 | +def retrieve_document_links(): |
| 12 | + """ |
| 13 | + # Retrieve metadata from all activities that have a document link |
| 14 | + # https://iatistandard.org/en/iati-standard/203/activity-standard/iati-activities/iati-activity/document-link |
| 15 | + # @url is a must have for every document link. |
| 16 | + """ |
| 17 | + logging.info('_retrieve_document_links:: Retrieving document links from Solr') |
| 18 | + data_metadata_fields = '%2C'.join(ALL_FIELDS) |
| 19 | + query = 'document-link.url:*' |
| 20 | + doc_url = f'http://localhost:8983/solr/activity/select?fl={data_metadata_fields}&q.op=OR&q={query}&rows=10000000' |
| 21 | + data = requests.get(doc_url).json()['response']['docs'] |
| 22 | + return _format_document_links(data) |
| 23 | + |
| 24 | + |
| 25 | +def _format_document_links(data): |
| 26 | + document_list = [] |
| 27 | + # loop over the activities |
| 28 | + for activity in data: |
| 29 | + for index in range(len(activity['document-link.url'])): |
| 30 | + document_list.append(_extract_doc(activity, index)) |
| 31 | + return document_list |
| 32 | + |
| 33 | + |
| 34 | +def _extract_doc(activity, index): |
| 35 | + doc = {} |
| 36 | + for field in EXTRA_FIELDS: |
| 37 | + if field in activity: |
| 38 | + doc[field] = activity[field] |
| 39 | + for field in DOC_LINK_FIELDS: |
| 40 | + if field in activity: |
| 41 | + doc[field] = activity[field][index] |
| 42 | + for field in DATASET_FIELDS: |
| 43 | + if field in activity: |
| 44 | + doc[field] = activity[field] |
| 45 | + return doc |
| 46 | + |
| 47 | + |
| 48 | +def list_existing_documents(): |
| 49 | + """ |
| 50 | + Get a unique list of the identifier and hash of the existing documents in the solr core |
| 51 | + """ |
| 52 | + logging.info('_list_existing_documents:: Retrieving existing documents from Solr') |
| 53 | + fields = [IDENTIFIER, HASH] |
| 54 | + doc_url = f'{settings.SOLR_DOCUMENT}/select?fl={",".join(fields)}&q.op=OR&q=*:*&rows=10000000' |
| 55 | + existing_data = requests.get(doc_url).json()['response']['docs'] |
| 56 | + # Ensure data is available, add NA if not. |
| 57 | + for d in existing_data: |
| 58 | + for field in fields: |
| 59 | + if field not in d: |
| 60 | + d[field] = 'NA' |
| 61 | + |
| 62 | + ret = [] |
| 63 | + ids = [] |
| 64 | + for d in existing_data: |
| 65 | + if d[IDENTIFIER] in ids: |
| 66 | + continue |
| 67 | + ret.append(d) |
| 68 | + return ret |
| 69 | + |
| 70 | + |
| 71 | +def preprocess_documents(data, existing_documents, solr): |
| 72 | + """ |
| 73 | + Remove documents from data where the iati-identifier and hash are already in the solr core, |
| 74 | + but the hash has changed. If they match, we skip the document. If they do not exist they are new |
| 75 | + and will be added to the solr core. |
| 76 | +
|
| 77 | + :param data: list of documents |
| 78 | + :param existing_documents: list of existing documents in solr core |
| 79 | + """ |
| 80 | + logging.info('_preprocess_documents:: Preprocessing documents') |
| 81 | + # loop over the existing data in form of a list of dicts, and create two lists, |
| 82 | + # one with the iati-identifiers and one with the hashes |
| 83 | + existing_iati_identifiers = [] |
| 84 | + existing_hashes = [] |
| 85 | + for doc in existing_documents: |
| 86 | + existing_iati_identifiers.append(doc[IDENTIFIER]) |
| 87 | + existing_hashes.append(doc[HASH]) |
| 88 | + |
| 89 | + # Filter data and remove documents that are already in the solr core and have been updated |
| 90 | + filtered_data = [] |
| 91 | + for doc in data: |
| 92 | + if doc[IDENTIFIER] not in existing_iati_identifiers: |
| 93 | + filtered_data.append(doc) |
| 94 | + else: |
| 95 | + # if the iati-identifier is already in the solr core, check if the hash is the same |
| 96 | + iati_identifier_index = existing_iati_identifiers.index(doc[IDENTIFIER]) |
| 97 | + if existing_hashes[iati_identifier_index] != doc[HASH]: |
| 98 | + solr.delete(q=f'{IDENTIFIER}:"{doc[IDENTIFIER]}"') |
| 99 | + filtered_data.append(doc) |
| 100 | + return filtered_data |
0 commit comments