zimmerman-team · sylvanr · Feb 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ solr_mount_dir
 # # General gitignore
 # Environments
 .env
+.env.docker
+.env.local
 .venv
 env/
 venv/

diff --git a/README.MD b/README.MD
@@ -27,11 +27,12 @@ IATI is a global aid transparency standard and it makes information about aid sp
 We have recently moved towards a Solr Only version of the IATI.cloud. If you are looking for the hybrid IATI.cloud with Django API and Solr API, you can find this under the branch `archive/iati-cloud-hybrid-django-solr`
 
 ## Setting up, running and using IATI cloud
-Running and setting up is split into two parts: docker and manual. Because of the extensiveness of these sections they are contained in their own files. We've also included a usage guide, as well as a guide to how the IATI.cloud processes data. Find them here:
+Running and setting up is split into two parts: docker and manual. Because of the extensiveness of these sections they are contained in their own files. We've also included a usage guide, as well as a guide to how the IATI.cloud processes data, and a guide to our document summarisation feature. Find them here:
 - [Docker guide](./docs/DOCKER.md)
 - [Local guide](./docs/LOCAL.md)
 - [Usage guide](./docs/USAGE.md)
 - [Data processing guide](./docs/PROCESSING.md)
+- [Document summarisation](docs/DOCUMENT_SUMMARISATION.md)
 
 ## Requirements
 ### Software

diff --git a/direct_indexing/direct_indexing.py b/direct_indexing/direct_indexing.py
@@ -5,6 +5,7 @@
 import requests
 from django.conf import settings
 
+from direct_indexing.document_summarisation.document_summarisation import document_summarisation
 from direct_indexing.metadata.dataset import index_datasets_and_dataset_metadata
 from direct_indexing.metadata.publisher import index_publisher_metadata
 
@@ -28,7 +29,7 @@ def clear_indices():
     Clear all indices as indicated by the 'cores' variable.
     """
     try:
-        cores = ['dataset', 'publisher', 'activity', 'transaction', 'budget', 'result', 'organisation']
+        cores = ['dataset', 'publisher', 'activity', 'transaction', 'budget', 'result', 'document', 'organisation']
         for core in cores:
             logging.info(f'clear_indices:: Clearing {core} core')
             solr = pysolr.Solr(f'{settings.SOLR_URL}/{core}', always_commit=True)
@@ -55,6 +56,12 @@ def run_dataset_metadata(update, force_update=False):
     return result
 
 
+def run_document_summarisation():
+    result = document_summarisation()
+    logging.info(f"run_document_summarisation:: result: {result}")
+    return result
+
+
 def drop_removed_data():
     logging.info('drop_removed_data:: Removing all data not found in the latest dataset list')
     dropped_list = []

diff --git a/direct_indexing/document_summarisation/__init__.py b/direct_indexing/document_summarisation/__init__.py
diff --git a/direct_indexing/document_summarisation/const.py b/direct_indexing/document_summarisation/const.py
@@ -0,0 +1,15 @@
+HASH = 'dataset.resources.hash'
+IDENTIFIER = 'iati-identifier'
+FORMAT = 'document-link.format'
+DOC_URL = 'document-link.url'
+DOC_LINK_FIELDS = [
+    DOC_URL, FORMAT, 'document-link.title.narrative',
+    'document-link.title.narrative.lang', 'document-link.description.narrative',
+    'document-link.description.narrative.lang', 'document-link.category.code',
+    'document-link.language.code', 'document-link.document-date.iso-date'
+]
+DATASET_FIELDS = [
+    'dataset.id', 'dataset.metadata_modified', 'dataset.name',
+    'dataset.extras.iati_version', HASH, 'dataset.resources.url']
+EXTRA_FIELDS = [IDENTIFIER]
+ALL_FIELDS = DATASET_FIELDS + DOC_LINK_FIELDS + EXTRA_FIELDS
diff --git a/direct_indexing/document_summarisation/document_summarisation.py b/direct_indexing/document_summarisation/document_summarisation.py
@@ -0,0 +1,60 @@
+import logging
+
+import pysolr
+from django.conf import settings
+from summarizer import Summarizer
+
+from direct_indexing.document_summarisation.const import DOC_URL, FORMAT
+from direct_indexing.document_summarisation.preprocess import (
+    list_existing_documents, preprocess_documents, retrieve_document_links
+)
+from direct_indexing.document_summarisation.summarise import summarise_document_content, supported_doctype
+
+
+def document_summarisation():
+    """
+    Function kickstarts document summarisation.
+    Retrieve the document links, preprocess them and index the summaries
+    """
+    logging.info('document_summarisation:: Starting document summarisation')
+    # Set up solr
+    solr = pysolr.Solr(settings.SOLR_DOCUMENT, always_commit=True)
+    # Create a separate object for each document link in the activity
+    data = retrieve_document_links()
+    existing_documents = list_existing_documents()
+    data = preprocess_documents(data, existing_documents, solr)
+
+    result = index_summaries(data, solr)
+    logging.info(f'document_summarisation:: {result}')
+    if result == 'Success':
+        logging.info('document_summarisation:: Done document summarisation')
+        return result
+    else:
+        logging.error(f'document_summarisation:: Error in document summarisation:\n{result}')
+        raise DocumentException(result)
+
+
+class DocumentException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+def index_summaries(data, solr):
+    try:
+        logging.info('index_summaries:: Indexing summaries')
+        model = Summarizer()
+        for document in data:
+            if not supported_doctype(document[FORMAT]):
+                continue
+            summarised_text = summarise_document_content(document[DOC_URL], document[FORMAT], model)
+            if summarised_text == 'Not extractable':
+                document['content-extraction-status'] = 'Not extractable'
+            else:
+                document['content-extraction-status'] = 'Extracted'
+            document['summary'] = summarised_text
+            solr.add(document)
+        logging.info('index_summaries:: Done indexing summaries')
+        return "Success"
+    except TypeError as e:
+        logging.error(f'index_summaries:: Error indexing summaries:\n{e}')
+        return f'error: {e}'
diff --git a/direct_indexing/document_summarisation/preprocess.py b/direct_indexing/document_summarisation/preprocess.py
@@ -0,0 +1,100 @@
+import logging
+
+import requests
+from django.conf import settings
+
+from direct_indexing.document_summarisation.const import (
+    ALL_FIELDS, DATASET_FIELDS, DOC_LINK_FIELDS, EXTRA_FIELDS, HASH, IDENTIFIER
+)
+
+
+def retrieve_document_links():
+    """
+    # Retrieve metadata from all activities that have a document link
+    # https://iatistandard.org/en/iati-standard/203/activity-standard/iati-activities/iati-activity/document-link
+    # @url is a must have for every document link.
+    """
+    logging.info('_retrieve_document_links:: Retrieving document links from Solr')
+    data_metadata_fields = '%2C'.join(ALL_FIELDS)
+    query = 'document-link.url:*'
+    doc_url = f'http://localhost:8983/solr/activity/select?fl={data_metadata_fields}&q.op=OR&q={query}&rows=10000000'
+    data = requests.get(doc_url).json()['response']['docs']
+    return _format_document_links(data)
+
+
+def _format_document_links(data):
+    document_list = []
+    # loop over the activities
+    for activity in data:
+        for index in range(len(activity['document-link.url'])):
+            document_list.append(_extract_doc(activity, index))
+    return document_list
+
+
+def _extract_doc(activity, index):
+    doc = {}
+    for field in EXTRA_FIELDS:
+        if field in activity:
+            doc[field] = activity[field]
+    for field in DOC_LINK_FIELDS:
+        if field in activity:
+            doc[field] = activity[field][index]
+    for field in DATASET_FIELDS:
+        if field in activity:
+            doc[field] = activity[field]
+    return doc
+
+
+def list_existing_documents():
+    """
+    Get a unique list of the identifier and hash of the existing documents in the solr core
+    """
+    logging.info('_list_existing_documents:: Retrieving existing documents from Solr')
+    fields = [IDENTIFIER, HASH]
+    doc_url = f'{settings.SOLR_DOCUMENT}/select?fl={",".join(fields)}&q.op=OR&q=*:*&rows=10000000'
+    existing_data = requests.get(doc_url).json()['response']['docs']
+    # Ensure data is available, add NA if not.
+    for d in existing_data:
+        for field in fields:
+            if field not in d:
+                d[field] = 'NA'
+
+    ret = []
+    ids = []
+    for d in existing_data:
+        if d[IDENTIFIER] in ids:
+            continue
+        ret.append(d)
+    return ret
+
+
+def preprocess_documents(data, existing_documents, solr):
+    """
+    Remove documents from data where the iati-identifier and hash are already in the solr core,
+    but the hash has changed. If they match, we skip the document. If they do not exist they are new
+    and will be added to the solr core.
+
+    :param data: list of documents
+    :param existing_documents: list of existing documents in solr core
+    """
+    logging.info('_preprocess_documents:: Preprocessing documents')
+    # loop over the existing data in form of a list of dicts, and create two lists,
+    # one with the iati-identifiers and one with the hashes
+    existing_iati_identifiers = []
+    existing_hashes = []
+    for doc in existing_documents:
+        existing_iati_identifiers.append(doc[IDENTIFIER])
+        existing_hashes.append(doc[HASH])
+
+    # Filter data and remove documents that are already in the solr core and have been updated
+    filtered_data = []
+    for doc in data:
+        if doc[IDENTIFIER] not in existing_iati_identifiers:
+            filtered_data.append(doc)
+        else:
+            # if the iati-identifier is already in the solr core, check if the hash is the same
+            iati_identifier_index = existing_iati_identifiers.index(doc[IDENTIFIER])
+            if existing_hashes[iati_identifier_index] != doc[HASH]:
+                solr.delete(q=f'{IDENTIFIER}:"{doc[IDENTIFIER]}"')
+                filtered_data.append(doc)
+    return filtered_data
diff --git a/direct_indexing/document_summarisation/summarise.py b/direct_indexing/document_summarisation/summarise.py
@@ -0,0 +1,93 @@
+
+import logging
+from io import BytesIO
+from urllib.request import urlopen
+
+import docx2txt
+import requests
+from PyPDF2 import PdfReader
+
+NOT_EXTRACTED = 'Not extractable'
+
+
+def _download_files(url, format):
+    """
+    Function retrieves pdf files or word documents from internet
+    :param url: url of the file
+    :param format: format of the file
+    """
+
+    try:
+        if 'pdf' in format:
+            res = requests.get(url)
+            byte_data = res.content
+            return byte_data
+
+        if 'msword' in format:
+            res = urlopen(url)
+            byte_data = res.read()
+            return byte_data
+        else:
+            raise KeyError('Wrong file format!')
+    except KeyError:
+        raise KeyError('Wrong file format!')
+
+
+def _extract_text(doc_bytes, doc_format):
+    """
+    Function extracts texts from either pdf or word documents
+    """
+    all_extracted_text = ""
+
+    try:
+        if 'pdf' in doc_format:
+            # Extract text from pdf
+            reader = PdfReader(BytesIO(doc_bytes))
+            number_of_pages = len(reader.pages)
+            for n in range(number_of_pages):
+                page = reader.pages[n]
+                all_extracted_text += page.extract_text()
+            return all_extracted_text
+
+        if 'msword' in doc_format:
+            # extract text from ms word format
+            return docx2txt.process(BytesIO(doc_bytes))
+    except Exception:
+        return NOT_EXTRACTED
+
+
+def _extractive_summary(text, model):
+    result = model(text, min_length=60)
+    summarised_text = "".join(result)
+    return summarised_text
+
+
+def supported_doctype(doc_format):
+    """
+    Check if the document format is supported
+    """
+    if doc_format in ["application/pdf", "application/msword"]:
+        return True
+    return False
+
+
+def summarise_document_content(doc_link, doc_format, model):
+    """
+    Download the content of the provided file
+    Extract the text
+    Summarise the text
+    return the summary
+
+    :param doc_link: the document link URL
+    :param model: the summarisation model
+    """
+    try:
+        byte_data = _download_files(doc_link, doc_format)
+        text = _extract_text(byte_data, doc_format)
+        if text == NOT_EXTRACTED:
+            return NOT_EXTRACTED
+        summary = _extractive_summary(text, model)
+        return summary
+    except KeyError as e:
+        logging.error(f'_summarise_document_content:: Error summarising document:\n{e}')
+        raise TypeError(f'Error summarising document:\n{e}')