Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ solr_mount_dir
# # General gitignore
# Environments
.env
.env.docker
.env.local
.venv
env/
venv/
Expand Down
3 changes: 2 additions & 1 deletion README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ IATI is a global aid transparency standard and it makes information about aid sp
We have recently moved towards a Solr Only version of the IATI.cloud. If you are looking for the hybrid IATI.cloud with Django API and Solr API, you can find this under the branch `archive/iati-cloud-hybrid-django-solr`

## Setting up, running and using IATI cloud
Running and setting up is split into two parts: docker and manual. Because of the extensiveness of these sections they are contained in their own files. We've also included a usage guide, as well as a guide to how the IATI.cloud processes data. Find them here:
Running and setting up is split into two parts: docker and manual. Because of the extensiveness of these sections they are contained in their own files. We've also included a usage guide, as well as a guide to how the IATI.cloud processes data, and a guide to our document summarisation feature. Find them here:
- [Docker guide](./docs/DOCKER.md)
- [Local guide](./docs/LOCAL.md)
- [Usage guide](./docs/USAGE.md)
- [Data processing guide](./docs/PROCESSING.md)
- [Document summarisation](docs/DOCUMENT_SUMMARISATION.md)

## Requirements
### Software
Expand Down
9 changes: 8 additions & 1 deletion direct_indexing/direct_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import requests
from django.conf import settings

from direct_indexing.document_summarisation.document_summarisation import document_summarisation
from direct_indexing.metadata.dataset import index_datasets_and_dataset_metadata
from direct_indexing.metadata.publisher import index_publisher_metadata

Expand All @@ -28,7 +29,7 @@ def clear_indices():
Clear all indices as indicated by the 'cores' variable.
"""
try:
cores = ['dataset', 'publisher', 'activity', 'transaction', 'budget', 'result', 'organisation']
cores = ['dataset', 'publisher', 'activity', 'transaction', 'budget', 'result', 'document', 'organisation']
for core in cores:
logging.info(f'clear_indices:: Clearing {core} core')
solr = pysolr.Solr(f'{settings.SOLR_URL}/{core}', always_commit=True)
Expand All @@ -55,6 +56,12 @@ def run_dataset_metadata(update, force_update=False):
return result


def run_document_summarisation():
result = document_summarisation()
logging.info(f"run_document_summarisation:: result: {result}")
return result


def drop_removed_data():
logging.info('drop_removed_data:: Removing all data not found in the latest dataset list')
dropped_list = []
Expand Down
Empty file.
15 changes: 15 additions & 0 deletions direct_indexing/document_summarisation/const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
HASH = 'dataset.resources.hash'
IDENTIFIER = 'iati-identifier'
FORMAT = 'document-link.format'
DOC_URL = 'document-link.url'
DOC_LINK_FIELDS = [
DOC_URL, FORMAT, 'document-link.title.narrative',
'document-link.title.narrative.lang', 'document-link.description.narrative',
'document-link.description.narrative.lang', 'document-link.category.code',
'document-link.language.code', 'document-link.document-date.iso-date'
]
DATASET_FIELDS = [
'dataset.id', 'dataset.metadata_modified', 'dataset.name',
'dataset.extras.iati_version', HASH, 'dataset.resources.url']
EXTRA_FIELDS = [IDENTIFIER]
ALL_FIELDS = DATASET_FIELDS + DOC_LINK_FIELDS + EXTRA_FIELDS
60 changes: 60 additions & 0 deletions direct_indexing/document_summarisation/document_summarisation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import logging

import pysolr
from django.conf import settings
from summarizer import Summarizer

from direct_indexing.document_summarisation.const import DOC_URL, FORMAT
from direct_indexing.document_summarisation.preprocess import (
list_existing_documents, preprocess_documents, retrieve_document_links
)
from direct_indexing.document_summarisation.summarise import summarise_document_content, supported_doctype


def document_summarisation():
"""
Function kickstarts document summarisation.
Retrieve the document links, preprocess them and index the summaries
"""
logging.info('document_summarisation:: Starting document summarisation')
# Set up solr
solr = pysolr.Solr(settings.SOLR_DOCUMENT, always_commit=True)
# Create a separate object for each document link in the activity
data = retrieve_document_links()
existing_documents = list_existing_documents()
data = preprocess_documents(data, existing_documents, solr)

result = index_summaries(data, solr)
logging.info(f'document_summarisation:: {result}')
if result == 'Success':
logging.info('document_summarisation:: Done document summarisation')
return result
else:
logging.error(f'document_summarisation:: Error in document summarisation:\n{result}')
raise DocumentException(result)


class DocumentException(Exception):
def __init__(self, message):
super().__init__(message)


def index_summaries(data, solr):
try:
logging.info('index_summaries:: Indexing summaries')
model = Summarizer()
for document in data:
if not supported_doctype(document[FORMAT]):
continue
summarised_text = summarise_document_content(document[DOC_URL], document[FORMAT], model)
if summarised_text == 'Not extractable':
document['content-extraction-status'] = 'Not extractable'
else:
document['content-extraction-status'] = 'Extracted'
document['summary'] = summarised_text
solr.add(document)
logging.info('index_summaries:: Done indexing summaries')
return "Success"
except TypeError as e:
logging.error(f'index_summaries:: Error indexing summaries:\n{e}')
return f'error: {e}'
100 changes: 100 additions & 0 deletions direct_indexing/document_summarisation/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import logging

import requests
from django.conf import settings

from direct_indexing.document_summarisation.const import (
ALL_FIELDS, DATASET_FIELDS, DOC_LINK_FIELDS, EXTRA_FIELDS, HASH, IDENTIFIER
)


def retrieve_document_links():
"""
# Retrieve metadata from all activities that have a document link
# https://iatistandard.org/en/iati-standard/203/activity-standard/iati-activities/iati-activity/document-link
# @url is a must have for every document link.
"""
logging.info('_retrieve_document_links:: Retrieving document links from Solr')
data_metadata_fields = '%2C'.join(ALL_FIELDS)
query = 'document-link.url:*'
doc_url = f'http://localhost:8983/solr/activity/select?fl={data_metadata_fields}&q.op=OR&q={query}&rows=10000000'
data = requests.get(doc_url).json()['response']['docs']
return _format_document_links(data)


def _format_document_links(data):
document_list = []
# loop over the activities
for activity in data:
for index in range(len(activity['document-link.url'])):
document_list.append(_extract_doc(activity, index))
return document_list


def _extract_doc(activity, index):
doc = {}
for field in EXTRA_FIELDS:
if field in activity:
doc[field] = activity[field]
for field in DOC_LINK_FIELDS:
if field in activity:
doc[field] = activity[field][index]
for field in DATASET_FIELDS:
if field in activity:
doc[field] = activity[field]
return doc


def list_existing_documents():
"""
Get a unique list of the identifier and hash of the existing documents in the solr core
"""
logging.info('_list_existing_documents:: Retrieving existing documents from Solr')
fields = [IDENTIFIER, HASH]
doc_url = f'{settings.SOLR_DOCUMENT}/select?fl={",".join(fields)}&q.op=OR&q=*:*&rows=10000000'
existing_data = requests.get(doc_url).json()['response']['docs']
# Ensure data is available, add NA if not.
for d in existing_data:
for field in fields:
if field not in d:
d[field] = 'NA'

ret = []
ids = []
for d in existing_data:
if d[IDENTIFIER] in ids:
continue
ret.append(d)
return ret


def preprocess_documents(data, existing_documents, solr):
"""
Remove documents from data where the iati-identifier and hash are already in the solr core,
but the hash has changed. If they match, we skip the document. If they do not exist they are new
and will be added to the solr core.

:param data: list of documents
:param existing_documents: list of existing documents in solr core
"""
logging.info('_preprocess_documents:: Preprocessing documents')
# loop over the existing data in form of a list of dicts, and create two lists,
# one with the iati-identifiers and one with the hashes
existing_iati_identifiers = []
existing_hashes = []
for doc in existing_documents:
existing_iati_identifiers.append(doc[IDENTIFIER])
existing_hashes.append(doc[HASH])

# Filter data and remove documents that are already in the solr core and have been updated
filtered_data = []
for doc in data:
if doc[IDENTIFIER] not in existing_iati_identifiers:
filtered_data.append(doc)
else:
# if the iati-identifier is already in the solr core, check if the hash is the same
iati_identifier_index = existing_iati_identifiers.index(doc[IDENTIFIER])
if existing_hashes[iati_identifier_index] != doc[HASH]:
solr.delete(q=f'{IDENTIFIER}:"{doc[IDENTIFIER]}"')
filtered_data.append(doc)
return filtered_data
93 changes: 93 additions & 0 deletions direct_indexing/document_summarisation/summarise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@

import logging
from io import BytesIO
from urllib.request import urlopen

import docx2txt
import requests
from PyPDF2 import PdfReader

NOT_EXTRACTED = 'Not extractable'


def _download_files(url, format):
"""
Function retrieves pdf files or word documents from internet
:param url: url of the file
:param format: format of the file
"""

try:
if 'pdf' in format:
res = requests.get(url)
byte_data = res.content
return byte_data

if 'msword' in format:
res = urlopen(url)
byte_data = res.read()
return byte_data
else:
raise KeyError('Wrong file format!')
except KeyError:
raise KeyError('Wrong file format!')


def _extract_text(doc_bytes, doc_format):
"""
Function extracts texts from either pdf or word documents
"""
all_extracted_text = ""

try:
if 'pdf' in doc_format:
# Extract text from pdf
reader = PdfReader(BytesIO(doc_bytes))
number_of_pages = len(reader.pages)
for n in range(number_of_pages):
page = reader.pages[n]
all_extracted_text += page.extract_text()
return all_extracted_text

if 'msword' in doc_format:
# extract text from ms word format
return docx2txt.process(BytesIO(doc_bytes))
except Exception:
return NOT_EXTRACTED


def _extractive_summary(text, model):
result = model(text, min_length=60)
summarised_text = "".join(result)
return summarised_text


def supported_doctype(doc_format):
"""
Check if the document format is supported
"""
if doc_format in ["application/pdf", "application/msword"]:
return True
return False


def summarise_document_content(doc_link, doc_format, model):
"""
Download the content of the provided file
Extract the text
Summarise the text
return the summary

:param doc_link: the document link URL
:param model: the summarisation model
"""
try:
byte_data = _download_files(doc_link, doc_format)
text = _extract_text(byte_data, doc_format)
if text == NOT_EXTRACTED:
return NOT_EXTRACTED
summary = _extractive_summary(text, model)
return summary
except KeyError as e:
logging.error(f'_summarise_document_content:: Error summarising document:\n{e}')
raise TypeError(f'Error summarising document:\n{e}')
Loading