Skip to content

Commit d660cf9

Browse files
committed
feat: add document summarisation as a dockerised feature
1 parent 824dc55 commit d660cf9

File tree

16 files changed

+905
-3
lines changed

16 files changed

+905
-3
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ solr_mount_dir
55
# # General gitignore
66
# Environments
77
.env
8+
.env.docker
9+
.env.local
810
.venv
911
env/
1012
venv/

README.MD

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ IATI is a global aid transparency standard and it makes information about aid sp
2727
We have recently moved towards a Solr Only version of the IATI.cloud. If you are looking for the hybrid IATI.cloud with Django API and Solr API, you can find this under the branch `archive/iati-cloud-hybrid-django-solr`
2828

2929
## Setting up, running and using IATI cloud
30-
Running and setting up is split into two parts: docker and manual. Because of the extensiveness of these sections they are contained in their own files. We've also included a usage guide, as well as a guide to how the IATI.cloud processes data. Find them here:
30+
Running and setting up is split into two parts: docker and manual. Because of the extensiveness of these sections they are contained in their own files. We've also included a usage guide, as well as a guide to how the IATI.cloud processes data, and a guide to our document summarisation feature. Find them here:
3131
- [Docker guide](./docs/DOCKER.md)
3232
- [Local guide](./docs/LOCAL.md)
3333
- [Usage guide](./docs/USAGE.md)
3434
- [Data processing guide](./docs/PROCESSING.md)
35+
- [Document summarisation](docs/DOCUMENT_SUMMARISATION.md)
3536

3637
## Requirements
3738
### Software

direct_indexing/direct_indexing.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import requests
66
from django.conf import settings
77

8+
from direct_indexing.document_summarisation.document_summarisation import document_summarisation
89
from direct_indexing.metadata.dataset import index_datasets_and_dataset_metadata
910
from direct_indexing.metadata.publisher import index_publisher_metadata
1011

@@ -28,7 +29,7 @@ def clear_indices():
2829
Clear all indices as indicated by the 'cores' variable.
2930
"""
3031
try:
31-
cores = ['dataset', 'publisher', 'activity', 'transaction', 'budget', 'result', 'organisation']
32+
cores = ['dataset', 'publisher', 'activity', 'transaction', 'budget', 'result', 'document', 'organisation']
3233
for core in cores:
3334
logging.info(f'clear_indices:: Clearing {core} core')
3435
solr = pysolr.Solr(f'{settings.SOLR_URL}/{core}', always_commit=True)
@@ -55,6 +56,12 @@ def run_dataset_metadata(update, force_update=False):
5556
return result
5657

5758

59+
def run_document_summarisation():
60+
result = document_summarisation()
61+
logging.info(f"run_document_summarisation:: result: {result}")
62+
return result
63+
64+
5865
def drop_removed_data():
5966
logging.info('drop_removed_data:: Removing all data not found in the latest dataset list')
6067
dropped_list = []

direct_indexing/document_summarisation/__init__.py

Whitespace-only changes.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
HASH = 'dataset.resources.hash'
2+
IDENTIFIER = 'iati-identifier'
3+
FORMAT = 'document-link.format'
4+
DOC_URL = 'document-link.url'
5+
DOC_LINK_FIELDS = [
6+
DOC_URL, FORMAT, 'document-link.title.narrative',
7+
'document-link.title.narrative.lang', 'document-link.description.narrative',
8+
'document-link.description.narrative.lang', 'document-link.category.code',
9+
'document-link.language.code', 'document-link.document-date.iso-date'
10+
]
11+
DATASET_FIELDS = [
12+
'dataset.id', 'dataset.metadata_modified', 'dataset.name',
13+
'dataset.extras.iati_version', HASH, 'dataset.resources.url']
14+
EXTRA_FIELDS = [IDENTIFIER]
15+
ALL_FIELDS = DATASET_FIELDS + DOC_LINK_FIELDS + EXTRA_FIELDS
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import logging
2+
3+
import pysolr
4+
from django.conf import settings
5+
from summarizer import Summarizer
6+
7+
from direct_indexing.document_summarisation.const import DOC_URL, FORMAT
8+
from direct_indexing.document_summarisation.preprocess import (
9+
list_existing_documents, preprocess_documents, retrieve_document_links
10+
)
11+
from direct_indexing.document_summarisation.summarise import summarise_document_content, supported_doctype
12+
13+
14+
def document_summarisation():
15+
"""
16+
Function kickstarts document summarisation.
17+
Retrieve the document links, preprocess them and index the summaries
18+
"""
19+
logging.info('document_summarisation:: Starting document summarisation')
20+
# Set up solr
21+
solr = pysolr.Solr(settings.SOLR_DOCUMENT, always_commit=True)
22+
# Create a separate object for each document link in the activity
23+
data = retrieve_document_links()
24+
existing_documents = list_existing_documents()
25+
data = preprocess_documents(data, existing_documents, solr)
26+
27+
result = index_summaries(data, solr)
28+
logging.info(f'document_summarisation:: {result}')
29+
if result == 'Success':
30+
logging.info('document_summarisation:: Done document summarisation')
31+
return result
32+
else:
33+
logging.error(f'document_summarisation:: Error in document summarisation:\n{result}')
34+
raise DocumentException(result)
35+
36+
37+
class DocumentException(Exception):
38+
def __init__(self, message):
39+
super().__init__(message)
40+
41+
42+
def index_summaries(data, solr):
43+
try:
44+
logging.info('index_summaries:: Indexing summaries')
45+
model = Summarizer()
46+
for document in data:
47+
if not supported_doctype(document[FORMAT]):
48+
continue
49+
summarised_text = summarise_document_content(document[DOC_URL], document[FORMAT], model)
50+
if summarised_text == 'Not extractable':
51+
document['content-extraction-status'] = 'Not extractable'
52+
else:
53+
document['content-extraction-status'] = 'Extracted'
54+
document['summary'] = summarised_text
55+
solr.add(document)
56+
logging.info('index_summaries:: Done indexing summaries')
57+
return "Success"
58+
except TypeError as e:
59+
logging.error(f'index_summaries:: Error indexing summaries:\n{e}')
60+
return f'error: {e}'
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import logging
2+
3+
import requests
4+
from django.conf import settings
5+
6+
from direct_indexing.document_summarisation.const import (
7+
ALL_FIELDS, DATASET_FIELDS, DOC_LINK_FIELDS, EXTRA_FIELDS, HASH, IDENTIFIER
8+
)
9+
10+
11+
def retrieve_document_links():
12+
"""
13+
# Retrieve metadata from all activities that have a document link
14+
# https://iatistandard.org/en/iati-standard/203/activity-standard/iati-activities/iati-activity/document-link
15+
# @url is a must have for every document link.
16+
"""
17+
logging.info('_retrieve_document_links:: Retrieving document links from Solr')
18+
data_metadata_fields = '%2C'.join(ALL_FIELDS)
19+
query = 'document-link.url:*'
20+
doc_url = f'http://localhost:8983/solr/activity/select?fl={data_metadata_fields}&q.op=OR&q={query}&rows=10000000'
21+
data = requests.get(doc_url).json()['response']['docs']
22+
return _format_document_links(data)
23+
24+
25+
def _format_document_links(data):
26+
document_list = []
27+
# loop over the activities
28+
for activity in data:
29+
for index in range(len(activity['document-link.url'])):
30+
document_list.append(_extract_doc(activity, index))
31+
return document_list
32+
33+
34+
def _extract_doc(activity, index):
35+
doc = {}
36+
for field in EXTRA_FIELDS:
37+
if field in activity:
38+
doc[field] = activity[field]
39+
for field in DOC_LINK_FIELDS:
40+
if field in activity:
41+
doc[field] = activity[field][index]
42+
for field in DATASET_FIELDS:
43+
if field in activity:
44+
doc[field] = activity[field]
45+
return doc
46+
47+
48+
def list_existing_documents():
49+
"""
50+
Get a unique list of the identifier and hash of the existing documents in the solr core
51+
"""
52+
logging.info('_list_existing_documents:: Retrieving existing documents from Solr')
53+
fields = [IDENTIFIER, HASH]
54+
doc_url = f'{settings.SOLR_DOCUMENT}/select?fl={",".join(fields)}&q.op=OR&q=*:*&rows=10000000'
55+
existing_data = requests.get(doc_url).json()['response']['docs']
56+
# Ensure data is available, add NA if not.
57+
for d in existing_data:
58+
for field in fields:
59+
if field not in d:
60+
d[field] = 'NA'
61+
62+
ret = []
63+
ids = []
64+
for d in existing_data:
65+
if d[IDENTIFIER] in ids:
66+
continue
67+
ret.append(d)
68+
return ret
69+
70+
71+
def preprocess_documents(data, existing_documents, solr):
72+
"""
73+
Remove documents from data where the iati-identifier and hash are already in the solr core,
74+
but the hash has changed. If they match, we skip the document. If they do not exist they are new
75+
and will be added to the solr core.
76+
77+
:param data: list of documents
78+
:param existing_documents: list of existing documents in solr core
79+
"""
80+
logging.info('_preprocess_documents:: Preprocessing documents')
81+
# loop over the existing data in form of a list of dicts, and create two lists,
82+
# one with the iati-identifiers and one with the hashes
83+
existing_iati_identifiers = []
84+
existing_hashes = []
85+
for doc in existing_documents:
86+
existing_iati_identifiers.append(doc[IDENTIFIER])
87+
existing_hashes.append(doc[HASH])
88+
89+
# Filter data and remove documents that are already in the solr core and have been updated
90+
filtered_data = []
91+
for doc in data:
92+
if doc[IDENTIFIER] not in existing_iati_identifiers:
93+
filtered_data.append(doc)
94+
else:
95+
# if the iati-identifier is already in the solr core, check if the hash is the same
96+
iati_identifier_index = existing_iati_identifiers.index(doc[IDENTIFIER])
97+
if existing_hashes[iati_identifier_index] != doc[HASH]:
98+
solr.delete(q=f'{IDENTIFIER}:"{doc[IDENTIFIER]}"')
99+
filtered_data.append(doc)
100+
return filtered_data
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
2+
import logging
3+
from io import BytesIO
4+
from urllib.request import urlopen
5+
6+
import docx2txt
7+
import requests
8+
from PyPDF2 import PdfReader
9+
10+
NOT_EXTRACTED = 'Not extractable'
11+
12+
13+
def _download_files(url, format):
14+
"""
15+
Function retrieves pdf files or word documents from internet
16+
:param url: url of the file
17+
:param format: format of the file
18+
"""
19+
20+
try:
21+
if 'pdf' in format:
22+
res = requests.get(url)
23+
byte_data = res.content
24+
return byte_data
25+
26+
if 'msword' in format:
27+
res = urlopen(url)
28+
byte_data = res.read()
29+
return byte_data
30+
else:
31+
raise KeyError('Wrong file format!')
32+
except KeyError:
33+
raise KeyError('Wrong file format!')
34+
35+
36+
def _extract_text(doc_bytes, doc_format):
37+
"""
38+
Function extracts texts from either pdf or word documents
39+
"""
40+
all_extracted_text = ""
41+
42+
try:
43+
if 'pdf' in doc_format:
44+
# Extract text from pdf
45+
reader = PdfReader(BytesIO(doc_bytes))
46+
number_of_pages = len(reader.pages)
47+
for n in range(number_of_pages):
48+
page = reader.pages[n]
49+
all_extracted_text += page.extract_text()
50+
return all_extracted_text
51+
52+
if 'msword' in doc_format:
53+
# extract text from ms word format
54+
return docx2txt.process(BytesIO(doc_bytes))
55+
except Exception:
56+
return NOT_EXTRACTED
57+
58+
59+
def _extractive_summary(text, model):
60+
result = model(text, min_length=60)
61+
summarised_text = "".join(result)
62+
return summarised_text
63+
64+
65+
def supported_doctype(doc_format):
66+
"""
67+
Check if the document format is supported
68+
"""
69+
if doc_format in ["application/pdf", "application/msword"]:
70+
return True
71+
return False
72+
73+
74+
def summarise_document_content(doc_link, doc_format, model):
75+
"""
76+
Download the content of the provided file
77+
Extract the text
78+
Summarise the text
79+
return the summary
80+
81+
:param doc_link: the document link URL
82+
:param model: the summarisation model
83+
"""
84+
try:
85+
byte_data = _download_files(doc_link, doc_format)
86+
text = _extract_text(byte_data, doc_format)
87+
if text == NOT_EXTRACTED:
88+
return NOT_EXTRACTED
89+
summary = _extractive_summary(text, model)
90+
return summary
91+
except KeyError as e:
92+
logging.error(f'_summarise_document_content:: Error summarising document:\n{e}')
93+
raise TypeError(f'Error summarising document:\n{e}')

0 commit comments

Comments
 (0)