Skip to content

Commit 2adf926

Browse files
committed
Extracting tags for OpsLearning
1 parent 4820e44 commit 2adf926

File tree

3 files changed

+899
-131
lines changed

3 files changed

+899
-131
lines changed
Lines changed: 397 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,397 @@
1+
import requests
2+
from django.core.management.base import BaseCommand
3+
from api.logger import logger
4+
import pandas as pd
5+
import numpy as np
6+
from retrying import retry
7+
import time
8+
import nltk
9+
from nltk.tokenize import sent_tokenize
10+
from api.models import CronJob, CronJobStatus
11+
from rest_framework.authtoken.models import Token
12+
from django.contrib.auth.models import User
13+
14+
15+
CRON_NAME = "extract_tags_for_ops_learnings"
16+
CLASSIFY_URL = "https://dreftagging.azurewebsites.net/classify"
17+
GO_API_URL = "https://goadmin.ifrc.org/api/v2/"
18+
OPS_LEARNING_URL = GO_API_URL + "ops-learning/"
19+
LIMIT_200 = "/?limit=200/"
20+
21+
22+
class Command(BaseCommand):
23+
help = "Extracting tags for Operational Learnings"
24+
25+
def set_auth_token(self):
26+
user = User.objects.filter(is_superuser=True).first()
27+
api_key = Token.objects.filter(user=user)[0].key
28+
self.go_authorization_token = {"Authorization" : "Token " + api_key}
29+
30+
def fetch_data(self, dref_final_report, appeal, ops_learning):
31+
32+
def fetchUrl(field):
33+
return requests.get(field, headers=self.go_authorization_token).json()
34+
35+
def fetchField(field):
36+
dict_field = []
37+
temp_dict = requests.get(GO_API_URL + field + LIMIT_200, headers=self.go_authorization_token).json()
38+
while temp_dict['next']:
39+
dict_field.extend(temp_dict['results'])
40+
temp_dict = fetchUrl(temp_dict['next'])
41+
dict_field.extend(temp_dict['results'])
42+
return pd.DataFrame.from_dict(dict_field)
43+
44+
# read dref final reports, to extract learnings in planned interventions
45+
logger.info('Fetching DREF Final Reports from GO')
46+
dref_final_report = fetchField(dref_final_report)
47+
48+
# read appeals to verify which drefs (appeals) are public and which drefs (appeals) are silent
49+
logger.info('Fetching Appeals from GO')
50+
appeals = fetchField(appeal)
51+
52+
# read ops learning to verify which drefs have already been processed
53+
logger.info('Fetching Operational Learnings from GO')
54+
ops_learning = fetchField(ops_learning)
55+
ops_learning['appeal_code'] = [x['code'] for x in ops_learning['appeal']]
56+
57+
return dref_final_report, appeals, ops_learning
58+
59+
def filter_final_report(self, final_report, appeal, ops_learning, final_report_is_published=True, appeal_is_published=True, in_ops_learning=False):
60+
61+
if final_report_is_published:
62+
logger.info('Filtering only DREF Final Reports that have been closed')
63+
mask = [x for x in final_report['is_published']]
64+
final_report = final_report[mask]
65+
66+
if appeal_is_published:
67+
logger.info('Filtering only DREF Final Reports that are public')
68+
mask = [x in list(appeal['code']) for x in final_report['appeal_code']]
69+
final_report = final_report[mask]
70+
71+
if not in_ops_learning:
72+
logger.info('Filtering only DREF Final Reports that have not been processed yet for operational learning')
73+
list_new_reports = np.setdiff1d(final_report['appeal_code'].unique(),ops_learning['appeal_code'].unique())
74+
75+
# only reports that are not processed yet
76+
mask = [x in list_new_reports for x in final_report['appeal_code']]
77+
final_report = final_report[mask]
78+
79+
if final_report.empty:
80+
logger.warning('There were not find any DREF Final Reports after the filtering process')
81+
return None
82+
83+
else:
84+
filtered_final_report = final_report[['appeal_code', 'planned_interventions']]
85+
logger.info('There were found %s reports after the filtering process', str(len(filtered_final_report)))
86+
return filtered_final_report
87+
88+
def split_rows(self, filtered_final_report):
89+
90+
def split_planned_interventions(df):
91+
logger.info('Splitting DREF Final Reports per planned intervention')
92+
df = df.explode(column='planned_interventions', ignore_index=True)
93+
94+
df['Sector'] = [x['title_display'] for x in df['planned_interventions']]
95+
df['Lessons Learnt'] = [x['lessons_learnt'] for x in df['planned_interventions']]
96+
df['Challenges'] = [x['challenges'] for x in df['planned_interventions']]
97+
98+
mask_1 = [pd.notna(x) for x in df['Lessons Learnt']]
99+
mask_2 = [pd.notna(x) for x in df['Challenges']]
100+
mask = [x or y for x, y in zip(mask_1, mask_2)]
101+
df = df[mask]
102+
df.drop(columns='planned_interventions', inplace=True)
103+
104+
df = df.melt(id_vars=['appeal_code', 'Sector'], value_vars=['Lessons Learnt', 'Challenges'], var_name='Finding', value_name='Excerpts')
105+
df = df[pd.notna(df['Excerpts'])]
106+
return df
107+
108+
def split_excerpts(df):
109+
logger.info('Splitting unique learnings in each planned intervention')
110+
df['Excerpts_ind'] = [sent_tokenize(x) for x in df['Excerpts']]
111+
df = df.explode(column='Excerpts_ind', ignore_index=True)
112+
df.drop(columns='Excerpts', inplace=True)
113+
114+
# remove strings that have less than 5 characters
115+
df['Excerpts_ind'] = [np.nan if pd.notna(x) and len(x) < 5 else x for x in df['Excerpts_ind']]
116+
df = df[pd.notna(df['Excerpts_ind'])]
117+
118+
# catching go format (bullet point)
119+
df['Excerpts_ind'] = [x[2:] if x.startswith('•\t') else x for x in df['Excerpts_ind']]
120+
121+
# catching other formats
122+
df['Excerpts_ind'] = [x[1:] if x.startswith(tuple(['-', '•', '▪', ' '])) else x for x in df['Excerpts_ind']]
123+
124+
df['Excerpts'] = [x.strip() for x in df['Excerpts_ind']]
125+
126+
df.drop(columns='Excerpts_ind', inplace=True)
127+
128+
return df
129+
130+
final_report_interventions = split_planned_interventions(filtered_final_report)
131+
final_report_learnings = split_excerpts(final_report_interventions)
132+
133+
if final_report_interventions.empty:
134+
logger.warning('There were not found any learnings on the DREF Final Reports planned interventions')
135+
return None
136+
else:
137+
final_report_learnings = split_excerpts(final_report_interventions)
138+
return final_report_learnings
139+
140+
def tag_data(self, df, tagging, tagging_api_endpoint):
141+
logger.info('Tagging learnings with PER framework')
142+
143+
headers = {
144+
'accept': 'application/json',
145+
'Content-Type': 'application/json',
146+
}
147+
148+
url = tagging_api_endpoint
149+
150+
df[tagging] = None
151+
df.reset_index(inplace=True, drop=True)
152+
153+
for i in range(0, len(df)):
154+
data = "\""+df['Excerpts'].iloc[i]+"\""
155+
data = data.encode('utf-8')
156+
response = requests.post(url, headers=headers, data=data)
157+
if (response.status_code == 201) and len(response.json()[0]['tags']) > 0:
158+
df.loc[i, tagging] = response.json()[0]['tags'][0]
159+
160+
df['Institution'] = np.empty(len(df))
161+
162+
for i in range(0, len(df)):
163+
if (df['PER - Component'].iloc[i] == 'Activation of Regional and International Support'):
164+
df.loc[i, 'Institution'] = 'Secretariat'
165+
else:
166+
df.loc[i, 'Institution'] = 'National Society'
167+
168+
tagged_data = df
169+
return tagged_data
170+
171+
def fetch_complementary_data(self, per_formcomponent, primary_sector):
172+
logger.info('Fetching complementary data on PER components ids, sectors ids, finding ids, organisations ids')
173+
174+
def fetchUrl(field):
175+
return requests.get(field).json()
176+
177+
def fetchField(field):
178+
dict_field = []
179+
temp_dict = requests.get(GO_API_URL + field + LIMIT_200).json()
180+
while temp_dict['next']:
181+
dict_field.extend(temp_dict['results'])
182+
temp_dict = fetchUrl(temp_dict['next'])
183+
dict_field.extend(temp_dict['results'])
184+
return pd.DataFrame.from_dict(dict_field)
185+
186+
per_formcomponent = fetchField(per_formcomponent)
187+
188+
go_sectors = fetchUrl(GO_API_URL + primary_sector)
189+
190+
dict_per = dict(zip(per_formcomponent['title'], per_formcomponent['id']))
191+
192+
dict_sector = {item['label']: item['key'] for item in go_sectors}
193+
194+
dict_finding = {
195+
'Lessons Learnt': 1,
196+
'Challenges': 2
197+
}
198+
199+
dict_org = {
200+
'Secretariat': 1,
201+
'National Society': 2
202+
}
203+
204+
mapping_per = {
205+
"Activation of Regional and International Support": "Activation of regional and international support",
206+
"Affected Population Selection": "Affected population selection",
207+
"Business Continuity": "Business continuity",
208+
"Cash and Voucher Assistance": "Cash Based Intervention (CBI)",
209+
"Communications in Emergencies": "Communication in emergencies",
210+
"Coordination with Authorities": "Coordination with authorities",
211+
"Coordination with External Agencies and NGOs": "Coordination with External Agencies and NGOs",
212+
"Coordination with Local Community Level Responders": "Coordination with local community level responders",
213+
"Coordination with Movement": "Coordination with Movement",
214+
"DRM Laws, Advocacy and Dissemination": "DRM Laws, Advocacy and Dissemination",
215+
"Early Action Mechanisms": "Early Action Mechanisms",
216+
"Emergency Needs Assessment and Planning": "Emergency Needs Assessment",
217+
"Emergency Operations Centre (EOC)": "Emergency Operations Centre (EOC)",
218+
"Emergency Response Procedures (SOP)": "Emergency Response Procedures (SOPs)",
219+
"Finance and Admin. Policy and Emergency Procedures": "Finance and Admin policy and emergency procedures",
220+
"Hazard, Context and Risk Analysis, Monitoring and Early Warning": "Hazard, Context and Risk Analysis, Monitoring and Early Warning",
221+
"Information and Communication Technology (ICT)": "Information and Communication Technology (ICT)",
222+
"Information Management": "Information Management (IM)",
223+
"Logistics - Logistics Management": "LOGISTICS MANAGEMENT",
224+
"Logistics - Procurement": "PROCUREMENT",
225+
"Logistics - Warehouse and Stock Management": "WAREHOUSE AND STOCK MANAGEMENT",
226+
"Mapping of NS Capacities": "Mapping of NS capacities",
227+
"NS Specific Areas of Intervention": "NS-specific areas of intervention",
228+
"Operations Monitoring, Evaluation, Reporting and Learning": "Operations Monitoring, Evaluation, Reporting and Learning",
229+
"Pre-Disaster Meetings and Agreements": "Pre-disaster meetings and agreements",
230+
"Preparedness Plans and Budgets": "Preparedness plans and budgets",
231+
"Quality and Accountability": "Quality and accountability",
232+
"RC Auxiliary Role, Mandate and Law": "RC auxiliary role, Mandate and Law",
233+
"Resources Mobilisation": "Resource Mobilisation",
234+
"Response and Recovery Planning": "Response and recovery planning",
235+
"Risk Management": "Risk management",
236+
"Safety and Security Management": "Safety and security management",
237+
"Staff and Volunteer Management": "Staff and volunteer management",
238+
"Testing and Learning": "Testing and Learning",
239+
"Cooperation with Private Sector": "Cooperation with private sector",
240+
"Disaster Risk Management Strategy": "DRM Strategy",
241+
"Logistics - Supply Chain Management": "SUPPLY CHAIN MANAGEMENT",
242+
"Logistics - Transportation Management": "FLEET AND TRANSPORTATION MANAGEMENT",
243+
"Scenario Planning": "Scenario planning",
244+
"Civil Military Relations": "Civil Military Relations",
245+
"Disaster Risk Management Policy": "DRM Policy",
246+
"information and Communication Technology (ICT)": "Information and Communication Technology (ICT)",
247+
"Coordination with local community level responders": "Coordination with local community level responders",
248+
"Emergency Response Procedures (SOPs)": "Emergency Response Procedures (SOPs)",
249+
"Logistics - Transport": "FLEET AND TRANSPORTATION MANAGEMENT",
250+
"Unknown": None,
251+
"Business continuity": "Business continuity",
252+
"emergency Response Procedures (SOP)": "Emergency Response Procedures (SOPs)",
253+
"National Society Specific Areas of intervention": "NS-specific areas of intervention"
254+
}
255+
256+
mapping_sector = {
257+
"Strategies for implementation": None, # No direct match found # no sector(?)
258+
"Disaster Risk Reduction and Climate Action": "DRR",
259+
"Health": "Health (public)",
260+
"Livelihoods and Basic Needs": "Livelihoods and basic needs",
261+
"Migration and Displacement": "Migration",
262+
"Protection, Gender and Inclusion": "PGI",
263+
"Shelter and Settlements": "Shelter",
264+
"Water Sanitation and Hygiene": "WASH",
265+
"Secretariat Services": None, # No direct match found # need to bring it out as IFRC learning
266+
"National Society Strengthening": "NS Strengthening",
267+
"Water, Sanitation And Hygiene": "WASH",
268+
"Protection, Gender And Inclusion": "PGI",
269+
"Shelter Housing And Settlements": "Shelter",
270+
"Livelihoods And Basic Needs": "Livelihoods and basic needs",
271+
"Community Engagement And Accountability": "CEA",
272+
"Multi-purpose Cash": "Livelihoods and basic needs", # No direct match found
273+
"Risk Reduction, Climate Adaptation And Recovery": "DRR",
274+
"Migration": "Migration",
275+
"Education": "Education",
276+
"Shelter and Basic Household Items": "Shelter",
277+
"Multi Purpose Cash": "Livelihoods and basic needs",
278+
"Environmental Sustainability": None,
279+
"Migration And Displacement": "Migration",
280+
"Coordination And Partnerships":"NS Strengthening",
281+
}
282+
283+
return mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding
284+
285+
def format_data(self, df, mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding):
286+
logger.info('Formatting data to upload to GO Operational Learning Table')
287+
288+
df.loc[:, 'mapped_per'] = [mapping_per[x] if pd.notna(x) else None for x in df['PER - Component']]
289+
df.loc[:, 'id_per'] = [dict_per[x] if pd.notna(x) else None for x in df['mapped_per']]
290+
df.loc[:, 'mapped_sector'] = [mapping_sector[x] if pd.notna(x) else None for x in df['Sector']]
291+
df.loc[:, 'id_sector'] = [dict_sector[x] if pd.notna(x) else None for x in df['mapped_sector']]
292+
df.loc[:, 'id_institution'] = [dict_org[x] for x in df['Institution']]
293+
df.loc[:, 'id_finding'] = [dict_finding[x] for x in df['Finding']]
294+
295+
formatted_data = df[['appeal_code', 'Excerpts', 'id_per', 'id_sector', 'id_institution', 'id_finding']]
296+
297+
return formatted_data
298+
299+
def manage_duplicates(self, df):
300+
301+
df = df.groupby(['appeal_code', 'Excerpts', 'id_finding'], as_index=False).agg(list).reset_index()
302+
df.drop(columns=['index'], inplace=True)
303+
304+
df['id_per'] = [list(set([y for y in x if pd.notna(y)])) for x in df['id_per']]
305+
df['id_sector'] = [list(set([y for y in x if pd.notna(y)])) for x in df['id_sector']]
306+
df['id_institution'] = [list(set([y for y in x if pd.notna(y)])) for x in df['id_institution']]
307+
308+
deduplicated_data = df
309+
310+
return deduplicated_data
311+
312+
def post_to_api(self, df, api_post_endpoint):
313+
logger.info('Posting data to GO Operational Learning API')
314+
315+
url = api_post_endpoint
316+
317+
myobj = {}
318+
for i in range(0, len(df)):
319+
myobj[i] = {"learning": df['Excerpts'].iloc[i],
320+
"learning_validated": df['Excerpts'].iloc[i],
321+
"appeal_code": df['appeal_code'].iloc[i],
322+
"type": int(df['id_finding'].iloc[i]),
323+
"type_validated": int(df['id_finding'].iloc[i]),
324+
"sector": df['id_sector'].iloc[i],
325+
"sector_validated": df['id_sector'].iloc[i],
326+
"per_component": df['id_per'].iloc[i],
327+
"per_component_validated": df['id_per'].iloc[i],
328+
"organization": df['id_institution'].iloc[i],
329+
"organization_validated": df['id_institution'].iloc[i],
330+
"is_validated": False}
331+
332+
# Define a retry decorator
333+
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=5)
334+
def post_request(x):
335+
response = requests.post(url, json=myobj[x], headers=self.go_authorization_token)
336+
response.raise_for_status() # Raise HTTPError for bad responses
337+
return response
338+
339+
for x in range(0, len(myobj)):
340+
try:
341+
response = post_request(x)
342+
logger.info("Response status: %s" % response.status_code)
343+
time.sleep(1)
344+
except requests.exceptions.HTTPError as errh:
345+
print(f"HTTP Error: {errh}")
346+
time.sleep(5) # Wait before retrying
347+
except requests.exceptions.RequestException as err:
348+
print(f"Request Exception: {err}")
349+
time.sleep(5) # Wait before retrying
350+
351+
def handle(self, *args, **options):
352+
logger.info("Starting extracting tags for ops learnings")
353+
self.set_auth_token()
354+
355+
# Step 0: Setup
356+
nltk.download('punkt')
357+
358+
# Step 1: Fetch Data
359+
final_report, appeal, ops_learning = self.fetch_data('dref-final-report', 'appeal', 'ops-learning')
360+
filtered_data = self.filter_final_report(final_report, appeal, ops_learning, final_report_is_published=True, appeal_is_published=True, in_ops_learning=False)
361+
362+
rows = 0
363+
if filtered_data is not None:
364+
# Step 2: Data Preprocessing
365+
split_learnings = self.split_rows(filtered_data)
366+
367+
if split_learnings is not None:
368+
# Step 3: Tagging
369+
tagged_data = self.tag_data(split_learnings,'PER - Component' , CLASSIFY_URL)
370+
371+
# Step 4: Post Processing
372+
mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding = self.fetch_complementary_data('per-formcomponent', 'primarysector')
373+
formatted_data = self.format_data(tagged_data, mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding)
374+
deduplicated_data = self.manage_duplicates(formatted_data)
375+
376+
# Step 5: Post to API Endpoint
377+
self.post_to_api(deduplicated_data, OPS_LEARNING_URL)
378+
379+
rows = len(deduplicated_data) if deduplicated_data else 0
380+
logger.info("%s new Operational Learnings ingested to GO API" % rows)
381+
382+
if rows == 0:
383+
body = {
384+
"name": CRON_NAME,
385+
"message": ("No new learnings added. Done processing ops learnings.",),
386+
"num_result": rows,
387+
"status": CronJobStatus.WARNED
388+
}
389+
else:
390+
body = {
391+
"name": CRON_NAME,
392+
"message": ("Done processing ops learnings",),
393+
"num_result": rows,
394+
"status": CronJobStatus.SUCCESSFUL,
395+
}
396+
397+
CronJob.sync_cron(body)

0 commit comments

Comments
 (0)