|
| 1 | +import requests |
| 2 | +from django.core.management.base import BaseCommand |
| 3 | +from api.logger import logger |
| 4 | +import pandas as pd |
| 5 | +import numpy as np |
| 6 | +from retrying import retry |
| 7 | +import time |
| 8 | +import nltk |
| 9 | +from nltk.tokenize import sent_tokenize |
| 10 | +from api.models import CronJob, CronJobStatus |
| 11 | +from rest_framework.authtoken.models import Token |
| 12 | +from django.contrib.auth.models import User |
| 13 | + |
| 14 | + |
| 15 | +CRON_NAME = "extract_tags_for_ops_learnings" |
| 16 | +CLASSIFY_URL = "https://dreftagging.azurewebsites.net/classify" |
| 17 | +GO_API_URL = "https://goadmin.ifrc.org/api/v2/" |
| 18 | +OPS_LEARNING_URL = GO_API_URL + "ops-learning/" |
| 19 | +LIMIT_200 = "/?limit=200/" |
| 20 | + |
| 21 | + |
| 22 | +class Command(BaseCommand): |
| 23 | + help = "Extracting tags for Operational Learnings" |
| 24 | + |
| 25 | + def set_auth_token(self): |
| 26 | + user = User.objects.filter(is_superuser=True).first() |
| 27 | + api_key = Token.objects.filter(user=user)[0].key |
| 28 | + self.go_authorization_token = {"Authorization" : "Token " + api_key} |
| 29 | + |
| 30 | + def fetch_data(self, dref_final_report, appeal, ops_learning): |
| 31 | + |
| 32 | + def fetchUrl(field): |
| 33 | + return requests.get(field, headers=self.go_authorization_token).json() |
| 34 | + |
| 35 | + def fetchField(field): |
| 36 | + dict_field = [] |
| 37 | + temp_dict = requests.get(GO_API_URL + field + LIMIT_200, headers=self.go_authorization_token).json() |
| 38 | + while temp_dict['next']: |
| 39 | + dict_field.extend(temp_dict['results']) |
| 40 | + temp_dict = fetchUrl(temp_dict['next']) |
| 41 | + dict_field.extend(temp_dict['results']) |
| 42 | + return pd.DataFrame.from_dict(dict_field) |
| 43 | + |
| 44 | + # read dref final reports, to extract learnings in planned interventions |
| 45 | + logger.info('Fetching DREF Final Reports from GO') |
| 46 | + dref_final_report = fetchField(dref_final_report) |
| 47 | + |
| 48 | + # read appeals to verify which drefs (appeals) are public and which drefs (appeals) are silent |
| 49 | + logger.info('Fetching Appeals from GO') |
| 50 | + appeals = fetchField(appeal) |
| 51 | + |
| 52 | + # read ops learning to verify which drefs have already been processed |
| 53 | + logger.info('Fetching Operational Learnings from GO') |
| 54 | + ops_learning = fetchField(ops_learning) |
| 55 | + ops_learning['appeal_code'] = [x['code'] for x in ops_learning['appeal']] |
| 56 | + |
| 57 | + return dref_final_report, appeals, ops_learning |
| 58 | + |
| 59 | + def filter_final_report(self, final_report, appeal, ops_learning, final_report_is_published=True, appeal_is_published=True, in_ops_learning=False): |
| 60 | + |
| 61 | + if final_report_is_published: |
| 62 | + logger.info('Filtering only DREF Final Reports that have been closed') |
| 63 | + mask = [x for x in final_report['is_published']] |
| 64 | + final_report = final_report[mask] |
| 65 | + |
| 66 | + if appeal_is_published: |
| 67 | + logger.info('Filtering only DREF Final Reports that are public') |
| 68 | + mask = [x in list(appeal['code']) for x in final_report['appeal_code']] |
| 69 | + final_report = final_report[mask] |
| 70 | + |
| 71 | + if not in_ops_learning: |
| 72 | + logger.info('Filtering only DREF Final Reports that have not been processed yet for operational learning') |
| 73 | + list_new_reports = np.setdiff1d(final_report['appeal_code'].unique(),ops_learning['appeal_code'].unique()) |
| 74 | + |
| 75 | + # only reports that are not processed yet |
| 76 | + mask = [x in list_new_reports for x in final_report['appeal_code']] |
| 77 | + final_report = final_report[mask] |
| 78 | + |
| 79 | + if final_report.empty: |
| 80 | + logger.warning('There were not find any DREF Final Reports after the filtering process') |
| 81 | + return None |
| 82 | + |
| 83 | + else: |
| 84 | + filtered_final_report = final_report[['appeal_code', 'planned_interventions']] |
| 85 | + logger.info('There were found %s reports after the filtering process', str(len(filtered_final_report))) |
| 86 | + return filtered_final_report |
| 87 | + |
| 88 | + def split_rows(self, filtered_final_report): |
| 89 | + |
| 90 | + def split_planned_interventions(df): |
| 91 | + logger.info('Splitting DREF Final Reports per planned intervention') |
| 92 | + df = df.explode(column='planned_interventions', ignore_index=True) |
| 93 | + |
| 94 | + df['Sector'] = [x['title_display'] for x in df['planned_interventions']] |
| 95 | + df['Lessons Learnt'] = [x['lessons_learnt'] for x in df['planned_interventions']] |
| 96 | + df['Challenges'] = [x['challenges'] for x in df['planned_interventions']] |
| 97 | + |
| 98 | + mask_1 = [pd.notna(x) for x in df['Lessons Learnt']] |
| 99 | + mask_2 = [pd.notna(x) for x in df['Challenges']] |
| 100 | + mask = [x or y for x, y in zip(mask_1, mask_2)] |
| 101 | + df = df[mask] |
| 102 | + df.drop(columns='planned_interventions', inplace=True) |
| 103 | + |
| 104 | + df = df.melt(id_vars=['appeal_code', 'Sector'], value_vars=['Lessons Learnt', 'Challenges'], var_name='Finding', value_name='Excerpts') |
| 105 | + df = df[pd.notna(df['Excerpts'])] |
| 106 | + return df |
| 107 | + |
| 108 | + def split_excerpts(df): |
| 109 | + logger.info('Splitting unique learnings in each planned intervention') |
| 110 | + df['Excerpts_ind'] = [sent_tokenize(x) for x in df['Excerpts']] |
| 111 | + df = df.explode(column='Excerpts_ind', ignore_index=True) |
| 112 | + df.drop(columns='Excerpts', inplace=True) |
| 113 | + |
| 114 | + # remove strings that have less than 5 characters |
| 115 | + df['Excerpts_ind'] = [np.nan if pd.notna(x) and len(x) < 5 else x for x in df['Excerpts_ind']] |
| 116 | + df = df[pd.notna(df['Excerpts_ind'])] |
| 117 | + |
| 118 | + # catching go format (bullet point) |
| 119 | + df['Excerpts_ind'] = [x[2:] if x.startswith('•\t') else x for x in df['Excerpts_ind']] |
| 120 | + |
| 121 | + # catching other formats |
| 122 | + df['Excerpts_ind'] = [x[1:] if x.startswith(tuple(['-', '•', '▪', ' '])) else x for x in df['Excerpts_ind']] |
| 123 | + |
| 124 | + df['Excerpts'] = [x.strip() for x in df['Excerpts_ind']] |
| 125 | + |
| 126 | + df.drop(columns='Excerpts_ind', inplace=True) |
| 127 | + |
| 128 | + return df |
| 129 | + |
| 130 | + final_report_interventions = split_planned_interventions(filtered_final_report) |
| 131 | + final_report_learnings = split_excerpts(final_report_interventions) |
| 132 | + |
| 133 | + if final_report_interventions.empty: |
| 134 | + logger.warning('There were not found any learnings on the DREF Final Reports planned interventions') |
| 135 | + return None |
| 136 | + else: |
| 137 | + final_report_learnings = split_excerpts(final_report_interventions) |
| 138 | + return final_report_learnings |
| 139 | + |
| 140 | + def tag_data(self, df, tagging, tagging_api_endpoint): |
| 141 | + logger.info('Tagging learnings with PER framework') |
| 142 | + |
| 143 | + headers = { |
| 144 | + 'accept': 'application/json', |
| 145 | + 'Content-Type': 'application/json', |
| 146 | + } |
| 147 | + |
| 148 | + url = tagging_api_endpoint |
| 149 | + |
| 150 | + df[tagging] = None |
| 151 | + df.reset_index(inplace=True, drop=True) |
| 152 | + |
| 153 | + for i in range(0, len(df)): |
| 154 | + data = "\""+df['Excerpts'].iloc[i]+"\"" |
| 155 | + data = data.encode('utf-8') |
| 156 | + response = requests.post(url, headers=headers, data=data) |
| 157 | + if (response.status_code == 201) and len(response.json()[0]['tags']) > 0: |
| 158 | + df.loc[i, tagging] = response.json()[0]['tags'][0] |
| 159 | + |
| 160 | + df['Institution'] = np.empty(len(df)) |
| 161 | + |
| 162 | + for i in range(0, len(df)): |
| 163 | + if (df['PER - Component'].iloc[i] == 'Activation of Regional and International Support'): |
| 164 | + df.loc[i, 'Institution'] = 'Secretariat' |
| 165 | + else: |
| 166 | + df.loc[i, 'Institution'] = 'National Society' |
| 167 | + |
| 168 | + tagged_data = df |
| 169 | + return tagged_data |
| 170 | + |
| 171 | + def fetch_complementary_data(self, per_formcomponent, primary_sector): |
| 172 | + logger.info('Fetching complementary data on PER components ids, sectors ids, finding ids, organisations ids') |
| 173 | + |
| 174 | + def fetchUrl(field): |
| 175 | + return requests.get(field).json() |
| 176 | + |
| 177 | + def fetchField(field): |
| 178 | + dict_field = [] |
| 179 | + temp_dict = requests.get(GO_API_URL + field + LIMIT_200).json() |
| 180 | + while temp_dict['next']: |
| 181 | + dict_field.extend(temp_dict['results']) |
| 182 | + temp_dict = fetchUrl(temp_dict['next']) |
| 183 | + dict_field.extend(temp_dict['results']) |
| 184 | + return pd.DataFrame.from_dict(dict_field) |
| 185 | + |
| 186 | + per_formcomponent = fetchField(per_formcomponent) |
| 187 | + |
| 188 | + go_sectors = fetchUrl(GO_API_URL + primary_sector) |
| 189 | + |
| 190 | + dict_per = dict(zip(per_formcomponent['title'], per_formcomponent['id'])) |
| 191 | + |
| 192 | + dict_sector = {item['label']: item['key'] for item in go_sectors} |
| 193 | + |
| 194 | + dict_finding = { |
| 195 | + 'Lessons Learnt': 1, |
| 196 | + 'Challenges': 2 |
| 197 | + } |
| 198 | + |
| 199 | + dict_org = { |
| 200 | + 'Secretariat': 1, |
| 201 | + 'National Society': 2 |
| 202 | + } |
| 203 | + |
| 204 | + mapping_per = { |
| 205 | + "Activation of Regional and International Support": "Activation of regional and international support", |
| 206 | + "Affected Population Selection": "Affected population selection", |
| 207 | + "Business Continuity": "Business continuity", |
| 208 | + "Cash and Voucher Assistance": "Cash Based Intervention (CBI)", |
| 209 | + "Communications in Emergencies": "Communication in emergencies", |
| 210 | + "Coordination with Authorities": "Coordination with authorities", |
| 211 | + "Coordination with External Agencies and NGOs": "Coordination with External Agencies and NGOs", |
| 212 | + "Coordination with Local Community Level Responders": "Coordination with local community level responders", |
| 213 | + "Coordination with Movement": "Coordination with Movement", |
| 214 | + "DRM Laws, Advocacy and Dissemination": "DRM Laws, Advocacy and Dissemination", |
| 215 | + "Early Action Mechanisms": "Early Action Mechanisms", |
| 216 | + "Emergency Needs Assessment and Planning": "Emergency Needs Assessment", |
| 217 | + "Emergency Operations Centre (EOC)": "Emergency Operations Centre (EOC)", |
| 218 | + "Emergency Response Procedures (SOP)": "Emergency Response Procedures (SOPs)", |
| 219 | + "Finance and Admin. Policy and Emergency Procedures": "Finance and Admin policy and emergency procedures", |
| 220 | + "Hazard, Context and Risk Analysis, Monitoring and Early Warning": "Hazard, Context and Risk Analysis, Monitoring and Early Warning", |
| 221 | + "Information and Communication Technology (ICT)": "Information and Communication Technology (ICT)", |
| 222 | + "Information Management": "Information Management (IM)", |
| 223 | + "Logistics - Logistics Management": "LOGISTICS MANAGEMENT", |
| 224 | + "Logistics - Procurement": "PROCUREMENT", |
| 225 | + "Logistics - Warehouse and Stock Management": "WAREHOUSE AND STOCK MANAGEMENT", |
| 226 | + "Mapping of NS Capacities": "Mapping of NS capacities", |
| 227 | + "NS Specific Areas of Intervention": "NS-specific areas of intervention", |
| 228 | + "Operations Monitoring, Evaluation, Reporting and Learning": "Operations Monitoring, Evaluation, Reporting and Learning", |
| 229 | + "Pre-Disaster Meetings and Agreements": "Pre-disaster meetings and agreements", |
| 230 | + "Preparedness Plans and Budgets": "Preparedness plans and budgets", |
| 231 | + "Quality and Accountability": "Quality and accountability", |
| 232 | + "RC Auxiliary Role, Mandate and Law": "RC auxiliary role, Mandate and Law", |
| 233 | + "Resources Mobilisation": "Resource Mobilisation", |
| 234 | + "Response and Recovery Planning": "Response and recovery planning", |
| 235 | + "Risk Management": "Risk management", |
| 236 | + "Safety and Security Management": "Safety and security management", |
| 237 | + "Staff and Volunteer Management": "Staff and volunteer management", |
| 238 | + "Testing and Learning": "Testing and Learning", |
| 239 | + "Cooperation with Private Sector": "Cooperation with private sector", |
| 240 | + "Disaster Risk Management Strategy": "DRM Strategy", |
| 241 | + "Logistics - Supply Chain Management": "SUPPLY CHAIN MANAGEMENT", |
| 242 | + "Logistics - Transportation Management": "FLEET AND TRANSPORTATION MANAGEMENT", |
| 243 | + "Scenario Planning": "Scenario planning", |
| 244 | + "Civil Military Relations": "Civil Military Relations", |
| 245 | + "Disaster Risk Management Policy": "DRM Policy", |
| 246 | + "information and Communication Technology (ICT)": "Information and Communication Technology (ICT)", |
| 247 | + "Coordination with local community level responders": "Coordination with local community level responders", |
| 248 | + "Emergency Response Procedures (SOPs)": "Emergency Response Procedures (SOPs)", |
| 249 | + "Logistics - Transport": "FLEET AND TRANSPORTATION MANAGEMENT", |
| 250 | + "Unknown": None, |
| 251 | + "Business continuity": "Business continuity", |
| 252 | + "emergency Response Procedures (SOP)": "Emergency Response Procedures (SOPs)", |
| 253 | + "National Society Specific Areas of intervention": "NS-specific areas of intervention" |
| 254 | + } |
| 255 | + |
| 256 | + mapping_sector = { |
| 257 | + "Strategies for implementation": None, # No direct match found # no sector(?) |
| 258 | + "Disaster Risk Reduction and Climate Action": "DRR", |
| 259 | + "Health": "Health (public)", |
| 260 | + "Livelihoods and Basic Needs": "Livelihoods and basic needs", |
| 261 | + "Migration and Displacement": "Migration", |
| 262 | + "Protection, Gender and Inclusion": "PGI", |
| 263 | + "Shelter and Settlements": "Shelter", |
| 264 | + "Water Sanitation and Hygiene": "WASH", |
| 265 | + "Secretariat Services": None, # No direct match found # need to bring it out as IFRC learning |
| 266 | + "National Society Strengthening": "NS Strengthening", |
| 267 | + "Water, Sanitation And Hygiene": "WASH", |
| 268 | + "Protection, Gender And Inclusion": "PGI", |
| 269 | + "Shelter Housing And Settlements": "Shelter", |
| 270 | + "Livelihoods And Basic Needs": "Livelihoods and basic needs", |
| 271 | + "Community Engagement And Accountability": "CEA", |
| 272 | + "Multi-purpose Cash": "Livelihoods and basic needs", # No direct match found |
| 273 | + "Risk Reduction, Climate Adaptation And Recovery": "DRR", |
| 274 | + "Migration": "Migration", |
| 275 | + "Education": "Education", |
| 276 | + "Shelter and Basic Household Items": "Shelter", |
| 277 | + "Multi Purpose Cash": "Livelihoods and basic needs", |
| 278 | + "Environmental Sustainability": None, |
| 279 | + "Migration And Displacement": "Migration", |
| 280 | + "Coordination And Partnerships":"NS Strengthening", |
| 281 | + } |
| 282 | + |
| 283 | + return mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding |
| 284 | + |
| 285 | + def format_data(self, df, mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding): |
| 286 | + logger.info('Formatting data to upload to GO Operational Learning Table') |
| 287 | + |
| 288 | + df.loc[:, 'mapped_per'] = [mapping_per[x] if pd.notna(x) else None for x in df['PER - Component']] |
| 289 | + df.loc[:, 'id_per'] = [dict_per[x] if pd.notna(x) else None for x in df['mapped_per']] |
| 290 | + df.loc[:, 'mapped_sector'] = [mapping_sector[x] if pd.notna(x) else None for x in df['Sector']] |
| 291 | + df.loc[:, 'id_sector'] = [dict_sector[x] if pd.notna(x) else None for x in df['mapped_sector']] |
| 292 | + df.loc[:, 'id_institution'] = [dict_org[x] for x in df['Institution']] |
| 293 | + df.loc[:, 'id_finding'] = [dict_finding[x] for x in df['Finding']] |
| 294 | + |
| 295 | + formatted_data = df[['appeal_code', 'Excerpts', 'id_per', 'id_sector', 'id_institution', 'id_finding']] |
| 296 | + |
| 297 | + return formatted_data |
| 298 | + |
| 299 | + def manage_duplicates(self, df): |
| 300 | + |
| 301 | + df = df.groupby(['appeal_code', 'Excerpts', 'id_finding'], as_index=False).agg(list).reset_index() |
| 302 | + df.drop(columns=['index'], inplace=True) |
| 303 | + |
| 304 | + df['id_per'] = [list(set([y for y in x if pd.notna(y)])) for x in df['id_per']] |
| 305 | + df['id_sector'] = [list(set([y for y in x if pd.notna(y)])) for x in df['id_sector']] |
| 306 | + df['id_institution'] = [list(set([y for y in x if pd.notna(y)])) for x in df['id_institution']] |
| 307 | + |
| 308 | + deduplicated_data = df |
| 309 | + |
| 310 | + return deduplicated_data |
| 311 | + |
| 312 | + def post_to_api(self, df, api_post_endpoint): |
| 313 | + logger.info('Posting data to GO Operational Learning API') |
| 314 | + |
| 315 | + url = api_post_endpoint |
| 316 | + |
| 317 | + myobj = {} |
| 318 | + for i in range(0, len(df)): |
| 319 | + myobj[i] = {"learning": df['Excerpts'].iloc[i], |
| 320 | + "learning_validated": df['Excerpts'].iloc[i], |
| 321 | + "appeal_code": df['appeal_code'].iloc[i], |
| 322 | + "type": int(df['id_finding'].iloc[i]), |
| 323 | + "type_validated": int(df['id_finding'].iloc[i]), |
| 324 | + "sector": df['id_sector'].iloc[i], |
| 325 | + "sector_validated": df['id_sector'].iloc[i], |
| 326 | + "per_component": df['id_per'].iloc[i], |
| 327 | + "per_component_validated": df['id_per'].iloc[i], |
| 328 | + "organization": df['id_institution'].iloc[i], |
| 329 | + "organization_validated": df['id_institution'].iloc[i], |
| 330 | + "is_validated": False} |
| 331 | + |
| 332 | + # Define a retry decorator |
| 333 | + @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=5) |
| 334 | + def post_request(x): |
| 335 | + response = requests.post(url, json=myobj[x], headers=self.go_authorization_token) |
| 336 | + response.raise_for_status() # Raise HTTPError for bad responses |
| 337 | + return response |
| 338 | + |
| 339 | + for x in range(0, len(myobj)): |
| 340 | + try: |
| 341 | + response = post_request(x) |
| 342 | + logger.info("Response status: %s" % response.status_code) |
| 343 | + time.sleep(1) |
| 344 | + except requests.exceptions.HTTPError as errh: |
| 345 | + print(f"HTTP Error: {errh}") |
| 346 | + time.sleep(5) # Wait before retrying |
| 347 | + except requests.exceptions.RequestException as err: |
| 348 | + print(f"Request Exception: {err}") |
| 349 | + time.sleep(5) # Wait before retrying |
| 350 | + |
| 351 | + def handle(self, *args, **options): |
| 352 | + logger.info("Starting extracting tags for ops learnings") |
| 353 | + self.set_auth_token() |
| 354 | + |
| 355 | + # Step 0: Setup |
| 356 | + nltk.download('punkt') |
| 357 | + |
| 358 | + # Step 1: Fetch Data |
| 359 | + final_report, appeal, ops_learning = self.fetch_data('dref-final-report', 'appeal', 'ops-learning') |
| 360 | + filtered_data = self.filter_final_report(final_report, appeal, ops_learning, final_report_is_published=True, appeal_is_published=True, in_ops_learning=False) |
| 361 | + |
| 362 | + rows = 0 |
| 363 | + if filtered_data is not None: |
| 364 | + # Step 2: Data Preprocessing |
| 365 | + split_learnings = self.split_rows(filtered_data) |
| 366 | + |
| 367 | + if split_learnings is not None: |
| 368 | + # Step 3: Tagging |
| 369 | + tagged_data = self.tag_data(split_learnings,'PER - Component' , CLASSIFY_URL) |
| 370 | + |
| 371 | + # Step 4: Post Processing |
| 372 | + mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding = self.fetch_complementary_data('per-formcomponent', 'primarysector') |
| 373 | + formatted_data = self.format_data(tagged_data, mapping_per, dict_per, mapping_sector, dict_sector, dict_org, dict_finding) |
| 374 | + deduplicated_data = self.manage_duplicates(formatted_data) |
| 375 | + |
| 376 | + # Step 5: Post to API Endpoint |
| 377 | + self.post_to_api(deduplicated_data, OPS_LEARNING_URL) |
| 378 | + |
| 379 | + rows = len(deduplicated_data) if deduplicated_data else 0 |
| 380 | + logger.info("%s new Operational Learnings ingested to GO API" % rows) |
| 381 | + |
| 382 | + if rows == 0: |
| 383 | + body = { |
| 384 | + "name": CRON_NAME, |
| 385 | + "message": ("No new learnings added. Done processing ops learnings.",), |
| 386 | + "num_result": rows, |
| 387 | + "status": CronJobStatus.WARNED |
| 388 | + } |
| 389 | + else: |
| 390 | + body = { |
| 391 | + "name": CRON_NAME, |
| 392 | + "message": ("Done processing ops learnings",), |
| 393 | + "num_result": rows, |
| 394 | + "status": CronJobStatus.SUCCESSFUL, |
| 395 | + } |
| 396 | + |
| 397 | + CronJob.sync_cron(body) |
0 commit comments