Update EMDAT VARNAMES for version 2023 (#701)

simonameiler · emanuel-schmid · web-flow · commit dfb1a9267a14 · 2023-04-27T13:36:55.000+02:00
* Update EMDAT VARNAMES for version 2023

Update the EMDAT variables names to meet data structure criteria of the year 2023

* impact_data: use the newest version of EM-DAT files instead of fixing it to 2020

* clean_emdat_df: add inline comments for 2018 case

---------

Co-authored-by: emanuel-schmid &lt;schmide@ethz.ch&gt;
diff --git a/climada/engine/calibration_opt.py b/climada/engine/calibration_opt.py
@@ -31,7 +31,7 @@
 
 from climada.engine import ImpactCalc
 from climada.entity import ImpactFuncSet, ImpfTropCyclone, impact_funcs
-from climada.engine.impact_data import emdat_impact_yearlysum, emdat_impact_event
+from climada.engine.impact_data import emdat_impact_yearlysum  #, emdat_impact_event
 
 LOGGER = logging.getLogger(__name__)
 
@@ -261,7 +261,7 @@ def init_impact_data(hazard_type,
                                              reference_year=reference_year)
         else:
             raise ValueError('init_impact_data not yet implemented for yearly_impact = False.')
-            em_data = emdat_impact_event(source_file)
+            #em_data = emdat_impact_event(source_file)
     else:
         raise ValueError('init_impact_data not yet implemented for other impact_data_sources '
                          'than emdat.')
diff --git a/climada/engine/impact_data.py b/climada/engine/impact_data.py
@@ -114,7 +114,57 @@
             "Reconstruction Costs ('000 US$)": "Reconstruction Costs ('000 US$)",
             "Insured Damages ('000 US$)": "Insured Damages ('000 US$)",
             "Total Damages ('000 US$)": "Total Damages ('000 US$)",
-            'CPI': 'CPI'}}
+            'CPI': 'CPI'},
+     2023: {'Dis No': 'Dis No',
+            'Year': 'Year',
+            'Seq': 'Seq',
+            'Glide': 'Glide',
+            'Disaster Group': 'Disaster Group',
+            'Disaster Subgroup': 'Disaster Subgroup',
+            'Disaster Type': 'Disaster Type',
+            'Disaster Subtype': 'Disaster Subtype',
+            'Disaster Subsubtype': 'Disaster Subsubtype',
+            'Event Name': 'Event Name',
+            'Country': 'Country',
+            'ISO': 'ISO',
+            'Region': 'Region',
+            'Continent': 'Continent',
+            'Location': 'Location',
+            'Origin': 'Origin',
+            'Associated Dis': 'Associated Dis',
+            'Associated Dis2': 'Associated Dis2',
+            'OFDA Response': 'OFDA Response',
+            'Appeal': 'Appeal',
+            'Declaration': 'Declaration',
+            "AID Contribution ('000 US$)": "AID Contribution ('000 US$)",
+            'Dis Mag Value': 'Dis Mag Value',
+            'Dis Mag Scale': 'Dis Mag Scale',
+            'Latitude': 'Latitude',
+            'Longitude': 'Longitude',
+            'Local Time': 'Local Time',
+            'River Basin': 'River Basin',
+            'Start Year': 'Start Year',
+            'Start Month': 'Start Month',
+            'Start Day': 'Start Day',
+            'End Year': 'End Year',
+            'End Month': 'End Month',
+            'End Day': 'End Day',
+            'Total Deaths': 'Total Deaths',
+            'No Injured': 'No Injured',
+            'No Affected': 'No Affected',
+            'No Homeless': 'No Homeless',
+            'Total Affected': 'Total Affected',
+            "Reconstruction Costs ('000 US$)": "Reconstruction Costs ('000 US$)",
+            "Reconstruction Costs, Adjusted ('000 US$)": "Reconstruction Costs, Adjusted ('000 US$)",
+            "Insured Damages ('000 US$)": "Insured Damages ('000 US$)",
+            "Insured Damages, Adjusted ('000 US$)": "Insured Damages, Adjusted ('000 US$)",
+            "Total Damages ('000 US$)": "Total Damages ('000 US$)",
+            "Total Damages, Adjusted ('000 US$)": "Total Damages, Adjusted ('000 US$)",
+            'CPI': 'CPI',
+            'Adm Level': 'Adm Level',
+            'Admin1 Code': 'Admin1 Code',
+            'Admin2 Code': 'Admin2 Code',
+            'Geo Locations': 'Geo Locations'}}
 
 
 def assign_hazard_to_emdat(certainty_level, intensity_path_haz, names_path_haz,
@@ -473,7 +523,7 @@ def check_assigned_track(lookup, checkset):
 
 
 def clean_emdat_df(emdat_file, countries=None, hazard=None, year_range=None,
-                   target_version=2020):
+                   target_version=None):
     """
     Get a clean and standardized DataFrame from EM-DAT-CSV-file
     (1) load EM-DAT data from CSV to DataFrame and remove header/footer,
@@ -501,7 +551,8 @@ def clean_emdat_df(emdat_file, countries=None, hazard=None, year_range=None,
         (only min and max are considered)
     target_version : int
         required EM-DAT data format version (i.e. year of download),
-        changes naming of columns/variables (default: 2020)
+        changes naming of columns/variables,
+        default: newest available version in ``VARNAMES_EMDAT`` that matches the given emdat_file
 
     Returns
     -------
@@ -527,12 +578,16 @@ def clean_emdat_df(emdat_file, countries=None, hazard=None, year_range=None,
 
     # (2)  handle version, clean up, and add columns:
     # (2.1) identify underlying EMDAT version of csv:
-    version = 2020
-    for vers in list(VARNAMES_EMDAT.keys()):
+    version = None
+    for vers in sorted(VARNAMES_EMDAT.keys()):
         if len(df_emdat.columns) >= len(VARNAMES_EMDAT[vers]) and \
            all(item in list(df_emdat.columns) for item in VARNAMES_EMDAT[vers].values()):
             version = vers
+    if not version:
+        raise ValueError("the given emdat_file contains unexpected columns and cannot be"
+                         " associated with any known EM-DAT file structure")
     # (2.2) create new DataFrame df_data with column names as target version
+    target_version = target_version or version
     df_data = pd.DataFrame(index=df_emdat.index.values,
                            columns=VARNAMES_EMDAT[target_version].values())
     if 'Year' not in df_data.columns:  # make sure column "Year" exists
@@ -551,6 +606,9 @@ def clean_emdat_df(emdat_file, countries=None, hazard=None, year_range=None,
                     years_list.append(np.nan)
             df_data[col] = years_list
     if version <= 2018 and target_version >= 2020:
+        # create 'Start Year', -Month' and -Day' from 'Start date'
+        # ignore 'End date'
+        # replace NaN with None in 'Disaster Subtype', 'Disaster Type' and 'Country'
         date_list = list()
         year_list = list()
         month_list = list()
@@ -705,7 +763,7 @@ def scale_impact2refyear(impact_values, year_values, iso3a_values, reference_yea
 
 def emdat_impact_yearlysum(emdat_file_csv, countries=None, hazard=None, year_range=None,
                            reference_year=None, imp_str="Total Damages ('000 US$)",
-                           version=2020):
+                           version=None):
     """function to load EM-DAT data and sum impact per year
 
     Parameters
@@ -727,16 +785,18 @@ def emdat_impact_yearlysum(emdat_file_csv, countries=None, hazard=None, year_ran
     year_range : list or tuple
         Year range to be extracted, e.g. (2000, 2015);
         (only min and max are considered)
-    version : int
+    version : int, optional
         required EM-DAT data format version (i.e. year of download),
-        changes naming of columns/variables (default: 2020)
+        changes naming of columns/variables,
+        default: newest available version in ``VARNAMES_EMDAT``
 
     Returns
     -------
     out : pd.DataFrame
         DataFrame with summed impact and scaled impact per
         year and country.
     """
+    version = version or max(VARNAMES_EMDAT.keys())
     imp_str = VARNAMES_EMDAT[version][imp_str]
     df_data = clean_emdat_df(emdat_file_csv, countries=countries, hazard=hazard,
                              year_range=year_range, target_version=version)
@@ -773,7 +833,7 @@ def emdat_impact_yearlysum(emdat_file_csv, countries=None, hazard=None, year_ran
 
 def emdat_impact_event(emdat_file_csv, countries=None, hazard=None, year_range=None,
                        reference_year=None, imp_str="Total Damages ('000 US$)",
-                       version=2020):
+                       version=None):
     """function to load EM-DAT data return impact per event
 
     Parameters
@@ -801,8 +861,9 @@ def emdat_impact_event(emdat_file_csv, countries=None, hazard=None, year_range=N
     imp_str : str
         Column name of impact metric in EMDAT CSV,
         default = "Total Damages ('000 US$)"
-    version : int
-        EM-DAT version to take variable/column names from (defaul: 2020)
+    version : int, optional
+        EM-DAT version to take variable/column names from,
+        default: newest available version in ``VARNAMES_EMDAT``
 
     Returns
     -------
@@ -812,6 +873,7 @@ def emdat_impact_event(emdat_file_csv, countries=None, hazard=None, year_range=N
         same unit as chosen impact, but multiplied by 1000 if impact is given
         as 1000 US$ (e.g. imp_str="Total Damages ('000 US$) scaled").
     """
+    version = version or max(VARNAMES_EMDAT.keys())
     imp_str = VARNAMES_EMDAT[version][imp_str]
     df_data = clean_emdat_df(emdat_file_csv, hazard=hazard, year_range=year_range,
                              countries=countries, target_version=version)
@@ -883,7 +945,11 @@ def emdat_to_impact(emdat_file_csv, hazard_type_climada, year_range=None, countr
         imp_str = "Insured Damages ('000 US$)"
     elif "Reconstruction Costs" in imp_str:
         imp_str = "Reconstruction Costs ('000 US$)"
-    imp_str = VARNAMES_EMDAT[max(VARNAMES_EMDAT.keys())][imp_str]
+
+    # use the newest version of EMDAT varnames
+    version = max(VARNAMES_EMDAT.keys())
+
+    imp_str = VARNAMES_EMDAT[version][imp_str]
     if not hazard_type_emdat:
         hazard_type_emdat = [hazard_type_climada]
     if reference_year == 0:
@@ -903,7 +969,7 @@ def emdat_to_impact(emdat_file_csv, hazard_type_climada, year_range=None, countr
     # Load EM-DAT impact data by event:
     em_data = emdat_impact_event(emdat_file_csv, countries=countries, hazard=hazard_type_emdat,
                                  year_range=year_range, reference_year=reference_year,
-                                 imp_str=imp_str, version=max(VARNAMES_EMDAT.keys()))
+                                 imp_str=imp_str, version=version)
 
     if isinstance(countries, str):
         countries = [countries]
@@ -915,7 +981,7 @@ def emdat_to_impact(emdat_file_csv, hazard_type_climada, year_range=None, countr
         return impact_instance, countries
     impact_instance.event_id = np.array(em_data.index, int)
     impact_instance.event_name = list(
-        em_data[VARNAMES_EMDAT[max(VARNAMES_EMDAT.keys())]['Dis No']])
+        em_data[VARNAMES_EMDAT[version]['Dis No']])
 
     date_list = list()
     for year in list(em_data['Year']):
@@ -982,8 +1048,7 @@ def emdat_to_impact(emdat_file_csv, hazard_type_climada, year_range=None, countr
             countries_reg_id.append(u_coord.country_to_iso(cntry, "numeric"))
         except LookupError:
             countries_reg_id.append(0)
-        df_tmp = em_data[em_data[VARNAMES_EMDAT[
-            max(VARNAMES_EMDAT.keys())]['ISO']].str.contains(cntry)]
+        df_tmp = em_data[em_data[VARNAMES_EMDAT[version]['ISO']].str.contains(cntry)]
         if not reference_year:
             impact_instance.eai_exp[idx] = sum(np.array(df_tmp["impact"]) *
                                                impact_instance.frequency[0])