Improve performance of LitPop (#720)

peanutfun · web-flow · commit f95b267c0634 · 2023-05-24T11:16:33.000+02:00
* Improve handling of region_id in LitPop

* Assume data from LitPop is gridded when determining region IDs.
* Add option to pass 'region_id' parameter to LitPop.from_shape.

* Streamline gridpoints_core_calc

* Avoid copying data by using np.asarray.
* Streamline computations by relying on numpy broadcasting.

* Avoid rounding the same data twice

* Fix linter issue and typo

* Normalize string delimiters

* Fix typo

* Add suggestions from GitHub discussions

* Rename return value in local function.
* Remove named argument call in `LitPop.from_shape`.
* Improve `region_id` parameter documentation.

* Fix return variable name in gridpoints_core_calc

* Update CHANGELOG.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,8 @@ Removed:
 - Use `myst-nb` for parsing Jupyter Notebooks for the documentation instead of `nbsphinx` [#712](https://github.com/CLIMADA-project/climada_python/pull/712)
 - Installation guide now recommends installing CLIMADA directly via `conda install` [#714](https://github.com/CLIMADA-project/climada_python/pull/714)
 - `Exposures.affected_total_value` now takes a hazard intensity threshold as argument. Affected values are only those for which at least one event exceeds the threshold. (previously, all exposures points with an assigned centroid were considered affected) [#702](https://github.com/CLIMADA-project/climada_python/pull/702)
+- Add option to pass region ID to `LitPop.from_shape` [#720](https://github.com/CLIMADA-project/climada_python/pull/720)
+- Slightly improved performance on `LitPop`-internal computations [#720](https://github.com/CLIMADA-project/climada_python/pull/720)
 
 ### Fixed
 
diff --git a/climada/entity/exposures/litpop/litpop.py b/climada/entity/exposures/litpop/litpop.py
@@ -460,8 +460,18 @@ def set_custom_shape(self, *args, **kwargs):
         self.__dict__ = LitPop.from_shape(*args, **kwargs).__dict__
 
     @classmethod
-    def from_shape(cls, shape, total_value, res_arcsec=30, exponents=(1,1), value_unit='USD',
-                   reference_year=DEF_REF_YEAR, gpw_version=GPW_VERSION, data_dir=SYSTEM_DIR):
+    def from_shape(
+        cls,
+        shape,
+        total_value,
+        res_arcsec=30,
+        exponents=(1,1),
+        value_unit='USD',
+        region_id=None,
+        reference_year=DEF_REF_YEAR,
+        gpw_version=GPW_VERSION,
+        data_dir=SYSTEM_DIR,
+    ):
         """init LitPop exposure object for a custom shape.
         Requires user input regarding the total value to be disaggregated.
 
@@ -488,6 +498,12 @@ def from_shape(cls, shape, total_value, res_arcsec=30, exponents=(1,1), value_un
             Defining power with which lit (nightlights) and pop (gpw) go into LitPop.
         value_unit : str
             Unit of exposure values. The default is USD.
+        region_id : int, optional
+            The numeric ISO 3166 region associated with the shape. If set to a value,
+            this single value will be set for every coordinate in the ``GeoDataFrame``
+            of the resulting ``LitPop`` instance. If ``None`` (default), the region ID
+            for every coordinate will be determined automatically (at a slight
+            computational cost).
         reference_year : int, optional
             Reference year for data sources. Default: CONFIG.exposures.def_ref_year
         gpw_version : int, optional
@@ -512,10 +528,15 @@ def from_shape(cls, shape, total_value, res_arcsec=30, exponents=(1,1), value_un
                                       'GeoSeries. Loop over elements of series outside method.')
 
         tag = Tag()
-        litpop_gdf, _ = _get_litpop_single_polygon(shape, reference_year,
-                                                          res_arcsec, data_dir,
-                                                          gpw_version, exponents,
-                                                          )
+        litpop_gdf, _ = _get_litpop_single_polygon(
+            shape,
+            reference_year,
+            res_arcsec,
+            data_dir,
+            gpw_version,
+            exponents,
+            region_id
+        )
 
         # disaggregate total value proportional to LitPop values:
         if isinstance(total_value, (float, int)):
@@ -673,8 +694,9 @@ def _get_litpop_single_polygon(polygon, reference_year, res_arcsec, data_dir,
         Defining power with which lit (nightlights) and pop (gpw) go into LitPop. To get
         nightlights^3 without population count: (3, 0). To use population count alone: (0, 1).
     region_id : int, optional
-        if provided, region_id of gdf is set to value.
-        The default is None, this implies that region_id is not set.
+        if provided, region_id of gdf is set to value. The default is ``None``, in which
+        case the region is determined automatically for every location of the resulting
+        GeoDataFrame.
     verbose : bool, optional
         Enable verbose logging about the used GPW version and reference year as well as about the
         boundary case where no grid points from the GPW grid are contained in the specified
@@ -755,18 +777,19 @@ def _get_litpop_single_polygon(polygon, reference_year, res_arcsec, data_dir,
                                           meta_out['width'],
                                           meta_out['height'])
     # init GeoDataFrame from data and coordinates:
-    gdf = geopandas.GeoDataFrame({'value': litpop_array.flatten()}, crs=meta_out['crs'],
-                                 geometry=geopandas.points_from_xy(
-                                     np.round_(lon.flatten(), decimals = 8, out = None),
-                                     np.round_(lat.flatten(), decimals = 8, out = None)
-                                     )
-                                 )
-    gdf['latitude'] = np.round_(lat.flatten(), decimals = 8, out = None)
-    gdf['longitude'] = np.round_(lon.flatten(), decimals = 8, out = None)
+    latitude = np.round(lat.flatten(), decimals=8)
+    longitude = np.round(lon.flatten(), decimals=8)
+    gdf = geopandas.GeoDataFrame(
+        {"value": litpop_array.flatten(), "latitude": latitude, "longitude": longitude},
+        crs=meta_out['crs'],
+        geometry=geopandas.points_from_xy(longitude, latitude),
+    )
     if region_id is not None: # set region_id
         gdf['region_id'] = region_id
     else:
-        gdf['region_id'] = u_coord.get_country_code(gdf.latitude, gdf.longitude)
+        gdf['region_id'] = u_coord.get_country_code(
+            gdf.latitude, gdf.longitude, gridded=True
+        )
     # remove entries outside polygon with `dropna` and return GeoDataFrame:
     return gdf.dropna(), meta_out
 
@@ -953,8 +976,10 @@ def reproject_input_data(data_array_list, meta_list,
     meta_out['width'] = data_out_list[-1].shape[1]
     return data_out_list, meta_out
 
-def gridpoints_core_calc(data_arrays, offsets=None, exponents=None,
-                         total_val_rescale=None):
+
+def gridpoints_core_calc(
+    data_arrays, offsets=None, exponents=None, total_val_rescale=None
+):
     """
     Combines N dense numerical arrays by point-wise multipilcation and
     optionally rescales to new total value:
@@ -992,47 +1017,50 @@ def gridpoints_core_calc(data_arrays, offsets=None, exponents=None,
 
     Returns
     -------
-    result_array : np.array of same shape as arrays in data_arrays
+    np.array of same shape as arrays in data_arrays
         Results from calculation described above.
     """
-    # convert input data to arrays
-    # check integrity of data_array input (length and type):
-    try:
-        data_arrays = [np.array(data) for data in data_arrays]
-        if len(list({data.shape for data in data_arrays})) > 1:
-            raise ValueError("Elements in data_arrays don't agree in shape.")
-    except AttributeError as err:
-        raise TypeError("data_arrays or contained elements have wrong type.") from err
-
-    # if None, defaults for offsets and exponents are set:
-    if offsets is None:
-        offsets = np.zeros(len(data_arrays))
-    if exponents is None:
-        exponents = np.ones(len(data_arrays))
-    if np.min(offsets) < 0:
-        raise ValueError("offset values < 0 are not allowed, because negative offset "
-                         "values produce undesired negative disaggregation values.")
-    if np.min(exponents) < 0:
-        raise ValueError("exponents < 0 are not allowed, because negative exponents "
-                         "produce 'inf' values where the initial value is 0.")
+    # Convert input data to arrays
+    # NOTE: Setting dtype will cause an error if shapes are not compatible
+    data = np.asarray(data_arrays, dtype=float)
+
+    def prepare_input(input_data, default_value, name):
+        """Prepare the offset and exposure inputs
+
+        If they are ``None``, use the default values. If not, cast them to an array and
+        ensure that the array has only one dimension. Finally, check if the values are
+        positive.
+        """
+        if input_data is None:
+            return default_value
+
+        array = np.asarray(input_data, dtype=float)
+        if array.ndim > 1:
+            raise ValueError(f"Provide {name} as 1D array or list of scalars")
+        if np.any(array < 0):
+            raise ValueError(
+                f"{name} values < 0 are not allowed, because negative {name} "
+                "values produce undesired negative disaggregation values."
+            )
+        return array
+
+    offsets = prepare_input(offsets, 0, "offsets")
+    exponents = prepare_input(exponents, 1, "exponents")
+
     # Steps 1-3: arrays are multiplied after application of offets and exponents:
-    #       (arrays are converted to float to prevent ValueError:
-    #       "Integers to negative integer powers are not allowed.")
-    result_array = np.power(np.array(data_arrays[0]+offsets[0], dtype=float),
-                            exponents[0])
-    for i in np.arange(1, len(data_arrays)):
-        result_array = np.multiply(result_array,
-                                   np.power(np.array(data_arrays[i]+offsets[i], dtype=float),
-                                            exponents[i])
-                                   )
-
-    # Steps 4+5: if total value for rescaling is provided, result_array is normalized and
+    # NOTE: Transpose so that broadcasting over last dimension works. Computing the
+    #       product over the last axis is also much faster. Then transpose back.
+    data = (data.T + offsets) ** exponents
+    data = data.prod(axis=-1).T
+
+    # Steps 4+5: if total value for rescaling is provided, result is normalized and
     # scaled with this total value (total_val_rescale):
     if isinstance(total_val_rescale, (float, int)):
-        return np.divide(result_array, result_array.sum()) * total_val_rescale
+        return data / data.sum() * total_val_rescale
     if total_val_rescale is not None:
         raise TypeError("total_val_rescale must be int or float.")
-    return result_array
+    return data
+
 
 # The following functions are only required if calc_admin1 is True,
 # not for core LitPop. They are maintained here mainly for backward compatibility
diff --git a/climada/entity/exposures/test/test_litpop.py b/climada/entity/exposures/test/test_litpop.py
@@ -205,13 +205,11 @@ def test_gridpoints_core_calc_input_errors(self):
             lp.gridpoints_core_calc(data_arrays_demo(4))
 
         # wrong format:
-        with self.assertRaises(TypeError):
+        with self.assertRaises(ValueError):
             lp.gridpoints_core_calc(data, exponents=['a', 'b'])
         data.append('hello i am a string')
         with self.assertRaises(ValueError):
             lp.gridpoints_core_calc(data)
-        with self.assertRaises(TypeError):
-            lp.gridpoints_core_calc(777)
 
     def test_gridpoints_core_calc_default_1(self):
         """test function gridpoints_core_calc, i.e. core data combination
@@ -374,4 +372,4 @@ def test_get_value_unit_pass(self):
 if __name__ == "__main__":
     TESTS = unittest.TestLoader().loadTestsFromTestCase(TestLitPop)
     # TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUncertainty))
-    unittest.TextTestRunner(verbosity=2).run(TESTS)
+    unittest.TextTestRunner(verbosity=2).run(TESTS)