change of plan: just pickle geometries in wkb format

emanuel-schmid · emanuel-schmid · commit ab1867504510 · 2025-04-30T16:05:35.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,18 +15,23 @@ Removed:
 - `pandas-datareader`
 
 ### Added
+
 - Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029)
 
 ### Changed
+
 - `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
 - World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
+- `Exposures.write_hdf5` pickles geometry data in WKB format by default, and not as `shapely` objects anymore. There is now a flag to keep the previous behavior.
 
 ### Fixed
+
 - NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038)
 
 ### Deprecated
 
 ### Removed
+
 - `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
 
 ## 6.0.1
diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py
@@ -29,6 +29,7 @@
 
 import cartopy.crs as ccrs
 import contextily as ctx
+import geopandas as gpd
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
@@ -37,7 +38,6 @@
 from geopandas import GeoDataFrame, GeoSeries, points_from_xy
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 from rasterio.warp import Resampling
-from xarray import DataArray
 
 import climada.util.coordinates as u_coord
 import climada.util.hdf5_handler as u_hdf5
@@ -1122,44 +1122,43 @@ def plot_basemap(
         self.to_crs(crs_ori, inplace=True)
         return axis
 
-    def write_hdf5(self, file_name, pickle_geometry=False):
+    def write_hdf5(self, file_name, pickle_geometry_as_shapely=False):
         """Write data frame and metadata in hdf5 format
 
         Parameters
         ----------
         file_name : str
             (path and) file name to write to.
-        pickle_geometry : bool
+        pickle_geometry_as_shapely : bool
             flag, indicating whether the "geometry" of the Exposures` `data` will be stored as
-            pickled shapely objects instead of wkb bytes. This is faster but less durable, because
-            pickled data may get unreadable for future shapely versions.
+            pickled shapely objects instead of wkb bytes. This has been the case for earlier
+            CLIMADA version, up to 6.0, and is perhaps faster but less durable,
+            because pickled data may evantually get unreadable for future shapely versions.
             Default: False
         """
         LOGGER.info("Writing %s", file_name)
         store = pd.HDFStore(file_name, mode="w")
-        pandas_df = pd.DataFrame(self.data)
-        wkb_data = {}
+        pandas_df = pd.DataFrame(self.gdf)
+        wkb_columns = []
         for col in pandas_df.columns:
             if str(pandas_df[col].dtype) == "geometry":
-                if pickle_geometry:
-                    pandas_df[col] = np.asarray(self.data[col])
+                if pickle_geometry_as_shapely:
+                    pandas_df[col] = np.asarray(self.gdf[col])
                 else:
-                    wkb_data[col] = to_wkb_store(self.geometry)
-                    pandas_df.drop(columns=[col], inplace=True)
+                    pandas_df[col] = gpd.GeoSeries.to_wkb(pandas_df[col])
+                    wkb_columns.append(col)
 
         # Avoid pandas PerformanceWarning when writing HDF5 data
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
             # Write dataframe
             store.put("exposures", pandas_df)
 
-        if wkb_data:
-            store.put("wkb_data", wkb_data)
-
         var_meta = {}
         for var in type(self)._metadata:
             var_meta[var] = getattr(self, var)
         var_meta["crs"] = self.crs
+        var_meta["wkb_columns"] = wkb_columns
         store.get_storer("exposures").attrs.metadata = var_meta
 
         store.close()
@@ -1199,12 +1198,13 @@ def from_hdf5(cls, file_name):
             if crs is None and metadata.get("meta"):
                 crs = metadata["meta"].get("crs")
             data = pd.DataFrame(store["exposures"])
-            try:
-                wkb_data = store.get("wkb_data")
-            except KeyError:
-                wkb_data = {}
-            for col, val in wkb_data.items():
-                data[col] = from_wkb_store(val)
+
+            wkb_columns = (
+                metadata.pop("wkb_columns") if "wkb_columns" in metadata else []
+            )
+            for col in wkb_columns:
+                data[col] = gpd.GeoSeries.from_wkb(data[col])
+
             exp = cls(data, crs=crs)
             for key, val in metadata.items():
                 if key in type(exp)._metadata:  # pylint: disable=protected-access
@@ -1574,21 +1574,6 @@ def _read_mat_optional(exposures, data, var_names):
         pass
 
 
-def to_wkb_store(geometry: np.array, store):
-    wkb_data = geometry.to_wkb().to_numpy()
-    import h5py
-
-    wkb_dataset = h5py.Dataset(store)
-
-    # Store WKB as variable-length byte arrays
-    dt = h5py.vlen_dtype(np.dtype("uint8"))
-    wkb_dataset.dtype = dt
-    for i, geom_bytes in enumerate(wkb_data):
-        wkb_dataset[i] = np.frombuffer(geom_bytes, dtype="uint8")
-
-    return wkb_data
-
-
 def _read_mat_metadata(exposures, data, file_name, var_names):
     """Fill metadata in DataFrame object"""
     try:
diff --git a/climada/entity/exposures/test/test_base.py b/climada/entity/exposures/test/test_base.py
@@ -378,60 +378,63 @@ def test_read_template_pass(self):
 
     def test_io_hdf5_pass(self):
         """write and read hdf5"""
-        exp_df = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
-        exp_df.check()
+        exp = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
+
         # set metadata
-        exp_df.ref_year = 2020
-        exp_df.value_unit = "XSD"
+        exp.ref_year = 2020
+        exp.value_unit = "XSD"
 
         file_name = DATA_DIR.joinpath("test_hdf5_exp.h5")
 
         # pd.errors.PerformanceWarning should be suppressed. Therefore, make sure that
         # PerformanceWarning would result in test failure here
         import warnings
 
-        with warnings.catch_warnings():
-            warnings.simplefilter("error", category=pd.errors.PerformanceWarning)
-            exp_df.write_hdf5(file_name)
+        for pickle_geometry_as_shapely in [False, True]:
+            with warnings.catch_warnings():
+                warnings.simplefilter("error", category=pd.errors.PerformanceWarning)
+                exp.write_hdf5(
+                    file_name, pickle_geometry_as_shapely=pickle_geometry_as_shapely
+                )
 
-        exp_read = Exposures.from_hdf5(file_name)
+            exp_read = Exposures.from_hdf5(file_name)
 
-        self.assertEqual(exp_df.ref_year, exp_read.ref_year)
-        self.assertEqual(exp_df.value_unit, exp_read.value_unit)
-        self.assertEqual(exp_df.description, exp_read.description)
-        np.testing.assert_array_equal(exp_df.latitude, exp_read.latitude)
-        np.testing.assert_array_equal(exp_df.longitude, exp_read.longitude)
-        np.testing.assert_array_equal(exp_df.value, exp_read.value)
-        np.testing.assert_array_equal(
-            exp_df.data["deductible"].values, exp_read.data["deductible"].values
-        )
-        np.testing.assert_array_equal(
-            exp_df.data["cover"].values, exp_read.data["cover"].values
-        )
-        np.testing.assert_array_equal(
-            exp_df.data["region_id"].values, exp_read.data["region_id"].values
-        )
-        np.testing.assert_array_equal(
-            exp_df.data["category_id"].values, exp_read.data["category_id"].values
-        )
-        np.testing.assert_array_equal(
-            exp_df.data["impf_TC"].values, exp_read.data["impf_TC"].values
-        )
-        np.testing.assert_array_equal(
-            exp_df.data["centr_TC"].values, exp_read.data["centr_TC"].values
-        )
-        np.testing.assert_array_equal(
-            exp_df.data["impf_FL"].values, exp_read.data["impf_FL"].values
-        )
-        np.testing.assert_array_equal(
-            exp_df.data["centr_FL"].values, exp_read.data["centr_FL"].values
-        )
+            self.assertEqual(exp.ref_year, exp_read.ref_year)
+            self.assertEqual(exp.value_unit, exp_read.value_unit)
+            self.assertEqual(exp.description, exp_read.description)
+            np.testing.assert_array_equal(exp.latitude, exp_read.latitude)
+            np.testing.assert_array_equal(exp.longitude, exp_read.longitude)
+            np.testing.assert_array_equal(exp.value, exp_read.value)
+            np.testing.assert_array_equal(
+                exp.data["deductible"].values, exp_read.data["deductible"].values
+            )
+            np.testing.assert_array_equal(
+                exp.data["cover"].values, exp_read.data["cover"].values
+            )
+            np.testing.assert_array_equal(
+                exp.data["region_id"].values, exp_read.data["region_id"].values
+            )
+            np.testing.assert_array_equal(
+                exp.data["category_id"].values, exp_read.data["category_id"].values
+            )
+            np.testing.assert_array_equal(
+                exp.data["impf_TC"].values, exp_read.data["impf_TC"].values
+            )
+            np.testing.assert_array_equal(
+                exp.data["centr_TC"].values, exp_read.data["centr_TC"].values
+            )
+            np.testing.assert_array_equal(
+                exp.data["impf_FL"].values, exp_read.data["impf_FL"].values
+            )
+            np.testing.assert_array_equal(
+                exp.data["centr_FL"].values, exp_read.data["centr_FL"].values
+            )
 
-        self.assertTrue(
-            u_coord.equal_crs(exp_df.crs, exp_read.crs),
-            f"{exp_df.crs} and {exp_read.crs} are different",
-        )
-        self.assertTrue(u_coord.equal_crs(exp_df.gdf.crs, exp_read.gdf.crs))
+            self.assertTrue(
+                u_coord.equal_crs(exp.crs, exp_read.crs),
+                f"{exp.crs} and {exp_read.crs} are different",
+            )
+            self.assertTrue(u_coord.equal_crs(exp.gdf.crs, exp_read.gdf.crs))
 
 
 class TestAddSea(unittest.TestCase):