Skip to content

Commit 297326e

Browse files
refactor Exposures.write_hdf5 and .from_hdf5: use wkb instead of pickle for geometry serialization
1 parent 4d2d690 commit 297326e

File tree

1 file changed

+40
-4
lines changed

1 file changed

+40
-4
lines changed

climada/entity/exposures/base.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from geopandas import GeoDataFrame, GeoSeries, points_from_xy
3838
from mpl_toolkits.axes_grid1 import make_axes_locatable
3939
from rasterio.warp import Resampling
40+
from xarray import DataArray
4041

4142
import climada.util.coordinates as u_coord
4243
import climada.util.hdf5_handler as u_hdf5
@@ -1121,27 +1122,40 @@ def plot_basemap(
11211122
self.to_crs(crs_ori, inplace=True)
11221123
return axis
11231124

1124-
def write_hdf5(self, file_name):
1125+
def write_hdf5(self, file_name, pickle_geometry=False):
11251126
"""Write data frame and metadata in hdf5 format
11261127
11271128
Parameters
11281129
----------
11291130
file_name : str
11301131
(path and) file name to write to.
1132+
pickle_geometry : bool
1133+
flag, indicating whether the "geometry" of the Exposures` `data` will be stored as
1134+
pickled shapely objects instead of wkb bytes. This is faster but less durable, because
1135+
pickled data may get unreadable for future shapely versions.
1136+
Default: False
11311137
"""
11321138
LOGGER.info("Writing %s", file_name)
11331139
store = pd.HDFStore(file_name, mode="w")
1134-
pandas_df = pd.DataFrame(self.gdf)
1140+
pandas_df = pd.DataFrame(self.data)
1141+
wkb_data = {}
11351142
for col in pandas_df.columns:
11361143
if str(pandas_df[col].dtype) == "geometry":
1137-
pandas_df[col] = np.asarray(self.gdf[col])
1144+
if pickle_geometry:
1145+
pandas_df[col] = np.asarray(self.data[col])
1146+
else:
1147+
wkb_data[col] = to_wkb_store(self.geometry)
1148+
pandas_df.drop(columns=["geometry"])
11381149

11391150
# Avoid pandas PerformanceWarning when writing HDF5 data
11401151
with warnings.catch_warnings():
11411152
warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
11421153
# Write dataframe
11431154
store.put("exposures", pandas_df)
11441155

1156+
if wkb_data:
1157+
store.put("wkb_data", wkb_data)
1158+
11451159
var_meta = {}
11461160
for var in type(self)._metadata:
11471161
var_meta[var] = getattr(self, var)
@@ -1184,7 +1198,14 @@ def from_hdf5(cls, file_name):
11841198
crs = metadata.get("crs", metadata.get("_crs"))
11851199
if crs is None and metadata.get("meta"):
11861200
crs = metadata["meta"].get("crs")
1187-
exp = cls(store["exposures"], crs=crs)
1201+
data = pd.DataFrame(store["exposures"])
1202+
try:
1203+
wkb_data = store.get("wkb_data")
1204+
except KeyError:
1205+
wkb_data = {}
1206+
for col, val in wkb_data.items():
1207+
data[col] = from_wkb_store(val)
1208+
exp = cls(data, crs=crs)
11881209
for key, val in metadata.items():
11891210
if key in type(exp)._metadata: # pylint: disable=protected-access
11901211
setattr(exp, key, val)
@@ -1553,6 +1574,21 @@ def _read_mat_optional(exposures, data, var_names):
15531574
pass
15541575

15551576

1577+
def to_wkb_store(geometry: np.array, store):
1578+
wkb_data = geometry.to_wkb().to_numpy()
1579+
import h5py
1580+
1581+
wkb_dataset = h5py.Dataset(store)
1582+
1583+
# Store WKB as variable-length byte arrays
1584+
dt = h5py.vlen_dtype(np.dtype("uint8"))
1585+
wkb_dataset.dtype = dt
1586+
for i, geom_bytes in enumerate(wkb_data):
1587+
wkb_dataset[i] = np.frombuffer(geom_bytes, dtype="uint8")
1588+
1589+
return wkb_data
1590+
1591+
15561592
def _read_mat_metadata(exposures, data, file_name, var_names):
15571593
"""Fill metadata in DataFrame object"""
15581594
try:

0 commit comments

Comments
 (0)