Skip to content

Commit ab18675

Browse files
change of plan: just pickle geometries in wkb format
1 parent 66e1d47 commit ab18675

File tree

3 files changed

+71
-78
lines changed

3 files changed

+71
-78
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,23 @@ Removed:
1515
- `pandas-datareader`
1616

1717
### Added
18+
1819
- Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029)
1920

2021
### Changed
22+
2123
- `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
2224
- World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
25+
- `Exposures.write_hdf5` pickles geometry data in WKB format by default, and not as `shapely` objects anymore. There is now a flag to keep the previous behavior.
2326

2427
### Fixed
28+
2529
- NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038)
2630

2731
### Deprecated
2832

2933
### Removed
34+
3035
- `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
3136

3237
## 6.0.1

climada/entity/exposures/base.py

Lines changed: 20 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import cartopy.crs as ccrs
3131
import contextily as ctx
32+
import geopandas as gpd
3233
import matplotlib.pyplot as plt
3334
import numpy as np
3435
import pandas as pd
@@ -37,7 +38,6 @@
3738
from geopandas import GeoDataFrame, GeoSeries, points_from_xy
3839
from mpl_toolkits.axes_grid1 import make_axes_locatable
3940
from rasterio.warp import Resampling
40-
from xarray import DataArray
4141

4242
import climada.util.coordinates as u_coord
4343
import climada.util.hdf5_handler as u_hdf5
@@ -1122,44 +1122,43 @@ def plot_basemap(
11221122
self.to_crs(crs_ori, inplace=True)
11231123
return axis
11241124

1125-
def write_hdf5(self, file_name, pickle_geometry=False):
1125+
def write_hdf5(self, file_name, pickle_geometry_as_shapely=False):
11261126
"""Write data frame and metadata in hdf5 format
11271127
11281128
Parameters
11291129
----------
11301130
file_name : str
11311131
(path and) file name to write to.
1132-
pickle_geometry : bool
1132+
pickle_geometry_as_shapely : bool
11331133
flag, indicating whether the "geometry" of the Exposures` `data` will be stored as
1134-
pickled shapely objects instead of wkb bytes. This is faster but less durable, because
1135-
pickled data may get unreadable for future shapely versions.
1134+
pickled shapely objects instead of wkb bytes. This has been the case for earlier
1135+
CLIMADA version, up to 6.0, and is perhaps faster but less durable,
1136+
because pickled data may evantually get unreadable for future shapely versions.
11361137
Default: False
11371138
"""
11381139
LOGGER.info("Writing %s", file_name)
11391140
store = pd.HDFStore(file_name, mode="w")
1140-
pandas_df = pd.DataFrame(self.data)
1141-
wkb_data = {}
1141+
pandas_df = pd.DataFrame(self.gdf)
1142+
wkb_columns = []
11421143
for col in pandas_df.columns:
11431144
if str(pandas_df[col].dtype) == "geometry":
1144-
if pickle_geometry:
1145-
pandas_df[col] = np.asarray(self.data[col])
1145+
if pickle_geometry_as_shapely:
1146+
pandas_df[col] = np.asarray(self.gdf[col])
11461147
else:
1147-
wkb_data[col] = to_wkb_store(self.geometry)
1148-
pandas_df.drop(columns=[col], inplace=True)
1148+
pandas_df[col] = gpd.GeoSeries.to_wkb(pandas_df[col])
1149+
wkb_columns.append(col)
11491150

11501151
# Avoid pandas PerformanceWarning when writing HDF5 data
11511152
with warnings.catch_warnings():
11521153
warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
11531154
# Write dataframe
11541155
store.put("exposures", pandas_df)
11551156

1156-
if wkb_data:
1157-
store.put("wkb_data", wkb_data)
1158-
11591157
var_meta = {}
11601158
for var in type(self)._metadata:
11611159
var_meta[var] = getattr(self, var)
11621160
var_meta["crs"] = self.crs
1161+
var_meta["wkb_columns"] = wkb_columns
11631162
store.get_storer("exposures").attrs.metadata = var_meta
11641163

11651164
store.close()
@@ -1199,12 +1198,13 @@ def from_hdf5(cls, file_name):
11991198
if crs is None and metadata.get("meta"):
12001199
crs = metadata["meta"].get("crs")
12011200
data = pd.DataFrame(store["exposures"])
1202-
try:
1203-
wkb_data = store.get("wkb_data")
1204-
except KeyError:
1205-
wkb_data = {}
1206-
for col, val in wkb_data.items():
1207-
data[col] = from_wkb_store(val)
1201+
1202+
wkb_columns = (
1203+
metadata.pop("wkb_columns") if "wkb_columns" in metadata else []
1204+
)
1205+
for col in wkb_columns:
1206+
data[col] = gpd.GeoSeries.from_wkb(data[col])
1207+
12081208
exp = cls(data, crs=crs)
12091209
for key, val in metadata.items():
12101210
if key in type(exp)._metadata: # pylint: disable=protected-access
@@ -1574,21 +1574,6 @@ def _read_mat_optional(exposures, data, var_names):
15741574
pass
15751575

15761576

1577-
def to_wkb_store(geometry: np.array, store):
1578-
wkb_data = geometry.to_wkb().to_numpy()
1579-
import h5py
1580-
1581-
wkb_dataset = h5py.Dataset(store)
1582-
1583-
# Store WKB as variable-length byte arrays
1584-
dt = h5py.vlen_dtype(np.dtype("uint8"))
1585-
wkb_dataset.dtype = dt
1586-
for i, geom_bytes in enumerate(wkb_data):
1587-
wkb_dataset[i] = np.frombuffer(geom_bytes, dtype="uint8")
1588-
1589-
return wkb_data
1590-
1591-
15921577
def _read_mat_metadata(exposures, data, file_name, var_names):
15931578
"""Fill metadata in DataFrame object"""
15941579
try:

climada/entity/exposures/test/test_base.py

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -378,60 +378,63 @@ def test_read_template_pass(self):
378378

379379
def test_io_hdf5_pass(self):
380380
"""write and read hdf5"""
381-
exp_df = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
382-
exp_df.check()
381+
exp = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
382+
383383
# set metadata
384-
exp_df.ref_year = 2020
385-
exp_df.value_unit = "XSD"
384+
exp.ref_year = 2020
385+
exp.value_unit = "XSD"
386386

387387
file_name = DATA_DIR.joinpath("test_hdf5_exp.h5")
388388

389389
# pd.errors.PerformanceWarning should be suppressed. Therefore, make sure that
390390
# PerformanceWarning would result in test failure here
391391
import warnings
392392

393-
with warnings.catch_warnings():
394-
warnings.simplefilter("error", category=pd.errors.PerformanceWarning)
395-
exp_df.write_hdf5(file_name)
393+
for pickle_geometry_as_shapely in [False, True]:
394+
with warnings.catch_warnings():
395+
warnings.simplefilter("error", category=pd.errors.PerformanceWarning)
396+
exp.write_hdf5(
397+
file_name, pickle_geometry_as_shapely=pickle_geometry_as_shapely
398+
)
396399

397-
exp_read = Exposures.from_hdf5(file_name)
400+
exp_read = Exposures.from_hdf5(file_name)
398401

399-
self.assertEqual(exp_df.ref_year, exp_read.ref_year)
400-
self.assertEqual(exp_df.value_unit, exp_read.value_unit)
401-
self.assertEqual(exp_df.description, exp_read.description)
402-
np.testing.assert_array_equal(exp_df.latitude, exp_read.latitude)
403-
np.testing.assert_array_equal(exp_df.longitude, exp_read.longitude)
404-
np.testing.assert_array_equal(exp_df.value, exp_read.value)
405-
np.testing.assert_array_equal(
406-
exp_df.data["deductible"].values, exp_read.data["deductible"].values
407-
)
408-
np.testing.assert_array_equal(
409-
exp_df.data["cover"].values, exp_read.data["cover"].values
410-
)
411-
np.testing.assert_array_equal(
412-
exp_df.data["region_id"].values, exp_read.data["region_id"].values
413-
)
414-
np.testing.assert_array_equal(
415-
exp_df.data["category_id"].values, exp_read.data["category_id"].values
416-
)
417-
np.testing.assert_array_equal(
418-
exp_df.data["impf_TC"].values, exp_read.data["impf_TC"].values
419-
)
420-
np.testing.assert_array_equal(
421-
exp_df.data["centr_TC"].values, exp_read.data["centr_TC"].values
422-
)
423-
np.testing.assert_array_equal(
424-
exp_df.data["impf_FL"].values, exp_read.data["impf_FL"].values
425-
)
426-
np.testing.assert_array_equal(
427-
exp_df.data["centr_FL"].values, exp_read.data["centr_FL"].values
428-
)
402+
self.assertEqual(exp.ref_year, exp_read.ref_year)
403+
self.assertEqual(exp.value_unit, exp_read.value_unit)
404+
self.assertEqual(exp.description, exp_read.description)
405+
np.testing.assert_array_equal(exp.latitude, exp_read.latitude)
406+
np.testing.assert_array_equal(exp.longitude, exp_read.longitude)
407+
np.testing.assert_array_equal(exp.value, exp_read.value)
408+
np.testing.assert_array_equal(
409+
exp.data["deductible"].values, exp_read.data["deductible"].values
410+
)
411+
np.testing.assert_array_equal(
412+
exp.data["cover"].values, exp_read.data["cover"].values
413+
)
414+
np.testing.assert_array_equal(
415+
exp.data["region_id"].values, exp_read.data["region_id"].values
416+
)
417+
np.testing.assert_array_equal(
418+
exp.data["category_id"].values, exp_read.data["category_id"].values
419+
)
420+
np.testing.assert_array_equal(
421+
exp.data["impf_TC"].values, exp_read.data["impf_TC"].values
422+
)
423+
np.testing.assert_array_equal(
424+
exp.data["centr_TC"].values, exp_read.data["centr_TC"].values
425+
)
426+
np.testing.assert_array_equal(
427+
exp.data["impf_FL"].values, exp_read.data["impf_FL"].values
428+
)
429+
np.testing.assert_array_equal(
430+
exp.data["centr_FL"].values, exp_read.data["centr_FL"].values
431+
)
429432

430-
self.assertTrue(
431-
u_coord.equal_crs(exp_df.crs, exp_read.crs),
432-
f"{exp_df.crs} and {exp_read.crs} are different",
433-
)
434-
self.assertTrue(u_coord.equal_crs(exp_df.gdf.crs, exp_read.gdf.crs))
433+
self.assertTrue(
434+
u_coord.equal_crs(exp.crs, exp_read.crs),
435+
f"{exp.crs} and {exp_read.crs} are different",
436+
)
437+
self.assertTrue(u_coord.equal_crs(exp.gdf.crs, exp_read.gdf.crs))
435438

436439

437440
class TestAddSea(unittest.TestCase):

0 commit comments

Comments
 (0)