Skip to content

Commit e5ac209

Browse files
COMPAT: prepare for pandas 3.0 string dtype (#493)
1 parent 98bb7cd commit e5ac209

File tree

5 files changed

+79
-13
lines changed

5 files changed

+79
-13
lines changed

.github/workflows/tests-conda.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ jobs:
4444
- os: "ubuntu-latest"
4545
python: "3.11"
4646
env: "nightly-deps"
47+
pandas_future_infer_string: "1"
4748

4849
steps:
4950
- name: Checkout repo
@@ -68,5 +69,7 @@ jobs:
6869
run: pip install -e .
6970

7071
- name: Test
72+
env:
73+
PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }}
7174
run: |
7275
pytest -v --color=yes -r s pyogrio/tests

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
- Capture all errors logged by gdal when opening a file fails (#495).
88
- Add support to read and write ".gpkg.zip" (GDAL >= 3.7), ".shp.zip", and ".shz"
99
files (#527).
10+
- Compatibility with the string dtype in the upcoming pandas 3.0 release (#493).
1011

1112
### Bug fixes
1213

pyogrio/_compat.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,21 @@
3333
HAS_ARROW_WRITE_API = __gdal_version__ >= (3, 8, 0)
3434
HAS_PYARROW = pyarrow is not None
3535
HAS_PYPROJ = pyproj is not None
36+
PYARROW_GE_19 = pyarrow is not None and Version(pyarrow.__version__) >= Version(
37+
"19.0.0"
38+
)
3639

3740
HAS_GEOPANDAS = geopandas is not None
3841

3942
PANDAS_GE_15 = pandas is not None and Version(pandas.__version__) >= Version("1.5.0")
4043
PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0")
4144
PANDAS_GE_22 = pandas is not None and Version(pandas.__version__) >= Version("2.2.0")
45+
PANDAS_GE_30 = pandas is not None and Version(pandas.__version__) >= Version("3.0.0dev")
4246

4347
GDAL_GE_352 = __gdal_version__ >= (3, 5, 2)
4448
GDAL_GE_37 = __gdal_version__ >= (3, 7, 0)
4549
GDAL_GE_38 = __gdal_version__ >= (3, 8, 0)
50+
GDAL_GE_311 = __gdal_version__ >= (3, 11, 0)
4651

4752
HAS_GDAL_GEOS = __gdal_geos_version__ is not None
4853

pyogrio/geopandas.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,14 @@
55

66
import numpy as np
77

8-
from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_15, PANDAS_GE_20, PANDAS_GE_22
8+
from pyogrio._compat import (
9+
HAS_GEOPANDAS,
10+
PANDAS_GE_15,
11+
PANDAS_GE_20,
12+
PANDAS_GE_22,
13+
PANDAS_GE_30,
14+
PYARROW_GE_19,
15+
)
916
from pyogrio.errors import DataSourceError
1017
from pyogrio.raw import (
1118
DRIVERS_NO_MIXED_DIMENSIONS,
@@ -52,13 +59,13 @@ def _try_parse_datetime(ser):
5259
except Exception:
5360
res = ser
5461
# if object dtype, try parse as utc instead
55-
if res.dtype == "object":
62+
if res.dtype in ("object", "string"):
5663
try:
5764
res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
5865
except Exception:
5966
pass
6067

61-
if res.dtype != "object":
68+
if res.dtype.kind == "M": # any datetime64
6269
# GDAL only supports ms precision, convert outputs to match.
6370
# Pandas 2.0 supports datetime[ms] directly, prior versions only support [ns],
6471
# Instead, round the values to [ms] precision.
@@ -285,11 +292,31 @@ def read_dataframe(
285292
)
286293

287294
if use_arrow:
295+
import pyarrow as pa
296+
288297
meta, table = result
289298

290299
# split_blocks and self_destruct decrease memory usage, but have as side effect
291300
# that accessing table afterwards causes crash, so del table to avoid.
292301
kwargs = {"self_destruct": True}
302+
if PANDAS_GE_30:
303+
# starting with pyarrow 19.0, pyarrow will correctly handle this themselves,
304+
# so only use types_mapper as workaround for older versions
305+
if not PYARROW_GE_19:
306+
kwargs["types_mapper"] = {
307+
pa.string(): pd.StringDtype(na_value=np.nan),
308+
pa.large_string(): pd.StringDtype(na_value=np.nan),
309+
pa.json_(): pd.StringDtype(na_value=np.nan),
310+
}.get
311+
# TODO enable the below block when upstream issue to accept extension types
312+
# is fixed
313+
# else:
314+
# # for newer pyarrow, still include mapping for json
315+
# # GDAL 3.11 started to emit this extension type, but pyarrow does not
316+
# # yet support it properly in the conversion to pandas
317+
# kwargs["types_mapper"] = {
318+
# pa.json_(): pd.StringDtype(na_value=np.nan),
319+
# }.get
293320
if arrow_to_pandas_kwargs is not None:
294321
kwargs.update(arrow_to_pandas_kwargs)
295322
df = table.to_pandas(**kwargs)

pyogrio/tests/test_geopandas_io.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
)
1919
from pyogrio._compat import (
2020
GDAL_GE_37,
21+
GDAL_GE_311,
2122
GDAL_GE_352,
2223
HAS_ARROW_WRITE_API,
2324
HAS_PYPROJ,
2425
PANDAS_GE_15,
26+
PANDAS_GE_30,
2527
SHAPELY_GE_21,
2628
)
2729
from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError
@@ -256,13 +258,20 @@ def test_read_layer(tmp_path, use_arrow):
256258

257259
# create a multilayer GPKG
258260
expected1 = gp.GeoDataFrame(geometry=[Point(0, 0)], crs="EPSG:4326")
261+
if use_arrow:
262+
# TODO this needs to be fixed on the geopandas side (to ensure the
263+
# GeoDataFrame() constructor does this), when use_arrow we already
264+
# get columns Index with string dtype
265+
expected1.columns = expected1.columns.astype("str")
259266
write_dataframe(
260267
expected1,
261268
filename,
262269
layer="layer1",
263270
)
264271

265272
expected2 = gp.GeoDataFrame(geometry=[Point(1, 1)], crs="EPSG:4326")
273+
if use_arrow:
274+
expected2.columns = expected2.columns.astype("str")
266275
write_dataframe(expected2, filename, layer="layer2", append=True)
267276

268277
assert np.array_equal(
@@ -385,7 +394,7 @@ def test_read_null_values(tmp_path, use_arrow):
385394
df = read_dataframe(filename, use_arrow=use_arrow, read_geometry=False)
386395

387396
# make sure that Null values are preserved
388-
assert np.array_equal(df.col.values, expected.col.values)
397+
assert df["col"].isna().all()
389398

390399

391400
def test_read_fid_as_index(naturalearth_lowres_all_ext, use_arrow):
@@ -699,6 +708,13 @@ def test_read_skip_features(naturalearth_lowres_all_ext, use_arrow, skip_feature
699708
# In .geojsonl the vertices are reordered, so normalize
700709
is_jsons = ext == ".geojsonl"
701710

711+
if skip_features == 200 and not use_arrow:
712+
# result is an empty dataframe, so no proper dtype inference happens
713+
# for the numpy object dtype arrays
714+
df[["continent", "name", "iso_a3"]] = df[
715+
["continent", "name", "iso_a3"]
716+
].astype("str")
717+
702718
assert_geodataframe_equal(
703719
df,
704720
expected,
@@ -1180,6 +1196,10 @@ def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow):
11801196
# For older pandas versions, the index is created as Object dtype but read as
11811197
# RangeIndex, so don't check the index dtype in that case.
11821198
check_index_type = True if PANDAS_GE_20 else False
1199+
# with pandas 3+ and reading through arrow, we preserve the string dtype
1200+
# (no proper dtype inference happens for the empty numpy object dtype arrays)
1201+
if use_arrow and dtype is object:
1202+
expected["col_object"] = expected["col_object"].astype("str")
11831203
assert_geodataframe_equal(df, expected, check_index_type=check_index_type)
11841204

11851205

@@ -1214,7 +1234,11 @@ def test_write_None_string_column(tmp_path, use_arrow):
12141234
assert filename.exists()
12151235

12161236
result_gdf = read_dataframe(filename, use_arrow=use_arrow)
1217-
assert result_gdf.object_col.dtype == object
1237+
if PANDAS_GE_30 and use_arrow:
1238+
assert result_gdf.object_col.dtype == "str"
1239+
gdf["object_col"] = gdf["object_col"].astype("str")
1240+
else:
1241+
assert result_gdf.object_col.dtype == object
12181242
assert_geodataframe_equal(result_gdf, gdf)
12191243

12201244

@@ -1658,11 +1682,13 @@ def test_write_read_mixed_column_values(tmp_path):
16581682
write_dataframe(test_gdf, output_path)
16591683
output_gdf = read_dataframe(output_path)
16601684
assert len(test_gdf) == len(output_gdf)
1661-
for idx, value in enumerate(mixed_values):
1662-
if value in (None, np.nan):
1663-
assert output_gdf["mixed"][idx] is None
1664-
else:
1665-
assert output_gdf["mixed"][idx] == str(value)
1685+
# mixed values as object dtype are currently written as strings
1686+
# (but preserving nulls)
1687+
expected = pd.Series(
1688+
[str(value) if value not in (None, np.nan) else None for value in mixed_values],
1689+
name="mixed",
1690+
)
1691+
assert_series_equal(output_gdf["mixed"], expected)
16661692

16671693

16681694
@requires_arrow_write_api
@@ -1695,8 +1721,8 @@ def test_write_read_null(tmp_path, use_arrow):
16951721
assert pd.isna(result_gdf["float64"][1])
16961722
assert pd.isna(result_gdf["float64"][2])
16971723
assert result_gdf["object_str"][0] == "test"
1698-
assert result_gdf["object_str"][1] is None
1699-
assert result_gdf["object_str"][2] is None
1724+
assert pd.isna(result_gdf["object_str"][1])
1725+
assert pd.isna(result_gdf["object_str"][2])
17001726

17011727

17021728
@pytest.mark.requires_arrow_write_api
@@ -1927,6 +1953,10 @@ def test_read_dataset_kwargs(nested_geojson_file, use_arrow):
19271953
geometry=[shapely.Point(0, 0)],
19281954
crs="EPSG:4326",
19291955
)
1956+
if GDAL_GE_311 and use_arrow:
1957+
# GDAL 3.11 started to use json extension type, which is not yet handled
1958+
# correctly in the arrow->pandas conversion (using object instead of str dtype)
1959+
expected["intermediate_level"] = expected["intermediate_level"].astype(object)
19301960

19311961
assert_geodataframe_equal(df, expected)
19321962

@@ -1972,7 +2002,7 @@ def test_write_nullable_dtypes(tmp_path, use_arrow):
19722002
expected["col2"] = expected["col2"].astype("float64")
19732003
expected["col3"] = expected["col3"].astype("float32")
19742004
expected["col4"] = expected["col4"].astype("float64")
1975-
expected["col5"] = expected["col5"].astype(object)
2005+
expected["col5"] = expected["col5"].astype("str")
19762006
expected.loc[1, "col5"] = None # pandas converts to pd.NA on line above
19772007
assert_geodataframe_equal(output_gdf, expected)
19782008

0 commit comments

Comments
 (0)