|
18 | 18 | )
|
19 | 19 | from pyogrio._compat import (
|
20 | 20 | GDAL_GE_37,
|
| 21 | + GDAL_GE_311, |
21 | 22 | GDAL_GE_352,
|
22 | 23 | HAS_ARROW_WRITE_API,
|
23 | 24 | HAS_PYPROJ,
|
24 | 25 | PANDAS_GE_15,
|
| 26 | + PANDAS_GE_30, |
25 | 27 | SHAPELY_GE_21,
|
26 | 28 | )
|
27 | 29 | from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError
|
@@ -256,13 +258,20 @@ def test_read_layer(tmp_path, use_arrow):
|
256 | 258 |
|
257 | 259 | # create a multilayer GPKG
|
258 | 260 | expected1 = gp.GeoDataFrame(geometry=[Point(0, 0)], crs="EPSG:4326")
|
| 261 | + if use_arrow: |
| 262 | + # TODO this needs to be fixed on the geopandas side (to ensure the |
| 263 | + # GeoDataFrame() constructor does this), when use_arrow we already |
| 264 | + # get columns Index with string dtype |
| 265 | + expected1.columns = expected1.columns.astype("str") |
259 | 266 | write_dataframe(
|
260 | 267 | expected1,
|
261 | 268 | filename,
|
262 | 269 | layer="layer1",
|
263 | 270 | )
|
264 | 271 |
|
265 | 272 | expected2 = gp.GeoDataFrame(geometry=[Point(1, 1)], crs="EPSG:4326")
|
| 273 | + if use_arrow: |
| 274 | + expected2.columns = expected2.columns.astype("str") |
266 | 275 | write_dataframe(expected2, filename, layer="layer2", append=True)
|
267 | 276 |
|
268 | 277 | assert np.array_equal(
|
@@ -385,7 +394,7 @@ def test_read_null_values(tmp_path, use_arrow):
|
385 | 394 | df = read_dataframe(filename, use_arrow=use_arrow, read_geometry=False)
|
386 | 395 |
|
387 | 396 | # make sure that Null values are preserved
|
388 |
| - assert np.array_equal(df.col.values, expected.col.values) |
| 397 | + assert df["col"].isna().all() |
389 | 398 |
|
390 | 399 |
|
391 | 400 | def test_read_fid_as_index(naturalearth_lowres_all_ext, use_arrow):
|
@@ -699,6 +708,13 @@ def test_read_skip_features(naturalearth_lowres_all_ext, use_arrow, skip_feature
|
699 | 708 | # In .geojsonl the vertices are reordered, so normalize
|
700 | 709 | is_jsons = ext == ".geojsonl"
|
701 | 710 |
|
| 711 | + if skip_features == 200 and not use_arrow: |
| 712 | + # result is an empty dataframe, so no proper dtype inference happens |
| 713 | + # for the numpy object dtype arrays |
| 714 | + df[["continent", "name", "iso_a3"]] = df[ |
| 715 | + ["continent", "name", "iso_a3"] |
| 716 | + ].astype("str") |
| 717 | + |
702 | 718 | assert_geodataframe_equal(
|
703 | 719 | df,
|
704 | 720 | expected,
|
@@ -1180,6 +1196,10 @@ def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow):
|
1180 | 1196 | # For older pandas versions, the index is created as Object dtype but read as
|
1181 | 1197 | # RangeIndex, so don't check the index dtype in that case.
|
1182 | 1198 | check_index_type = True if PANDAS_GE_20 else False
|
| 1199 | + # with pandas 3+ and reading through arrow, we preserve the string dtype |
| 1200 | + # (no proper dtype inference happens for the empty numpy object dtype arrays) |
| 1201 | + if use_arrow and dtype is object: |
| 1202 | + expected["col_object"] = expected["col_object"].astype("str") |
1183 | 1203 | assert_geodataframe_equal(df, expected, check_index_type=check_index_type)
|
1184 | 1204 |
|
1185 | 1205 |
|
@@ -1214,7 +1234,11 @@ def test_write_None_string_column(tmp_path, use_arrow):
|
1214 | 1234 | assert filename.exists()
|
1215 | 1235 |
|
1216 | 1236 | result_gdf = read_dataframe(filename, use_arrow=use_arrow)
|
1217 |
| - assert result_gdf.object_col.dtype == object |
| 1237 | + if PANDAS_GE_30 and use_arrow: |
| 1238 | + assert result_gdf.object_col.dtype == "str" |
| 1239 | + gdf["object_col"] = gdf["object_col"].astype("str") |
| 1240 | + else: |
| 1241 | + assert result_gdf.object_col.dtype == object |
1218 | 1242 | assert_geodataframe_equal(result_gdf, gdf)
|
1219 | 1243 |
|
1220 | 1244 |
|
@@ -1658,11 +1682,13 @@ def test_write_read_mixed_column_values(tmp_path):
|
1658 | 1682 | write_dataframe(test_gdf, output_path)
|
1659 | 1683 | output_gdf = read_dataframe(output_path)
|
1660 | 1684 | assert len(test_gdf) == len(output_gdf)
|
1661 |
| - for idx, value in enumerate(mixed_values): |
1662 |
| - if value in (None, np.nan): |
1663 |
| - assert output_gdf["mixed"][idx] is None |
1664 |
| - else: |
1665 |
| - assert output_gdf["mixed"][idx] == str(value) |
| 1685 | + # mixed values as object dtype are currently written as strings |
| 1686 | + # (but preserving nulls) |
| 1687 | + expected = pd.Series( |
| 1688 | + [str(value) if value not in (None, np.nan) else None for value in mixed_values], |
| 1689 | + name="mixed", |
| 1690 | + ) |
| 1691 | + assert_series_equal(output_gdf["mixed"], expected) |
1666 | 1692 |
|
1667 | 1693 |
|
1668 | 1694 | @requires_arrow_write_api
|
@@ -1695,8 +1721,8 @@ def test_write_read_null(tmp_path, use_arrow):
|
1695 | 1721 | assert pd.isna(result_gdf["float64"][1])
|
1696 | 1722 | assert pd.isna(result_gdf["float64"][2])
|
1697 | 1723 | assert result_gdf["object_str"][0] == "test"
|
1698 |
| - assert result_gdf["object_str"][1] is None |
1699 |
| - assert result_gdf["object_str"][2] is None |
| 1724 | + assert pd.isna(result_gdf["object_str"][1]) |
| 1725 | + assert pd.isna(result_gdf["object_str"][2]) |
1700 | 1726 |
|
1701 | 1727 |
|
1702 | 1728 | @pytest.mark.requires_arrow_write_api
|
@@ -1927,6 +1953,10 @@ def test_read_dataset_kwargs(nested_geojson_file, use_arrow):
|
1927 | 1953 | geometry=[shapely.Point(0, 0)],
|
1928 | 1954 | crs="EPSG:4326",
|
1929 | 1955 | )
|
| 1956 | + if GDAL_GE_311 and use_arrow: |
| 1957 | + # GDAL 3.11 started to use json extension type, which is not yet handled |
| 1958 | + # correctly in the arrow->pandas conversion (using object instead of str dtype) |
| 1959 | + expected["intermediate_level"] = expected["intermediate_level"].astype(object) |
1930 | 1960 |
|
1931 | 1961 | assert_geodataframe_equal(df, expected)
|
1932 | 1962 |
|
@@ -1972,7 +2002,7 @@ def test_write_nullable_dtypes(tmp_path, use_arrow):
|
1972 | 2002 | expected["col2"] = expected["col2"].astype("float64")
|
1973 | 2003 | expected["col3"] = expected["col3"].astype("float32")
|
1974 | 2004 | expected["col4"] = expected["col4"].astype("float64")
|
1975 |
| - expected["col5"] = expected["col5"].astype(object) |
| 2005 | + expected["col5"] = expected["col5"].astype("str") |
1976 | 2006 | expected.loc[1, "col5"] = None # pandas converts to pd.NA on line above
|
1977 | 2007 | assert_geodataframe_equal(output_gdf, expected)
|
1978 | 2008 |
|
|
0 commit comments