Skip to content

Commit bcb1634

Browse files
BUG: fix error in read dataframe with use arrow and columns filter (#612)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent d5983c5 commit bcb1634

File tree

7 files changed

+112
-32
lines changed

7 files changed

+112
-32
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
### Bug fixes
66

77
- Fix regression in reading date columns (#616)
8+
- Fix error in `read_dataframe` when `use_arrow=True` and `columns` is used to filter
9+
out columns of some specific types (#611)
810

911
## 0.12.0 (2025-11-26)
1012

pyogrio/_io.pyx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1449,7 +1449,7 @@ def ogr_read(
14491449

14501450
# Fields are matched exactly by name, duplicates are dropped.
14511451
# Find index of each field into fields
1452-
idx = np.intersect1d(fields[:, 2], columns, return_indices=True)[1]
1452+
idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
14531453
fields = fields[idx, :]
14541454

14551455
if not read_geometry and bbox is None and mask is None:
@@ -1722,6 +1722,11 @@ def ogr_open_arrow(
17221722
if columns is not None:
17231723
# Fields are matched exactly by name, duplicates are dropped.
17241724
ignored_fields = list(set(fields[:, 2]) - set(columns))
1725+
1726+
# Find index of each field in columns, and only keep those
1727+
idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
1728+
fields = fields[idx, :]
1729+
17251730
if not read_geometry:
17261731
ignored_fields.append("OGR_GEOMETRY")
17271732

@@ -1731,9 +1736,8 @@ def ogr_open_arrow(
17311736

17321737
driver = get_driver(ogr_dataset)
17331738
if driver in {"FlatGeobuf", "GPKG"}:
1734-
ignored = set(ignored_fields)
1735-
for f in fields:
1736-
if f[2] not in ignored and f[3] == "bool":
1739+
for field in fields:
1740+
if field[3] == "bool": # numpy type is bool
17371741
raise RuntimeError(
17381742
"GDAL < 3.8.3 does not correctly read boolean data values "
17391743
"using the Arrow API. Do not use read_arrow() / "

pyogrio/geopandas.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -439,17 +439,17 @@ def read_dataframe(
439439
del table
440440

441441
# convert datetime columns that were read as string to datetime
442-
for dtype, column in zip(meta["dtypes"], meta["fields"]):
443-
# With arrow, date columns are returned as datetime.date objects.
442+
for dtype, column in zip(meta["dtypes"], meta["fields"], strict=True):
444443
if (
445444
dtype is not None
446445
and dtype.startswith("datetime")
446+
# With arrow, date columns are returned as datetime.date objects
447447
and dtype != "datetime64[D]"
448448
):
449449
df[column] = _try_parse_datetime(
450450
df[column], datetime_as_string, mixed_offsets_as_utc
451451
)
452-
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
452+
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
453453
if ogr_subtype == "OFSTJSON":
454454
# When reading .parquet files with arrow, JSON fields are already
455455
# parsed, so only parse if strings.
@@ -502,10 +502,10 @@ def read_dataframe(
502502
else:
503503
index = None
504504
df = pd.DataFrame(data, columns=columns, index=index)
505-
for dtype, c in zip(meta["dtypes"], df.columns):
505+
for dtype, c in zip(meta["dtypes"], meta["fields"], strict=True):
506506
if dtype.startswith("datetime"):
507507
df[c] = _try_parse_datetime(df[c], datetime_as_string, mixed_offsets_as_utc)
508-
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
508+
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
509509
if ogr_subtype == "OFSTJSON":
510510
dtype = pd.api.types.infer_dtype(df[c])
511511
if dtype == "string":

pyogrio/raw.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def read_arrow(
246246
-------
247247
(dict, pyarrow.Table)
248248
249-
Returns a tuple of meta information about the data source in a dict,
249+
Returns a tuple of meta information about the returned data in a dict,
250250
and a pyarrow Table with data.
251251
252252
Meta is: {

pyogrio/tests/test_arrow.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@ def test_read_arrow(naturalearth_lowres_all_ext):
5656
assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise)
5757

5858

59+
@pytest.mark.parametrize("columns", [None, [], ["continent"], ["iso_a3", "pop_est"]])
60+
def test_read_arrow_columns(naturalearth_lowres, columns):
61+
meta, _table = read_arrow(naturalearth_lowres, columns=columns)
62+
assert meta["fields"] is not None
63+
if columns is None:
64+
expected_fields = ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]
65+
else:
66+
expected_fields = columns
67+
assert sorted(meta["fields"]) == sorted(expected_fields)
68+
69+
5970
def test_read_arrow_unspecified_layer_warning(data_dir):
6071
"""Reading a multi-layer file without specifying a layer gives a warning."""
6172
with pytest.warns(UserWarning, match="More than one layer found "):
@@ -107,7 +118,7 @@ def test_read_arrow_skip_features_max_features(
107118
assert len(table) == expected
108119

109120

110-
def test_read_arrow_fid(naturalearth_lowres_all_ext):
121+
def test_read_df_arrow_fid(naturalearth_lowres_all_ext):
111122
kwargs = {"use_arrow": True, "where": "fid >= 2 AND fid <= 3"}
112123

113124
df = read_dataframe(naturalearth_lowres_all_ext, fid_as_index=False, **kwargs)
@@ -117,12 +128,12 @@ def test_read_arrow_fid(naturalearth_lowres_all_ext):
117128
assert_index_equal(df.index, pd.Index([2, 3], name="fid"))
118129

119130

120-
def test_read_arrow_columns(naturalearth_lowres):
131+
def test_read_df_arrow_columns(naturalearth_lowres):
121132
result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"])
122133
assert result.columns.tolist() == ["continent", "geometry"]
123134

124135

125-
def test_read_arrow_ignore_geometry(naturalearth_lowres):
136+
def test_read_df_arrow_ignore_geometry(naturalearth_lowres):
126137
result = read_dataframe(naturalearth_lowres, use_arrow=True, read_geometry=False)
127138
assert type(result) is pd.DataFrame
128139

@@ -132,7 +143,7 @@ def test_read_arrow_ignore_geometry(naturalearth_lowres):
132143
assert_frame_equal(result, expected)
133144

134145

135-
def test_read_arrow_to_pandas_kwargs(no_geometry_file):
146+
def test_read_df_arrow_to_pandas_kwargs(no_geometry_file):
136147
# with arrow, list types are supported
137148
arrow_to_pandas_kwargs = {"strings_to_categorical": True}
138149
df = read_dataframe(
@@ -216,6 +227,30 @@ def test_open_arrow_batch_size(naturalearth_lowres):
216227
assert len(tables[0]) == batch_size, "First table should match the batch size"
217228

218229

230+
@pytest.mark.parametrize(
231+
"descr, columns, exp_columns",
232+
[
233+
("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
234+
("case_sensitive", ["NAME"], []),
235+
("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
236+
("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
237+
],
238+
)
239+
def test_open_arrow_columns(naturalearth_lowres, descr, columns, exp_columns):
240+
with open_arrow(naturalearth_lowres, columns=columns) as (meta, reader):
241+
assert isinstance(meta, dict)
242+
assert isinstance(reader, pyogrio._io._ArrowStream)
243+
244+
result = pyarrow.table(reader)
245+
246+
# Check metadata
247+
assert np.array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
248+
249+
# Check columns in table
250+
exp_columns_with_geom = exp_columns + ["wkb_geometry"]
251+
assert result.column_names == exp_columns_with_geom, f"Failed for {descr}"
252+
253+
219254
@pytest.mark.skipif(
220255
__gdal_version__ >= (3, 8, 0),
221256
reason="skip_features supported by Arrow stream API for GDAL>=3.8.0",

pyogrio/tests/test_geopandas_io.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -355,13 +355,21 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext, use_arrow):
355355
read_dataframe(naturalearth_lowres_all_ext, layer="wrong", use_arrow=use_arrow)
356356

357357

358-
def test_read_datetime(datetime_file, use_arrow):
359-
df = read_dataframe(datetime_file, use_arrow=use_arrow)
360-
if PANDAS_GE_20:
361-
# starting with pandas 2.0, it preserves the passed datetime resolution
362-
assert df.col.dtype.name == "datetime64[ms]"
358+
@pytest.mark.parametrize("columns", [None, [], ["col"]])
359+
def test_read_datetime_columns(datetime_file, columns, use_arrow):
360+
df = read_dataframe(datetime_file, columns=columns, use_arrow=use_arrow)
361+
362+
# Check result
363+
if columns is None or "col" in columns:
364+
assert "col" in df.columns
365+
assert is_datetime64_dtype(df.col.dtype)
366+
if PANDAS_GE_20:
367+
# starting with pandas 2.0, it preserves the passed datetime resolution
368+
assert df.col.dtype.name == "datetime64[ms]"
369+
else:
370+
assert df.col.dtype.name == "datetime64[ns]"
363371
else:
364-
assert df.col.dtype.name == "datetime64[ns]"
372+
assert len(df.columns) == 1 # only geometry
365373

366374

367375
def test_read_list_types(list_field_values_files, use_arrow):
@@ -484,6 +492,36 @@ def test_read_list_types(list_field_values_files, use_arrow):
484492
assert result["list_string_with_null"][4] == [""]
485493

486494

495+
@pytest.mark.parametrize("columns", [None, [], ["list_int", "list_string"]])
496+
def test_read_list_types_columns(request, list_field_values_files, use_arrow, columns):
497+
"""Test reading a geojson file containing fields with lists."""
498+
if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
499+
pytest.skip(
500+
"Skipping test for parquet as the GDAL Parquet driver is not available"
501+
)
502+
if (
503+
use_arrow
504+
and columns
505+
and len(columns) == 2
506+
and list_field_values_files.suffix == ".parquet"
507+
):
508+
# This gives following error, not sure why. Opened an issue for followup:
509+
# https://github.com/geopandas/pyogrio/issues/XXX
510+
error_msg = (
511+
"This fails with 'pyarrow.lib.ArrowInvalid: ArrowArray struct has "
512+
"1 children, expected 0 for type extension<geoarrow.wkb>'"
513+
)
514+
request.node.add_marker(pytest.mark.xfail(reason=error_msg))
515+
516+
result = read_dataframe(
517+
list_field_values_files, use_arrow=use_arrow, columns=columns
518+
)
519+
520+
# Check result
521+
exp_columns = 7 if columns is None else len(columns) + 1 # +1 for geometry
522+
assert len(result.columns) == exp_columns
523+
524+
487525
@pytest.mark.requires_arrow_write_api
488526
@pytest.mark.skipif(
489527
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
@@ -3294,7 +3332,7 @@ def test_write_geojson_rfc7946_coordinates(tmp_path, use_arrow):
32943332
assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)
32953333

32963334

3297-
@pytest.mark.requires_arrow_api
3335+
@pytest.mark.requires_arrow_write_api
32983336
@pytest.mark.skipif(
32993337
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
33003338
)

pyogrio/tests/test_raw_io.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -155,19 +155,20 @@ def test_read_no_geometry_no_columns_no_fids(naturalearth_lowres):
155155
)
156156

157157

158-
def test_read_columns(naturalearth_lowres):
159-
columns = ["NAME", "NAME_LONG"]
160-
meta, _, geometry, fields = read(
161-
naturalearth_lowres, columns=columns, read_geometry=False
162-
)
163-
array_equal(meta["fields"], columns)
164-
165-
# Repeats should be dropped
166-
columns = ["NAME", "NAME_LONG", "NAME"]
167-
meta, _, geometry, fields = read(
158+
@pytest.mark.parametrize(
159+
"descr, columns, exp_columns",
160+
[
161+
("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
162+
("case_sensitive", ["NAME"], []),
163+
("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
164+
("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
165+
],
166+
)
167+
def test_read_columns(naturalearth_lowres, descr, columns, exp_columns):
168+
meta, _fids, _geometry, _fields = read(
168169
naturalearth_lowres, columns=columns, read_geometry=False
169170
)
170-
array_equal(meta["fields"], columns[:2])
171+
assert array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
171172

172173

173174
@pytest.mark.parametrize("skip_features", [10, 200])

0 commit comments

Comments
 (0)