BUG: fix error in read dataframe with use arrow and columns filter (#612)

theroggy · jorisvandenbossche · web-flow · commit bcb1634d3811 · 2025-11-28T19:01:25.000+01:00
Co-authored-by: Joris Van den Bossche &lt;jorisvandenbossche@gmail.com&gt;
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,6 +5,8 @@
 ### Bug fixes
 
 -   Fix regression in reading date columns (#616)
+-   Fix error in `read_dataframe` when `use_arrow=True` and `columns` is used to filter
+    out columns of some specific types (#611)
 
 ## 0.12.0 (2025-11-26)
 
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -1449,7 +1449,7 @@ def ogr_read(
 
             # Fields are matched exactly by name, duplicates are dropped.
             # Find index of each field into fields
-            idx = np.intersect1d(fields[:, 2], columns, return_indices=True)[1]
+            idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
             fields = fields[idx, :]
 
         if not read_geometry and bbox is None and mask is None:
@@ -1722,6 +1722,11 @@ def ogr_open_arrow(
         if columns is not None:
             # Fields are matched exactly by name, duplicates are dropped.
             ignored_fields = list(set(fields[:, 2]) - set(columns))
+
+            # Find index of each field in columns, and only keep those
+            idx = np.sort(np.intersect1d(fields[:, 2], columns, return_indices=True)[1])
+            fields = fields[idx, :]
+
         if not read_geometry:
             ignored_fields.append("OGR_GEOMETRY")
 
@@ -1731,9 +1736,8 @@ def ogr_open_arrow(
 
             driver = get_driver(ogr_dataset)
             if driver in {"FlatGeobuf", "GPKG"}:
-                ignored = set(ignored_fields)
-                for f in fields:
-                    if f[2] not in ignored and f[3] == "bool":
+                for field in fields:
+                    if field[3] == "bool":  # numpy type is bool
                         raise RuntimeError(
                             "GDAL < 3.8.3 does not correctly read boolean data values "
                             "using the Arrow API. Do not use read_arrow() / "
diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -439,17 +439,17 @@ def read_dataframe(
         del table
 
         # convert datetime columns that were read as string to datetime
-        for dtype, column in zip(meta["dtypes"], meta["fields"]):
-            # With arrow, date columns are returned as datetime.date objects.
+        for dtype, column in zip(meta["dtypes"], meta["fields"], strict=True):
             if (
                 dtype is not None
                 and dtype.startswith("datetime")
+                # With arrow, date columns are returned as datetime.date objects
                 and dtype != "datetime64[D]"
             ):
                 df[column] = _try_parse_datetime(
                     df[column], datetime_as_string, mixed_offsets_as_utc
                 )
-        for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
+        for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
             if ogr_subtype == "OFSTJSON":
                 # When reading .parquet files with arrow, JSON fields are already
                 # parsed, so only parse if strings.
@@ -502,10 +502,10 @@ def read_dataframe(
     else:
         index = None
     df = pd.DataFrame(data, columns=columns, index=index)
-    for dtype, c in zip(meta["dtypes"], df.columns):
+    for dtype, c in zip(meta["dtypes"], meta["fields"], strict=True):
         if dtype.startswith("datetime"):
             df[c] = _try_parse_datetime(df[c], datetime_as_string, mixed_offsets_as_utc)
-    for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
+    for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"], strict=True):
         if ogr_subtype == "OFSTJSON":
             dtype = pd.api.types.infer_dtype(df[c])
             if dtype == "string":
diff --git a/pyogrio/raw.py b/pyogrio/raw.py
@@ -246,7 +246,7 @@ def read_arrow(
     -------
     (dict, pyarrow.Table)
 
-        Returns a tuple of meta information about the data source in a dict,
+        Returns a tuple of meta information about the returned data in a dict,
         and a pyarrow Table with data.
 
         Meta is: {
diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py
@@ -56,6 +56,17 @@ def test_read_arrow(naturalearth_lowres_all_ext):
     assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise)
 
 
+@pytest.mark.parametrize("columns", [None, [], ["continent"], ["iso_a3", "pop_est"]])
+def test_read_arrow_columns(naturalearth_lowres, columns):
+    meta, _table = read_arrow(naturalearth_lowres, columns=columns)
+    assert meta["fields"] is not None
+    if columns is None:
+        expected_fields = ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]
+    else:
+        expected_fields = columns
+    assert sorted(meta["fields"]) == sorted(expected_fields)
+
+
 def test_read_arrow_unspecified_layer_warning(data_dir):
     """Reading a multi-layer file without specifying a layer gives a warning."""
     with pytest.warns(UserWarning, match="More than one layer found "):
@@ -107,7 +118,7 @@ def test_read_arrow_skip_features_max_features(
     assert len(table) == expected
 
 
-def test_read_arrow_fid(naturalearth_lowres_all_ext):
+def test_read_df_arrow_fid(naturalearth_lowres_all_ext):
     kwargs = {"use_arrow": True, "where": "fid >= 2 AND fid <= 3"}
 
     df = read_dataframe(naturalearth_lowres_all_ext, fid_as_index=False, **kwargs)
@@ -117,12 +128,12 @@ def test_read_arrow_fid(naturalearth_lowres_all_ext):
     assert_index_equal(df.index, pd.Index([2, 3], name="fid"))
 
 
-def test_read_arrow_columns(naturalearth_lowres):
+def test_read_df_arrow_columns(naturalearth_lowres):
     result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"])
     assert result.columns.tolist() == ["continent", "geometry"]
 
 
-def test_read_arrow_ignore_geometry(naturalearth_lowres):
+def test_read_df_arrow_ignore_geometry(naturalearth_lowres):
     result = read_dataframe(naturalearth_lowres, use_arrow=True, read_geometry=False)
     assert type(result) is pd.DataFrame
 
@@ -132,7 +143,7 @@ def test_read_arrow_ignore_geometry(naturalearth_lowres):
     assert_frame_equal(result, expected)
 
 
-def test_read_arrow_to_pandas_kwargs(no_geometry_file):
+def test_read_df_arrow_to_pandas_kwargs(no_geometry_file):
     # with arrow, list types are supported
     arrow_to_pandas_kwargs = {"strings_to_categorical": True}
     df = read_dataframe(
@@ -216,6 +227,30 @@ def test_open_arrow_batch_size(naturalearth_lowres):
         assert len(tables[0]) == batch_size, "First table should match the batch size"
 
 
+@pytest.mark.parametrize(
+    "descr, columns, exp_columns",
+    [
+        ("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
+        ("case_sensitive", ["NAME"], []),
+        ("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
+        ("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
+    ],
+)
+def test_open_arrow_columns(naturalearth_lowres, descr, columns, exp_columns):
+    with open_arrow(naturalearth_lowres, columns=columns) as (meta, reader):
+        assert isinstance(meta, dict)
+        assert isinstance(reader, pyogrio._io._ArrowStream)
+
+        result = pyarrow.table(reader)
+
+    # Check metadata
+    assert np.array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
+
+    # Check columns in table
+    exp_columns_with_geom = exp_columns + ["wkb_geometry"]
+    assert result.column_names == exp_columns_with_geom, f"Failed for {descr}"
+
+
 @pytest.mark.skipif(
     __gdal_version__ >= (3, 8, 0),
     reason="skip_features supported by Arrow stream API for GDAL>=3.8.0",
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
@@ -355,13 +355,21 @@ def test_read_layer_invalid(naturalearth_lowres_all_ext, use_arrow):
         read_dataframe(naturalearth_lowres_all_ext, layer="wrong", use_arrow=use_arrow)
 
 
-def test_read_datetime(datetime_file, use_arrow):
-    df = read_dataframe(datetime_file, use_arrow=use_arrow)
-    if PANDAS_GE_20:
-        # starting with pandas 2.0, it preserves the passed datetime resolution
-        assert df.col.dtype.name == "datetime64[ms]"
+@pytest.mark.parametrize("columns", [None, [], ["col"]])
+def test_read_datetime_columns(datetime_file, columns, use_arrow):
+    df = read_dataframe(datetime_file, columns=columns, use_arrow=use_arrow)
+
+    # Check result
+    if columns is None or "col" in columns:
+        assert "col" in df.columns
+        assert is_datetime64_dtype(df.col.dtype)
+        if PANDAS_GE_20:
+            # starting with pandas 2.0, it preserves the passed datetime resolution
+            assert df.col.dtype.name == "datetime64[ms]"
+        else:
+            assert df.col.dtype.name == "datetime64[ns]"
     else:
-        assert df.col.dtype.name == "datetime64[ns]"
+        assert len(df.columns) == 1  # only geometry
 
 
 def test_read_list_types(list_field_values_files, use_arrow):
@@ -484,6 +492,36 @@ def test_read_list_types(list_field_values_files, use_arrow):
     assert result["list_string_with_null"][4] == [""]
 
 
+@pytest.mark.parametrize("columns", [None, [], ["list_int", "list_string"]])
+def test_read_list_types_columns(request, list_field_values_files, use_arrow, columns):
+    """Test reading a geojson file containing fields with lists."""
+    if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
+        pytest.skip(
+            "Skipping test for parquet as the GDAL Parquet driver is not available"
+        )
+    if (
+        use_arrow
+        and columns
+        and len(columns) == 2
+        and list_field_values_files.suffix == ".parquet"
+    ):
+        # This gives following error, not sure why. Opened an issue for followup:
+        # https://github.com/geopandas/pyogrio/issues/XXX
+        error_msg = (
+            "This fails with 'pyarrow.lib.ArrowInvalid: ArrowArray struct has "
+            "1 children, expected 0 for type extension<geoarrow.wkb>'"
+        )
+        request.node.add_marker(pytest.mark.xfail(reason=error_msg))
+
+    result = read_dataframe(
+        list_field_values_files, use_arrow=use_arrow, columns=columns
+    )
+
+    # Check result
+    exp_columns = 7 if columns is None else len(columns) + 1  # +1 for geometry
+    assert len(result.columns) == exp_columns
+
+
 @pytest.mark.requires_arrow_write_api
 @pytest.mark.skipif(
     not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
@@ -3294,7 +3332,7 @@ def test_write_geojson_rfc7946_coordinates(tmp_path, use_arrow):
     assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)
 
 
-@pytest.mark.requires_arrow_api
+@pytest.mark.requires_arrow_write_api
 @pytest.mark.skipif(
     not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
 )
diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
@@ -155,19 +155,20 @@ def test_read_no_geometry_no_columns_no_fids(naturalearth_lowres):
         )
 
 
-def test_read_columns(naturalearth_lowres):
-    columns = ["NAME", "NAME_LONG"]
-    meta, _, geometry, fields = read(
-        naturalearth_lowres, columns=columns, read_geometry=False
-    )
-    array_equal(meta["fields"], columns)
-
-    # Repeats should be dropped
-    columns = ["NAME", "NAME_LONG", "NAME"]
-    meta, _, geometry, fields = read(
+@pytest.mark.parametrize(
+    "descr, columns, exp_columns",
+    [
+        ("all", None, ["pop_est", "continent", "name", "iso_a3", "gdp_md_est"]),
+        ("case_sensitive", ["NAME"], []),
+        ("repeats_dropped", ["continent", "continent", "name"], ["continent", "name"]),
+        ("keep_original_order", ["continent", "pop_est"], ["pop_est", "continent"]),
+    ],
+)
+def test_read_columns(naturalearth_lowres, descr, columns, exp_columns):
+    meta, _fids, _geometry, _fields = read(
         naturalearth_lowres, columns=columns, read_geometry=False
     )
-    array_equal(meta["fields"], columns[:2])
+    assert array_equal(meta["fields"], exp_columns), f"Failed for {descr}"
 
 
 @pytest.mark.parametrize("skip_features", [10, 200])