From f73eae3102a1a1d4562a2dc8e8f5099fd71f0127 Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Mon, 8 Sep 2025 09:02:09 +0200
Subject: [PATCH 1/7] ENH: improve performance of read_dataframe

---
 pyogrio/_io.pyx | 88 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 63 insertions(+), 25 deletions(-)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index a4aed843..f063f483 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1076,11 +1076,14 @@ cdef get_features(
     uint8_t return_fids,
     bint datetime_as_string
 ):
-
     cdef OGRFeatureH ogr_feature = NULL
     cdef int n_fields
     cdef int i
     cdef int field_index
+    chunk_size = 50_000
+
+    if num_features >= 0:
+        chunk_size = num_features
 
     # make sure layer is read from beginning
     OGR_L_ResetReading(ogr_layer)
@@ -1089,17 +1092,16 @@ cdef get_features(
         apply_skip_features(ogr_layer, skip_features)
 
     if return_fids:
-        fid_data = np.empty(shape=(num_features), dtype=np.int64)
-        fid_view = fid_data[:]
+        fid_data = [np.empty(shape=(chunk_size, ), dtype=np.int64)]
+        fid_view = fid_data[0][:]
     else:
-        fid_data = None
+        fid_data = [None]
 
     if read_geometry:
-        geometries = np.empty(shape=(num_features, ), dtype="object")
-        geom_view = geometries[:]
-
+        geometries = [np.empty(shape=(chunk_size, ), dtype="object")]
+        geom_view = geometries[0][:]
     else:
-        geometries = None
+        geometries = [None]
 
     n_fields = fields.shape[0]
     field_indexes = fields[:, 0]
@@ -1114,17 +1116,20 @@ cdef get_features(
         else:
             dtype = fields[field_index, 3]
 
-        field_data.append(np.empty(shape=(num_features, ), dtype=dtype))
+        field_data.append([np.empty(shape=(chunk_size, ), dtype=dtype)])
 
-    field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
+    field_data_view = [field_data[field_index][0][:] for field_index in range(n_fields)]
 
     if num_features == 0:
-        return fid_data, geometries, field_data
+        field_data = [data_field[0] for data_field in field_data]
+        return fid_data[0], geometries[0], field_data
 
     i = 0
+    i_total = 0
+    chunk_index = 0
     while True:
         try:
-            if num_features > 0 and i == num_features:
+            if num_features > 0 and i_total == num_features:
                 break
 
             try:
@@ -1137,7 +1142,7 @@ cdef get_features(
             except CPLE_BaseError as exc:
                 raise FeatureError(str(exc))
 
-            if i >= num_features:
+            if num_features > 0 and i_total >= num_features:
                 raise FeatureError(
                     "GDAL returned more records than expected based on the count of "
                     "records that may meet your combination of filters against this "
@@ -1146,6 +1151,28 @@ cdef get_features(
                     "encountering this error."
                 ) from None
 
+            if i_total > 0 and i_total % chunk_size == 0:
+                # need to add a new chunk
+                i = 0
+                chunk_index += 1
+
+                if return_fids:
+                    fid_data.append(np.empty(shape=(chunk_size,), dtype=np.int64))
+                    fid_view = fid_data[chunk_index][:]
+
+                if read_geometry:
+                    geometries.append(np.empty(shape=(chunk_size,), dtype="object"))
+                    geom_view = geometries[chunk_index][:]
+
+                for field_index in range(n_fields):
+                    field_data[field_index].append(
+                        np.empty(
+                            shape=(chunk_size,),
+                            dtype=field_data[field_index][0].dtype,
+                        )
+                    )
+                    field_data_view[field_index] = field_data[field_index][chunk_index][:]
+
             if return_fids:
                 fid_view[i] = OGR_F_GetFID(ogr_feature)
 
@@ -1157,21 +1184,27 @@ cdef get_features(
                 field_indexes, field_ogr_types, encoding, datetime_as_string
             )
             i += 1
+            i_total += 1
         finally:
             if ogr_feature != NULL:
                 OGR_F_Destroy(ogr_feature)
                 ogr_feature = NULL
 
-    # There may be fewer rows available than expected from OGR_L_GetFeatureCount,
-    # such as features with bounding boxes that intersect the bbox
-    # but do not themselves intersect the bbox.
-    # Empty rows are dropped.
-    if i < num_features:
+    # The last check wasn't completely full, so drop empty rows
+    if i < chunk_size:
         if return_fids:
-            fid_data = fid_data[:i]
+            fid_data[chunk_index] = fid_data[chunk_index][:i]
         if read_geometry:
-            geometries = geometries[:i]
-        field_data = [data_field[:i] for data_field in field_data]
+            geometries[chunk_index] = geometries[chunk_index][:i]
+        for data_field in field_data:
+            data_field[chunk_index] = data_field[chunk_index][:i]
+
+    fid_data = np.concatenate(fid_data) if len(fid_data) > 1 else fid_data[0]
+    geometries = np.concatenate(geometries) if len(geometries) > 1 else geometries[0]
+    field_data = [
+        np.concatenate(data_field) if len(data_field) > 1 else data_field[0]
+        for data_field in field_data
+    ]
 
     return fid_data, geometries, field_data
 
@@ -1490,10 +1523,15 @@ def ogr_read(
                 apply_geometry_filter(ogr_layer, mask)
 
             # Limit feature range to available range
-            skip_features, num_features = validate_feature_range(
-                ogr_layer, skip_features, max_features
-            )
-
+            num_features = -1
+            if max_features > 0:
+                num_features = max_features
+            
+            if skip_features > 0:
+                skip_features, num_features = validate_feature_range(
+                    ogr_layer, skip_features, max_features
+                )
+            
             fid_data, geometries, field_data = get_features(
                 ogr_layer,
                 fields,

From ec645b53069028f142f7f4a37a09770be28de91b Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sat, 13 Sep 2025 03:02:16 +0200
Subject: [PATCH 2/7] ENH: improve performance of read_dataframe if a filter is
 used

---
 pyogrio/_io.pyx                    | 152 ++++++++++++++++-------------
 pyogrio/tests/test_geopandas_io.py |  21 ++++
 2 files changed, 106 insertions(+), 67 deletions(-)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index f063f483..5aec1010 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1080,56 +1080,55 @@ cdef get_features(
     cdef int n_fields
     cdef int i
     cdef int field_index
-    chunk_size = 50_000
 
-    if num_features >= 0:
-        chunk_size = num_features
-
-    # make sure layer is read from beginning
+    # Make sure the layer is read from the beginning.
     OGR_L_ResetReading(ogr_layer)
 
     if skip_features > 0:
         apply_skip_features(ogr_layer, skip_features)
 
-    if return_fids:
-        fid_data = [np.empty(shape=(chunk_size, ), dtype=np.int64)]
-        fid_view = fid_data[0][:]
-    else:
-        fid_data = [None]
+    # We don't (always) know beforehand how many features there will be read, so
+    # allocate output arrays as needed (in chunks), and return the concatenated results.
+    last_chunk_size = 1_000
+    max_chunk_size = 50_000
 
-    if read_geometry:
-        geometries = [np.empty(shape=(chunk_size, ), dtype="object")]
-        geom_view = geometries[0][:]
-    else:
-        geometries = [None]
+    if num_features >= 0:
+        last_chunk_size = num_features
 
+    fid_chunks = []
+    geom_chunks = []
     n_fields = fields.shape[0]
-    field_indexes = fields[:, 0]
-    field_ogr_types = fields[:, 1]
+    field_data_chunks: list[list[np.ndarray]] = []
 
-    field_data = []
-    for field_index in range(n_fields):
-        if datetime_as_string and fields[field_index, 3].startswith("datetime"):
-            dtype = "object"
-        elif fields[field_index, 3].startswith("list"):
-            dtype = "object"
-        else:
-            dtype = fields[field_index, 3]
-
-        field_data.append([np.empty(shape=(chunk_size, ), dtype=dtype)])
+    def add_data_chunk(size):
+        """Add a new data chunk to the output array lists."""
+        if return_fids:
+            fid_chunks.append(np.empty(shape=(size, ), dtype=np.int64))
+        if read_geometry:
+            geom_chunks.append(np.empty(shape=(size, ), dtype="object"))
+
+        field_data_chunk: list[np.ndarray] = []
+        for field_index in range(n_fields):
+            if datetime_as_string and fields[field_index, 3].startswith("datetime"):
+                dtype = "object"
+            elif fields[field_index, 3].startswith("list"):
+                dtype = "object"
+            else:
+                dtype = fields[field_index, 3]
 
-    field_data_view = [field_data[field_index][0][:] for field_index in range(n_fields)]
+            field_data_chunk.append(np.empty(shape=(size, ), dtype=dtype))
 
-    if num_features == 0:
-        field_data = [data_field[0] for data_field in field_data]
-        return fid_data[0], geometries[0], field_data
+        field_data_chunks.append(field_data_chunk)
 
     i = 0
     i_total = 0
-    chunk_index = 0
+    last_chunk_index = -1
+    field_indexes = fields[:, 0]
+    field_ogr_types = fields[:, 1]
+
     while True:
         try:
-            if num_features > 0 and i_total == num_features:
+            if num_features >= 0 and i_total == num_features:
                 break
 
             try:
@@ -1151,27 +1150,23 @@ cdef get_features(
                     "encountering this error."
                 ) from None
 
-            if i_total > 0 and i_total % chunk_size == 0:
-                # need to add a new chunk
+            if i_total == 0 or i == last_chunk_size:
+                # No chunk yet or last chunk filled up: add a new chunk.
                 i = 0
-                chunk_index += 1
+                last_chunk_index += 1
+                if num_features < 0 and last_chunk_size < max_chunk_size:
+                    # Double chunk size for next chunk, up to max_chunk_size
+                    last_chunk_size = min(last_chunk_size * 2, max_chunk_size)
 
+                add_data_chunk(last_chunk_size)
                 if return_fids:
-                    fid_data.append(np.empty(shape=(chunk_size,), dtype=np.int64))
-                    fid_view = fid_data[chunk_index][:]
-
+                    fid_view = fid_chunks[last_chunk_index][:]
                 if read_geometry:
-                    geometries.append(np.empty(shape=(chunk_size,), dtype="object"))
-                    geom_view = geometries[chunk_index][:]
-
-                for field_index in range(n_fields):
-                    field_data[field_index].append(
-                        np.empty(
-                            shape=(chunk_size,),
-                            dtype=field_data[field_index][0].dtype,
-                        )
-                    )
-                    field_data_view[field_index] = field_data[field_index][chunk_index][:]
+                    geom_view = geom_chunks[last_chunk_index][:]
+                field_data_view = [
+                    field_data_chunks[last_chunk_index][field_index][:]
+                    for field_index in range(n_fields)
+                ]
 
             if return_fids:
                 fid_view[i] = OGR_F_GetFID(ogr_feature)
@@ -1180,33 +1175,56 @@ cdef get_features(
                 process_geometry(ogr_feature, i, geom_view, force_2d)
 
             process_fields(
-                ogr_feature, i, n_fields, field_data, field_data_view,
-                field_indexes, field_ogr_types, encoding, datetime_as_string
+                ogr_feature, i, n_fields, field_data_chunks[last_chunk_index],
+                field_data_view, field_indexes, field_ogr_types, encoding,
+                datetime_as_string
             )
+
             i += 1
             i_total += 1
+
         finally:
             if ogr_feature != NULL:
                 OGR_F_Destroy(ogr_feature)
                 ogr_feature = NULL
 
-    # The last check wasn't completely full, so drop empty rows
-    if i < chunk_size:
+    # If no features were read, add an empty chunk
+    if i_total == 0:
+        add_data_chunk(0)
+        last_chunk_size = 0
+        last_chunk_index = 0
+
+    # If the last chunk wasn't completely full, drop empty rows
+    if i < last_chunk_size:
         if return_fids:
-            fid_data[chunk_index] = fid_data[chunk_index][:i]
+            fid_chunks[last_chunk_index] = fid_chunks[last_chunk_index][:i]
         if read_geometry:
-            geometries[chunk_index] = geometries[chunk_index][:i]
-        for data_field in field_data:
-            data_field[chunk_index] = data_field[chunk_index][:i]
+            geom_chunks[last_chunk_index] = geom_chunks[last_chunk_index][:i]
+        for field_index in range(n_fields):
+            field_data_chunks[last_chunk_index][field_index] = (
+                field_data_chunks[last_chunk_index][field_index][:i]
+            )
 
-    fid_data = np.concatenate(fid_data) if len(fid_data) > 1 else fid_data[0]
-    geometries = np.concatenate(geometries) if len(geometries) > 1 else geometries[0]
-    field_data = [
-        np.concatenate(data_field) if len(data_field) > 1 else data_field[0]
-        for data_field in field_data
-    ]
+    if last_chunk_index == 0:
+        # Only one chunk, so avoid concatenation
+        fid_data = fid_chunks[0] if return_fids else None
+        geom_data = geom_chunks[0] if read_geometry else None
+
+        return fid_data, geom_data, field_data_chunks[0]
+
+    else:
+        # Concatenate all chunks
+        fid_data = np.concatenate(fid_chunks) if return_fids else None
+        geom_data = np.concatenate(geom_chunks) if read_geometry else None
+
+        field_data = []
+        for field_index in range(n_fields):
+            data_to_concat = []
+            for chunk in range(last_chunk_index + 1):
+                data_to_concat.append(field_data_chunks[chunk][field_index])
+            field_data.append(np.concatenate(data_to_concat))
 
-    return fid_data, geometries, field_data
+        return fid_data, geom_data, field_data
 
 
 @cython.boundscheck(False)  # Deactivate bounds checking
@@ -1526,12 +1544,12 @@ def ogr_read(
             num_features = -1
             if max_features > 0:
                 num_features = max_features
-            
+
             if skip_features > 0:
                 skip_features, num_features = validate_feature_range(
                     ogr_layer, skip_features, max_features
                 )
-            
+
             fid_data, geometries, field_data = get_features(
                 ogr_layer,
                 fields,
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index e9a71aac..10b79f05 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -193,6 +193,27 @@ def test_read_dataframe(naturalearth_lowres_all_ext):
     ]
 
 
+def test_read_dataframe_multi_chunks(tmp_path, naturalearth_lowres):
+    """Test reading a file where multiple chunks are used.
+
+    `ogr_read` reads features in chunks of features. Read a suffucient number of
+    featuers in this test so multiple chunks will be used.
+    """
+    # Create test file with enough features to require multiple chunks.
+    # > 3000 features will result in 3 chunks.
+    min_features = 3300
+    df = read_dataframe(naturalearth_lowres)
+    test_path = tmp_path / "test.gpkg"
+    df = pd.concat([df] * (min_features // len(df) + 1), ignore_index=True)
+    df = df.reset_index(drop=True)
+    assert len(df) > min_features
+    write_dataframe(df, test_path)
+
+    # Read the test file and compare to original dataframe
+    result = read_dataframe(test_path, use_arrow=False)
+    assert_geodataframe_equal(result, df)
+
+
 def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow):
     df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow)
     assert len(df) == 177

From 1fad77a31e781ec96d969c1fda8a6e71a4e146e6 Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sat, 13 Sep 2025 03:12:06 +0200
Subject: [PATCH 3/7] Update CHANGES.md

---
 CHANGES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGES.md b/CHANGES.md
index ee4a01f5..ad9af7d8 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -11,6 +11,7 @@
 
 -   Add listing of GDAL data types and subtypes to `read_info` (#556).
 -   Add support to read list fields without arrow (#558).
+-   Improve performance of `read_dataframe`, especially if a filter is used (#577).
 
 ### Bug fixes
 

From af294f23fcc4e628ba8d682497b3f146194a7809 Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sat, 13 Sep 2025 03:27:02 +0200
Subject: [PATCH 4/7] Update _io.pyx

---
 pyogrio/_io.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 5aec1010..67d939da 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1139,6 +1139,9 @@ cdef get_features(
                 break
 
             except CPLE_BaseError as exc:
+                if "failed to prepare SQL" in str(exc):
+                    raise ValueError(f"Invalid SQL query: {str(exc)}") from None
+
                 raise FeatureError(str(exc))
 
             if num_features > 0 and i_total >= num_features:

From d3fcf152ad3c7e069df18b0b17b719a40994c41d Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sat, 13 Sep 2025 20:17:36 +0200
Subject: [PATCH 5/7] Try to fix for pandas 3

---
 pyogrio/tests/test_geopandas_io.py | 44 ++++++++++++++++--------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 10b79f05..2399158b 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -193,27 +193,6 @@ def test_read_dataframe(naturalearth_lowres_all_ext):
     ]
 
 
-def test_read_dataframe_multi_chunks(tmp_path, naturalearth_lowres):
-    """Test reading a file where multiple chunks are used.
-
-    `ogr_read` reads features in chunks of features. Read a suffucient number of
-    featuers in this test so multiple chunks will be used.
-    """
-    # Create test file with enough features to require multiple chunks.
-    # > 3000 features will result in 3 chunks.
-    min_features = 3300
-    df = read_dataframe(naturalearth_lowres)
-    test_path = tmp_path / "test.gpkg"
-    df = pd.concat([df] * (min_features // len(df) + 1), ignore_index=True)
-    df = df.reset_index(drop=True)
-    assert len(df) > min_features
-    write_dataframe(df, test_path)
-
-    # Read the test file and compare to original dataframe
-    result = read_dataframe(test_path, use_arrow=False)
-    assert_geodataframe_equal(result, df)
-
-
 def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow):
     df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow)
     assert len(df) == 177
@@ -2092,6 +2071,29 @@ def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message, expect
             assert df.geometry.iloc[0].wkt == expected_wkt
 
 
+def test_read_multi_chunks(tmp_path, naturalearth_lowres):
+    """Test reading a file where multiple chunks are used.
+
+    `ogr_read` reads features in chunks of features. Read a suffucient number of
+    featuers in this test so multiple chunks will be used.
+    """
+    # Create test file with enough features to require multiple chunks.
+    # > 3000 features will result in 3 chunks.
+    min_features = 3300
+    df = read_dataframe(naturalearth_lowres)
+    test_path = tmp_path / "test.gpkg"
+    nb_copies = min_features // len(df) + 1
+    df_list = [df.copy() for _ in range(nb_copies)]
+    df = pd.concat(df_list, ignore_index=True)
+    df = df.reset_index(drop=True)
+    assert len(df) > min_features
+    write_dataframe(df, test_path)
+
+    # Read the test file and compare to original dataframe
+    result = read_dataframe(test_path, use_arrow=False)
+    assert_geodataframe_equal(result, df)
+
+
 def test_read_multisurface(multisurface_file, use_arrow):
     if use_arrow:
         # TODO: revisit once https://github.com/geopandas/pyogrio/issues/478

From 8e1328a9b575247b34a8982a6f05742ee2c9e08f Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sat, 13 Sep 2025 21:39:52 +0200
Subject: [PATCH 6/7] Update test_geopandas_io.py

---
 pyogrio/tests/test_geopandas_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 2399158b..c277c658 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -2084,7 +2084,7 @@ def test_read_multi_chunks(tmp_path, naturalearth_lowres):
     test_path = tmp_path / "test.gpkg"
     nb_copies = min_features // len(df) + 1
     df_list = [df.copy() for _ in range(nb_copies)]
-    df = pd.concat(df_list, ignore_index=True)
+    df = pd.concat(df_list, ignore_index=True, sort=False)
     df = df.reset_index(drop=True)
     assert len(df) > min_features
     write_dataframe(df, test_path)

From 6b816c3830dbb9bb7fc66a2931ee16b4936a54a7 Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sat, 13 Sep 2025 22:25:36 +0200
Subject: [PATCH 7/7] Update test_geopandas_io.py

---
 pyogrio/tests/test_geopandas_io.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index c277c658..714174c2 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -2071,7 +2071,7 @@ def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message, expect
             assert df.geometry.iloc[0].wkt == expected_wkt
 
 
-def test_read_multi_chunks(tmp_path, naturalearth_lowres):
+def test_read_multi_chunks(tmp_path):
     """Test reading a file where multiple chunks are used.
 
     `ogr_read` reads features in chunks of features. Read a suffucient number of
@@ -2079,14 +2079,13 @@ def test_read_multi_chunks(tmp_path, naturalearth_lowres):
     """
     # Create test file with enough features to require multiple chunks.
     # > 3000 features will result in 3 chunks.
-    min_features = 3300
-    df = read_dataframe(naturalearth_lowres)
+    nb_features = 3300
+    df = gp.GeoDataFrame(
+        {"col": [1.0] * nb_features},
+        geometry=[Point(1, 1)] * nb_features,
+        crs="EPSG:4326",
+    )
     test_path = tmp_path / "test.gpkg"
-    nb_copies = min_features // len(df) + 1
-    df_list = [df.copy() for _ in range(nb_copies)]
-    df = pd.concat(df_list, ignore_index=True, sort=False)
-    df = df.reset_index(drop=True)
-    assert len(df) > min_features
     write_dataframe(df, test_path)
 
     # Read the test file and compare to original dataframe