From f73eae3102a1a1d4562a2dc8e8f5099fd71f0127 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 8 Sep 2025 09:02:09 +0200 Subject: [PATCH 1/7] ENH: improve performance of read_dataframe --- pyogrio/_io.pyx | 88 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index a4aed843..f063f483 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1076,11 +1076,14 @@ cdef get_features( uint8_t return_fids, bint datetime_as_string ): - cdef OGRFeatureH ogr_feature = NULL cdef int n_fields cdef int i cdef int field_index + chunk_size = 50_000 + + if num_features >= 0: + chunk_size = num_features # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -1089,17 +1092,16 @@ cdef get_features( apply_skip_features(ogr_layer, skip_features) if return_fids: - fid_data = np.empty(shape=(num_features), dtype=np.int64) - fid_view = fid_data[:] + fid_data = [np.empty(shape=(chunk_size, ), dtype=np.int64)] + fid_view = fid_data[0][:] else: - fid_data = None + fid_data = [None] if read_geometry: - geometries = np.empty(shape=(num_features, ), dtype="object") - geom_view = geometries[:] - + geometries = [np.empty(shape=(chunk_size, ), dtype="object")] + geom_view = geometries[0][:] else: - geometries = None + geometries = [None] n_fields = fields.shape[0] field_indexes = fields[:, 0] @@ -1114,17 +1116,20 @@ cdef get_features( else: dtype = fields[field_index, 3] - field_data.append(np.empty(shape=(num_features, ), dtype=dtype)) + field_data.append([np.empty(shape=(chunk_size, ), dtype=dtype)]) - field_data_view = [field_data[field_index][:] for field_index in range(n_fields)] + field_data_view = [field_data[field_index][0][:] for field_index in range(n_fields)] if num_features == 0: - return fid_data, geometries, field_data + field_data = [data_field[0] for data_field in field_data] + return fid_data[0], geometries[0], field_data i = 0 + i_total = 0 + chunk_index = 0 while True: try: - if num_features > 0 and i == num_features: + if num_features > 0 and i_total == num_features: break try: @@ -1137,7 +1142,7 @@ cdef get_features( except CPLE_BaseError as exc: raise FeatureError(str(exc)) - if i >= num_features: + if num_features > 0 and i_total >= num_features: raise FeatureError( "GDAL returned more records than expected based on the count of " "records that may meet your combination of filters against this " @@ -1146,6 +1151,28 @@ cdef get_features( "encountering this error." ) from None + if i_total > 0 and i_total % chunk_size == 0: + # need to add a new chunk + i = 0 + chunk_index += 1 + + if return_fids: + fid_data.append(np.empty(shape=(chunk_size,), dtype=np.int64)) + fid_view = fid_data[chunk_index][:] + + if read_geometry: + geometries.append(np.empty(shape=(chunk_size,), dtype="object")) + geom_view = geometries[chunk_index][:] + + for field_index in range(n_fields): + field_data[field_index].append( + np.empty( + shape=(chunk_size,), + dtype=field_data[field_index][0].dtype, + ) + ) + field_data_view[field_index] = field_data[field_index][chunk_index][:] + if return_fids: fid_view[i] = OGR_F_GetFID(ogr_feature) @@ -1157,21 +1184,27 @@ cdef get_features( field_indexes, field_ogr_types, encoding, datetime_as_string ) i += 1 + i_total += 1 finally: if ogr_feature != NULL: OGR_F_Destroy(ogr_feature) ogr_feature = NULL - # There may be fewer rows available than expected from OGR_L_GetFeatureCount, - # such as features with bounding boxes that intersect the bbox - # but do not themselves intersect the bbox. - # Empty rows are dropped. - if i < num_features: + # The last check wasn't completely full, so drop empty rows + if i < chunk_size: if return_fids: - fid_data = fid_data[:i] + fid_data[chunk_index] = fid_data[chunk_index][:i] if read_geometry: - geometries = geometries[:i] - field_data = [data_field[:i] for data_field in field_data] + geometries[chunk_index] = geometries[chunk_index][:i] + for data_field in field_data: + data_field[chunk_index] = data_field[chunk_index][:i] + + fid_data = np.concatenate(fid_data) if len(fid_data) > 1 else fid_data[0] + geometries = np.concatenate(geometries) if len(geometries) > 1 else geometries[0] + field_data = [ + np.concatenate(data_field) if len(data_field) > 1 else data_field[0] + for data_field in field_data + ] return fid_data, geometries, field_data @@ -1490,10 +1523,15 @@ def ogr_read( apply_geometry_filter(ogr_layer, mask) # Limit feature range to available range - skip_features, num_features = validate_feature_range( - ogr_layer, skip_features, max_features - ) - + num_features = -1 + if max_features > 0: + num_features = max_features + + if skip_features > 0: + skip_features, num_features = validate_feature_range( + ogr_layer, skip_features, max_features + ) + fid_data, geometries, field_data = get_features( ogr_layer, fields, From ec645b53069028f142f7f4a37a09770be28de91b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 13 Sep 2025 03:02:16 +0200 Subject: [PATCH 2/7] ENH: improve performance of read_dataframe if a filter is used --- pyogrio/_io.pyx | 152 ++++++++++++++++------------- pyogrio/tests/test_geopandas_io.py | 21 ++++ 2 files changed, 106 insertions(+), 67 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index f063f483..5aec1010 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1080,56 +1080,55 @@ cdef get_features( cdef int n_fields cdef int i cdef int field_index - chunk_size = 50_000 - if num_features >= 0: - chunk_size = num_features - - # make sure layer is read from beginning + # Make sure the layer is read from the beginning. OGR_L_ResetReading(ogr_layer) if skip_features > 0: apply_skip_features(ogr_layer, skip_features) - if return_fids: - fid_data = [np.empty(shape=(chunk_size, ), dtype=np.int64)] - fid_view = fid_data[0][:] - else: - fid_data = [None] + # We don't (always) know beforehand how many features there will be read, so + # allocate output arrays as needed (in chunks), and return the concatenated results. + last_chunk_size = 1_000 + max_chunk_size = 50_000 - if read_geometry: - geometries = [np.empty(shape=(chunk_size, ), dtype="object")] - geom_view = geometries[0][:] - else: - geometries = [None] + if num_features >= 0: + last_chunk_size = num_features + fid_chunks = [] + geom_chunks = [] n_fields = fields.shape[0] - field_indexes = fields[:, 0] - field_ogr_types = fields[:, 1] + field_data_chunks: list[list[np.ndarray]] = [] - field_data = [] - for field_index in range(n_fields): - if datetime_as_string and fields[field_index, 3].startswith("datetime"): - dtype = "object" - elif fields[field_index, 3].startswith("list"): - dtype = "object" - else: - dtype = fields[field_index, 3] - - field_data.append([np.empty(shape=(chunk_size, ), dtype=dtype)]) + def add_data_chunk(size): + """Add a new data chunk to the output array lists.""" + if return_fids: + fid_chunks.append(np.empty(shape=(size, ), dtype=np.int64)) + if read_geometry: + geom_chunks.append(np.empty(shape=(size, ), dtype="object")) + + field_data_chunk: list[np.ndarray] = [] + for field_index in range(n_fields): + if datetime_as_string and fields[field_index, 3].startswith("datetime"): + dtype = "object" + elif fields[field_index, 3].startswith("list"): + dtype = "object" + else: + dtype = fields[field_index, 3] - field_data_view = [field_data[field_index][0][:] for field_index in range(n_fields)] + field_data_chunk.append(np.empty(shape=(size, ), dtype=dtype)) - if num_features == 0: - field_data = [data_field[0] for data_field in field_data] - return fid_data[0], geometries[0], field_data + field_data_chunks.append(field_data_chunk) i = 0 i_total = 0 - chunk_index = 0 + last_chunk_index = -1 + field_indexes = fields[:, 0] + field_ogr_types = fields[:, 1] + while True: try: - if num_features > 0 and i_total == num_features: + if num_features >= 0 and i_total == num_features: break try: @@ -1151,27 +1150,23 @@ cdef get_features( "encountering this error." ) from None - if i_total > 0 and i_total % chunk_size == 0: - # need to add a new chunk + if i_total == 0 or i == last_chunk_size: + # No chunk yet or last chunk filled up: add a new chunk. i = 0 - chunk_index += 1 + last_chunk_index += 1 + if num_features < 0 and last_chunk_size < max_chunk_size: + # Double chunk size for next chunk, up to max_chunk_size + last_chunk_size = min(last_chunk_size * 2, max_chunk_size) + add_data_chunk(last_chunk_size) if return_fids: - fid_data.append(np.empty(shape=(chunk_size,), dtype=np.int64)) - fid_view = fid_data[chunk_index][:] - + fid_view = fid_chunks[last_chunk_index][:] if read_geometry: - geometries.append(np.empty(shape=(chunk_size,), dtype="object")) - geom_view = geometries[chunk_index][:] - - for field_index in range(n_fields): - field_data[field_index].append( - np.empty( - shape=(chunk_size,), - dtype=field_data[field_index][0].dtype, - ) - ) - field_data_view[field_index] = field_data[field_index][chunk_index][:] + geom_view = geom_chunks[last_chunk_index][:] + field_data_view = [ + field_data_chunks[last_chunk_index][field_index][:] + for field_index in range(n_fields) + ] if return_fids: fid_view[i] = OGR_F_GetFID(ogr_feature) @@ -1180,33 +1175,56 @@ cdef get_features( process_geometry(ogr_feature, i, geom_view, force_2d) process_fields( - ogr_feature, i, n_fields, field_data, field_data_view, - field_indexes, field_ogr_types, encoding, datetime_as_string + ogr_feature, i, n_fields, field_data_chunks[last_chunk_index], + field_data_view, field_indexes, field_ogr_types, encoding, + datetime_as_string ) + i += 1 i_total += 1 + finally: if ogr_feature != NULL: OGR_F_Destroy(ogr_feature) ogr_feature = NULL - # The last check wasn't completely full, so drop empty rows - if i < chunk_size: + # If no features were read, add an empty chunk + if i_total == 0: + add_data_chunk(0) + last_chunk_size = 0 + last_chunk_index = 0 + + # If the last chunk wasn't completely full, drop empty rows + if i < last_chunk_size: if return_fids: - fid_data[chunk_index] = fid_data[chunk_index][:i] + fid_chunks[last_chunk_index] = fid_chunks[last_chunk_index][:i] if read_geometry: - geometries[chunk_index] = geometries[chunk_index][:i] - for data_field in field_data: - data_field[chunk_index] = data_field[chunk_index][:i] + geom_chunks[last_chunk_index] = geom_chunks[last_chunk_index][:i] + for field_index in range(n_fields): + field_data_chunks[last_chunk_index][field_index] = ( + field_data_chunks[last_chunk_index][field_index][:i] + ) - fid_data = np.concatenate(fid_data) if len(fid_data) > 1 else fid_data[0] - geometries = np.concatenate(geometries) if len(geometries) > 1 else geometries[0] - field_data = [ - np.concatenate(data_field) if len(data_field) > 1 else data_field[0] - for data_field in field_data - ] + if last_chunk_index == 0: + # Only one chunk, so avoid concatenation + fid_data = fid_chunks[0] if return_fids else None + geom_data = geom_chunks[0] if read_geometry else None + + return fid_data, geom_data, field_data_chunks[0] + + else: + # Concatenate all chunks + fid_data = np.concatenate(fid_chunks) if return_fids else None + geom_data = np.concatenate(geom_chunks) if read_geometry else None + + field_data = [] + for field_index in range(n_fields): + data_to_concat = [] + for chunk in range(last_chunk_index + 1): + data_to_concat.append(field_data_chunks[chunk][field_index]) + field_data.append(np.concatenate(data_to_concat)) - return fid_data, geometries, field_data + return fid_data, geom_data, field_data @cython.boundscheck(False) # Deactivate bounds checking @@ -1526,12 +1544,12 @@ def ogr_read( num_features = -1 if max_features > 0: num_features = max_features - + if skip_features > 0: skip_features, num_features = validate_feature_range( ogr_layer, skip_features, max_features ) - + fid_data, geometries, field_data = get_features( ogr_layer, fields, diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index e9a71aac..10b79f05 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -193,6 +193,27 @@ def test_read_dataframe(naturalearth_lowres_all_ext): ] +def test_read_dataframe_multi_chunks(tmp_path, naturalearth_lowres): + """Test reading a file where multiple chunks are used. + + `ogr_read` reads features in chunks of features. Read a suffucient number of + featuers in this test so multiple chunks will be used. + """ + # Create test file with enough features to require multiple chunks. + # > 3000 features will result in 3 chunks. + min_features = 3300 + df = read_dataframe(naturalearth_lowres) + test_path = tmp_path / "test.gpkg" + df = pd.concat([df] * (min_features // len(df) + 1), ignore_index=True) + df = df.reset_index(drop=True) + assert len(df) > min_features + write_dataframe(df, test_path) + + # Read the test file and compare to original dataframe + result = read_dataframe(test_path, use_arrow=False) + assert_geodataframe_equal(result, df) + + def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow): df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow) assert len(df) == 177 From 1fad77a31e781ec96d969c1fda8a6e71a4e146e6 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 13 Sep 2025 03:12:06 +0200 Subject: [PATCH 3/7] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index ee4a01f5..ad9af7d8 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,6 +11,7 @@ - Add listing of GDAL data types and subtypes to `read_info` (#556). - Add support to read list fields without arrow (#558). +- Improve performance of `read_dataframe`, especially if a filter is used (#577). ### Bug fixes From af294f23fcc4e628ba8d682497b3f146194a7809 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 13 Sep 2025 03:27:02 +0200 Subject: [PATCH 4/7] Update _io.pyx --- pyogrio/_io.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 5aec1010..67d939da 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1139,6 +1139,9 @@ cdef get_features( break except CPLE_BaseError as exc: + if "failed to prepare SQL" in str(exc): + raise ValueError(f"Invalid SQL query: {str(exc)}") from None + raise FeatureError(str(exc)) if num_features > 0 and i_total >= num_features: From d3fcf152ad3c7e069df18b0b17b719a40994c41d Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 13 Sep 2025 20:17:36 +0200 Subject: [PATCH 5/7] Try to fix for pandas 3 --- pyogrio/tests/test_geopandas_io.py | 44 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 10b79f05..2399158b 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -193,27 +193,6 @@ def test_read_dataframe(naturalearth_lowres_all_ext): ] -def test_read_dataframe_multi_chunks(tmp_path, naturalearth_lowres): - """Test reading a file where multiple chunks are used. - - `ogr_read` reads features in chunks of features. Read a suffucient number of - featuers in this test so multiple chunks will be used. - """ - # Create test file with enough features to require multiple chunks. - # > 3000 features will result in 3 chunks. - min_features = 3300 - df = read_dataframe(naturalearth_lowres) - test_path = tmp_path / "test.gpkg" - df = pd.concat([df] * (min_features // len(df) + 1), ignore_index=True) - df = df.reset_index(drop=True) - assert len(df) > min_features - write_dataframe(df, test_path) - - # Read the test file and compare to original dataframe - result = read_dataframe(test_path, use_arrow=False) - assert_geodataframe_equal(result, df) - - def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow): df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow) assert len(df) == 177 @@ -2092,6 +2071,29 @@ def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message, expect assert df.geometry.iloc[0].wkt == expected_wkt +def test_read_multi_chunks(tmp_path, naturalearth_lowres): + """Test reading a file where multiple chunks are used. + + `ogr_read` reads features in chunks of features. Read a suffucient number of + featuers in this test so multiple chunks will be used. + """ + # Create test file with enough features to require multiple chunks. + # > 3000 features will result in 3 chunks. + min_features = 3300 + df = read_dataframe(naturalearth_lowres) + test_path = tmp_path / "test.gpkg" + nb_copies = min_features // len(df) + 1 + df_list = [df.copy() for _ in range(nb_copies)] + df = pd.concat(df_list, ignore_index=True) + df = df.reset_index(drop=True) + assert len(df) > min_features + write_dataframe(df, test_path) + + # Read the test file and compare to original dataframe + result = read_dataframe(test_path, use_arrow=False) + assert_geodataframe_equal(result, df) + + def test_read_multisurface(multisurface_file, use_arrow): if use_arrow: # TODO: revisit once https://github.com/geopandas/pyogrio/issues/478 From 8e1328a9b575247b34a8982a6f05742ee2c9e08f Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 13 Sep 2025 21:39:52 +0200 Subject: [PATCH 6/7] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 2399158b..c277c658 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -2084,7 +2084,7 @@ def test_read_multi_chunks(tmp_path, naturalearth_lowres): test_path = tmp_path / "test.gpkg" nb_copies = min_features // len(df) + 1 df_list = [df.copy() for _ in range(nb_copies)] - df = pd.concat(df_list, ignore_index=True) + df = pd.concat(df_list, ignore_index=True, sort=False) df = df.reset_index(drop=True) assert len(df) > min_features write_dataframe(df, test_path) From 6b816c3830dbb9bb7fc66a2931ee16b4936a54a7 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 13 Sep 2025 22:25:36 +0200 Subject: [PATCH 7/7] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index c277c658..714174c2 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -2071,7 +2071,7 @@ def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message, expect assert df.geometry.iloc[0].wkt == expected_wkt -def test_read_multi_chunks(tmp_path, naturalearth_lowres): +def test_read_multi_chunks(tmp_path): """Test reading a file where multiple chunks are used. `ogr_read` reads features in chunks of features. Read a suffucient number of @@ -2079,14 +2079,13 @@ def test_read_multi_chunks(tmp_path, naturalearth_lowres): """ # Create test file with enough features to require multiple chunks. # > 3000 features will result in 3 chunks. - min_features = 3300 - df = read_dataframe(naturalearth_lowres) + nb_features = 3300 + df = gp.GeoDataFrame( + {"col": [1.0] * nb_features}, + geometry=[Point(1, 1)] * nb_features, + crs="EPSG:4326", + ) test_path = tmp_path / "test.gpkg" - nb_copies = min_features // len(df) + 1 - df_list = [df.copy() for _ in range(nb_copies)] - df = pd.concat(df_list, ignore_index=True, sort=False) - df = df.reset_index(drop=True) - assert len(df) > min_features write_dataframe(df, test_path) # Read the test file and compare to original dataframe