Skip to content

Commit d4f51b3

Browse files
authored
BUG: fix support to read parquet files with list columns (#597)
1 parent 8020035 commit d4f51b3

File tree

6 files changed

+245
-30
lines changed

6 files changed

+245
-30
lines changed

CHANGES.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
### Improvements
1212

1313
- Add listing of GDAL data types and subtypes to `read_info` (#556).
14-
- Add support to read list fields without arrow (#558).
14+
- Add support to read list fields without arrow (#558, #597).
1515

1616
### Bug fixes
1717

pyogrio/geopandas.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,19 @@ def read_dataframe(
332332

333333
del table
334334

335-
for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
335+
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
336336
if ogr_subtype == "OFSTJSON":
337-
df[c] = df[c].map(json.loads, na_action="ignore")
337+
# When reading .parquet files with arrow, JSON fields are already
338+
# parsed, so only parse if strings.
339+
dtype = pd.api.types.infer_dtype(df[c])
340+
if dtype == "string":
341+
try:
342+
df[c] = df[c].map(json.loads, na_action="ignore")
343+
except Exception:
344+
warnings.warn(
345+
f"Could not parse column '{c}' as JSON; leaving as string",
346+
stacklevel=2,
347+
)
338348

339349
if fid_as_index:
340350
df = df.set_index(meta["fid_column"])
@@ -378,9 +388,17 @@ def read_dataframe(
378388
for dtype, c in zip(meta["dtypes"], df.columns):
379389
if dtype.startswith("datetime"):
380390
df[c] = _try_parse_datetime(df[c])
381-
for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
391+
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
382392
if ogr_subtype == "OFSTJSON":
383-
df[c] = df[c].map(json.loads, na_action="ignore")
393+
dtype = pd.api.types.infer_dtype(df[c])
394+
if dtype == "string":
395+
try:
396+
df[c] = df[c].map(json.loads, na_action="ignore")
397+
except Exception:
398+
warnings.warn(
399+
f"Could not parse column '{c}' as JSON; leaving as string",
400+
stacklevel=2,
401+
)
384402

385403
if geometry is None or not read_geometry:
386404
return df

pyogrio/tests/conftest.py

Lines changed: 103 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1+
"""Module with helper functions, fixtures, and common test data for pyogrio tests."""
2+
13
from io import BytesIO
24
from pathlib import Path
35
from zipfile import ZIP_DEFLATED, ZipFile
46

57
import numpy as np
68

7-
from pyogrio import (
8-
__gdal_version_string__,
9-
__version__,
10-
list_drivers,
11-
)
9+
from pyogrio import __gdal_version_string__, __version__, list_drivers
1210
from pyogrio._compat import (
1311
GDAL_GE_37,
1412
HAS_ARROW_WRITE_API,
@@ -203,8 +201,7 @@ def no_geometry_file(tmp_path):
203201
return filename
204202

205203

206-
@pytest.fixture(scope="function")
207-
def list_field_values_file(tmp_path):
204+
def list_field_values_geojson_file(tmp_path):
208205
# Create a GeoJSON file with list values in a property
209206
list_geojson = """{
210207
"type": "FeatureCollection",
@@ -279,6 +276,66 @@ def list_field_values_file(tmp_path):
279276
return filename
280277

281278

279+
def list_field_values_parquet_file():
280+
"""Return the path to a Parquet file with list values in a property.
281+
282+
Because in the CI environments pyarrow.parquet is typically not available, we save
283+
the file in the test data directory instead of always creating it from scratch.
284+
285+
The code to create it is here though, in case it needs to be recreated later.
286+
"""
287+
# Check if the file already exists in the test data dir
288+
fixture_path = _data_dir / "list_field_values_file.parquet"
289+
if fixture_path.exists():
290+
return fixture_path
291+
292+
# The file doesn't exist, so create it
293+
try:
294+
import pyarrow as pa
295+
from pyarrow import parquet as pq
296+
297+
import shapely
298+
except ImportError as ex:
299+
raise RuntimeError(
300+
f"test file {fixture_path} does not exist, but error importing: {ex}."
301+
)
302+
303+
table = pa.table(
304+
{
305+
"geometry": shapely.to_wkb(shapely.points(np.ones((5, 2)))),
306+
"int": [1, 2, 3, 4, 5],
307+
"list_int": [[0, 1], [2, 3], [], None, None],
308+
"list_double": [[0.0, 1.0], [2.0, 3.0], [], None, None],
309+
"list_string": [
310+
["string1", "string2"],
311+
["string3", "string4", ""],
312+
[],
313+
None,
314+
[""],
315+
],
316+
"list_int_with_null": [[0, None], [2, 3], [], None, None],
317+
"list_string_with_null": [
318+
["string1", None],
319+
["string3", "string4", ""],
320+
[],
321+
None,
322+
[""],
323+
],
324+
}
325+
)
326+
pq.write_table(table, fixture_path)
327+
328+
return fixture_path
329+
330+
331+
@pytest.fixture(scope="function", params=[".geojson", ".parquet"])
332+
def list_field_values_files(tmp_path, request):
333+
if request.param == ".geojson":
334+
return list_field_values_geojson_file(tmp_path)
335+
elif request.param == ".parquet":
336+
return list_field_values_parquet_file()
337+
338+
282339
@pytest.fixture(scope="function")
283340
def nested_geojson_file(tmp_path):
284341
# create GeoJSON file with nested properties
@@ -308,6 +365,45 @@ def nested_geojson_file(tmp_path):
308365
return filename
309366

310367

368+
@pytest.fixture(scope="function")
369+
def list_nested_struct_parquet_file(tmp_path):
370+
"""Create a Parquet file in tmp_path with nested values in a property.
371+
372+
Because in the CI environments pyarrow.parquet is typically not available, we save
373+
the file in the test data directory instead of always creating it from scratch.
374+
375+
The code to create it is here though, in case it needs to be recreated later.
376+
"""
377+
# Check if the file already exists in the test data dir
378+
fixture_path = _data_dir / "list_nested_struct_file.parquet"
379+
if fixture_path.exists():
380+
return fixture_path
381+
382+
# The file doesn't exist, so create it
383+
try:
384+
import pyarrow as pa
385+
from pyarrow import parquet as pq
386+
387+
import shapely
388+
except ImportError as ex:
389+
raise RuntimeError(
390+
f"test file {fixture_path} does not exist, but error importing: {ex}."
391+
)
392+
393+
table = pa.table(
394+
{
395+
"geometry": shapely.to_wkb(shapely.points(np.ones((3, 2)))),
396+
"col_flat": [0, 1, 2],
397+
"col_struct": [{"a": 1, "b": 2}] * 3,
398+
"col_nested": [[{"a": 1, "b": 2}] * 2] * 3,
399+
"col_list": [[1, 2, 3]] * 3,
400+
}
401+
)
402+
pq.write_table(table, fixture_path)
403+
404+
return fixture_path
405+
406+
311407
@pytest.fixture(scope="function")
312408
def datetime_file(tmp_path):
313409
# create GeoJSON file with millisecond precision
2.87 KB
Binary file not shown.
2.63 KB
Binary file not shown.

pyogrio/tests/test_geopandas_io.py

Lines changed: 119 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -379,20 +379,29 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow):
379379
assert_series_equal(df_read.datetime_col, expected)
380380

381381

382-
def test_read_list_types(list_field_values_file, use_arrow):
382+
def test_read_list_types(list_field_values_files, use_arrow):
383383
"""Test reading a geojson file containing fields with lists."""
384-
info = read_info(list_field_values_file)
385-
result = read_dataframe(list_field_values_file, use_arrow=use_arrow)
384+
if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
385+
pytest.skip(
386+
"Skipping test for parquet as the GDAL Parquet driver is not available"
387+
)
388+
389+
info = read_info(list_field_values_files)
390+
suffix = list_field_values_files.suffix
386391

392+
result = read_dataframe(list_field_values_files, use_arrow=use_arrow)
393+
394+
# Check list_int column
387395
assert "list_int" in result.columns
388396
assert info["fields"][1] == "list_int"
389-
assert info["ogr_types"][1] == "OFTIntegerList"
397+
assert info["ogr_types"][1] in ("OFTIntegerList", "OFTInteger64List")
390398
assert result["list_int"][0].tolist() == [0, 1]
391399
assert result["list_int"][1].tolist() == [2, 3]
392400
assert result["list_int"][2].tolist() == []
393401
assert result["list_int"][3] is None
394402
assert result["list_int"][4] is None
395403

404+
# Check list_double column
396405
assert "list_double" in result.columns
397406
assert info["fields"][2] == "list_double"
398407
assert info["ogr_types"][2] == "OFTRealList"
@@ -402,6 +411,7 @@ def test_read_list_types(list_field_values_file, use_arrow):
402411
assert result["list_double"][3] is None
403412
assert result["list_double"][4] is None
404413

414+
# Check list_string column
405415
assert "list_string" in result.columns
406416
assert info["fields"][3] == "list_string"
407417
assert info["ogr_types"][3] == "OFTStringList"
@@ -411,31 +421,122 @@ def test_read_list_types(list_field_values_file, use_arrow):
411421
assert result["list_string"][3] is None
412422
assert result["list_string"][4] == [""]
413423

414-
# Once any row of a column contains a null value in a list (in the test geojson),
415-
# the column isn't recognized as a list column anymore, but as a JSON column.
416-
# Because JSON columns containing JSON Arrays are also parsed to python lists, the
417-
# end result is the same...
424+
# Check list_int_with_null column
425+
if suffix == ".geojson":
426+
# Once any row of a column contains a null value in a list, the column isn't
427+
# recognized as a list column anymore for .geojson files, but as a JSON column.
428+
# Because JSON columns containing JSON Arrays are also parsed to python lists,
429+
# the end result is the same...
430+
exp_type = "OFTString"
431+
exp_subtype = "OFSTJSON"
432+
exp_list_int_with_null_value = [0, None]
433+
else:
434+
# For .parquet files, the list column is preserved as a list column.
435+
exp_type = "OFTInteger64List"
436+
exp_subtype = "OFSTNone"
437+
if use_arrow:
438+
exp_list_int_with_null_value = [0.0, np.nan]
439+
else:
440+
exp_list_int_with_null_value = [0, 0]
441+
# xfail: when reading a list of int with None values without Arrow from a
442+
# .parquet file, the None values become 0, which is wrong.
443+
# https://github.com/OSGeo/gdal/issues/13448
444+
418445
assert "list_int_with_null" in result.columns
419446
assert info["fields"][4] == "list_int_with_null"
420-
assert info["ogr_types"][4] == "OFTString"
421-
assert info["ogr_subtypes"][4] == "OFSTJSON"
422-
assert result["list_int_with_null"][0] == [0, None]
423-
assert result["list_int_with_null"][1] == [2, 3]
424-
assert result["list_int_with_null"][2] == []
447+
assert info["ogr_types"][4] == exp_type
448+
assert info["ogr_subtypes"][4] == exp_subtype
449+
assert result["list_int_with_null"][0][0] == 0
450+
if exp_list_int_with_null_value[1] == 0:
451+
assert result["list_int_with_null"][0][1] == exp_list_int_with_null_value[1]
452+
else:
453+
assert pd.isna(result["list_int_with_null"][0][1])
454+
455+
if suffix == ".geojson":
456+
# For .geojson, the lists are already python lists
457+
assert result["list_int_with_null"][1] == [2, 3]
458+
assert result["list_int_with_null"][2] == []
459+
else:
460+
# For .parquet, the lists are numpy arrays
461+
assert result["list_int_with_null"][1].tolist() == [2, 3]
462+
assert result["list_int_with_null"][2].tolist() == []
463+
425464
assert pd.isna(result["list_int_with_null"][3])
426465
assert pd.isna(result["list_int_with_null"][4])
427466

467+
# Check list_string_with_null column
468+
if suffix == ".geojson":
469+
# Once any row of a column contains a null value in a list, the column isn't
470+
# recognized as a list column anymore for .geojson files, but as a JSON column.
471+
# Because JSON columns containing JSON Arrays are also parsed to python lists,
472+
# the end result is the same...
473+
exp_type = "OFTString"
474+
exp_subtype = "OFSTJSON"
475+
else:
476+
# For .parquet files, the list column is preserved as a list column.
477+
exp_type = "OFTStringList"
478+
exp_subtype = "OFSTNone"
479+
428480
assert "list_string_with_null" in result.columns
429481
assert info["fields"][5] == "list_string_with_null"
430-
assert info["ogr_types"][5] == "OFTString"
431-
assert info["ogr_subtypes"][5] == "OFSTJSON"
432-
assert result["list_string_with_null"][0] == ["string1", None]
433-
assert result["list_string_with_null"][1] == ["string3", "string4", ""]
434-
assert result["list_string_with_null"][2] == []
482+
assert info["ogr_types"][5] == exp_type
483+
assert info["ogr_subtypes"][5] == exp_subtype
484+
485+
if suffix == ".geojson":
486+
# For .geojson, the lists are already python lists
487+
assert result["list_string_with_null"][0] == ["string1", None]
488+
assert result["list_string_with_null"][1] == ["string3", "string4", ""]
489+
assert result["list_string_with_null"][2] == []
490+
else:
491+
# For .parquet, the lists are numpy arrays
492+
# When use_arrow=False, the None becomes an empty string, which is wrong.
493+
exp_value = ["string1", ""] if not use_arrow else ["string1", None]
494+
assert result["list_string_with_null"][0].tolist() == exp_value
495+
assert result["list_string_with_null"][1].tolist() == ["string3", "string4", ""]
496+
assert result["list_string_with_null"][2].tolist() == []
497+
435498
assert pd.isna(result["list_string_with_null"][3])
436499
assert result["list_string_with_null"][4] == [""]
437500

438501

502+
@pytest.mark.requires_arrow_write_api
503+
@pytest.mark.skipif(
504+
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
505+
)
506+
def test_read_list_nested_struct_parquet_file(
507+
list_nested_struct_parquet_file, use_arrow
508+
):
509+
"""Test reading a Parquet file containing nested struct and list types."""
510+
if not use_arrow:
511+
pytest.skip(
512+
"When use_arrow=False, gdal flattens nested columns to seperate columns. "
513+
"Not sure how we want to deal with this case, but for now just skip."
514+
)
515+
516+
result = read_dataframe(list_nested_struct_parquet_file, use_arrow=use_arrow)
517+
518+
assert "col_flat" in result.columns
519+
assert np.array_equal(result["col_flat"].to_numpy(), np.array([0, 1, 2]))
520+
521+
assert "col_list" in result.columns
522+
assert result["col_list"].dtype == object
523+
assert result["col_list"][0].tolist() == [1, 2, 3]
524+
assert result["col_list"][1].tolist() == [1, 2, 3]
525+
assert result["col_list"][2].tolist() == [1, 2, 3]
526+
527+
assert "col_nested" in result.columns
528+
assert result["col_nested"].dtype == object
529+
assert result["col_nested"][0].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
530+
assert result["col_nested"][1].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
531+
assert result["col_nested"][2].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
532+
533+
assert "col_struct" in result.columns
534+
assert result["col_struct"].dtype == object
535+
assert result["col_struct"][0] == {"a": 1, "b": 2}
536+
assert result["col_struct"][1] == {"a": 1, "b": 2}
537+
assert result["col_struct"][2] == {"a": 1, "b": 2}
538+
539+
439540
@pytest.mark.filterwarnings(
440541
"ignore: Non-conformant content for record 1 in column dates"
441542
)

0 commit comments

Comments
 (0)