BUG: fix support to read parquet files with list columns (#597)

theroggy · web-flow · commit d4f51b3b77cd · 2025-11-20T11:31:19.000+01:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -11,7 +11,7 @@
 ### Improvements
 
 -   Add listing of GDAL data types and subtypes to `read_info` (#556).
--   Add support to read list fields without arrow (#558).
+-   Add support to read list fields without arrow (#558, #597).
 
 ### Bug fixes
 
diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -332,9 +332,19 @@ def read_dataframe(
 
         del table
 
-        for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
+        for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
             if ogr_subtype == "OFSTJSON":
-                df[c] = df[c].map(json.loads, na_action="ignore")
+                # When reading .parquet files with arrow, JSON fields are already
+                # parsed, so only parse if strings.
+                dtype = pd.api.types.infer_dtype(df[c])
+                if dtype == "string":
+                    try:
+                        df[c] = df[c].map(json.loads, na_action="ignore")
+                    except Exception:
+                        warnings.warn(
+                            f"Could not parse column '{c}' as JSON; leaving as string",
+                            stacklevel=2,
+                        )
 
         if fid_as_index:
             df = df.set_index(meta["fid_column"])
@@ -378,9 +388,17 @@ def read_dataframe(
     for dtype, c in zip(meta["dtypes"], df.columns):
         if dtype.startswith("datetime"):
             df[c] = _try_parse_datetime(df[c])
-    for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
+    for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
         if ogr_subtype == "OFSTJSON":
-            df[c] = df[c].map(json.loads, na_action="ignore")
+            dtype = pd.api.types.infer_dtype(df[c])
+            if dtype == "string":
+                try:
+                    df[c] = df[c].map(json.loads, na_action="ignore")
+                except Exception:
+                    warnings.warn(
+                        f"Could not parse column '{c}' as JSON; leaving as string",
+                        stacklevel=2,
+                    )
 
     if geometry is None or not read_geometry:
         return df
diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py
@@ -1,14 +1,12 @@
+"""Module with helper functions, fixtures, and common test data for pyogrio tests."""
+
 from io import BytesIO
 from pathlib import Path
 from zipfile import ZIP_DEFLATED, ZipFile
 
 import numpy as np
 
-from pyogrio import (
-    __gdal_version_string__,
-    __version__,
-    list_drivers,
-)
+from pyogrio import __gdal_version_string__, __version__, list_drivers
 from pyogrio._compat import (
     GDAL_GE_37,
     HAS_ARROW_WRITE_API,
@@ -203,8 +201,7 @@ def no_geometry_file(tmp_path):
     return filename
 
 
-@pytest.fixture(scope="function")
-def list_field_values_file(tmp_path):
+def list_field_values_geojson_file(tmp_path):
     # Create a GeoJSON file with list values in a property
     list_geojson = """{
         "type": "FeatureCollection",
@@ -279,6 +276,66 @@ def list_field_values_file(tmp_path):
     return filename
 
 
+def list_field_values_parquet_file():
+    """Return the path to a Parquet file with list values in a property.
+
+    Because in the CI environments pyarrow.parquet is typically not available, we save
+    the file in the test data directory instead of always creating it from scratch.
+
+    The code to create it is here though, in case it needs to be recreated later.
+    """
+    # Check if the file already exists in the test data dir
+    fixture_path = _data_dir / "list_field_values_file.parquet"
+    if fixture_path.exists():
+        return fixture_path
+
+    # The file doesn't exist, so create it
+    try:
+        import pyarrow as pa
+        from pyarrow import parquet as pq
+
+        import shapely
+    except ImportError as ex:
+        raise RuntimeError(
+            f"test file {fixture_path} does not exist, but error importing: {ex}."
+        )
+
+    table = pa.table(
+        {
+            "geometry": shapely.to_wkb(shapely.points(np.ones((5, 2)))),
+            "int": [1, 2, 3, 4, 5],
+            "list_int": [[0, 1], [2, 3], [], None, None],
+            "list_double": [[0.0, 1.0], [2.0, 3.0], [], None, None],
+            "list_string": [
+                ["string1", "string2"],
+                ["string3", "string4", ""],
+                [],
+                None,
+                [""],
+            ],
+            "list_int_with_null": [[0, None], [2, 3], [], None, None],
+            "list_string_with_null": [
+                ["string1", None],
+                ["string3", "string4", ""],
+                [],
+                None,
+                [""],
+            ],
+        }
+    )
+    pq.write_table(table, fixture_path)
+
+    return fixture_path
+
+
+@pytest.fixture(scope="function", params=[".geojson", ".parquet"])
+def list_field_values_files(tmp_path, request):
+    if request.param == ".geojson":
+        return list_field_values_geojson_file(tmp_path)
+    elif request.param == ".parquet":
+        return list_field_values_parquet_file()
+
+
 @pytest.fixture(scope="function")
 def nested_geojson_file(tmp_path):
     # create GeoJSON file with nested properties
@@ -308,6 +365,45 @@ def nested_geojson_file(tmp_path):
     return filename
 
 
+@pytest.fixture(scope="function")
+def list_nested_struct_parquet_file(tmp_path):
+    """Create a Parquet file in tmp_path with nested values in a property.
+
+    Because in the CI environments pyarrow.parquet is typically not available, we save
+    the file in the test data directory instead of always creating it from scratch.
+
+    The code to create it is here though, in case it needs to be recreated later.
+    """
+    # Check if the file already exists in the test data dir
+    fixture_path = _data_dir / "list_nested_struct_file.parquet"
+    if fixture_path.exists():
+        return fixture_path
+
+    # The file doesn't exist, so create it
+    try:
+        import pyarrow as pa
+        from pyarrow import parquet as pq
+
+        import shapely
+    except ImportError as ex:
+        raise RuntimeError(
+            f"test file {fixture_path} does not exist, but error importing: {ex}."
+        )
+
+    table = pa.table(
+        {
+            "geometry": shapely.to_wkb(shapely.points(np.ones((3, 2)))),
+            "col_flat": [0, 1, 2],
+            "col_struct": [{"a": 1, "b": 2}] * 3,
+            "col_nested": [[{"a": 1, "b": 2}] * 2] * 3,
+            "col_list": [[1, 2, 3]] * 3,
+        }
+    )
+    pq.write_table(table, fixture_path)
+
+    return fixture_path
+
+
 @pytest.fixture(scope="function")
 def datetime_file(tmp_path):
     # create GeoJSON file with millisecond precision
diff --git a/pyogrio/tests/fixtures/list_field_values_file.parquet b/pyogrio/tests/fixtures/list_field_values_file.parquet
diff --git a/pyogrio/tests/fixtures/list_nested_struct_file.parquet b/pyogrio/tests/fixtures/list_nested_struct_file.parquet
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
@@ -379,20 +379,29 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow):
     assert_series_equal(df_read.datetime_col, expected)
 
 
-def test_read_list_types(list_field_values_file, use_arrow):
+def test_read_list_types(list_field_values_files, use_arrow):
     """Test reading a geojson file containing fields with lists."""
-    info = read_info(list_field_values_file)
-    result = read_dataframe(list_field_values_file, use_arrow=use_arrow)
+    if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
+        pytest.skip(
+            "Skipping test for parquet as the GDAL Parquet driver is not available"
+        )
+
+    info = read_info(list_field_values_files)
+    suffix = list_field_values_files.suffix
 
+    result = read_dataframe(list_field_values_files, use_arrow=use_arrow)
+
+    # Check list_int column
     assert "list_int" in result.columns
     assert info["fields"][1] == "list_int"
-    assert info["ogr_types"][1] == "OFTIntegerList"
+    assert info["ogr_types"][1] in ("OFTIntegerList", "OFTInteger64List")
     assert result["list_int"][0].tolist() == [0, 1]
     assert result["list_int"][1].tolist() == [2, 3]
     assert result["list_int"][2].tolist() == []
     assert result["list_int"][3] is None
     assert result["list_int"][4] is None
 
+    # Check list_double column
     assert "list_double" in result.columns
     assert info["fields"][2] == "list_double"
     assert info["ogr_types"][2] == "OFTRealList"
@@ -402,6 +411,7 @@ def test_read_list_types(list_field_values_file, use_arrow):
     assert result["list_double"][3] is None
     assert result["list_double"][4] is None
 
+    # Check list_string column
     assert "list_string" in result.columns
     assert info["fields"][3] == "list_string"
     assert info["ogr_types"][3] == "OFTStringList"
@@ -411,31 +421,122 @@ def test_read_list_types(list_field_values_file, use_arrow):
     assert result["list_string"][3] is None
     assert result["list_string"][4] == [""]
 
-    # Once any row of a column contains a null value in a list (in the test geojson),
-    # the column isn't recognized as a list column anymore, but as a JSON column.
-    # Because JSON columns containing JSON Arrays are also parsed to python lists, the
-    # end result is the same...
+    # Check list_int_with_null column
+    if suffix == ".geojson":
+        # Once any row of a column contains a null value in a list, the column isn't
+        # recognized as a list column anymore for .geojson files, but as a JSON column.
+        # Because JSON columns containing JSON Arrays are also parsed to python lists,
+        # the end result is the same...
+        exp_type = "OFTString"
+        exp_subtype = "OFSTJSON"
+        exp_list_int_with_null_value = [0, None]
+    else:
+        # For .parquet files, the list column is preserved as a list column.
+        exp_type = "OFTInteger64List"
+        exp_subtype = "OFSTNone"
+        if use_arrow:
+            exp_list_int_with_null_value = [0.0, np.nan]
+        else:
+            exp_list_int_with_null_value = [0, 0]
+            # xfail: when reading a list of int with None values without Arrow from a
+            # .parquet file, the None values become 0, which is wrong.
+            # https://github.com/OSGeo/gdal/issues/13448
+
     assert "list_int_with_null" in result.columns
     assert info["fields"][4] == "list_int_with_null"
-    assert info["ogr_types"][4] == "OFTString"
-    assert info["ogr_subtypes"][4] == "OFSTJSON"
-    assert result["list_int_with_null"][0] == [0, None]
-    assert result["list_int_with_null"][1] == [2, 3]
-    assert result["list_int_with_null"][2] == []
+    assert info["ogr_types"][4] == exp_type
+    assert info["ogr_subtypes"][4] == exp_subtype
+    assert result["list_int_with_null"][0][0] == 0
+    if exp_list_int_with_null_value[1] == 0:
+        assert result["list_int_with_null"][0][1] == exp_list_int_with_null_value[1]
+    else:
+        assert pd.isna(result["list_int_with_null"][0][1])
+
+    if suffix == ".geojson":
+        # For .geojson, the lists are already python lists
+        assert result["list_int_with_null"][1] == [2, 3]
+        assert result["list_int_with_null"][2] == []
+    else:
+        # For .parquet, the lists are numpy arrays
+        assert result["list_int_with_null"][1].tolist() == [2, 3]
+        assert result["list_int_with_null"][2].tolist() == []
+
     assert pd.isna(result["list_int_with_null"][3])
     assert pd.isna(result["list_int_with_null"][4])
 
+    # Check list_string_with_null column
+    if suffix == ".geojson":
+        # Once any row of a column contains a null value in a list, the column isn't
+        # recognized as a list column anymore for .geojson files, but as a JSON column.
+        # Because JSON columns containing JSON Arrays are also parsed to python lists,
+        # the end result is the same...
+        exp_type = "OFTString"
+        exp_subtype = "OFSTJSON"
+    else:
+        # For .parquet files, the list column is preserved as a list column.
+        exp_type = "OFTStringList"
+        exp_subtype = "OFSTNone"
+
     assert "list_string_with_null" in result.columns
     assert info["fields"][5] == "list_string_with_null"
-    assert info["ogr_types"][5] == "OFTString"
-    assert info["ogr_subtypes"][5] == "OFSTJSON"
-    assert result["list_string_with_null"][0] == ["string1", None]
-    assert result["list_string_with_null"][1] == ["string3", "string4", ""]
-    assert result["list_string_with_null"][2] == []
+    assert info["ogr_types"][5] == exp_type
+    assert info["ogr_subtypes"][5] == exp_subtype
+
+    if suffix == ".geojson":
+        # For .geojson, the lists are already python lists
+        assert result["list_string_with_null"][0] == ["string1", None]
+        assert result["list_string_with_null"][1] == ["string3", "string4", ""]
+        assert result["list_string_with_null"][2] == []
+    else:
+        # For .parquet, the lists are numpy arrays
+        # When use_arrow=False, the None becomes an empty string, which is wrong.
+        exp_value = ["string1", ""] if not use_arrow else ["string1", None]
+        assert result["list_string_with_null"][0].tolist() == exp_value
+        assert result["list_string_with_null"][1].tolist() == ["string3", "string4", ""]
+        assert result["list_string_with_null"][2].tolist() == []
+
     assert pd.isna(result["list_string_with_null"][3])
     assert result["list_string_with_null"][4] == [""]
 
 
+@pytest.mark.requires_arrow_write_api
+@pytest.mark.skipif(
+    not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
+)
+def test_read_list_nested_struct_parquet_file(
+    list_nested_struct_parquet_file, use_arrow
+):
+    """Test reading a Parquet file containing nested struct and list types."""
+    if not use_arrow:
+        pytest.skip(
+            "When use_arrow=False, gdal flattens nested columns to seperate columns. "
+            "Not sure how we want to deal with this case, but for now just skip."
+        )
+
+    result = read_dataframe(list_nested_struct_parquet_file, use_arrow=use_arrow)
+
+    assert "col_flat" in result.columns
+    assert np.array_equal(result["col_flat"].to_numpy(), np.array([0, 1, 2]))
+
+    assert "col_list" in result.columns
+    assert result["col_list"].dtype == object
+    assert result["col_list"][0].tolist() == [1, 2, 3]
+    assert result["col_list"][1].tolist() == [1, 2, 3]
+    assert result["col_list"][2].tolist() == [1, 2, 3]
+
+    assert "col_nested" in result.columns
+    assert result["col_nested"].dtype == object
+    assert result["col_nested"][0].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
+    assert result["col_nested"][1].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
+    assert result["col_nested"][2].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
+
+    assert "col_struct" in result.columns
+    assert result["col_struct"].dtype == object
+    assert result["col_struct"][0] == {"a": 1, "b": 2}
+    assert result["col_struct"][1] == {"a": 1, "b": 2}
+    assert result["col_struct"][2] == {"a": 1, "b": 2}
+
+
 @pytest.mark.filterwarnings(
     "ignore: Non-conformant content for record 1 in column dates"
 )