@@ -379,20 +379,29 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow):
379379 assert_series_equal (df_read .datetime_col , expected )
380380
381381
382- def test_read_list_types (list_field_values_file , use_arrow ):
382+ def test_read_list_types (list_field_values_files , use_arrow ):
383383 """Test reading a geojson file containing fields with lists."""
384- info = read_info (list_field_values_file )
385- result = read_dataframe (list_field_values_file , use_arrow = use_arrow )
384+ if list_field_values_files .suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER :
385+ pytest .skip (
386+ "Skipping test for parquet as the GDAL Parquet driver is not available"
387+ )
388+
389+ info = read_info (list_field_values_files )
390+ suffix = list_field_values_files .suffix
386391
392+ result = read_dataframe (list_field_values_files , use_arrow = use_arrow )
393+
394+ # Check list_int column
387395 assert "list_int" in result .columns
388396 assert info ["fields" ][1 ] == "list_int"
389- assert info ["ogr_types" ][1 ] == "OFTIntegerList"
397+ assert info ["ogr_types" ][1 ] in ( "OFTIntegerList" , "OFTInteger64List" )
390398 assert result ["list_int" ][0 ].tolist () == [0 , 1 ]
391399 assert result ["list_int" ][1 ].tolist () == [2 , 3 ]
392400 assert result ["list_int" ][2 ].tolist () == []
393401 assert result ["list_int" ][3 ] is None
394402 assert result ["list_int" ][4 ] is None
395403
404+ # Check list_double column
396405 assert "list_double" in result .columns
397406 assert info ["fields" ][2 ] == "list_double"
398407 assert info ["ogr_types" ][2 ] == "OFTRealList"
@@ -402,6 +411,7 @@ def test_read_list_types(list_field_values_file, use_arrow):
402411 assert result ["list_double" ][3 ] is None
403412 assert result ["list_double" ][4 ] is None
404413
414+ # Check list_string column
405415 assert "list_string" in result .columns
406416 assert info ["fields" ][3 ] == "list_string"
407417 assert info ["ogr_types" ][3 ] == "OFTStringList"
@@ -411,31 +421,122 @@ def test_read_list_types(list_field_values_file, use_arrow):
411421 assert result ["list_string" ][3 ] is None
412422 assert result ["list_string" ][4 ] == ["" ]
413423
414- # Once any row of a column contains a null value in a list (in the test geojson),
415- # the column isn't recognized as a list column anymore, but as a JSON column.
416- # Because JSON columns containing JSON Arrays are also parsed to python lists, the
417- # end result is the same...
424+ # Check list_int_with_null column
425+ if suffix == ".geojson" :
426+ # Once any row of a column contains a null value in a list, the column isn't
427+ # recognized as a list column anymore for .geojson files, but as a JSON column.
428+ # Because JSON columns containing JSON Arrays are also parsed to python lists,
429+ # the end result is the same...
430+ exp_type = "OFTString"
431+ exp_subtype = "OFSTJSON"
432+ exp_list_int_with_null_value = [0 , None ]
433+ else :
434+ # For .parquet files, the list column is preserved as a list column.
435+ exp_type = "OFTInteger64List"
436+ exp_subtype = "OFSTNone"
437+ if use_arrow :
438+ exp_list_int_with_null_value = [0.0 , np .nan ]
439+ else :
440+ exp_list_int_with_null_value = [0 , 0 ]
441+ # xfail: when reading a list of int with None values without Arrow from a
442+ # .parquet file, the None values become 0, which is wrong.
443+ # https://github.com/OSGeo/gdal/issues/13448
444+
418445 assert "list_int_with_null" in result .columns
419446 assert info ["fields" ][4 ] == "list_int_with_null"
420- assert info ["ogr_types" ][4 ] == "OFTString"
421- assert info ["ogr_subtypes" ][4 ] == "OFSTJSON"
422- assert result ["list_int_with_null" ][0 ] == [0 , None ]
423- assert result ["list_int_with_null" ][1 ] == [2 , 3 ]
424- assert result ["list_int_with_null" ][2 ] == []
447+ assert info ["ogr_types" ][4 ] == exp_type
448+ assert info ["ogr_subtypes" ][4 ] == exp_subtype
449+ assert result ["list_int_with_null" ][0 ][0 ] == 0
450+ if exp_list_int_with_null_value [1 ] == 0 :
451+ assert result ["list_int_with_null" ][0 ][1 ] == exp_list_int_with_null_value [1 ]
452+ else :
453+ assert pd .isna (result ["list_int_with_null" ][0 ][1 ])
454+
455+ if suffix == ".geojson" :
456+ # For .geojson, the lists are already python lists
457+ assert result ["list_int_with_null" ][1 ] == [2 , 3 ]
458+ assert result ["list_int_with_null" ][2 ] == []
459+ else :
460+ # For .parquet, the lists are numpy arrays
461+ assert result ["list_int_with_null" ][1 ].tolist () == [2 , 3 ]
462+ assert result ["list_int_with_null" ][2 ].tolist () == []
463+
425464 assert pd .isna (result ["list_int_with_null" ][3 ])
426465 assert pd .isna (result ["list_int_with_null" ][4 ])
427466
467+ # Check list_string_with_null column
468+ if suffix == ".geojson" :
469+ # Once any row of a column contains a null value in a list, the column isn't
470+ # recognized as a list column anymore for .geojson files, but as a JSON column.
471+ # Because JSON columns containing JSON Arrays are also parsed to python lists,
472+ # the end result is the same...
473+ exp_type = "OFTString"
474+ exp_subtype = "OFSTJSON"
475+ else :
476+ # For .parquet files, the list column is preserved as a list column.
477+ exp_type = "OFTStringList"
478+ exp_subtype = "OFSTNone"
479+
428480 assert "list_string_with_null" in result .columns
429481 assert info ["fields" ][5 ] == "list_string_with_null"
430- assert info ["ogr_types" ][5 ] == "OFTString"
431- assert info ["ogr_subtypes" ][5 ] == "OFSTJSON"
432- assert result ["list_string_with_null" ][0 ] == ["string1" , None ]
433- assert result ["list_string_with_null" ][1 ] == ["string3" , "string4" , "" ]
434- assert result ["list_string_with_null" ][2 ] == []
482+ assert info ["ogr_types" ][5 ] == exp_type
483+ assert info ["ogr_subtypes" ][5 ] == exp_subtype
484+
485+ if suffix == ".geojson" :
486+ # For .geojson, the lists are already python lists
487+ assert result ["list_string_with_null" ][0 ] == ["string1" , None ]
488+ assert result ["list_string_with_null" ][1 ] == ["string3" , "string4" , "" ]
489+ assert result ["list_string_with_null" ][2 ] == []
490+ else :
491+ # For .parquet, the lists are numpy arrays
492+ # When use_arrow=False, the None becomes an empty string, which is wrong.
493+ exp_value = ["string1" , "" ] if not use_arrow else ["string1" , None ]
494+ assert result ["list_string_with_null" ][0 ].tolist () == exp_value
495+ assert result ["list_string_with_null" ][1 ].tolist () == ["string3" , "string4" , "" ]
496+ assert result ["list_string_with_null" ][2 ].tolist () == []
497+
435498 assert pd .isna (result ["list_string_with_null" ][3 ])
436499 assert result ["list_string_with_null" ][4 ] == ["" ]
437500
438501
502+ @pytest .mark .requires_arrow_write_api
503+ @pytest .mark .skipif (
504+ not GDAL_HAS_PARQUET_DRIVER , reason = "Parquet driver is not available"
505+ )
506+ def test_read_list_nested_struct_parquet_file (
507+ list_nested_struct_parquet_file , use_arrow
508+ ):
509+ """Test reading a Parquet file containing nested struct and list types."""
510+ if not use_arrow :
511+ pytest .skip (
512+ "When use_arrow=False, gdal flattens nested columns to seperate columns. "
513+ "Not sure how we want to deal with this case, but for now just skip."
514+ )
515+
516+ result = read_dataframe (list_nested_struct_parquet_file , use_arrow = use_arrow )
517+
518+ assert "col_flat" in result .columns
519+ assert np .array_equal (result ["col_flat" ].to_numpy (), np .array ([0 , 1 , 2 ]))
520+
521+ assert "col_list" in result .columns
522+ assert result ["col_list" ].dtype == object
523+ assert result ["col_list" ][0 ].tolist () == [1 , 2 , 3 ]
524+ assert result ["col_list" ][1 ].tolist () == [1 , 2 , 3 ]
525+ assert result ["col_list" ][2 ].tolist () == [1 , 2 , 3 ]
526+
527+ assert "col_nested" in result .columns
528+ assert result ["col_nested" ].dtype == object
529+ assert result ["col_nested" ][0 ].tolist () == [{"a" : 1 , "b" : 2 }, {"a" : 1 , "b" : 2 }]
530+ assert result ["col_nested" ][1 ].tolist () == [{"a" : 1 , "b" : 2 }, {"a" : 1 , "b" : 2 }]
531+ assert result ["col_nested" ][2 ].tolist () == [{"a" : 1 , "b" : 2 }, {"a" : 1 , "b" : 2 }]
532+
533+ assert "col_struct" in result .columns
534+ assert result ["col_struct" ].dtype == object
535+ assert result ["col_struct" ][0 ] == {"a" : 1 , "b" : 2 }
536+ assert result ["col_struct" ][1 ] == {"a" : 1 , "b" : 2 }
537+ assert result ["col_struct" ][2 ] == {"a" : 1 , "b" : 2 }
538+
539+
439540@pytest .mark .filterwarnings (
440541 "ignore: Non-conformant content for record 1 in column dates"
441542)
0 commit comments