diff --git a/stac_geoparquet/arrow/_to_parquet.py b/stac_geoparquet/arrow/_to_parquet.py index 7197d16..bdb6ba0 100644 --- a/stac_geoparquet/arrow/_to_parquet.py +++ b/stac_geoparquet/arrow/_to_parquet.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Iterable, Optional, Union +from typing import Any, Dict, Iterable, Optional, Union import pyarrow as pa import pyarrow.parquet as pq @@ -35,7 +35,9 @@ def parse_stac_ndjson_to_parquet( input_path, chunk_size=chunk_size, schema=schema ) first_batch = next(batches_iter) - schema = first_batch.schema.with_metadata(_create_geoparquet_metadata()) + schema = first_batch.schema.with_metadata( + _create_geoparquet_metadata(pa.Table.from_batches([first_batch])) + ) with pq.ParquetWriter(output_path, schema, **kwargs) as writer: writer.write_batch(first_batch) for batch in batches_iter: @@ -52,13 +54,13 @@ def to_parquet(table: pa.Table, where: Any, **kwargs: Any) -> None: where: The destination for saving. """ metadata = table.schema.metadata or {} - metadata.update(_create_geoparquet_metadata()) + metadata.update(_create_geoparquet_metadata(table)) table = table.replace_schema_metadata(metadata) pq.write_table(table, where, **kwargs) -def _create_geoparquet_metadata() -> dict[bytes, bytes]: +def _create_geoparquet_metadata(table: pa.Table) -> dict[bytes, bytes]: # TODO: include bbox of geometries column_meta = { "encoding": "WKB", @@ -75,9 +77,24 @@ def _create_geoparquet_metadata() -> dict[bytes, bytes]: } }, } - geo_meta = { + geo_meta: Dict[str, Any] = { "version": "1.1.0-dev", "columns": {"geometry": column_meta}, "primary_column": "geometry", } + + if "proj:geometry" in table.schema.names: + # Note we don't include proj:bbox as a covering here for a couple different + # reasons. For one, it's very common for the projected geometries to have a + # different CRS in each row, so having statistics for proj:bbox wouldn't be + # useful. Additionally, because of this we leave proj:bbox as a list instead of + # a struct. + geo_meta["columns"]["proj:geometry"] = { + "encoding": "WKB", + "geometry_types": [], + # Note that we have to set CRS to `null` to signify that the CRS is unknown. + # If the CRS key is missing, it gets inferred as WGS84. + "crs": None, + } + return {b"geo": json.dumps(geo_meta).encode("utf-8")} diff --git a/tests/test_arrow.py b/tests/test_arrow.py index f51787b..78584b0 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -1,7 +1,9 @@ import json +from io import BytesIO from pathlib import Path import pyarrow as pa +import pyarrow.parquet as pq import pytest from stac_geoparquet.arrow import ( @@ -9,6 +11,7 @@ parse_stac_ndjson_to_arrow, stac_table_to_items, stac_table_to_ndjson, + to_parquet, ) from .json_equals import assert_json_value_equal @@ -94,3 +97,23 @@ def test_from_arrow_deprecated(): import stac_geoparquet.from_arrow stac_geoparquet.from_arrow.stac_table_to_items + + +def test_to_parquet_two_geometry_columns(): + """ + When writing STAC Items that have a proj:geometry field, there should be two + geometry columns listed in the GeoParquet metadata. + """ + with open(HERE / "data" / "3dep-lidar-copc-pc.json") as f: + items = json.load(f) + + table = pa.Table.from_batches(parse_stac_items_to_arrow(items)) + with BytesIO() as bio: + to_parquet(table, bio) + bio.seek(0) + pq_meta = pq.read_metadata(bio) + + geo_meta = json.loads(pq_meta.metadata[b"geo"]) + assert geo_meta["primary_column"] == "geometry" + assert "geometry" in geo_meta["columns"].keys() + assert "proj:geometry" in geo_meta["columns"].keys()