From 9a37bfa86f958d1d987f3b0d0e28c75e1fbb8905 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Sun, 2 Jun 2024 15:17:21 +0200 Subject: [PATCH 1/3] Include proj:geometry column in GeoParquet metadata --- stac_geoparquet/arrow/_to_parquet.py | 19 +++++++++++++++++-- tests/test_arrow.py | 28 +++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/stac_geoparquet/arrow/_to_parquet.py b/stac_geoparquet/arrow/_to_parquet.py index 294e216..2d00f29 100644 --- a/stac_geoparquet/arrow/_to_parquet.py +++ b/stac_geoparquet/arrow/_to_parquet.py @@ -52,13 +52,13 @@ def to_parquet(table: pa.Table, where: Any, **kwargs: Any) -> None: where: The destination for saving. """ metadata = table.schema.metadata or {} - metadata.update(_create_geoparquet_metadata()) + metadata.update(_create_geoparquet_metadata(table)) table = table.replace_schema_metadata(metadata) pq.write_table(table, where, **kwargs) -def _create_geoparquet_metadata() -> dict[bytes, bytes]: +def _create_geoparquet_metadata(table: pa.Table) -> dict[bytes, bytes]: # TODO: include bbox of geometries column_meta = { "encoding": "WKB", @@ -80,4 +80,19 @@ def _create_geoparquet_metadata() -> dict[bytes, bytes]: "columns": {"geometry": column_meta}, "primary_column": "geometry", } + + if "proj:geometry" in table.schema.names: + # Note we don't include proj:bbox as a covering here for a couple different + # reasons. For one, it's very common for the projected geometries to have a + # different CRS in each row, so having statistics for proj:bbox wouldn't be + # useful. Additionally, because of this we leave proj:bbox as a list instead of + # a struct. + geo_meta["columns"]["proj:geometry"] = { + "encoding": "WKB", + "geometry_types": [], + # Note that we have to set CRS to `null` to signify that the CRS is unknown. + # If the CRS key is missing, it gets inferred as WGS84. + "crs": None, + } + return {b"geo": json.dumps(geo_meta).encode("utf-8")} diff --git a/tests/test_arrow.py b/tests/test_arrow.py index 2b9bca4..55c95ce 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -1,13 +1,19 @@ import json import math +from io import BytesIO from pathlib import Path from typing import Any, Dict, Sequence, Union import pyarrow as pa +import pyarrow.parquet as pq import pytest from ciso8601 import parse_rfc3339 -from stac_geoparquet.arrow import parse_stac_items_to_arrow, stac_table_to_items +from stac_geoparquet.arrow import ( + parse_stac_items_to_arrow, + stac_table_to_items, + to_parquet, +) HERE = Path(__file__).parent @@ -237,3 +243,23 @@ def test_from_arrow_deprecated(): import stac_geoparquet.from_arrow stac_geoparquet.from_arrow.stac_table_to_items + + +def test_to_parquet_two_geometry_columns(): + """ + When writing STAC Items that have a proj:geometry field, there should be two + geometry columns listed in the GeoParquet metadata. + """ + with open(HERE / "data" / "3dep-lidar-copc-pc.json") as f: + items = json.load(f) + + table = pa.Table.from_batches(parse_stac_items_to_arrow(items)) + with BytesIO() as bio: + to_parquet(table, bio) + bio.seek(0) + pq_meta = pq.read_metadata(bio) + + geo_meta = json.loads(pq_meta.metadata[b"geo"]) + assert geo_meta["primary_column"] == "geometry" + assert "geometry" in geo_meta["columns"].keys() + assert "proj:geometry" in geo_meta["columns"].keys() From ffd6379f187f43b76b057e8f5c089c8d523684a1 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Sun, 2 Jun 2024 18:18:24 +0200 Subject: [PATCH 2/3] fix test --- stac_geoparquet/arrow/_to_parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stac_geoparquet/arrow/_to_parquet.py b/stac_geoparquet/arrow/_to_parquet.py index 2d00f29..fad9921 100644 --- a/stac_geoparquet/arrow/_to_parquet.py +++ b/stac_geoparquet/arrow/_to_parquet.py @@ -35,7 +35,9 @@ def parse_stac_ndjson_to_parquet( input_path, chunk_size=chunk_size, schema=schema ) first_batch = next(batches_iter) - schema = first_batch.schema.with_metadata(_create_geoparquet_metadata()) + schema = first_batch.schema.with_metadata( + _create_geoparquet_metadata(pa.Table.from_batches([first_batch])) + ) with pq.ParquetWriter(output_path, schema, **kwargs) as writer: writer.write_batch(first_batch) for batch in batches_iter: From d3c369644f306ed923553c715c7c235b47768d04 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 4 Jun 2024 13:03:53 -0400 Subject: [PATCH 3/3] fix type check --- stac_geoparquet/arrow/_to_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stac_geoparquet/arrow/_to_parquet.py b/stac_geoparquet/arrow/_to_parquet.py index 95971e2..bdb6ba0 100644 --- a/stac_geoparquet/arrow/_to_parquet.py +++ b/stac_geoparquet/arrow/_to_parquet.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Iterable, Optional, Union +from typing import Any, Dict, Iterable, Optional, Union import pyarrow as pa import pyarrow.parquet as pq @@ -77,7 +77,7 @@ def _create_geoparquet_metadata(table: pa.Table) -> dict[bytes, bytes]: } }, } - geo_meta = { + geo_meta: Dict[str, Any] = { "version": "1.1.0-dev", "columns": {"geometry": column_meta}, "primary_column": "geometry",