Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/continuous-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ jobs:
run: uv run pytest tests -v
- name: Check docs
run: uv run mkdocs build --strict
- name: Check jsonschema
run: check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,18 @@ pc = ["adlfs", "azure-data-tables", "psycopg[binary,pool]", "pypgstac", "tqdm"]

[dependency-groups]
dev = [
"check-jsonschema",
"mypy",
"numpy>=2",
"ruff",
"pre-commit",
"pytest-recording>=0.13.2",
"pytest",
"requests",
"ruff",
"stac-geoparquet[pc]",
"stac-geoparquet[pgstac]",
"types-python-dateutil",
"types-requests",
"pytest-recording>=0.13.2",
"vcrpy>=7.0.0",
]
docs = [
Expand Down
42 changes: 42 additions & 0 deletions spec/example-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"stac:geoparquet_version": "1.0.0",
"stac:collections": {
"simple-collection": {
"id": "simple-collection",
"type": "Collection",
"stac_extensions": [],
"stac_version": "1.1.0",
"description": "A simple collection demonstrating core catalog fields with links to a couple of items",
"title": "Simple Example Collection",
"keywords": [
"simple",
"example",
"collection"
],
"providers": [],
"extent": {
"spatial": {
"bbox": [
[
172.91173669923782,
1.3438851951615003,
172.95469614953714,
1.3690476620161975
]
]
},
"temporal": {
"interval": [
[
"2020-12-11T22:38:32.125Z",
"2020-12-14T18:02:31.437Z"
]
]
}
},
"license": "CC-BY-4.0",
"summaries": {},
"links": []
}
}
}
27 changes: 27 additions & 0 deletions spec/json-schema/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://stac-utils.github.io/stac-geoparquet/json-schema/metadata.json",
"title": "STAC GeoParquet Metadata",
"description": "JSON Schema for STAC GeoParquet metadata stored in Parquet file metadata",
"type": "object",
"properties": {
"stac:geoparquet_version": {
"type": "string",
"enum": ["1.0.0"],
"description": "The stac-geoparquet metadata version."
},
"stac:collections": {
"type": "object",
"additionalProperties": {
"$ref": "https://schemas.stacspec.org/v1.1.0/collection-spec/json-schema/collection.json"
},
"description": "A mapping of collection IDs to STAC Collection objects"
},
"stac:collection": {
"$ref": "https://schemas.stacspec.org/v1.1.0/collection-spec/json-schema/collection.json",
"description": "DEPRECATED. Use stac:collections instead. A STAC Collection object.",
"deprecated": true
}
},
"required": ["stac:geoparquet_version"]
}
46 changes: 40 additions & 6 deletions spec/stac-geoparquet-spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ most of the fields should be the same in STAC and in GeoParquet.
| _property columns_ | _varies_ | - | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field |

- Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.
- Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data
- Strongly recommend storing items that are mostly homogenous (i.e. have the same field). Parquet is a columnar format; storing items with many different fields will lead to an expanded parquet Schema with lots of empty data. In practice, this means storing a single collection or only collections with very similar item properties in a single stac-geoparquet dataset.
- Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.
- STAC GeoParquet does not support properties that are named such that they collide with a top-level key.
- datetime columns should be stored as a [native timestamp][timestamp], not as a string
- The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#including-a-stac-collection-json-in-a-stac-geoparquet-collection) below.
- The Collection JSON objects should be included in the Parquet metadata. See [Collection JSON](#stac-collection-objects) below.
- Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. `proj:geometry`) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.

### Link Struct
Expand Down Expand Up @@ -69,12 +69,45 @@ To take advantage of Parquet's columnar nature and compression, the assets shoul

See [Asset Object][asset] for more.

## Including a STAC Collection JSON in a STAC Geoparquet Collection
### Parquet Metadata

stac-geoparquet uses Parquet [File Metadata](https://parquet.apache.org/docs/file-format/metadata/) to store metadata about the dataset.

See [`example-metadata.json`](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/example-metadata.json) for an example.

A [jsonschema schema file](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/json-schema/metadata.json) is provided
for tools to validate against.

| Field Name | Type | Description |
| ---------- | ---- | ----------- |
| `stac:geoparquet_version` | string | The stac-geoparquet metadata version. Currently just "1.0.0" is allowed |
| `stac:collections` | Struct of STAC Collection Objects | STAC Collection metadata |
| `stac:collection` | STAC Collection object | **Deprecated**. Use `stac:collections` instead. |

Note that this metadata is distinct from the file metadata required by
[geoparquet].

#### Geoparquet Version

The field `stac:geoparquet_version` stores the version of the stac-geoparquet
spec the data complies with. Readers can use this field to understand what
features and fields are available.

Currently, the only allowed value is the string `"1.0.0"`.

#### STAC Collection Objects

To make a stac-geoparquet file a fully self-contained representation, you can
include the Collection JSON in the Parquet metadata. If present in the [Parquet
file metadata][parquet-metadata], the key must be `stac:collection` and the
value must be a JSON string with the Collection JSON.
include the Collection JSON documents in the Parquet metadata under the
`stac:collections` key. This should contain a mapping of STAC Collection IDs to
[Collection] objects. As usual, the collection ID in the mapping keys must match
the `id` in the Collection object.

Note that a previous version of this spec allowed storing a single collection
under the `stac:collection` (singular) key. This field is deprecated in favor of
`stac:collections`. Readers aiming for maximum compatibility should first check
`stac:collections` for collection information and fall back to the deprecated
`stac:collection` field.

## Referencing a STAC Geoparquet Collections in a STAC Collection JSON

Expand Down Expand Up @@ -105,3 +138,4 @@ The principles here can likely be used to map into other geospatial data formats
[common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac
[timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
[parquet-metadata]: https://github.com/apache/parquet-format#metadata
[Collection]: https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#
4 changes: 2 additions & 2 deletions stac_geoparquet/arrow/_delta_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from stac_geoparquet.arrow._to_parquet import (
DEFAULT_PARQUET_SCHEMA_VERSION,
SUPPORTED_PARQUET_SCHEMA_VERSIONS,
create_geoparquet_metadata,
create_parquet_metadata,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -51,7 +51,7 @@ def parse_stac_ndjson_to_delta_lake(
input_path, chunk_size=chunk_size, schema=schema, limit=limit
)
schema = record_batch_reader.schema.with_metadata(
create_geoparquet_metadata(
create_parquet_metadata(
record_batch_reader.schema, schema_version=schema_version
)
)
Expand Down
9 changes: 6 additions & 3 deletions stac_geoparquet/arrow/_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ def to_parquet(
reader = pa.RecordBatchReader.from_stream(table)

schema = reader.schema.with_metadata(
create_geoparquet_metadata(reader.schema, schema_version=schema_version)
create_parquet_metadata(reader.schema, schema_version=schema_version)
)
with pq.ParquetWriter(output_path, schema, **kwargs) as writer:
for batch in reader:
writer.write_batch(batch)


def create_geoparquet_metadata(
def create_parquet_metadata(
schema: pa.Schema,
*,
schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS,
Expand Down Expand Up @@ -141,7 +141,10 @@ def create_geoparquet_metadata(
"crs": None,
}

return {b"geo": json.dumps(geo_meta).encode("utf-8")}
return {
b"geo": json.dumps(geo_meta).encode("utf-8"),
b"stac:geoparquet_version": b"1.0.0",
}


def schema_version_has_bbox_mapping(
Expand Down
15 changes: 15 additions & 0 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,18 @@ def test_round_trip_via_parquet(collection_id: str, tmp_path: Path):

for result, expected in zip(items_result, items):
assert_json_value_equal(result, expected, precision=0)


def test_metadata(tmp_path: Path):
collection_id = "3dep-lidar-copc"
path = HERE / "data" / f"{collection_id}-pc.json"
out_path = tmp_path / "file.parquet"
# Convert to Parquet
parse_stac_ndjson_to_parquet(path, out_path)
table = pq.read_table(out_path)

metadata = table.schema.metadata
assert metadata[b"stac:geoparquet_version"] == b"1.0.0"
geo = json.loads(metadata[b"geo"])
assert geo["version"] == "1.1.0"
assert set(geo) == {"version", "columns", "primary_column"}
Loading