Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/continuous-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ jobs:
run: uv run pytest tests -v
- name: Check docs
run: uv run mkdocs build --strict
- name: Check jsonschema
run: check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,11 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
exclude: tests/.*\.json
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.11.8
hooks:
# Run the linter.
- id: ruff
# Run the formatter.
- id: ruff-format
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,9 @@ uv run pre-commit install
uv run pytest
scripts/lint
```

Validate the example collection metadata against the jsonschema:

```shell
check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json
```
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,19 @@ pc = ["adlfs", "azure-data-tables", "psycopg[binary,pool]", "pypgstac", "tqdm"]

[dependency-groups]
dev = [
"check-jsonschema",
"jsonschema",
"mypy",
"numpy>=2",
"ruff",
"pre-commit",
"pytest-recording>=0.13.2",
"pytest",
"requests",
"ruff",
"stac-geoparquet[pc]",
"stac-geoparquet[pgstac]",
"types-python-dateutil",
"types-requests",
"pytest-recording>=0.13.2",
"vcrpy>=7.0.0",
]
docs = [
Expand Down
3 changes: 1 addition & 2 deletions scripts/lint
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@

set -e

uv run ruff check
uv run ruff format --check
uv run pre-commit run --all-files
uv run mypy stac_geoparquet
40 changes: 40 additions & 0 deletions spec/example-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"stac_geoparquet:version": "1.0.0",
"stac:collection": {
"id": "simple-collection",
"type": "Collection",
"stac_extensions": [],
"stac_version": "1.1.0",
"description": "A simple collection demonstrating core catalog fields with links to a couple of items",
"title": "Simple Example Collection",
"keywords": [
"simple",
"example",
"collection"
],
"providers": [],
"extent": {
"spatial": {
"bbox": [
[
172.91173669923782,
1.3438851951615003,
172.95469614953714,
1.3690476620161975
]
]
},
"temporal": {
"interval": [
[
"2020-12-11T22:38:32.125Z",
"2020-12-14T18:02:31.437Z"
]
]
}
},
"license": "CC-BY-4.0",
"summaries": {},
"links": []
}
}
20 changes: 20 additions & 0 deletions spec/json-schema/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://stac-utils.github.io/stac-geoparquet/json-schema/metadata.json",
"title": "STAC GeoParquet Metadata",
"description": "JSON Schema for STAC GeoParquet metadata stored in Parquet file metadata",
"type": "object",
"properties": {
"stac_geoparquet:version": {
"type": "string",
"enum": ["1.0.0"],
"description": "The stac-geoparquet metadata version."
},
"stac:collection": {
"$ref": "https://schemas.stacspec.org/v1.1.0/collection-spec/json-schema/collection.json",
"description": "This object represents a Collection in a SpatioTemporal Asset Catalog.",
"deprecated": true
}
},
"required": ["stac_geoparquet:version"]
}
37 changes: 31 additions & 6 deletions spec/stac-geoparquet-spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ most of the fields should be the same in STAC and in GeoParquet.
| _property columns_ | _varies_ | - | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field |

- Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.
- Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data
- Strongly recommend storing items that are mostly homogeneous (i.e. have the same fields). Parquet is a columnar format; storing items with many different fields will lead to an expanded parquet Schema with lots of empty data. In practice, this means storing a single collection or only collections with very similar item properties in a single stac-geoparquet dataset.
- Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.
- STAC GeoParquet does not support properties that are named such that they collide with a top-level key.
- datetime columns should be stored as a [native timestamp][timestamp], not as a string
- The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#including-a-stac-collection-json-in-a-stac-geoparquet-collection) below.
- The Collection JSON objects should be included in the Parquet metadata. See [Collection JSON](#stac-collection-objects) below.
- Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. `proj:geometry`) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.

### Link Struct
Expand Down Expand Up @@ -69,12 +69,36 @@ To take advantage of Parquet's columnar nature and compression, the assets shoul

See [Asset Object][asset] for more.

## Including a STAC Collection JSON in a STAC Geoparquet Collection
### Parquet Metadata

stac-geoparquet uses Parquet [File Metadata](https://parquet.apache.org/docs/file-format/metadata/) to store metadata about the dataset.

See [`example-metadata.json`](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/example-metadata.json) for an example.

A [jsonschema schema file](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/json-schema/metadata.json) is provided
for tools to validate against.

| Field Name | Type | Description |
| ------------------------- | -----------------------| ----------------------------------------------------------------------- |
| `stac_geoparquet:version` | string | The stac-geoparquet metadata version. Currently just "1.0.0" is allowed |
| `stac:collection` | STAC Collection object | STAC Collection metadata. |

Note that this metadata is distinct from the file metadata required by
[geoparquet].

#### Geoparquet Version

The field `stac_geoparquet:version` stores the version of the stac-geoparquet
spec the data complies with. Readers can use this field to understand what
features and fields are available.

Currently, the only allowed value is the string `"1.0.0"`.

#### STAC Collection Objects

To make a stac-geoparquet file a fully self-contained representation, you can
include the Collection JSON in the Parquet metadata. If present in the [Parquet
file metadata][parquet-metadata], the key must be `stac:collection` and the
value must be a JSON string with the Collection JSON.
include the Collection JSON document in the Parquet metadata under the
`stac:collection` key. This should contain a STAC [Collection] object.

## Referencing a STAC Geoparquet Collections in a STAC Collection JSON

Expand Down Expand Up @@ -105,3 +129,4 @@ The principles here can likely be used to map into other geospatial data formats
[common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac
[timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
[parquet-metadata]: https://github.com/apache/parquet-format#metadata
[Collection]: https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#
4 changes: 2 additions & 2 deletions stac_geoparquet/arrow/_delta_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from stac_geoparquet.arrow._to_parquet import (
DEFAULT_PARQUET_SCHEMA_VERSION,
SUPPORTED_PARQUET_SCHEMA_VERSIONS,
create_geoparquet_metadata,
create_parquet_metadata,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -51,7 +51,7 @@ def parse_stac_ndjson_to_delta_lake(
input_path, chunk_size=chunk_size, schema=schema, limit=limit
)
schema = record_batch_reader.schema.with_metadata(
create_geoparquet_metadata(
create_parquet_metadata(
record_batch_reader.schema, schema_version=schema_version
)
)
Expand Down
13 changes: 9 additions & 4 deletions stac_geoparquet/arrow/_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
from collections.abc import Iterable
from pathlib import Path
from typing import Any
from typing import Any, Literal

import pyarrow as pa
import pyarrow.parquet as pq
Expand All @@ -18,6 +18,8 @@
from stac_geoparquet.arrow._schema.models import InferredSchema
from stac_geoparquet.arrow.types import ArrowStreamExportable

STAC_GEOPARQUET_VERSION: Literal["1.0.0"] = "1.0.0"


def parse_stac_ndjson_to_parquet(
input_path: str | Path | Iterable[str | Path],
Expand Down Expand Up @@ -90,14 +92,14 @@ def to_parquet(
reader = pa.RecordBatchReader.from_stream(table)

schema = reader.schema.with_metadata(
create_geoparquet_metadata(reader.schema, schema_version=schema_version)
create_parquet_metadata(reader.schema, schema_version=schema_version)
)
with pq.ParquetWriter(output_path, schema, **kwargs) as writer:
for batch in reader:
writer.write_batch(batch)


def create_geoparquet_metadata(
def create_parquet_metadata(
schema: pa.Schema,
*,
schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS,
Expand Down Expand Up @@ -141,7 +143,10 @@ def create_geoparquet_metadata(
"crs": None,
}

return {b"geo": json.dumps(geo_meta).encode("utf-8")}
return {
b"geo": json.dumps(geo_meta).encode("utf-8"),
b"stac_geoparquet:version": STAC_GEOPARQUET_VERSION.encode(),
}


def schema_version_has_bbox_mapping(
Expand Down
21 changes: 21 additions & 0 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from pathlib import Path

import jsonschema
import pyarrow.parquet as pq
import pytest

Expand Down Expand Up @@ -51,3 +52,23 @@ def test_round_trip_via_parquet(collection_id: str, tmp_path: Path):

for result, expected in zip(items_result, items):
assert_json_value_equal(result, expected, precision=0)


def test_metadata(tmp_path: Path):
collection_id = "3dep-lidar-copc"
path = HERE / "data" / f"{collection_id}-pc.json"
out_path = tmp_path / "file.parquet"
# Convert to Parquet
parse_stac_ndjson_to_parquet(path, out_path)
table = pq.read_table(out_path)

metadata = table.schema.metadata
assert metadata[b"stac_geoparquet:version"] == b"1.0.0"
geo = json.loads(metadata[b"geo"])
assert geo["version"] == "1.1.0"
assert set(geo) == {"version", "columns", "primary_column"}

instance = {k.decode("utf-8"): v.decode("utf-8") for k, v in metadata.items()}

schema = json.loads((HERE / "../spec/json-schema/metadata.json").read_text())
jsonschema.validate(instance, schema)
Loading