stac-utils · TomAugspurger · May 14, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
@@ -32,3 +32,5 @@ jobs:
         run: uv run pytest tests -v
       - name: Check docs
         run: uv run mkdocs build --strict
+      - name: Check jsonschema
+        run: check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,3 +14,11 @@ repos:
       - id: trailing-whitespace
       - id: end-of-file-fixer
         exclude: tests/.*\.json
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.11.8
+    hooks:
+      # Run the linter.
+      - id: ruff
+      # Run the formatter.
+      - id: ruff-format
diff --git a/README.md b/README.md
@@ -40,3 +40,9 @@ uv run pre-commit install
 uv run pytest
 scripts/lint
 ```
+
+Validate the example collection metadata against the jsonschema:
+
+```shell
+check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,17 +44,19 @@ pc = ["adlfs", "azure-data-tables", "psycopg[binary,pool]", "pypgstac", "tqdm"]
 
 [dependency-groups]
 dev = [
+    "check-jsonschema",
+    "jsonschema",
     "mypy",
     "numpy>=2",
-    "ruff",
     "pre-commit",
+    "pytest-recording>=0.13.2",
     "pytest",
     "requests",
+    "ruff",
     "stac-geoparquet[pc]",
     "stac-geoparquet[pgstac]",
     "types-python-dateutil",
     "types-requests",
-    "pytest-recording>=0.13.2",
     "vcrpy>=7.0.0",
 ]
 docs = [

diff --git a/scripts/lint b/scripts/lint
@@ -2,6 +2,5 @@
 
 set -e
 
-uv run ruff check
-uv run ruff format --check
+uv run pre-commit run --all-files
 uv run mypy stac_geoparquet
diff --git a/spec/example-metadata.json b/spec/example-metadata.json
@@ -0,0 +1,40 @@
+{
+    "stac_geoparquet:version": "1.0.0",
+    "stac:collection": {
+        "id": "simple-collection",
+        "type": "Collection",
+        "stac_extensions": [],
+        "stac_version": "1.1.0",
+        "description": "A simple collection demonstrating core catalog fields with links to a couple of items",
+        "title": "Simple Example Collection",
+        "keywords": [
+            "simple",
+            "example",
+            "collection"
+        ],
+        "providers": [],
+        "extent": {
+            "spatial": {
+                "bbox": [
+                    [
+                        172.91173669923782,
+                        1.3438851951615003,
+                        172.95469614953714,
+                        1.3690476620161975
+                    ]
+                ]
+            },
+            "temporal": {
+                "interval": [
+                    [
+                        "2020-12-11T22:38:32.125Z",
+                        "2020-12-14T18:02:31.437Z"
+                    ]
+                ]
+            }
+        },
+        "license": "CC-BY-4.0",
+        "summaries": {},
+        "links": []
+    }
+}
diff --git a/spec/json-schema/metadata.json b/spec/json-schema/metadata.json
@@ -0,0 +1,20 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://stac-utils.github.io/stac-geoparquet/json-schema/metadata.json",
+  "title": "STAC GeoParquet Metadata",
+  "description": "JSON Schema for STAC GeoParquet metadata stored in Parquet file metadata",
+  "type": "object",
+  "properties": {
+    "stac_geoparquet:version": {
+      "type": "string",
+      "enum": ["1.0.0"],
+      "description": "The stac-geoparquet metadata version."
+    },
+    "stac:collection": {
+      "$ref": "https://schemas.stacspec.org/v1.1.0/collection-spec/json-schema/collection.json",
+      "description": "This object represents a Collection in a SpatioTemporal Asset Catalog.",
+      "deprecated": true
+    }
+  },
+  "required": ["stac_geoparquet:version"]
+}
diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md
@@ -31,11 +31,11 @@ most of the fields should be the same in STAC and in GeoParquet.
 | _property columns_ | _varies_             | -        | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field                                                                                                                          |
 
 - Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.
-- Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data
+- Strongly recommend storing items that are mostly homogeneous (i.e. have the same fields). Parquet is a columnar format; storing items with many different fields will lead to an expanded parquet Schema with lots of empty data. In practice, this means storing a single collection or only collections with very similar item properties in a single stac-geoparquet dataset.
 - Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.
 - STAC GeoParquet does not support properties that are named such that they collide with a top-level key.
 - datetime columns should be stored as a [native timestamp][timestamp], not as a string
-- The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#including-a-stac-collection-json-in-a-stac-geoparquet-collection) below.
+- The Collection JSON objects should be included in the Parquet metadata. See [Collection JSON](#stac-collection-objects) below.
 - Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. `proj:geometry`) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.
 
 ### Link Struct
@@ -69,12 +69,36 @@ To take advantage of Parquet's columnar nature and compression, the assets shoul
 
 See [Asset Object][asset] for more.
 
-## Including a STAC Collection JSON in a STAC Geoparquet Collection
+### Parquet Metadata
+
+stac-geoparquet uses Parquet [File Metadata](https://parquet.apache.org/docs/file-format/metadata/) to store metadata about the dataset.
+
+See [`example-metadata.json`](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/example-metadata.json) for an example.
+
+A [jsonschema schema file](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/json-schema/metadata.json) is provided
+for tools to validate against.
+
+| Field Name                | Type                   | Description                                                             |
+| ------------------------- | -----------------------| ----------------------------------------------------------------------- |
+| `stac_geoparquet:version` | string                 | The stac-geoparquet metadata version. Currently just "1.0.0" is allowed |
+| `stac:collection`         | STAC Collection object | STAC Collection metadata.                                               |
+
+Note that this metadata is distinct from the file metadata required by
+[geoparquet].
+
+#### Geoparquet Version
+
+The field `stac_geoparquet:version` stores the version of the stac-geoparquet
+spec the data complies with. Readers can use this field to understand what
+features and fields are available.
+
+Currently, the only allowed value is the string `"1.0.0"`.
+
+#### STAC Collection Objects
 
 To make a stac-geoparquet file a fully self-contained representation, you can
-include the Collection JSON in the Parquet metadata. If present in the [Parquet
-file metadata][parquet-metadata], the key must be `stac:collection` and the
-value must be a JSON string with the Collection JSON.
+include the Collection JSON document in the Parquet metadata under the
+`stac:collection` key. This should contain a STAC [Collection] object.
 
 ## Referencing a STAC Geoparquet Collections in a STAC Collection JSON
 
@@ -105,3 +129,4 @@ The principles here can likely be used to map into other geospatial data formats
 [common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac
 [timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
 [parquet-metadata]: https://github.com/apache/parquet-format#metadata
+[Collection]: https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#
diff --git a/stac_geoparquet/arrow/_delta_lake.py b/stac_geoparquet/arrow/_delta_lake.py
@@ -12,7 +12,7 @@
 from stac_geoparquet.arrow._to_parquet import (
     DEFAULT_PARQUET_SCHEMA_VERSION,
     SUPPORTED_PARQUET_SCHEMA_VERSIONS,
-    create_geoparquet_metadata,
+    create_parquet_metadata,
 )
 
 if TYPE_CHECKING:
@@ -51,7 +51,7 @@ def parse_stac_ndjson_to_delta_lake(
         input_path, chunk_size=chunk_size, schema=schema, limit=limit
     )
     schema = record_batch_reader.schema.with_metadata(
-        create_geoparquet_metadata(
+        create_parquet_metadata(
             record_batch_reader.schema, schema_version=schema_version
         )
     )

diff --git a/stac_geoparquet/arrow/_to_parquet.py b/stac_geoparquet/arrow/_to_parquet.py
@@ -3,7 +3,7 @@
 import json
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -18,6 +18,8 @@
 from stac_geoparquet.arrow._schema.models import InferredSchema
 from stac_geoparquet.arrow.types import ArrowStreamExportable
 
+STAC_GEOPARQUET_VERSION: Literal["1.0.0"] = "1.0.0"
+
 
 def parse_stac_ndjson_to_parquet(
     input_path: str | Path | Iterable[str | Path],
@@ -90,14 +92,14 @@ def to_parquet(
     reader = pa.RecordBatchReader.from_stream(table)
 
     schema = reader.schema.with_metadata(
-        create_geoparquet_metadata(reader.schema, schema_version=schema_version)
+        create_parquet_metadata(reader.schema, schema_version=schema_version)
     )
     with pq.ParquetWriter(output_path, schema, **kwargs) as writer:
         for batch in reader:
             writer.write_batch(batch)
 
 
-def create_geoparquet_metadata(
+def create_parquet_metadata(
     schema: pa.Schema,
     *,
     schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS,
@@ -141,7 +143,10 @@ def create_geoparquet_metadata(
             "crs": None,
         }
 
-    return {b"geo": json.dumps(geo_meta).encode("utf-8")}
+    return {
+        b"geo": json.dumps(geo_meta).encode("utf-8"),
+        b"stac_geoparquet:version": STAC_GEOPARQUET_VERSION.encode(),
+    }
 
 
 def schema_version_has_bbox_mapping(

diff --git a/tests/test_parquet.py b/tests/test_parquet.py
@@ -1,6 +1,7 @@
 import json
 from pathlib import Path
 
+import jsonschema
 import pyarrow.parquet as pq
 import pytest
 
@@ -51,3 +52,23 @@ def test_round_trip_via_parquet(collection_id: str, tmp_path: Path):
 
     for result, expected in zip(items_result, items):
         assert_json_value_equal(result, expected, precision=0)
+
+
+def test_metadata(tmp_path: Path):
+    collection_id = "3dep-lidar-copc"
+    path = HERE / "data" / f"{collection_id}-pc.json"
+    out_path = tmp_path / "file.parquet"
+    # Convert to Parquet
+    parse_stac_ndjson_to_parquet(path, out_path)
+    table = pq.read_table(out_path)
+
+    metadata = table.schema.metadata
+    assert metadata[b"stac_geoparquet:version"] == b"1.0.0"
+    geo = json.loads(metadata[b"geo"])
+    assert geo["version"] == "1.1.0"
+    assert set(geo) == {"version", "columns", "primary_column"}
+
+    instance = {k.decode("utf-8"): v.decode("utf-8") for k, v in metadata.items()}
+
+    schema = json.loads((HERE / "../spec/json-schema/metadata.json").read_text())
+    jsonschema.validate(instance, schema)