stac-utils · TomAugspurger · May 14, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
@@ -32,3 +32,5 @@ jobs:
         run: uv run pytest tests -v
       - name: Check docs
         run: uv run mkdocs build --strict
+      - name: Check jsonschema
+        run: check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,17 +44,18 @@ pc = ["adlfs", "azure-data-tables", "psycopg[binary,pool]", "pypgstac", "tqdm"]
 
 [dependency-groups]
 dev = [
+    "check-jsonschema",
     "mypy",
     "numpy>=2",
-    "ruff",
     "pre-commit",
+    "pytest-recording>=0.13.2",
     "pytest",
     "requests",
+    "ruff",
     "stac-geoparquet[pc]",
     "stac-geoparquet[pgstac]",
     "types-python-dateutil",
     "types-requests",
-    "pytest-recording>=0.13.2",
     "vcrpy>=7.0.0",
 ]
 docs = [

diff --git a/spec/example-metadata.json b/spec/example-metadata.json
@@ -0,0 +1,42 @@
+{
+    "stac:geoparquet_version": "1.0.0",
+    "stac:collections": {
+        "simple-collection": {
+            "id": "simple-collection",
+            "type": "Collection",
+            "stac_extensions": [],
+            "stac_version": "1.1.0",
+            "description": "A simple collection demonstrating core catalog fields with links to a couple of items",
+            "title": "Simple Example Collection",
+            "keywords": [
+                "simple",
+                "example",
+                "collection"
+            ],
+            "providers": [],
+            "extent": {
+                "spatial": {
+                    "bbox": [
+                        [
+                            172.91173669923782,
+                            1.3438851951615003,
+                            172.95469614953714,
+                            1.3690476620161975
+                        ]
+                    ]
+                },
+                "temporal": {
+                    "interval": [
+                        [
+                            "2020-12-11T22:38:32.125Z",
+                            "2020-12-14T18:02:31.437Z"
+                        ]
+                    ]
+                }
+            },
+            "license": "CC-BY-4.0",
+            "summaries": {},
+            "links": []
+        }
+    }
+}
diff --git a/spec/json-schema/metadata.json b/spec/json-schema/metadata.json
@@ -0,0 +1,27 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://stac-utils.github.io/stac-geoparquet/json-schema/metadata.json",
+  "title": "STAC GeoParquet Metadata",
+  "description": "JSON Schema for STAC GeoParquet metadata stored in Parquet file metadata",
+  "type": "object",
+  "properties": {
+    "stac:geoparquet_version": {
+      "type": "string",
+      "enum": ["1.0.0"],
+      "description": "The stac-geoparquet metadata version."
+    },
+    "stac:collections": {
+      "type": "object",
+      "additionalProperties": {
+        "$ref": "https://schemas.stacspec.org/v1.1.0/collection-spec/json-schema/collection.json"
+      },
+      "description": "A mapping of collection IDs to STAC Collection objects"
+    },
+    "stac:collection": {
+      "$ref": "https://schemas.stacspec.org/v1.1.0/collection-spec/json-schema/collection.json",
+      "description": "DEPRECATED. Use stac:collections instead. A STAC Collection object.",
+      "deprecated": true
+    }
+  },
+  "required": ["stac:geoparquet_version"]
+}
diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md
@@ -31,11 +31,11 @@ most of the fields should be the same in STAC and in GeoParquet.
 | _property columns_ | _varies_             | -        | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field                                                                                                                          |
 
 - Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.
-- Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data
+- Strongly recommend storing items that are mostly homogenous (i.e. have the same field). Parquet is a columnar format; storing items with many different fields will lead to an expanded parquet Schema with lots of empty data. In practice, this means storing a single collection or only collections with very similar item properties in a single stac-geoparquet dataset.
 - Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.
 - STAC GeoParquet does not support properties that are named such that they collide with a top-level key.
 - datetime columns should be stored as a [native timestamp][timestamp], not as a string
-- The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#including-a-stac-collection-json-in-a-stac-geoparquet-collection) below.
+- The Collection JSON objects should be included in the Parquet metadata. See [Collection JSON](#stac-collection-objects) below.
 - Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. `proj:geometry`) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.
 
 ### Link Struct
@@ -69,12 +69,45 @@ To take advantage of Parquet's columnar nature and compression, the assets shoul
 
 See [Asset Object][asset] for more.
 
-## Including a STAC Collection JSON in a STAC Geoparquet Collection
+### Parquet Metadata
+
+stac-geoparquet uses Parquet [File Metadata](https://parquet.apache.org/docs/file-format/metadata/) to store metadata about the dataset.
+
+See [`example-metadata.json`](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/example-metadata.json) for an example.
+
+A [jsonschema schema file](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/json-schema/metadata.json) is provided
+for tools to validate against.
+
+| Field Name | Type | Description |
+| ---------- | ---- | ----------- |
+| `stac:geoparquet_version` | string | The stac-geoparquet metadata version. Currently just "1.0.0" is allowed |
+| `stac:collections` | Struct of STAC Collection Objects | STAC Collection metadata |
+| `stac:collection` | STAC Collection object | **Deprecated**. Use `stac:collections` instead. |
+
+Note that this metadata is distinct from the file metadata required by
+[geoparquet].
+
+#### Geoparquet Version
+
+The field `stac:geoparquet_version` stores the version of the stac-geoparquet
+spec the data complies with. Readers can use this field to understand what
+features and fields are available.
+
+Currently, the only allowed value is the string `"1.0.0"`.
+
+#### STAC Collection Objects
 
 To make a stac-geoparquet file a fully self-contained representation, you can
-include the Collection JSON in the Parquet metadata. If present in the [Parquet
-file metadata][parquet-metadata], the key must be `stac:collection` and the
-value must be a JSON string with the Collection JSON.
+include the Collection JSON documents in the Parquet metadata under the
+`stac:collections` key. This should contain a mapping of STAC Collection IDs to
+[Collection] objects. As usual, the collection ID in the mapping keys must match
+the `id` in the Collection object.
+
+Note that a previous version of this spec allowed storing a single collection
+under the `stac:collection` (singular) key. This field is deprecated in favor of
+`stac:collections`. Readers aiming for maximum compatibility should first check
+`stac:collections` for collection information and fall back to the deprecated
+`stac:collection` field.
 
 ## Referencing a STAC Geoparquet Collections in a STAC Collection JSON
 
@@ -105,3 +138,4 @@ The principles here can likely be used to map into other geospatial data formats
 [common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac
 [timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
 [parquet-metadata]: https://github.com/apache/parquet-format#metadata
+[Collection]: https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#
diff --git a/stac_geoparquet/arrow/_delta_lake.py b/stac_geoparquet/arrow/_delta_lake.py
@@ -12,7 +12,7 @@
 from stac_geoparquet.arrow._to_parquet import (
     DEFAULT_PARQUET_SCHEMA_VERSION,
     SUPPORTED_PARQUET_SCHEMA_VERSIONS,
-    create_geoparquet_metadata,
+    create_parquet_metadata,
 )
 
 if TYPE_CHECKING:
@@ -51,7 +51,7 @@ def parse_stac_ndjson_to_delta_lake(
         input_path, chunk_size=chunk_size, schema=schema, limit=limit
     )
     schema = record_batch_reader.schema.with_metadata(
-        create_geoparquet_metadata(
+        create_parquet_metadata(
             record_batch_reader.schema, schema_version=schema_version
         )
     )

diff --git a/stac_geoparquet/arrow/_to_parquet.py b/stac_geoparquet/arrow/_to_parquet.py
@@ -90,14 +90,14 @@ def to_parquet(
     reader = pa.RecordBatchReader.from_stream(table)
 
     schema = reader.schema.with_metadata(
-        create_geoparquet_metadata(reader.schema, schema_version=schema_version)
+        create_parquet_metadata(reader.schema, schema_version=schema_version)
     )
     with pq.ParquetWriter(output_path, schema, **kwargs) as writer:
         for batch in reader:
             writer.write_batch(batch)
 
 
-def create_geoparquet_metadata(
+def create_parquet_metadata(
     schema: pa.Schema,
     *,
     schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS,
@@ -141,7 +141,10 @@ def create_geoparquet_metadata(
             "crs": None,
         }
 
-    return {b"geo": json.dumps(geo_meta).encode("utf-8")}
+    return {
+        b"geo": json.dumps(geo_meta).encode("utf-8"),
+        b"stac:geoparquet_version": b"1.0.0",
+    }
 
 
 def schema_version_has_bbox_mapping(

diff --git a/tests/test_parquet.py b/tests/test_parquet.py
@@ -51,3 +51,18 @@ def test_round_trip_via_parquet(collection_id: str, tmp_path: Path):
 
     for result, expected in zip(items_result, items):
         assert_json_value_equal(result, expected, precision=0)
+
+
+def test_metadata(tmp_path: Path):
+    collection_id = "3dep-lidar-copc"
+    path = HERE / "data" / f"{collection_id}-pc.json"
+    out_path = tmp_path / "file.parquet"
+    # Convert to Parquet
+    parse_stac_ndjson_to_parquet(path, out_path)
+    table = pq.read_table(out_path)
+
+    metadata = table.schema.metadata
+    assert metadata[b"stac:geoparquet_version"] == b"1.0.0"
+    geo = json.loads(metadata[b"geo"])
+    assert geo["version"] == "1.1.0"
+    assert set(geo) == {"version", "columns", "primary_column"}