Skip to content

Commit 780ac71

Browse files
COPDS-2954: stac validation at ingestion level (#170)
* feat: stac validation at ingestion level * chore: resource stac-validation * chore: integrated new get_extent error handling
1 parent 2b22c77 commit 780ac71

File tree

6 files changed

+151
-1
lines changed

6 files changed

+151
-1
lines changed

cads_catalogue/entry_points.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ def update_catalogue(
285285
exclude_licences: bool = False,
286286
exclude_messages: bool = False,
287287
exclude_contents: bool = False,
288+
perform_stac_validation: bool = True,
288289
) -> None:
289290
"""Update the database with the catalogue data.
290291
@@ -306,6 +307,7 @@ def update_catalogue(
306307
:param exclude_licences: if True, do not consider input licences (default False)
307308
:param exclude_messages: if True, do not consider input messages (default False)
308309
:param exclude_contents: if True, do not consider input contents (default False)
310+
:param perform_stac_validation: if False, skip STAC compliance validation
309311
"""
310312
cads_common.logging.structlog_configure()
311313
cads_common.logging.logging_configure()
@@ -366,6 +368,7 @@ def update_catalogue(
366368
include=include,
367369
exclude=exclude,
368370
override_md=new_catalogue_update_md["override_md"],
371+
perform_stac_validation=perform_stac_validation,
369372
)
370373
if "messages" in to_process:
371374
logger.info("db updating of messages")

cads_catalogue/manager.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
layout_manager,
3838
object_storage,
3939
utils,
40+
validations,
4041
)
4142

4243
logger = structlog.get_logger(__name__)
@@ -818,6 +819,7 @@ def update_catalogue_resources_single_folder(
818819
include: List[str] = [],
819820
exclude: List[str] = [],
820821
override_md: dict[str, Any] = {},
822+
perform_stac_validation: bool = True,
821823
) -> List[str]:
822824
"""
823825
Load metadata of resources from files of a single input folder and sync each resource in the db.
@@ -832,6 +834,7 @@ def update_catalogue_resources_single_folder(
832834
include: list of include patterns for the resource uids
833835
exclude: list of exclude patterns for the resource uids
834836
override_md: dictionary of override metadata for resources
837+
perform_stac_validation: if False, skip STAC compliance validation
835838
836839
Returns
837840
-------
@@ -887,6 +890,10 @@ def update_catalogue_resources_single_folder(
887890
session, resource_folder_path, resource, storage_settings
888891
)
889892
resource["adaptor_properties_hash"] = compute_config_hash(resource)
893+
894+
if perform_stac_validation:
895+
validations.validate_resource_dict_stac_compliance(resource)
896+
890897
resource_sync(session, resource, storage_settings)
891898
logger.info("resource '%s' db sync successful" % resource_uid)
892899
except Exception: # noqa
@@ -905,6 +912,7 @@ def update_catalogue_resources(
905912
include: List[str] = [],
906913
exclude: List[str] = [],
907914
override_md: dict[str, Any] = {},
915+
perform_stac_validation: bool = False,
908916
) -> List[str]:
909917
"""
910918
Load metadata of resources from files and sync each resource in the db.
@@ -919,6 +927,7 @@ def update_catalogue_resources(
919927
include: list of include patterns for the resource uids
920928
exclude: list of exclude patterns for the resource uids
921929
override_md: dictionary of override metadata for resources
930+
perform_stac_validation: if False, skip STAC compliance validation
922931
923932
Returns
924933
-------
@@ -935,6 +944,7 @@ def update_catalogue_resources(
935944
include,
936945
exclude,
937946
override_md,
947+
perform_stac_validation,
938948
)
939949
involved_resource_uids += new_involved
940950
return involved_resource_uids

cads_catalogue/stac_helpers.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""STAC helpers."""
2+
3+
# Copyright 2025, European Union.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import datetime
18+
19+
import pydantic
20+
import stac_pydantic
21+
import structlog
22+
23+
from . import database
24+
25+
logger = structlog.getLogger(__name__)
26+
27+
28+
WRONG_BBOX_LOGGED_IDS: set[str] = set()
29+
30+
31+
def get_extent(
32+
record: database.Resource,
33+
) -> dict:
34+
"""Get extent from model."""
35+
spatial = record.geo_extent or {}
36+
37+
west = float(spatial.get("bboxW", -180))
38+
south = float(spatial.get("bboxS", -90))
39+
east = float(spatial.get("bboxE", 180))
40+
north = float(spatial.get("bboxN", 90))
41+
42+
try:
43+
spatial_extent = stac_pydantic.collection.SpatialExtent(
44+
bbox=[(west, south, east, north)],
45+
)
46+
except pydantic.ValidationError as e:
47+
# 0-360 longitude values are considered valid
48+
if (
49+
0 <= west <= 360 or 0 <= east <= 360
50+
) and "Bounding box must be within (-180, -90, 180, 90)" in str(e):
51+
spatial_extent = stac_pydantic.collection.SpatialExtent.model_construct(
52+
bbox=[(west, south, east, north)]
53+
)
54+
else:
55+
if record.resource_uid not in WRONG_BBOX_LOGGED_IDS:
56+
# this log is important, but we don't want to flood the logs
57+
WRONG_BBOX_LOGGED_IDS.add(str(record.resource_uid))
58+
logger.warning(
59+
"Bbox stac_pydantic validation failed, fallback to whole world",
60+
error=e,
61+
id=record.resource_uid,
62+
)
63+
spatial_extent = stac_pydantic.collection.SpatialExtent(
64+
bbox=[(-180, -90, 180, 90)]
65+
)
66+
67+
begin_date_value = getattr(record, "begin_date", None)
68+
end_date_value = getattr(record, "end_date", None)
69+
70+
return stac_pydantic.collection.Extent(
71+
spatial=spatial_extent,
72+
temporal=stac_pydantic.collection.TimeInterval(
73+
interval=[
74+
[
75+
(
76+
# We have datetime.date on DB, while STAC requires datetime with timezone
77+
datetime.datetime.combine(
78+
begin_date_value, datetime.datetime.min.time()
79+
).replace(tzinfo=datetime.timezone.utc)
80+
if begin_date_value
81+
else None
82+
),
83+
(
84+
# We have datetime.date on DB, while STAC requires datetime with timezone
85+
datetime.datetime.combine(
86+
end_date_value, datetime.datetime.min.time()
87+
).replace(tzinfo=datetime.timezone.utc)
88+
if end_date_value
89+
else None
90+
),
91+
]
92+
],
93+
),
94+
).model_dump()

cads_catalogue/validations.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@
2323
from typing import Any
2424

2525
import cads_common.logging
26+
import pydantic
2627

27-
from cads_catalogue import layout_manager, utils
28+
from cads_catalogue import database, layout_manager, stac_helpers, utils
2829

2930
logger = logging.getLogger(__name__)
3031

@@ -470,3 +471,40 @@ def validate_datasets(datasets_folder: str, loglevel: str = "info") -> None:
470471
continue
471472
validate_dataset(dataset_folder, loglevel=None)
472473
print("----end of validations----")
474+
475+
476+
def validate_resource_dict_stac_compliance(
477+
resource_dict: dict[str, Any],
478+
) -> None:
479+
"""Validate that resource metadata dictionary will be STAC-compliant."""
480+
resource_uid = resource_dict.get("resource_uid", "unknown")
481+
482+
begin_date = resource_dict.get("begin_date")
483+
if isinstance(begin_date, str):
484+
begin_date = datetime.date.fromisoformat(begin_date) if begin_date else None
485+
486+
end_date = resource_dict.get("end_date")
487+
if isinstance(end_date, str):
488+
end_date = datetime.date.fromisoformat(end_date) if end_date else None
489+
490+
resource_model = database.Resource(
491+
resource_uid=resource_uid,
492+
geo_extent=resource_dict.get("geo_extent"),
493+
begin_date=begin_date,
494+
end_date=end_date,
495+
)
496+
497+
try:
498+
_ = stac_helpers.get_extent(resource_model)
499+
except pydantic.ValidationError as e:
500+
logger.error(
501+
f"STAC validation failed for resource dictionary '{resource_uid}': {e.errors()}"
502+
)
503+
raise
504+
except Exception as e:
505+
logger.error(
506+
f"Unexpected error during STAC validation for resource '{resource_uid}': {e}"
507+
)
508+
raise ValueError(f"STAC validation failed: {e}") from e
509+
510+
logger.info(f"STAC validation passed for resource dictionary '{resource_uid}'")

environment.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@ dependencies:
1111
- boto3
1212
- gitpython
1313
- jsonschema
14+
- pydantic
1415
- psycopg2
1516
- python-frontmatter
1617
- rfc3339-validator
1718
- sqlalchemy>=2.0.9
1819
- sqlalchemy-utils
1920
- structlog
2021
- typer
22+
- pip:
23+
- stac_pydantic

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ dependencies = [
2121
"cads-e2e-tests@git+https://github.com/ecmwf-projects/cads-e2e-tests.git",
2222
"gitpython",
2323
"jsonschema",
24+
"pydantic",
25+
"stac-pydantic",
2426
"python-frontmatter",
2527
"rfc3339-validator",
2628
"sqlalchemy>=2.0.9",

0 commit comments

Comments
 (0)