Skip to content

Commit 2b22c77

Browse files
authored
COPDS-2755 fair checker command (#169)
* FAIR checker command WIP * update_fair_score test * typecheck * explicit requests-types stub * ruff * Raise timeout to 1 minute * FAIR quick column * score_total to score_pecent
1 parent 39924d7 commit 2b22c77

16 files changed

+341
-42
lines changed

alembic/env.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
import alembic.context
1718
import cads_common.logging
1819
import sqlalchemy as sa
1920

20-
import alembic.context
2121
import cads_catalogue
2222

2323

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""FAIR additions.
2+
3+
Revision ID: ddf161fdce37
4+
Revises: 9c47800f0dcf
5+
Create Date: 2025-09-17 13:42:00.808209
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from sqlalchemy.dialects import postgresql as dialect_postgresql # needed for mypy
11+
12+
from alembic import op
13+
14+
# revision identifiers, used by Alembic.
15+
revision = "ddf161fdce37"
16+
down_revision = "9c47800f0dcf"
17+
branch_labels = None
18+
depends_on = None
19+
20+
21+
def upgrade() -> None:
22+
op.add_column("resource_data", sa.Column("fair_data", dialect_postgresql.JSONB))
23+
op.add_column(
24+
"resources",
25+
sa.Column(
26+
"fair_timestamp", sa.DateTime(timezone=True), default=None, nullable=True
27+
),
28+
)
29+
30+
31+
def downgrade() -> None:
32+
op.drop_column("resource_data", "fair_data")
33+
op.drop_column("resources", "fair_timestamp")

cads_catalogue/alembic_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import os
44
from typing import Optional, Sequence
55

6-
import cads_catalogue
76
from alembic.config import CommandLine, Config
87

8+
import cads_catalogue
9+
910
alembic_ini_path = os.path.abspath(os.path.join(__file__, "..", "..", "alembic.ini"))
1011

1112

cads_catalogue/database.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717
import datetime
1818
from typing import Any, List
1919

20+
import alembic.command
21+
import alembic.config
2022
import sqlalchemy as sa
2123
import sqlalchemy_utils
2224
from sqlalchemy.dialects import postgresql as dialect_postgresql # needed for mypy
2325

24-
import alembic.command
25-
import alembic.config
2626
from cads_catalogue import alembic_cli, config
2727

2828
metadata = sa.MetaData()
@@ -253,6 +253,7 @@ class ResourceData(BaseModel):
253253
constraints_data: Any = sa.Column(dialect_postgresql.JSONB)
254254
form_data: Any = sa.Column(dialect_postgresql.JSONB)
255255
mapping: Any = sa.Column(dialect_postgresql.JSONB)
256+
fair_data: Any = sa.Column(dialect_postgresql.JSONB, nullable=True)
256257

257258
resource_uid = sa.Column(
258259
sa.String, sa.ForeignKey("resources.resource_uid"), nullable=False
@@ -332,6 +333,9 @@ class Resource(BaseModel):
332333
variables: Any = sa.Column(dialect_postgresql.JSONB)
333334
content_size = sa.Column(sa.Float)
334335

336+
# FAIR
337+
fair_timestamp = sa.Column(sa.DateTime(timezone=True), default=None, nullable=True)
338+
335339
# fulltextsearch-related
336340
fulltext = sa.Column(sa.String)
337341
high_priority_terms = sa.Column(sa.String)
@@ -356,7 +360,7 @@ class Resource(BaseModel):
356360

357361
@sa.ext.hybrid.hybrid_property
358362
def has_adaptor_costing(self):
359-
"""Verificy is costing is defined in adaptor json."""
363+
"""Verify if costing is defined in adaptor json."""
360364
session = sa.orm.object_session(self)
361365
exists_query = sa.exists().where(
362366
sa.and_(
@@ -377,6 +381,49 @@ def has_adaptor_costing(self):
377381
)
378382
)
379383

384+
@sa.ext.hybrid.hybrid_property
385+
def fair_score(self):
386+
"""Return the FAIR score from the fair_data column.
387+
388+
Data will be looked-up on fair_data.summary.score_total.FAIR
389+
"""
390+
from sqlalchemy.orm import object_session
391+
392+
session = object_session(self)
393+
if session is None:
394+
return None
395+
396+
result = (
397+
session.query(
398+
ResourceData.fair_data.op("#>")(
399+
sa.text("'{summary,score_percent,FAIR}'")
400+
)
401+
)
402+
.filter(ResourceData.resource_uid == self.resource_uid)
403+
.scalar()
404+
)
405+
406+
# Convert to integer, or None if not found
407+
if result is not None:
408+
try:
409+
return int(result)
410+
except (ValueError, TypeError):
411+
return None
412+
return None
413+
414+
@fair_score.expression # type: ignore[no-redef]
415+
def fair_score(cls):
416+
return sa.cast(
417+
sa.select(
418+
ResourceData.fair_data.op("#>")(
419+
sa.text("'{summary,score_percent,FAIR}'")
420+
)
421+
)
422+
.where(ResourceData.resource_uid == cls.resource_uid)
423+
.scalar_subquery(),
424+
sa.Integer,
425+
)
426+
380427
# relationship attributes
381428
resource_data = sa.orm.relationship(
382429
ResourceData, uselist=False, back_populates="resource", lazy="select"

cads_catalogue/entry_points.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
config,
2929
contents,
3030
database,
31+
fair,
3132
licence_manager,
3233
maintenance,
3334
manager,
@@ -508,6 +509,29 @@ def update_sanity_check(
508509
logger.info("db update of sanity check information completed.")
509510

510511

512+
@app.command()
513+
def fair_checker(
514+
fair_checker_host: str,
515+
) -> None:
516+
"""Run FAIR checker on all resources in the catalogue.
517+
518+
Evaluate FAIR score of every entry in the catalogue.
519+
Command is compatible with the F-UJI FAIR checker API.
520+
521+
Parameters
522+
----------
523+
:param fair_checker_service_url: host (optionally with port) of the FAIR checker service
524+
"""
525+
dbsettings = config.ensure_settings(config.dbsettings)
526+
connection_string = dbsettings.connection_string
527+
engine = sa.create_engine(connection_string)
528+
session_obj = sa.orm.sessionmaker(engine)
529+
logger.info("start FAIR checker on all resources in the catalogue.")
530+
with session_obj() as session:
531+
fair.update_fair_score(session, fair_checker_host)
532+
logger.info("FAIR checker process completed.")
533+
534+
511535
def main() -> None:
512536
"""Run main catalogue entry points."""
513537
app()

cads_catalogue/fair.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""FAIR related functionalities."""
2+
3+
# Copyright 2025, European Union.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import datetime
18+
import os
19+
20+
import requests
21+
import sqlalchemy as sa
22+
import structlog
23+
from cads_common import portal
24+
25+
from cads_catalogue import database
26+
27+
FAIR_CHEKER_USERNAME = os.getenv("FAIR_CHECKER_USERNAME", "marvel")
28+
FAIR_CHEKER_PASSWORD = os.getenv("FAIR_CHECKER_PASSWORD", "wonderwoman")
29+
30+
logger = structlog.get_logger(__name__)
31+
32+
33+
def call_fair_checker(fair_checker_host: str, site_base: str, dataset_uid: str) -> dict:
34+
"""Check if the FAIR checker service is reachable.
35+
36+
Args:
37+
fair_checker_host: Hostname (with port, optionally) of the FAIR checker service.
38+
site_base: Base URL for DSS portal.
39+
dataset_uid: Dataset unique identifier.
40+
41+
Returns: JSON response from the FAIR checker service.
42+
"""
43+
payload = {
44+
"object_identifier": f"{site_base}/datasets/{dataset_uid}",
45+
"metadata_service_endpoint": "",
46+
"metadata_service_type": "oai_pmh",
47+
"use_datacite": True,
48+
"use_github": False,
49+
"metric_version": "metrics_v0.8",
50+
}
51+
52+
logger.info("Retrieving FAIR report for resource", dataset_uid=dataset_uid)
53+
response = requests.post(
54+
f"http://{fair_checker_host}/fuji/api/v1/evaluate",
55+
headers={
56+
"accept": "application/json",
57+
"Content-Type": "application/json",
58+
},
59+
json=payload,
60+
auth=(FAIR_CHEKER_USERNAME, FAIR_CHEKER_PASSWORD),
61+
timeout=60,
62+
)
63+
response.raise_for_status()
64+
return response.json()
65+
66+
67+
def update_fair_score(session: sa.orm.Session, fair_checker_host: str) -> None:
68+
"""Update the FAIR score for all resources in the catalogue.
69+
70+
Args:
71+
session_obj: SQLAlchemy session object.
72+
fair_checker_service_url: URL of the FAIR checker service.
73+
"""
74+
resources = session.query(database.Resource).all()
75+
for resource in resources:
76+
site_base = portal.get_site_url(resource.portal) if resource.portal else None
77+
if not site_base:
78+
logger.warning(
79+
"Cannot determine site base URL for portal",
80+
portal=resource.portal,
81+
resource_id=resource.resource_uid,
82+
)
83+
continue
84+
dataset_uid = resource.resource_uid
85+
assert dataset_uid is not None
86+
try:
87+
result = call_fair_checker(fair_checker_host, site_base, dataset_uid)
88+
except requests.RequestException as e:
89+
logger.error(
90+
"Error connecting to FAIR checker service",
91+
error=str(e),
92+
resource_id=resource.resource_uid,
93+
)
94+
continue
95+
resource.fair_timestamp = datetime.datetime.utcnow()
96+
try:
97+
session.execute(
98+
sa.update(database.ResourceData)
99+
.where(database.ResourceData.resource_uid == resource.resource_uid)
100+
.values(fair_data=result),
101+
)
102+
session.add(resource)
103+
session.commit()
104+
except sa.exc.SQLAlchemyError as e:
105+
session.rollback()
106+
logger.error(
107+
"Error updating FAIR data",
108+
error=str(e),
109+
resource_id=resource.resource_uid,
110+
)
111+
continue
112+
logger.info("Updated FAIR data", resource_id=resource.resource_uid)

ci/environment-ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies:
2222
- pytest-mock
2323
- pyyaml
2424
- sqlalchemy-utils<0.42.0 # see https://github.com/kvesteri/sqlalchemy-utils/issues/791
25+
- types-requests
2526
- pip:
2627
- boto3-stubs
2728
- pytest-postgresql

tests/data/dumped_resources1.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@
6262
"high_priority_terms": "reanalysis ERA5 land",
6363
"popularity": 500,
6464
"search_field": "'1950':11A 'accur':92B 'across':65B 'back':87B 'climat':56B,96B 'combin':59B 'compar':38B 'complet':71B 'compon':51B 'consist':23B,73B 'data':9A,61B,82B 'dataset':20B,74B 'decad':33B,86B 'descript':93B 'ecmwf':54B 'enhanc':36B 'era5':3A,6A,15B,40B,42B,55B,101 'era5-land':5A,14B,41B 'evolut':27B 'global':70B 'goe':84B 'hour':8A 'land':4A,7A,16B,29B,43B,50B,102 'law':77B 'model':60B 'observ':63B 'past':99B 'physic':79B 'present':13A 'produc':46B,81B 'provid':21B,90B 'reanalysi':2A,19B,57B,58B,80B,100 'reanalysis-era5-land':1A 'replay':48B 'resolut':37B 'sever':32B,85B 'time':89B 'use':75B 'variabl':30B 'view':24B 'world':67B",
65-
"fts": "'era5':2 'land':3 'reanalysi':1"
65+
"fts": "'era5':2 'land':3 'reanalysi':1",
66+
"fair_timestamp": null
6667
},
6768
{
6869
"resource_id": 2,
@@ -125,6 +126,7 @@
125126
"high_priority_terms": "",
126127
"popularity": 1,
127128
"search_field": "'1950':14A 'accur':95B 'across':68B 'averag':11A 'back':90B 'biospher':110C 'c3s':112C 'climat':59B,99B,103C 'combin':62B 'compar':41B 'complet':74B 'compon':54B 'condit':113C 'consist':26B,76B 'copernicus':111C 'data':12A,64B,85B 'dataset':23B,77B 'decad':36B,89B 'descript':96B 'ecmwf':57B 'enhanc':39B 'era5':3A,8A,18B,43B,45B,58B,107C 'era5-land':7A,17B,44B 'evolut':30B 'global':73B 'goe':87B 'hydrolog':108C 'land':4A,9A,19B,32B,46B,53B,106C 'law':80B 'mean':6A,116C 'model':63B 'month':5A,10A,115C 'observ':66B 'past':102B,105C 'physic':82B,109C 'present':16A 'produc':49B,84B 'provid':24B,93B 'reanalysi':2A,22B,60B,61B,83B,104C 'reanalysis-era5-land-monthly-means':1A 'replay':51B 'resolut':40B 'sever':35B,88B 'time':92B 'use':78B 'variabl':33B,114C 'view':27B 'world':70B",
128-
"fts": ""
129+
"fts": "",
130+
"fair_timestamp": null
129131
}
130132
]

tests/data/dumped_resources2.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@
6262
"high_priority_terms": "reanalysis ERA5 land",
6363
"popularity": 500,
6464
"search_field": "'1950':11A 'accur':92B 'across':65B 'back':87B 'climat':56B,96B 'combin':59B 'compar':38B 'complet':71B 'compon':51B 'consist':23B,73B 'data':9A,61B,82B 'dataset':20B,74B 'decad':33B,86B 'descript':93B 'ecmwf':54B 'enhanc':36B 'era5':3A,6A,15B,40B,42B,55B,101 'era5-land':5A,14B,41B 'evolut':27B 'global':70B 'goe':84B 'hour':8A 'land':4A,7A,16B,29B,43B,50B,102 'law':77B 'model':60B 'observ':63B 'past':99B 'physic':79B 'present':13A 'produc':46B,81B 'provid':21B,90B 'reanalysi':2A,19B,57B,58B,80B,100 'reanalysis-era5-land':1A 'replay':48B 'resolut':37B 'sever':32B,85B 'time':89B 'use':75B 'variabl':30B 'view':24B 'world':67B",
65-
"fts": "'era5':2 'land':3 'reanalysi':1"
65+
"fts": "'era5':2 'land':3 'reanalysi':1",
66+
"fair_timestamp": null
6667
},
6768
{
6869
"resource_id": 2,
@@ -125,6 +126,7 @@
125126
"high_priority_terms": "",
126127
"popularity": 1,
127128
"search_field": "'1950':14A 'accur':95B 'across':68B 'averag':11A 'back':90B 'biospher':110C 'c3s':112C 'climat':59B,99B,103C 'combin':62B 'compar':41B 'complet':74B 'compon':54B 'condit':113C 'consist':26B,76B 'copernicus':111C 'data':12A,64B,85B 'dataset':23B,77B 'decad':36B,89B 'descript':96B 'ecmwf':57B 'enhanc':39B 'era5':3A,8A,18B,43B,45B,58B,107C 'era5-land':7A,17B,44B 'evolut':30B 'global':73B 'goe':87B 'hydrolog':108C 'land':4A,9A,19B,32B,46B,53B,106C 'law':80B 'mean':6A,116C 'model':63B 'month':5A,10A,115C 'observ':66B 'past':102B,105C 'physic':82B,109C 'present':16A 'produc':49B,84B 'provid':24B,93B 'reanalysi':2A,22B,60B,61B,83B,104C 'reanalysis-era5-land-monthly-means':1A 'replay':51B 'resolut':40B 'sever':35B,88B 'time':92B 'use':78B 'variabl':33B,114C 'view':27B 'world':70B",
128-
"fts": ""
129+
"fts": "",
130+
"fair_timestamp": null
129131
}
130132
]

tests/data/dumped_resources3.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
"high_priority_terms": "reanalysis ERA5 land",
6363
"popularity": 500,
6464
"search_field": "'1950':11A 'accur':92B 'across':65B 'back':87B 'climat':56B,96B 'combin':59B 'compar':38B 'complet':71B 'compon':51B 'consist':23B,73B 'data':9A,61B,82B 'dataset':20B,74B 'decad':33B,86B 'descript':93B 'ecmwf':54B 'enhanc':36B 'era5':3A,6A,15B,40B,42B,55B,101 'era5-land':5A,14B,41B 'evolut':27B 'global':70B 'goe':84B 'hour':8A 'land':4A,7A,16B,29B,43B,50B,102 'law':77B 'model':60B 'observ':63B 'past':99B 'physic':79B 'present':13A 'produc':46B,81B 'provid':21B,90B 'reanalysi':2A,19B,57B,58B,80B,100 'reanalysis-era5-land':1A 'replay':48B 'resolut':37B 'sever':32B,85B 'time':89B 'use':75B 'variabl':30B 'view':24B 'world':67B",
65-
"fts": "'era5':2 'land':3 'reanalysi':1"
65+
"fts": "'era5':2 'land':3 'reanalysi':1",
66+
"fair_timestamp": null
6667
}
6768
]

0 commit comments

Comments
 (0)