diff --git a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py index 7b8b2b1e4..90eec21a8 100644 --- a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py +++ b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py @@ -70,6 +70,32 @@ def get_croissant_crumbs_from_dataset_infos( ) ] record_set = [] + + # Check if dataset has geospatial modality + is_geospatial = False + try: + dataset_modalities_response = get_previous_step_or_raise(kind="dataset-modalities", dataset=dataset) + modalities = dataset_modalities_response["content"].get("modalities", []) + is_geospatial = "geospatial" in modalities + except Exception: + # If modalities step fails, try direct file detection + try: + dataset_filetypes_response = get_previous_step_or_raise(kind="dataset-filetypes", dataset=dataset) + content = dataset_filetypes_response["content"] + if "filetypes" in content and isinstance(content["filetypes"], list): + geospatial_extensions = { + ".shp", ".shx", ".dbf", ".prj", ".cpg", ".kml", ".kmz", ".gpx", + ".geojson", ".topojson", ".gml", ".geoparquet", ".fgb", + ".img", ".bil", ".bip", ".bsq", ".gpkg", ".mbtiles", ".pmtiles", + ".tif", ".tiff" # GeoTIFF files + } + for filetype in content["filetypes"]: + if filetype["extension"] in geospatial_extensions and filetype["count"] > 0: + is_geospatial = True + break + except Exception: + pass + for info in infos: description_body = "" config = info["config_name"] @@ -197,16 +223,33 @@ def get_croissant_crumbs_from_dataset_infos( "source": "cr:source", "subField": "cr:subField", "transform": "cr:transform", + # GeoCroissant properties + "geocr": "http://mlcommons.org/croissant/geo/1.0", + "boundingBox": "geocr:boundingBox", + "geometry": "geocr:geometry", + "resolution": "geocr:resolution", + "crs": "geocr:crs", + "temporalExtent": "geocr:temporalExtent", + "spatialResolution": "geocr:spatialResolution", + "temporalResolution": "geocr:temporalResolution", + "label": "geocr:label", + "image": "geocr:image", + } + # Prepare base output + output = { + "@context": context, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.1", + "distribution": distribution, + "recordSet": record_set, } - return _remove_none_values( - { - "@context": context, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.1", - "distribution": distribution, - "recordSet": record_set, - } - ) + + # Add GeoCroissant properties if dataset is geospatial + if is_geospatial: + # TODO: Extract geospatial metadata from user-provided metadata.json or dataset card + pass + + return _remove_none_values(output) def compute_croissant_crumbs_response(dataset: str) -> Mapping[str, Any]: diff --git a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py index 2d20bb0c1..c3f4dee34 100644 --- a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py +++ b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py @@ -96,6 +96,17 @@ "source": "cr:source", "subField": "cr:subField", "transform": "cr:transform", + # GeoCroissant properties + "geocr": "http://mlcommons.org/croissant/geo/1.0", + "boundingBox": "geocr:boundingBox", + "geometry": "geocr:geometry", + "resolution": "geocr:resolution", + "crs": "geocr:crs", + "temporalExtent": "geocr:temporalExtent", + "spatialResolution": "geocr:spatialResolution", + "temporalResolution": "geocr:temporalResolution", + "label": "geocr:label", + "image": "geocr:image", } @@ -186,3 +197,11 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None: assert "@id" in distribution if "containedIn" in distribution: assert "@id" in distribution["containedIn"] + + # Test that GeoCroissant context is included + assert "geocr" in croissant_crumbs["@context"] + assert croissant_crumbs["@context"]["geocr"] == "http://mlcommons.org/croissant/geo/1.0" + assert "boundingBox" in croissant_crumbs["@context"] + assert "crs" in croissant_crumbs["@context"] + assert "resolution" in croissant_crumbs["@context"] + assert "geometry" in croissant_crumbs["@context"] \ No newline at end of file