From 1d7be85ad2413e015b7015eb94cae271cc7977dd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 00:33:15 +0000 Subject: [PATCH 1/9] Initial plan From 201b098059eb11aaa974c74bfcf4189c630f44b2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 00:40:27 +0000 Subject: [PATCH 2/9] Add ecoCroissant specification, schema, and examples Co-authored-by: egrace479 <38985481+egrace479@users.noreply.github.com> --- LICENSE | 190 +++++++++++++ README.md | 140 +++++++++- docs/eco-spec.md | 513 ++++++++++++++++++++++++++++++++++ examples/treeoflife-200m.json | 283 +++++++++++++++++++ schema/eco-context.jsonld | 98 +++++++ 5 files changed, 1222 insertions(+), 2 deletions(-) create mode 100644 LICENSE create mode 100644 docs/eco-spec.md create mode 100644 examples/treeoflife-200m.json create mode 100644 schema/eco-context.jsonld diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bfb6d93 --- /dev/null +++ b/LICENSE @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2024 Imageomics Institute + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index dc13b00..ac6d02b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,138 @@ -# ecoCroissant -Repository for developing croissant-based biodiversity metadata schema following FAIR4AI principles. +# ecoCroissant 🥐🌿 + +**Croissant Extension for Biodiversity Metadata** + +[![Specification](https://img.shields.io/badge/spec-v1.0-green.svg)](docs/eco-spec.md) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) + +## Overview + +ecoCroissant is an extension to the [Croissant format](https://github.com/mlcommons/croissant) designed to capture ecologically-relevant information from biodiversity datasets. It follows [FAIR4AI principles](https://www.nature.com/articles/s41597-022-01759-2) to ensure datasets are Findable, Accessible, Interoperable, and Reusable for AI/ML applications in ecological and biodiversity research. + +## Why ecoCroissant? + +The standard Croissant format provides excellent support for ML-ready datasets, but biodiversity datasets have unique characteristics that require additional metadata: + +- **Taxonomic Information**: Species identification, taxonomic hierarchies, and nomenclature +- **Geographic Context**: Collection locations, habitats, elevation, and protected areas +- **Temporal Ecology**: Phenology, seasonality, and collection timelines +- **Ecological Relationships**: Trophic levels, species interactions, and ecological roles +- **Conservation Status**: IUCN categories, population trends, and threats +- **Data Quality**: Identification confidence, georeferencing accuracy, and sampling methods + +## Quick Start + +### Using ecoCroissant Properties + +Add the ecoCroissant context to your Croissant metadata: + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "My Biodiversity Dataset", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "eco:taxon": "Lepidoptera", + "eco:taxonRank": "order", + "eco:habitat": ["tropical rainforest", "temperate forest"], + "eco:iucnStatus": "LC", + "eco:basisOfRecord": "PreservedSpecimen" +} +``` + +### Example Datasets + +See the [examples](examples/) directory for complete examples: + +- [TreeOfLife-200M](examples/treeoflife-200m.json) - Large-scale species image dataset + +## Documentation + +- **[ecoCroissant Specification](docs/eco-spec.md)** - Complete specification with property definitions +- **[JSON-LD Context](schema/eco-context.jsonld)** - JSON-LD context file for ecoCroissant + +## Property Categories + +### Taxonomic Properties +| Property | Description | +|----------|-------------| +| `eco:taxon` | Taxonomic name(s) of organisms | +| `eco:taxonRank` | Taxonomic rank (species, genus, family, etc.) | +| `eco:scientificName` | Full scientific name with authorship | +| `eco:taxonID` | Links to GBIF, NCBI, or other databases | +| `eco:higherClassification` | Full taxonomic hierarchy | + +### Geographic Properties +| Property | Description | +|----------|-------------| +| `eco:habitat` | Habitat type(s) | +| `eco:biome` | Major biome classification | +| `eco:locality` | Location description | +| `eco:protectedArea` | Protected areas where species occurs | + +### Conservation Properties +| Property | Description | +|----------|-------------| +| `eco:iucnStatus` | IUCN Red List category | +| `eco:populationTrend` | Population trend direction | +| `eco:threats` | Known threats to species | + +### Data Quality Properties +| Property | Description | +|----------|-------------| +| `eco:basisOfRecord` | Type of record (specimen, observation, etc.) | +| `eco:identificationVerificationStatus` | Verification level | +| `eco:samplingProtocol` | Data collection method | + +See the [full specification](docs/eco-spec.md) for all available properties. + +## Integration with Standards + +ecoCroissant is designed to integrate with established biodiversity standards: + +- **[Darwin Core](https://dwc.tdwg.org/)** - Standard for biodiversity data sharing +- **[GBIF](https://www.gbif.org/)** - Global Biodiversity Information Facility +- **[IUCN Red List](https://www.iucnredlist.org/)** - Conservation status assessments +- **[Encyclopedia of Life](https://eol.org/)** - Species information aggregator + +## Related Resources + +- [Croissant Format](https://github.com/mlcommons/croissant) - Base ML dataset format +- [Croissant RAI Extension](https://github.com/mlcommons/croissant/blob/main/docs/croissant-rai-spec.md) - Responsible AI extension +- [TreeOfLife-200M Dataset](https://huggingface.co/datasets/imageomics/TreeOfLife-200M) - Example dataset using ecoCroissant +- [Imageomics Institute](https://imageomics.org/) - Advancing biological knowledge through images + +## Contributing + +We welcome contributions! Please see our contributing guidelines for more information. + +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. + +## Citation + +If you use ecoCroissant in your research, please cite: + +```bibtex +@misc{ecoCroissant2024, + title={ecoCroissant: A Croissant Extension for Biodiversity Metadata}, + author={Imageomics Institute}, + year={2024}, + url={https://github.com/Imageomics/ecoCroissant} +} +``` + +## Acknowledgments + +This work builds upon the [Croissant format](https://github.com/mlcommons/croissant) developed by the MLCommons Datasets Working Group. We thank the biodiversity informatics community for their contributions to standards like Darwin Core that inform this work. diff --git a/docs/eco-spec.md b/docs/eco-spec.md new file mode 100644 index 0000000..1ef9317 --- /dev/null +++ b/docs/eco-spec.md @@ -0,0 +1,513 @@ +# ecoCroissant Specification + +## Croissant Extension for Biodiversity Metadata + +Version 1.0 + + + +## Introduction + +ecoCroissant is an extension to the [Croissant format](http://mlcommons.org/croissant/1.0) designed to capture ecologically-relevant information from biodiversity datasets. It follows [FAIR4AI principles](https://www.nature.com/articles/s41597-022-01759-2) to ensure datasets are Findable, Accessible, Interoperable, and Reusable for AI/ML applications in ecological and biodiversity research. + +Biodiversity datasets contain unique characteristics that are not adequately captured by the base Croissant format, including: + +- **Taxonomic information**: Species identification, taxonomic hierarchy, and nomenclature +- **Geographic and temporal context**: Collection locations, habitats, and temporal coverage +- **Ecological relationships**: Trophic levels, species interactions, and ecological roles +- **Collection methodology**: Observation methods, specimen handling, and data quality indicators +- **Conservation context**: IUCN status, protected areas, and population data + +The ecoCroissant extension addresses these needs by providing a standardized vocabulary for documenting ecological and biodiversity metadata in ML-ready datasets. + +## Prerequisites + +The ecoCroissant vocabulary builds on the [schema.org/Dataset](http://schema.org/Dataset) vocabulary and the [Croissant core vocabulary](http://mlcommons.org/croissant/1.0). + +### Namespace + +The ecoCroissant vocabulary is defined in its own namespace, identified by the IRI: + +``` +http://imageomics.org/ecoCroissant/ +``` + +We abbreviate this namespace IRI using the prefix `eco`. + +### Related Vocabularies + +ecoCroissant integrates with established biodiversity standards: + +| Prefix | IRI | Description | +|--------|-----|-------------| +| sc | http://schema.org/ | The schema.org namespace | +| cr | http://mlcommons.org/croissant/ | MLCommons Croissant namespace | +| dwc | http://rs.tdwg.org/dwc/terms/ | Darwin Core terms | +| gbif | https://www.gbif.org/species/ | GBIF Species API | +| ncbi | https://www.ncbi.nlm.nih.gov/taxonomy/ | NCBI Taxonomy | +| eol | https://eol.org/pages/ | Encyclopedia of Life | +| iucn | https://www.iucnredlist.org/ | IUCN Red List | + +### Conformance + +ecoCroissant datasets must declare conformance to this specification: + +```json +"dct:conformsTo": "http://imageomics.org/ecoCroissant/1.0" +``` + +## Use Cases + +### Use Case 1: Taxonomic Discovery and Classification + +ML models for species identification require rich taxonomic context. ecoCroissant enables: + +- **Hierarchical taxonomy**: Complete taxonomic lineage from kingdom to subspecies +- **Taxonomic identifiers**: Links to authoritative databases (GBIF, NCBI, EOL) +- **Nomenclature history**: Synonyms, basionyms, and taxonomic revisions +- **Vernacular names**: Common names across languages and regions + +### Use Case 2: Geographic and Habitat Context + +Ecological datasets require spatial and habitat information: + +- **Geolocation**: Coordinates with precision and datum information +- **Habitat classification**: Biome, ecosystem, and microhabitat descriptions +- **Elevation and depth**: Altitude/depth ranges for species occurrences +- **Protected areas**: National parks, reserves, and conservation zones + +### Use Case 3: Temporal Ecology + +Understanding temporal patterns in biodiversity data: + +- **Seasonality**: Phenological timing, migration patterns +- **Collection timeline**: When observations or specimens were collected +- **Historical context**: Changes in distribution or abundance over time + +### Use Case 4: Species Interactions and Ecology + +Capturing ecological relationships: + +- **Trophic relationships**: Predator-prey, herbivore-plant interactions +- **Symbiotic relationships**: Mutualism, parasitism, commensalism +- **Pollination and dispersal**: Plant-animal interactions +- **Ecological roles**: Keystone species, ecosystem engineers + +### Use Case 5: Conservation and Population Status + +Conservation-relevant metadata: + +- **IUCN Red List status**: Global and regional threat assessments +- **Population trends**: Increasing, stable, decreasing +- **Threats**: Habitat loss, climate change, invasive species +- **Protection status**: Legal protection levels + +### Use Case 6: Data Quality and Provenance + +Ensuring data reliability for ML applications: + +- **Identification confidence**: Expert-verified vs. citizen science observations +- **Data collection method**: Field observation, museum specimen, remote sensing +- **Georeferencing quality**: GPS accuracy, geocoding method +- **Temporal precision**: Exact date vs. date range + +## ecoCroissant Properties + +### Taxonomic Properties + +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:taxon | sc:Taxon or sc:Text | MANY | The taxonomic name(s) of organisms in the dataset | +| eco:taxonRank | sc:Text | ONE | The taxonomic rank (e.g., species, genus, family) | +| eco:scientificName | sc:Text | ONE | The full scientific name including authorship | +| eco:taxonID | sc:URL | MANY | Identifier(s) from taxonomic databases (GBIF, NCBI, etc.) | +| eco:higherClassification | sc:Text | ONE | Full taxonomic hierarchy (Kingdom > Phylum > Class > Order > Family > Genus > Species) | +| eco:vernacularName | sc:Text | MANY | Common name(s) in various languages | +| eco:taxonomicStatus | sc:Text | ONE | Status of the taxon name (accepted, synonym, etc.) | + +### Geographic Properties + +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:locality | sc:Text | ONE | Description of the location | +| eco:habitat | sc:Text | MANY | Habitat type(s) where organisms occur | +| eco:biome | sc:Text | ONE | Major biome classification | +| eco:continent | sc:Text | ONE | Continent of occurrence | +| eco:country | sc:Text | MANY | Country/countries of occurrence | +| eco:coordinateUncertaintyInMeters | sc:Number | ONE | Uncertainty radius for coordinates | +| eco:minimumElevationInMeters | sc:Number | ONE | Minimum elevation of occurrences | +| eco:maximumElevationInMeters | sc:Number | ONE | Maximum elevation of occurrences | +| eco:minimumDepthInMeters | sc:Number | ONE | Minimum depth (for aquatic organisms) | +| eco:maximumDepthInMeters | sc:Number | ONE | Maximum depth (for aquatic organisms) | + +### Temporal Properties + +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:eventDate | sc:Date or sc:DateTime | MANY | Date(s) when data was collected | +| eco:eventDateStart | sc:Date | ONE | Start of collection period | +| eco:eventDateEnd | sc:Date | ONE | End of collection period | +| eco:seasonality | sc:Text | MANY | Seasonal patterns in the data | +| eco:lifeStage | sc:Text | MANY | Life stage(s) represented (egg, larva, adult, etc.) | + +### Ecological Properties + +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:trophicLevel | sc:Text | ONE | Position in food chain (producer, primary consumer, etc.) | +| eco:ecologicalRole | sc:Text | MANY | Ecological function (pollinator, predator, decomposer, etc.) | +| eco:speciesInteractions | sc:Text | MANY | Description of species interactions in the dataset | +| eco:diet | sc:Text | MANY | Diet composition for animals | +| eco:hostOrganism | sc:Text | MANY | Host species (for parasites, symbionts) | + +### Conservation Properties + +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:iucnStatus | sc:Text | ONE | IUCN Red List category (LC, NT, VU, EN, CR, EW, EX) | +| eco:iucnStatusSource | sc:URL | ONE | Link to IUCN assessment | +| eco:populationTrend | sc:Text | ONE | Population trend (increasing, stable, decreasing, unknown) | +| eco:threats | sc:Text | MANY | Known threats to the species | +| eco:conservationActions | sc:Text | MANY | Conservation actions in place or recommended | +| eco:protectedArea | sc:Text | MANY | Protected areas where species occurs | + +### Data Quality Properties + +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:identificationVerificationStatus | sc:Text | ONE | Verification level of taxonomic identifications | +| eco:identifiedBy | sc:Text | MANY | Who identified the specimens/observations | +| eco:samplingProtocol | sc:Text | ONE | Method used to collect data | +| eco:dataGeneralizations | sc:Text | ONE | Any data generalizations applied (e.g., coordinate obscuring) | +| eco:informationWithheld | sc:Text | ONE | Information intentionally withheld (e.g., for endangered species) | +| eco:basisOfRecord | sc:Text | ONE | Type of record (PreservedSpecimen, HumanObservation, MachineObservation, etc.) | + +### Image and Observation Properties + +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:imageLicense | sc:URL | ONE | License for images in the dataset | +| eco:imageType | sc:Text | MANY | Type of images (photograph, illustration, microscopy, etc.) | +| eco:viewAngle | sc:Text | MANY | View angle of specimens in images (dorsal, ventral, lateral, etc.) | +| eco:anatomicalFeatures | sc:Text | MANY | Anatomical features visible or annotated | +| eco:phenotype | sc:Text | MANY | Observable phenotypic characteristics | + +## JSON-LD Context + +The recommended JSON-LD context for ecoCroissant: + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/", + + "taxon": "eco:taxon", + "taxonRank": "eco:taxonRank", + "scientificName": "eco:scientificName", + "taxonID": "eco:taxonID", + "higherClassification": "eco:higherClassification", + "vernacularName": "eco:vernacularName", + "taxonomicStatus": "eco:taxonomicStatus", + + "locality": "eco:locality", + "habitat": "eco:habitat", + "biome": "eco:biome", + "continent": "eco:continent", + "coordinateUncertaintyInMeters": "eco:coordinateUncertaintyInMeters", + "minimumElevationInMeters": "eco:minimumElevationInMeters", + "maximumElevationInMeters": "eco:maximumElevationInMeters", + "minimumDepthInMeters": "eco:minimumDepthInMeters", + "maximumDepthInMeters": "eco:maximumDepthInMeters", + + "eventDate": "eco:eventDate", + "eventDateStart": "eco:eventDateStart", + "eventDateEnd": "eco:eventDateEnd", + "seasonality": "eco:seasonality", + "lifeStage": "eco:lifeStage", + + "trophicLevel": "eco:trophicLevel", + "ecologicalRole": "eco:ecologicalRole", + "speciesInteractions": "eco:speciesInteractions", + "diet": "eco:diet", + "hostOrganism": "eco:hostOrganism", + + "iucnStatus": "eco:iucnStatus", + "iucnStatusSource": "eco:iucnStatusSource", + "populationTrend": "eco:populationTrend", + "threats": "eco:threats", + "conservationActions": "eco:conservationActions", + "protectedArea": "eco:protectedArea", + + "identificationVerificationStatus": "eco:identificationVerificationStatus", + "identifiedBy": "eco:identifiedBy", + "samplingProtocol": "eco:samplingProtocol", + "dataGeneralizations": "eco:dataGeneralizations", + "informationWithheld": "eco:informationWithheld", + "basisOfRecord": "eco:basisOfRecord", + + "imageLicense": "eco:imageLicense", + "imageType": "eco:imageType", + "viewAngle": "eco:viewAngle", + "anatomicalFeatures": "eco:anatomicalFeatures", + "phenotype": "eco:phenotype" + } +} +``` + +## Examples + +### Example 1: Species Image Dataset (TreeOfLife-200M style) + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "TreeOfLife-200M", + "description": "A large-scale dataset of 200 million images spanning the tree of life, designed for training species identification models.", + "license": "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "eco:taxon": ["Animalia", "Plantae", "Fungi"], + "eco:taxonRank": "kingdom", + "eco:higherClassification": "Life > Eukaryota > Multiple Kingdoms", + + "eco:habitat": ["terrestrial", "freshwater", "marine"], + "eco:continent": ["Africa", "Antarctica", "Asia", "Europe", "North America", "Oceania", "South America"], + + "eco:basisOfRecord": ["HumanObservation", "PreservedSpecimen", "MachineObservation"], + "eco:imageType": ["photograph", "museum specimen"], + "eco:identificationVerificationStatus": "mixed - includes expert-verified and community-validated observations", + + "eco:samplingProtocol": "Images collected from multiple sources including iNaturalist, museum collections, and research projects", + + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "images.tar.gz", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/images.tar.gz", + "encodingFormat": "application/gzip" + } + ], + + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "species_images", + "field": [ + { + "@type": "cr:Field", + "@id": "species_images/image", + "dataType": "sc:ImageObject" + }, + { + "@type": "cr:Field", + "@id": "species_images/scientific_name", + "description": "Scientific name of the species", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_images/taxon_id", + "description": "GBIF taxon identifier", + "dataType": "sc:URL" + }, + { + "@type": "cr:Field", + "@id": "species_images/kingdom", + "description": "Taxonomic kingdom", + "dataType": "sc:Text" + } + ] + } + ] +} +``` + +### Example 2: Butterfly Specimen Dataset + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "Heliconius Butterfly Wing Pattern Dataset", + "description": "High-resolution images of Heliconius butterfly specimens with wing pattern annotations for studying mimicry and adaptation.", + "license": "https://creativecommons.org/licenses/by/4.0/", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "eco:taxon": "Heliconius", + "eco:taxonRank": "genus", + "eco:scientificName": "Heliconius Kluk, 1780", + "eco:taxonID": ["https://www.gbif.org/species/1932585"], + "eco:higherClassification": "Animalia > Arthropoda > Insecta > Lepidoptera > Nymphalidae > Heliconiinae > Heliconius", + "eco:vernacularName": ["Longwing butterflies", "Heliconius butterflies"], + + "eco:habitat": ["tropical rainforest", "forest edge", "secondary forest"], + "eco:biome": "tropical moist broadleaf forest", + "eco:continent": ["South America", "Central America"], + "eco:country": ["Ecuador", "Peru", "Colombia", "Panama", "Costa Rica"], + "eco:minimumElevationInMeters": 0, + "eco:maximumElevationInMeters": 2000, + + "eco:lifeStage": ["adult"], + "eco:trophicLevel": "primary consumer", + "eco:ecologicalRole": ["pollinator", "Müllerian mimic"], + "eco:diet": ["pollen", "nectar"], + "eco:hostOrganism": ["Passiflora (host plant for larvae)"], + "eco:speciesInteractions": "Müllerian mimicry complex with other Heliconius species; larvae feed exclusively on Passiflora plants", + + "eco:iucnStatus": "LC", + "eco:populationTrend": "stable", + + "eco:basisOfRecord": "PreservedSpecimen", + "eco:identificationVerificationStatus": "expert-verified", + "eco:identifiedBy": ["Museum taxonomists", "Heliconius specialists"], + "eco:samplingProtocol": "Museum specimens imaged with standardized dorsal and ventral views", + + "eco:imageType": ["museum specimen photograph"], + "eco:viewAngle": ["dorsal", "ventral"], + "eco:anatomicalFeatures": ["forewing", "hindwing", "wing pattern"], + "eco:phenotype": "wing color pattern" +} +``` + +### Example 3: Camera Trap Dataset + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "Amazon Rainforest Camera Trap Survey", + "description": "Camera trap images of mammals from the Amazon rainforest for biodiversity monitoring and species identification.", + "license": "https://creativecommons.org/licenses/by-nc/4.0/", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "eco:taxon": "Mammalia", + "eco:taxonRank": "class", + "eco:higherClassification": "Animalia > Chordata > Mammalia", + + "eco:locality": "Yasuní National Park, Ecuador", + "eco:habitat": ["lowland tropical rainforest", "terra firme forest", "várzea forest"], + "eco:biome": "tropical moist broadleaf forest", + "eco:continent": "South America", + "eco:country": "Ecuador", + "eco:coordinateUncertaintyInMeters": 10, + "eco:minimumElevationInMeters": 200, + "eco:maximumElevationInMeters": 400, + + "eco:eventDateStart": "2020-01-01", + "eco:eventDateEnd": "2022-12-31", + "eco:seasonality": ["wet season", "dry season"], + + "eco:protectedArea": "Yasuní National Park", + "eco:threats": ["habitat fragmentation", "oil extraction", "hunting"], + + "eco:basisOfRecord": "MachineObservation", + "eco:identificationVerificationStatus": "expert-verified with AI-assisted pre-classification", + "eco:samplingProtocol": "Camera traps deployed at 1km intervals, active 24/7", + "eco:dataGeneralizations": "Exact coordinates obscured for sensitive species locations", + "eco:informationWithheld": "Precise locations of endangered species nesting sites withheld", + + "eco:imageType": ["camera trap photograph"], + "eco:imageLicense": "https://creativecommons.org/licenses/by-nc/4.0/" +} +``` + +## Alignment with Darwin Core + +ecoCroissant properties are designed to be compatible with [Darwin Core](https://dwc.tdwg.org/) terms where applicable. The following table shows the mapping: + +| ecoCroissant Property | Darwin Core Term | +|----------------------|------------------| +| eco:taxon | dwc:scientificName | +| eco:taxonRank | dwc:taxonRank | +| eco:higherClassification | dwc:higherClassification | +| eco:vernacularName | dwc:vernacularName | +| eco:locality | dwc:locality | +| eco:habitat | dwc:habitat | +| eco:continent | dwc:continent | +| eco:country | dwc:country | +| eco:coordinateUncertaintyInMeters | dwc:coordinateUncertaintyInMeters | +| eco:eventDate | dwc:eventDate | +| eco:lifeStage | dwc:lifeStage | +| eco:basisOfRecord | dwc:basisOfRecord | +| eco:identifiedBy | dwc:identifiedBy | +| eco:samplingProtocol | dwc:samplingProtocol | + +## Integration with FAIR4AI Principles + +ecoCroissant supports FAIR4AI principles: + +### Findable +- Standardized metadata fields enable discovery across repositories +- Links to authoritative taxonomic databases (GBIF, NCBI, EOL) +- Rich keyword and classification support + +### Accessible +- Clear licensing information for data and images +- Information about data access restrictions or withheld information +- Links to data sources and repositories + +### Interoperable +- JSON-LD format compatible with Croissant ecosystem +- Alignment with Darwin Core for biodiversity data exchange +- Integration with schema.org for web discoverability + +### Reusable +- Detailed provenance and methodology documentation +- Data quality indicators and verification status +- Conservation context for ethical use considerations + +## References + +1. [Croissant: A Metadata Format for ML-Ready Datasets](https://doi.org/10.1145/3650203.3663326) +2. [Darwin Core Standard](https://dwc.tdwg.org/) +3. [GBIF Data Quality](https://www.gbif.org/data-quality-requirements) +4. [FAIR4AI Principles](https://www.nature.com/articles/s41597-022-01759-2) +5. [IUCN Red List Categories and Criteria](https://www.iucnredlist.org/resources/categories-and-criteria) +6. [Encyclopedia of Life](https://eol.org/) + +## License + +This specification is released under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). + +## Contributors + +- Imageomics Institute + +## Acknowledgments + +This work builds upon the [Croissant format](https://github.com/mlcommons/croissant) developed by the MLCommons Datasets Working Group. diff --git a/examples/treeoflife-200m.json b/examples/treeoflife-200m.json new file mode 100644 index 0000000..54bde31 --- /dev/null +++ b/examples/treeoflife-200m.json @@ -0,0 +1,283 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "TreeOfLife-200M", + "description": "Tree of Life 200M (ToL-200M) is a comprehensive phylogenetically-guided image dataset containing approximately 200 million images from iNaturalist, spanning nearly all known species across the tree of life. The dataset is designed for training and evaluating computer vision models for species identification and biodiversity research.", + "license": "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M", + "creator": { + "@type": "Organization", + "name": "Imageomics Institute", + "url": "https://imageomics.org/" + }, + "datePublished": "2024", + "keywords": [ + "species identification", + "biodiversity", + "computer vision", + "iNaturalist", + "phylogeny", + "tree of life", + "image classification" + ], + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "eco:taxon": [ + "Animalia", + "Plantae", + "Fungi", + "Chromista", + "Protozoa", + "Bacteria", + "Archaea" + ], + "eco:taxonRank": "kingdom", + "eco:higherClassification": "All domains of life represented", + "eco:taxonomicStatus": "accepted names from iNaturalist taxonomy", + + "eco:habitat": [ + "terrestrial", + "freshwater", + "marine", + "urban", + "agricultural", + "forest", + "grassland", + "desert", + "wetland", + "alpine" + ], + "eco:biome": "all major biomes represented", + "eco:continent": [ + "Africa", + "Antarctica", + "Asia", + "Europe", + "North America", + "Oceania", + "South America" + ], + + "eco:basisOfRecord": [ + "HumanObservation" + ], + "eco:imageType": [ + "photograph" + ], + "eco:identificationVerificationStatus": "Research Grade observations from iNaturalist (community consensus with at least 2/3 agreement)", + "eco:identifiedBy": [ + "iNaturalist community", + "Subject matter experts" + ], + "eco:samplingProtocol": "Community science observations from iNaturalist platform", + "eco:dataGeneralizations": "Coordinates obscured for sensitive species (geoprivacy settings applied by observers)", + + "eco:lifeStage": [ + "egg", + "larva", + "juvenile", + "adult", + "various" + ], + + "eco:trophicLevel": "multiple - dataset spans producers, consumers, and decomposers", + "eco:ecologicalRole": [ + "primary producer", + "herbivore", + "carnivore", + "omnivore", + "decomposer", + "pollinator", + "parasite", + "symbiont" + ], + + "eco:iucnStatus": "multiple - includes species across all IUCN categories", + "eco:populationTrend": "multiple - includes increasing, stable, and decreasing populations", + + "eco:imageLicense": "https://creativecommons.org/licenses/by-nc/4.0/", + "eco:viewAngle": [ + "various" + ], + "eco:anatomicalFeatures": [ + "whole organism", + "diagnostic features", + "various body parts" + ], + "eco:phenotype": "natural variation in appearance, coloration, and morphology", + + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "metadata.parquet", + "name": "metadata.parquet", + "description": "Parquet file containing image metadata including species labels, taxonomic hierarchy, and observation details", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/metadata.parquet", + "encodingFormat": "application/x-parquet" + } + ], + + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "species_observations", + "name": "Species Observations", + "description": "Records of species observations with associated images and taxonomic metadata", + "field": [ + { + "@type": "cr:Field", + "@id": "species_observations/image_id", + "name": "image_id", + "description": "Unique identifier for the image", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/image_url", + "name": "image_url", + "description": "URL to the image file", + "dataType": "sc:URL" + }, + { + "@type": "cr:Field", + "@id": "species_observations/scientific_name", + "name": "scientific_name", + "description": "Scientific name of the observed species", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/common_name", + "name": "common_name", + "description": "Common name of the observed species", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/taxon_id", + "name": "taxon_id", + "description": "iNaturalist taxon identifier", + "dataType": "sc:Integer" + }, + { + "@type": "cr:Field", + "@id": "species_observations/kingdom", + "name": "kingdom", + "description": "Taxonomic kingdom", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/phylum", + "name": "phylum", + "description": "Taxonomic phylum", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/class", + "name": "class", + "description": "Taxonomic class", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/order", + "name": "order", + "description": "Taxonomic order", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/family", + "name": "family", + "description": "Taxonomic family", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/genus", + "name": "genus", + "description": "Taxonomic genus", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_observations/latitude", + "name": "latitude", + "description": "Latitude of observation (may be obscured for sensitive species)", + "dataType": "sc:Float" + }, + { + "@type": "cr:Field", + "@id": "species_observations/longitude", + "name": "longitude", + "description": "Longitude of observation (may be obscured for sensitive species)", + "dataType": "sc:Float" + }, + { + "@type": "cr:Field", + "@id": "species_observations/observed_on", + "name": "observed_on", + "description": "Date when the observation was made", + "dataType": "sc:Date" + }, + { + "@type": "cr:Field", + "@id": "species_observations/quality_grade", + "name": "quality_grade", + "description": "iNaturalist quality grade (research, needs_id, casual)", + "dataType": "sc:Text" + } + ] + }, + { + "@type": "cr:RecordSet", + "@id": "taxonomic_hierarchy", + "name": "Taxonomic Hierarchy", + "description": "Complete taxonomic classification for species in the dataset", + "dataType": "sc:Enumeration", + "key": {"@id": "taxonomic_hierarchy/taxon_id"}, + "field": [ + { + "@type": "cr:Field", + "@id": "taxonomic_hierarchy/taxon_id", + "name": "taxon_id", + "description": "Unique taxon identifier", + "dataType": "sc:Integer" + }, + { + "@type": "cr:Field", + "@id": "taxonomic_hierarchy/scientific_name", + "name": "scientific_name", + "description": "Scientific name", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "taxonomic_hierarchy/rank", + "name": "rank", + "description": "Taxonomic rank", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "taxonomic_hierarchy/parent_taxon_id", + "name": "parent_taxon_id", + "description": "Parent taxon identifier for hierarchical relationships", + "dataType": "sc:Integer" + } + ] + } + ] +} diff --git a/schema/eco-context.jsonld b/schema/eco-context.jsonld new file mode 100644 index 0000000..e6668f4 --- /dev/null +++ b/schema/eco-context.jsonld @@ -0,0 +1,98 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/", + + "taxon": "eco:taxon", + "taxonRank": "eco:taxonRank", + "scientificName": "eco:scientificName", + "taxonID": "eco:taxonID", + "higherClassification": "eco:higherClassification", + "vernacularName": "eco:vernacularName", + "taxonomicStatus": "eco:taxonomicStatus", + + "locality": "eco:locality", + "habitat": "eco:habitat", + "biome": "eco:biome", + "continent": "eco:continent", + "coordinateUncertaintyInMeters": "eco:coordinateUncertaintyInMeters", + "minimumElevationInMeters": "eco:minimumElevationInMeters", + "maximumElevationInMeters": "eco:maximumElevationInMeters", + "minimumDepthInMeters": "eco:minimumDepthInMeters", + "maximumDepthInMeters": "eco:maximumDepthInMeters", + + "eventDate": "eco:eventDate", + "eventDateStart": "eco:eventDateStart", + "eventDateEnd": "eco:eventDateEnd", + "seasonality": "eco:seasonality", + "lifeStage": "eco:lifeStage", + + "trophicLevel": "eco:trophicLevel", + "ecologicalRole": "eco:ecologicalRole", + "speciesInteractions": "eco:speciesInteractions", + "diet": "eco:diet", + "hostOrganism": "eco:hostOrganism", + + "iucnStatus": "eco:iucnStatus", + "iucnStatusSource": "eco:iucnStatusSource", + "populationTrend": "eco:populationTrend", + "threats": "eco:threats", + "conservationActions": "eco:conservationActions", + "protectedArea": "eco:protectedArea", + + "identificationVerificationStatus": "eco:identificationVerificationStatus", + "identifiedBy": "eco:identifiedBy", + "samplingProtocol": "eco:samplingProtocol", + "dataGeneralizations": "eco:dataGeneralizations", + "informationWithheld": "eco:informationWithheld", + "basisOfRecord": "eco:basisOfRecord", + + "imageLicense": "eco:imageLicense", + "imageType": "eco:imageType", + "viewAngle": "eco:viewAngle", + "anatomicalFeatures": "eco:anatomicalFeatures", + "phenotype": "eco:phenotype", + + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + } +} From 05b7998c950d8dcf16e11e95a63a322eea0d717d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 00:42:03 +0000 Subject: [PATCH 3/9] Add missing country property to JSON-LD context Co-authored-by: egrace479 <38985481+egrace479@users.noreply.github.com> --- schema/eco-context.jsonld | 1 + 1 file changed, 1 insertion(+) diff --git a/schema/eco-context.jsonld b/schema/eco-context.jsonld index e6668f4..fe139a2 100644 --- a/schema/eco-context.jsonld +++ b/schema/eco-context.jsonld @@ -20,6 +20,7 @@ "habitat": "eco:habitat", "biome": "eco:biome", "continent": "eco:continent", + "country": "eco:country", "coordinateUncertaintyInMeters": "eco:coordinateUncertaintyInMeters", "minimumElevationInMeters": "eco:minimumElevationInMeters", "maximumElevationInMeters": "eco:maximumElevationInMeters", From c679a0fa3b8bf4d76c40b08749be3d82c445f49d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 18:12:44 +0000 Subject: [PATCH 4/9] Restructure to use Darwin Core directly and add FAIR4AI features - Use Darwin Core terms directly instead of redefining as eco: prefixed terms - Add AI-specific properties for data distribution, preprocessing, splits - Add model provenance properties for AI-generated data - Add API/streaming properties for scalable data access - Add recordType property to distinguish occurrence vs image-based data - Update examples to demonstrate Darwin Core + AI-ready metadata - Update README to clarify Darwin Core integration and FAIR4AI requirements Co-authored-by: egrace479 <38985481+egrace479@users.noreply.github.com> --- README.md | 115 +++++---- docs/eco-spec.md | 472 +++++++++++++++++++++++----------- examples/treeoflife-200m.json | 295 ++++++++------------- schema/eco-context.jsonld | 92 ++++--- 4 files changed, 556 insertions(+), 418 deletions(-) diff --git a/README.md b/README.md index ac6d02b..ab2a0c3 100644 --- a/README.md +++ b/README.md @@ -7,24 +7,31 @@ ## Overview -ecoCroissant is an extension to the [Croissant format](https://github.com/mlcommons/croissant) designed to capture ecologically-relevant information from biodiversity datasets. It follows [FAIR4AI principles](https://www.nature.com/articles/s41597-022-01759-2) to ensure datasets are Findable, Accessible, Interoperable, and Reusable for AI/ML applications in ecological and biodiversity research. +ecoCroissant makes biodiversity datasets **AI-ready** by integrating [Darwin Core](https://dwc.tdwg.org/) terms with [FAIR4AI](https://www.nature.com/articles/s41597-022-01759-2) requirements. Rather than redefining existing standards, ecoCroissant uses Darwin Core terms directly and adds AI-specific metadata for machine learning applications. -## Why ecoCroissant? +### FAIR4AI Requirements -The standard Croissant format provides excellent support for ML-ready datasets, but biodiversity datasets have unique characteristics that require additional metadata: +ecoCroissant addresses the three key FAIR4AI requirements: -- **Taxonomic Information**: Species identification, taxonomic hierarchies, and nomenclature -- **Geographic Context**: Collection locations, habitats, elevation, and protected areas -- **Temporal Ecology**: Phenology, seasonality, and collection timelines -- **Ecological Relationships**: Trophic levels, species interactions, and ecological roles -- **Conservation Status**: IUCN categories, population trends, and threats -- **Data Quality**: Identification confidence, georeferencing accuracy, and sampling methods +1. **Queryable Metadata**: Data/metadata can be queried without downloading large files (via Parquet/queryable formats) +2. **Ontology Integration**: Darwin Core terms are queryable with synonyms from GBIF, NCBI, EOL +3. **Content/Context Extraction**: Clear distinction between occurrence-based and image-based records + +### AI-Ready Features + +ecoCroissant ensures datasets are AI-ready by documenting: + +- **Data Distribution**: Class distributions, long-tail characteristics, stratification details +- **Preprocessing Pipeline**: Standardization methods, augmentation, normalization +- **Train/Val/Test Splits**: Rationale, stratification variables, split proportions +- **Model Provenance**: For AI-generated annotations or labels +- **Streaming Support**: API endpoints and rate limits for scalable data access ## Quick Start -### Using ecoCroissant Properties +### Using Darwin Core with AI-Ready Metadata -Add the ecoCroissant context to your Croissant metadata: +ecoCroissant uses Darwin Core terms directly, adding AI-specific properties: ```json { @@ -33,6 +40,7 @@ Add the ecoCroissant context to your Croissant metadata: "@vocab": "https://schema.org/", "cr": "http://mlcommons.org/croissant/", "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", "dct": "http://purl.org/dc/terms/" }, "@type": "sc:Dataset", @@ -42,11 +50,15 @@ Add the ecoCroissant context to your Croissant metadata: "http://imageomics.org/ecoCroissant/1.0" ], - "eco:taxon": "Lepidoptera", - "eco:taxonRank": "order", - "eco:habitat": ["tropical rainforest", "temperate forest"], - "eco:iucnStatus": "LC", - "eco:basisOfRecord": "PreservedSpecimen" + "dwc:scientificName": "Lepidoptera", + "dwc:taxonRank": "order", + "dwc:habitat": ["tropical rainforest", "temperate forest"], + "dwc:basisOfRecord": "PreservedSpecimen", + + "eco:recordType": "image-based", + "eco:dataDistribution": "long-tailed: 5K species, 10-1000 images each", + "eco:trainTestSplit": "80/10/10 stratified by family", + "eco:preprocessingSteps": ["resized to 224x224", "ImageNet normalization"] } ``` @@ -54,7 +66,7 @@ Add the ecoCroissant context to your Croissant metadata: See the [examples](examples/) directory for complete examples: -- [TreeOfLife-200M](examples/treeoflife-200m.json) - Large-scale species image dataset +- [TreeOfLife-200M](examples/treeoflife-200m.json) - AI-ready species image dataset with 200M images ## Documentation @@ -63,38 +75,43 @@ See the [examples](examples/) directory for complete examples: ## Property Categories -### Taxonomic Properties -| Property | Description | -|----------|-------------| -| `eco:taxon` | Taxonomic name(s) of organisms | -| `eco:taxonRank` | Taxonomic rank (species, genus, family, etc.) | -| `eco:scientificName` | Full scientific name with authorship | -| `eco:taxonID` | Links to GBIF, NCBI, or other databases | -| `eco:higherClassification` | Full taxonomic hierarchy | - -### Geographic Properties -| Property | Description | -|----------|-------------| -| `eco:habitat` | Habitat type(s) | -| `eco:biome` | Major biome classification | -| `eco:locality` | Location description | -| `eco:protectedArea` | Protected areas where species occurs | - -### Conservation Properties -| Property | Description | -|----------|-------------| -| `eco:iucnStatus` | IUCN Red List category | -| `eco:populationTrend` | Population trend direction | -| `eco:threats` | Known threats to species | - -### Data Quality Properties -| Property | Description | -|----------|-------------| -| `eco:basisOfRecord` | Type of record (specimen, observation, etc.) | -| `eco:identificationVerificationStatus` | Verification level | -| `eco:samplingProtocol` | Data collection method | - -See the [full specification](docs/eco-spec.md) for all available properties. +### Darwin Core Terms (Used Directly) + +ecoCroissant uses Darwin Core terms without redefinition: + +#### Taxonomic +`dwc:scientificName`, `dwc:taxonRank`, `dwc:kingdom`, `dwc:phylum`, `dwc:class`, `dwc:order`, `dwc:family`, `dwc:genus`, `dwc:taxonID`, `dwc:higherClassification`, `dwc:vernacularName` + +#### Geographic +`dwc:locality`, `dwc:habitat`, `dwc:continent`, `dwc:country`, `dwc:decimalLatitude`, `dwc:decimalLongitude`, `dwc:coordinateUncertaintyInMeters`, `dwc:minimumElevationInMeters`, `dwc:maximumElevationInMeters` + +#### Temporal +`dwc:eventDate`, `dwc:year`, `dwc:month`, `dwc:day`, `dwc:lifeStage` + +#### Data Quality +`dwc:basisOfRecord`, `dwc:identifiedBy`, `dwc:identificationVerificationStatus`, `dwc:samplingProtocol`, `dwc:dataGeneralizations`, `dwc:informationWithheld` + +### AI-Specific ecoCroissant Extensions + +#### Data Distribution & Preprocessing +`eco:dataDistribution`, `eco:preprocessingSteps`, `eco:standardizationMethod`, `eco:trainTestSplit`, `eco:stratificationVariable`, `eco:dataSplitRationale` + +#### Model Provenance +`eco:generatedBy`, `eco:modelConfidence`, `eco:humanVerified`, `eco:generationMethod` + +#### API & Streaming +`eco:apiEndpoint`, `eco:rateLimitRequests`, `eco:rateLimitPeriod`, `eco:streamingSupported`, `eco:bulkDownloadSize` + +#### Record Type Context +`eco:recordType`, `eco:occurrenceToImageRatio`, `eco:imageAnnotationType` + +#### Ecological Extensions +`eco:biome`, `eco:trophicLevel`, `eco:ecologicalRole`, `eco:speciesInteractions`, `eco:diet` + +#### Conservation Extensions +`eco:iucnStatus`, `eco:populationTrend`, `eco:threats`, `eco:protectedArea` + +See the [full specification](docs/eco-spec.md) for complete property definitions. ## Integration with Standards diff --git a/docs/eco-spec.md b/docs/eco-spec.md index 1ef9317..238e904 100644 --- a/docs/eco-spec.md +++ b/docs/eco-spec.md @@ -8,17 +8,33 @@ Version 1.0 ## Introduction -ecoCroissant is an extension to the [Croissant format](http://mlcommons.org/croissant/1.0) designed to capture ecologically-relevant information from biodiversity datasets. It follows [FAIR4AI principles](https://www.nature.com/articles/s41597-022-01759-2) to ensure datasets are Findable, Accessible, Interoperable, and Reusable for AI/ML applications in ecological and biodiversity research. +ecoCroissant is an extension to the [Croissant format](http://mlcommons.org/croissant/1.0) designed to make biodiversity datasets **AI-ready** by integrating Darwin Core terms with FAIR4AI-specific requirements. Rather than redefining existing biodiversity standards, ecoCroissant builds upon [Darwin Core](https://dwc.tdwg.org/) terms and enhances them with AI-specific metadata needed for machine learning applications. -Biodiversity datasets contain unique characteristics that are not adequately captured by the base Croissant format, including: +### FAIR4AI Requirements -- **Taxonomic information**: Species identification, taxonomic hierarchy, and nomenclature -- **Geographic and temporal context**: Collection locations, habitats, and temporal coverage -- **Ecological relationships**: Trophic levels, species interactions, and ecological roles -- **Collection methodology**: Observation methods, specimen handling, and data quality indicators -- **Conservation context**: IUCN status, protected areas, and population data +ecoCroissant specifically addresses FAIR4AI requirements: -The ecoCroissant extension addresses these needs by providing a standardized vocabulary for documenting ecological and biodiversity metadata in ML-ready datasets. +1. **Queryable Metadata**: Data and metadata can be queried without downloading large files or specialized file types +2. **Ontology Integration**: Darwin Core terms are directly integrated, queryable with synonyms from other biodiversity ontologies (GBIF, NCBI, EOL) +3. **Content/Context Extraction**: Clear distinction between occurrence-based and image-based data records + +### AI-Ready Data Requirements + +ecoCroissant ensures datasets are **AI-ready** by including: + +- **Distribution Information**: Data splits, stratification details, and class distributions in usable form +- **Preprocessing Documentation**: Information about whether data has been processed or standardized and how +- **Model Provenance**: For AI-generated annotations or classifications +- **Rate Limiting**: Server profiling information for streaming data from sources + +### Extension Scope + +ecoCroissant extends Croissant by: + +- **Direct Darwin Core Integration**: Using Darwin Core terms natively rather than redefining them +- **AI-Specific Metadata**: Adding properties for model provenance, data splits, and preprocessing pipelines +- **Ecological Context**: Properties for ecological relationships and conservation status not in Darwin Core +- **Image-Specific Metadata**: Properties for anatomical features, view angles, and image types relevant to biodiversity ML ## Prerequisites @@ -111,57 +127,114 @@ Ensuring data reliability for ML applications: - **Georeferencing quality**: GPS accuracy, geocoding method - **Temporal precision**: Exact date vs. date range -## ecoCroissant Properties +## Properties + +ecoCroissant uses Darwin Core terms directly where applicable and adds new properties only when needed for AI-specific requirements or ecological concepts not covered by Darwin Core. + +### Darwin Core Properties (Used Directly) -### Taxonomic Properties +The following Darwin Core terms are used directly without redefinition: +#### Taxonomic Terms | Property | Expected Type | Cardinality | Description | |----------|---------------|-------------|-------------| -| eco:taxon | sc:Taxon or sc:Text | MANY | The taxonomic name(s) of organisms in the dataset | -| eco:taxonRank | sc:Text | ONE | The taxonomic rank (e.g., species, genus, family) | -| eco:scientificName | sc:Text | ONE | The full scientific name including authorship | -| eco:taxonID | sc:URL | MANY | Identifier(s) from taxonomic databases (GBIF, NCBI, etc.) | -| eco:higherClassification | sc:Text | ONE | Full taxonomic hierarchy (Kingdom > Phylum > Class > Order > Family > Genus > Species) | -| eco:vernacularName | sc:Text | MANY | Common name(s) in various languages | -| eco:taxonomicStatus | sc:Text | ONE | Status of the taxon name (accepted, synonym, etc.) | - -### Geographic Properties +| dwc:scientificName | sc:Text | ONE | The full scientific name including authorship | +| dwc:taxonRank | sc:Text | ONE | The taxonomic rank (e.g., species, genus, family) | +| dwc:kingdom | sc:Text | ONE | Taxonomic kingdom | +| dwc:phylum | sc:Text | ONE | Taxonomic phylum | +| dwc:class | sc:Text | ONE | Taxonomic class | +| dwc:order | sc:Text | ONE | Taxonomic order | +| dwc:family | sc:Text | ONE | Taxonomic family | +| dwc:genus | sc:Text | ONE | Taxonomic genus | +| dwc:higherClassification | sc:Text | ONE | Full taxonomic hierarchy | +| dwc:vernacularName | sc:Text | MANY | Common name(s) in various languages | +| dwc:taxonomicStatus | sc:Text | ONE | Status of the taxon name (accepted, synonym, etc.) | +| dwc:taxonID | sc:URL | MANY | Identifier from taxonomic databases (GBIF, NCBI, etc.) | + +#### Geographic Terms +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| dwc:locality | sc:Text | ONE | Description of the location | +| dwc:habitat | sc:Text | MANY | Habitat type(s) where organisms occur | +| dwc:continent | sc:Text | ONE | Continent of occurrence | +| dwc:country | sc:Text | MANY | Country/countries of occurrence | +| dwc:coordinateUncertaintyInMeters | sc:Number | ONE | Uncertainty radius for coordinates | +| dwc:minimumElevationInMeters | sc:Number | ONE | Minimum elevation of occurrences | +| dwc:maximumElevationInMeters | sc:Number | ONE | Maximum elevation of occurrences | +| dwc:minimumDepthInMeters | sc:Number | ONE | Minimum depth (for aquatic organisms) | +| dwc:maximumDepthInMeters | sc:Number | ONE | Maximum depth (for aquatic organisms) | +| dwc:decimalLatitude | sc:Float | ONE | Latitude in decimal degrees | +| dwc:decimalLongitude | sc:Float | ONE | Longitude in decimal degrees | +| dwc:geodeticDatum | sc:Text | ONE | Spatial reference system (e.g., WGS84) | + +#### Temporal Terms +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| dwc:eventDate | sc:Date or sc:DateTime | MANY | Date(s) when data was collected | +| dwc:year | sc:Integer | ONE | Year of collection | +| dwc:month | sc:Integer | ONE | Month of collection | +| dwc:day | sc:Integer | ONE | Day of collection | +| dwc:lifeStage | sc:Text | MANY | Life stage(s) represented (egg, larva, adult, etc.) | +#### Data Quality Terms | Property | Expected Type | Cardinality | Description | |----------|---------------|-------------|-------------| -| eco:locality | sc:Text | ONE | Description of the location | -| eco:habitat | sc:Text | MANY | Habitat type(s) where organisms occur | -| eco:biome | sc:Text | ONE | Major biome classification | -| eco:continent | sc:Text | ONE | Continent of occurrence | -| eco:country | sc:Text | MANY | Country/countries of occurrence | -| eco:coordinateUncertaintyInMeters | sc:Number | ONE | Uncertainty radius for coordinates | -| eco:minimumElevationInMeters | sc:Number | ONE | Minimum elevation of occurrences | -| eco:maximumElevationInMeters | sc:Number | ONE | Maximum elevation of occurrences | -| eco:minimumDepthInMeters | sc:Number | ONE | Minimum depth (for aquatic organisms) | -| eco:maximumDepthInMeters | sc:Number | ONE | Maximum depth (for aquatic organisms) | +| dwc:identificationVerificationStatus | sc:Text | ONE | Verification level of taxonomic identifications | +| dwc:identifiedBy | sc:Text | MANY | Who identified the specimens/observations | +| dwc:samplingProtocol | sc:Text | ONE | Method used to collect data | +| dwc:dataGeneralizations | sc:Text | ONE | Any data generalizations applied (e.g., coordinate obscuring) | +| dwc:informationWithheld | sc:Text | ONE | Information intentionally withheld (e.g., for endangered species) | +| dwc:basisOfRecord | sc:Text | ONE | Type of record (PreservedSpecimen, HumanObservation, MachineObservation, etc.) | +| dwc:occurrenceStatus | sc:Text | ONE | Whether organism was present or absent | -### Temporal Properties +### AI-Specific Properties (ecoCroissant Extensions) +These properties are ecoCroissant additions for AI-ready data requirements: + +#### Data Distribution and Preprocessing | Property | Expected Type | Cardinality | Description | |----------|---------------|-------------|-------------| -| eco:eventDate | sc:Date or sc:DateTime | MANY | Date(s) when data was collected | -| eco:eventDateStart | sc:Date | ONE | Start of collection period | -| eco:eventDateEnd | sc:Date | ONE | End of collection period | -| eco:seasonality | sc:Text | MANY | Seasonal patterns in the data | -| eco:lifeStage | sc:Text | MANY | Life stage(s) represented (egg, larva, adult, etc.) | +| eco:dataDistribution | sc:Text | ONE | Description of class distribution (e.g., "long-tailed", "balanced", stratification details) | +| eco:preprocessingSteps | sc:Text | MANY | List of preprocessing steps applied (e.g., "resized to 224x224", "normalized to [-1,1]") | +| eco:standardizationMethod | sc:Text | ONE | Method used for data standardization if applicable | +| eco:trainTestSplit | sc:Text | ONE | Description of train/test/validation splits with proportions | +| eco:stratificationVariable | sc:Text | MANY | Variables used for stratification (e.g., "taxonomic family", "geographic region") | +| eco:dataSplitRationale | sc:Text | ONE | Rationale for data splitting strategy | + +#### Model Provenance for AI-Generated Data +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:generatedBy | sc:Text | ONE | Name/version of model that generated annotations or classifications | +| eco:modelConfidence | sc:Float | ONE | Confidence score for AI-generated labels (0-1) | +| eco:humanVerified | sc:Boolean | ONE | Whether AI-generated data has been human-verified | +| eco:generationMethod | sc:Text | ONE | Method used for generation (e.g., "automated classification", "bounding box detection") | + +#### API and Streaming Information +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:apiEndpoint | sc:URL | ONE | API endpoint for streaming data access | +| eco:rateLimitRequests | sc:Integer | ONE | Maximum requests per time period | +| eco:rateLimitPeriod | sc:Text | ONE | Time period for rate limit (e.g., "per minute", "per hour") | +| eco:streamingSupported | sc:Boolean | ONE | Whether data can be streamed rather than downloaded | +| eco:bulkDownloadSize | sc:Text | ONE | Approximate size of full dataset download | -### Ecological Properties +#### Record Type Context (FAIR4AI Requirement) +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:recordType | sc:Text | ONE | Type of data record: "occurrence-based" or "image-based" or "mixed" | +| eco:occurrenceToImageRatio | sc:Float | ONE | Ratio of occurrence records to images (relevant for mixed datasets) | +| eco:imageAnnotationType | sc:Text | MANY | Type of image annotations (e.g., "bounding box", "segmentation", "whole image classification") | +#### Ecological Extensions (Not in Darwin Core) | Property | Expected Type | Cardinality | Description | |----------|---------------|-------------|-------------| +| eco:biome | sc:Text | ONE | Major biome classification | | eco:trophicLevel | sc:Text | ONE | Position in food chain (producer, primary consumer, etc.) | | eco:ecologicalRole | sc:Text | MANY | Ecological function (pollinator, predator, decomposer, etc.) | | eco:speciesInteractions | sc:Text | MANY | Description of species interactions in the dataset | | eco:diet | sc:Text | MANY | Diet composition for animals | -| eco:hostOrganism | sc:Text | MANY | Host species (for parasites, symbionts) | - -### Conservation Properties +#### Conservation Extensions | Property | Expected Type | Cardinality | Description | |----------|---------------|-------------|-------------| | eco:iucnStatus | sc:Text | ONE | IUCN Red List category (LC, NT, VU, EN, CR, EW, EX) | @@ -171,19 +244,7 @@ Ensuring data reliability for ML applications: | eco:conservationActions | sc:Text | MANY | Conservation actions in place or recommended | | eco:protectedArea | sc:Text | MANY | Protected areas where species occurs | -### Data Quality Properties - -| Property | Expected Type | Cardinality | Description | -|----------|---------------|-------------|-------------| -| eco:identificationVerificationStatus | sc:Text | ONE | Verification level of taxonomic identifications | -| eco:identifiedBy | sc:Text | MANY | Who identified the specimens/observations | -| eco:samplingProtocol | sc:Text | ONE | Method used to collect data | -| eco:dataGeneralizations | sc:Text | ONE | Any data generalizations applied (e.g., coordinate obscuring) | -| eco:informationWithheld | sc:Text | ONE | Information intentionally withheld (e.g., for endangered species) | -| eco:basisOfRecord | sc:Text | ONE | Type of record (PreservedSpecimen, HumanObservation, MachineObservation, etc.) | - -### Image and Observation Properties - +#### Image-Specific Extensions | Property | Expected Type | Cardinality | Description | |----------|---------------|-------------|-------------| | eco:imageLicense | sc:URL | ONE | License for images in the dataset | @@ -191,10 +252,12 @@ Ensuring data reliability for ML applications: | eco:viewAngle | sc:Text | MANY | View angle of specimens in images (dorsal, ventral, lateral, etc.) | | eco:anatomicalFeatures | sc:Text | MANY | Anatomical features visible or annotated | | eco:phenotype | sc:Text | MANY | Observable phenotypic characteristics | +| eco:imageResolution | sc:Text | ONE | Resolution of images (e.g., "1024x1024", "variable") | +| eco:imageFormat | sc:Text | MANY | Image file formats (e.g., "JPEG", "PNG", "TIFF") | ## JSON-LD Context -The recommended JSON-LD context for ecoCroissant: +The recommended JSON-LD context for ecoCroissant uses Darwin Core terms directly: ```json { @@ -207,35 +270,73 @@ The recommended JSON-LD context for ecoCroissant: "dwc": "http://rs.tdwg.org/dwc/terms/", "dct": "http://purl.org/dc/terms/", - "taxon": "eco:taxon", - "taxonRank": "eco:taxonRank", - "scientificName": "eco:scientificName", - "taxonID": "eco:taxonID", - "higherClassification": "eco:higherClassification", - "vernacularName": "eco:vernacularName", - "taxonomicStatus": "eco:taxonomicStatus", + "scientificName": "dwc:scientificName", + "taxonRank": "dwc:taxonRank", + "kingdom": "dwc:kingdom", + "phylum": "dwc:phylum", + "class": "dwc:class", + "order": "dwc:order", + "family": "dwc:family", + "genus": "dwc:genus", + "taxonID": "dwc:taxonID", + "higherClassification": "dwc:higherClassification", + "vernacularName": "dwc:vernacularName", + "taxonomicStatus": "dwc:taxonomicStatus", - "locality": "eco:locality", - "habitat": "eco:habitat", - "biome": "eco:biome", - "continent": "eco:continent", - "coordinateUncertaintyInMeters": "eco:coordinateUncertaintyInMeters", - "minimumElevationInMeters": "eco:minimumElevationInMeters", - "maximumElevationInMeters": "eco:maximumElevationInMeters", - "minimumDepthInMeters": "eco:minimumDepthInMeters", - "maximumDepthInMeters": "eco:maximumDepthInMeters", + "locality": "dwc:locality", + "habitat": "dwc:habitat", + "continent": "dwc:continent", + "country": "dwc:country", + "decimalLatitude": "dwc:decimalLatitude", + "decimalLongitude": "dwc:decimalLongitude", + "coordinateUncertaintyInMeters": "dwc:coordinateUncertaintyInMeters", + "minimumElevationInMeters": "dwc:minimumElevationInMeters", + "maximumElevationInMeters": "dwc:maximumElevationInMeters", + "minimumDepthInMeters": "dwc:minimumDepthInMeters", + "maximumDepthInMeters": "dwc:maximumDepthInMeters", + "geodeticDatum": "dwc:geodeticDatum", + + "eventDate": "dwc:eventDate", + "year": "dwc:year", + "month": "dwc:month", + "day": "dwc:day", + "lifeStage": "dwc:lifeStage", - "eventDate": "eco:eventDate", - "eventDateStart": "eco:eventDateStart", - "eventDateEnd": "eco:eventDateEnd", - "seasonality": "eco:seasonality", - "lifeStage": "eco:lifeStage", + "identificationVerificationStatus": "dwc:identificationVerificationStatus", + "identifiedBy": "dwc:identifiedBy", + "samplingProtocol": "dwc:samplingProtocol", + "dataGeneralizations": "dwc:dataGeneralizations", + "informationWithheld": "dwc:informationWithheld", + "basisOfRecord": "dwc:basisOfRecord", + "occurrenceStatus": "dwc:occurrenceStatus", + "dataDistribution": "eco:dataDistribution", + "preprocessingSteps": "eco:preprocessingSteps", + "standardizationMethod": "eco:standardizationMethod", + "trainTestSplit": "eco:trainTestSplit", + "stratificationVariable": "eco:stratificationVariable", + "dataSplitRationale": "eco:dataSplitRationale", + + "generatedBy": "eco:generatedBy", + "modelConfidence": "eco:modelConfidence", + "humanVerified": "eco:humanVerified", + "generationMethod": "eco:generationMethod", + + "apiEndpoint": "eco:apiEndpoint", + "rateLimitRequests": "eco:rateLimitRequests", + "rateLimitPeriod": "eco:rateLimitPeriod", + "streamingSupported": "eco:streamingSupported", + "bulkDownloadSize": "eco:bulkDownloadSize", + + "recordType": "eco:recordType", + "occurrenceToImageRatio": "eco:occurrenceToImageRatio", + "imageAnnotationType": "eco:imageAnnotationType", + + "biome": "eco:biome", "trophicLevel": "eco:trophicLevel", "ecologicalRole": "eco:ecologicalRole", "speciesInteractions": "eco:speciesInteractions", "diet": "eco:diet", - "hostOrganism": "eco:hostOrganism", "iucnStatus": "eco:iucnStatus", "iucnStatusSource": "eco:iucnStatusSource", @@ -244,25 +345,57 @@ The recommended JSON-LD context for ecoCroissant: "conservationActions": "eco:conservationActions", "protectedArea": "eco:protectedArea", - "identificationVerificationStatus": "eco:identificationVerificationStatus", - "identifiedBy": "eco:identifiedBy", - "samplingProtocol": "eco:samplingProtocol", - "dataGeneralizations": "eco:dataGeneralizations", - "informationWithheld": "eco:informationWithheld", - "basisOfRecord": "eco:basisOfRecord", - "imageLicense": "eco:imageLicense", "imageType": "eco:imageType", "viewAngle": "eco:viewAngle", "anatomicalFeatures": "eco:anatomicalFeatures", - "phenotype": "eco:phenotype" + "phenotype": "eco:phenotype", + "imageResolution": "eco:imageResolution", + "imageFormat": "eco:imageFormat", + + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" } } ``` ## Examples -### Example 1: Species Image Dataset (TreeOfLife-200M style) +### Example 1: AI-Ready Species Image Dataset (TreeOfLife-200M style) ```json { @@ -271,11 +404,12 @@ The recommended JSON-LD context for ecoCroissant: "@vocab": "https://schema.org/", "cr": "http://mlcommons.org/croissant/", "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", "dct": "http://purl.org/dc/terms/" }, "@type": "sc:Dataset", "name": "TreeOfLife-200M", - "description": "A large-scale dataset of 200 million images spanning the tree of life, designed for training species identification models.", + "description": "AI-ready dataset of 200M images spanning the tree of life with stratified splits for species identification. Images from iNaturalist Research Grade observations.", "license": "https://creativecommons.org/licenses/by-nc-sa/4.0/", "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M", "dct:conformsTo": [ @@ -283,25 +417,38 @@ The recommended JSON-LD context for ecoCroissant: "http://imageomics.org/ecoCroissant/1.0" ], - "eco:taxon": ["Animalia", "Plantae", "Fungi"], - "eco:taxonRank": "kingdom", - "eco:higherClassification": "Life > Eukaryota > Multiple Kingdoms", + "dwc:basisOfRecord": ["HumanObservation"], + "dwc:identificationVerificationStatus": "Research Grade (2/3+ community agreement)", + "dwc:samplingProtocol": "Community science observations via iNaturalist platform", + "dwc:dataGeneralizations": "Coordinates obscured for sensitive species per observer privacy settings", - "eco:habitat": ["terrestrial", "freshwater", "marine"], - "eco:continent": ["Africa", "Antarctica", "Asia", "Europe", "North America", "Oceania", "South America"], + "eco:recordType": "image-based", + "eco:dataDistribution": "long-tailed: 500K species with 1-10,000 images each, stratified by taxonomic family", + "eco:preprocessingSteps": ["resized to 224x224", "normalized to ImageNet stats", "augmented with random crops and flips"], + "eco:standardizationMethod": "ImageNet normalization (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])", + "eco:trainTestSplit": "80% train, 10% validation, 10% test - stratified by species", + "eco:stratificationVariable": ["dwc:family", "dwc:genus"], + "eco:dataSplitRationale": "Stratified to ensure representation across taxonomic groups; temporal split avoided due to seasonal biases", - "eco:basisOfRecord": ["HumanObservation", "PreservedSpecimen", "MachineObservation"], - "eco:imageType": ["photograph", "museum specimen"], - "eco:identificationVerificationStatus": "mixed - includes expert-verified and community-validated observations", + "eco:apiEndpoint": "https://huggingface.co/api/datasets/imageomics/TreeOfLife-200M", + "eco:streamingSupported": true, + "eco:bulkDownloadSize": "~15TB uncompressed", + "eco:rateLimitRequests": 1000, + "eco:rateLimitPeriod": "per hour", - "eco:samplingProtocol": "Images collected from multiple sources including iNaturalist, museum collections, and research projects", + "eco:imageType": ["photograph"], + "eco:imageResolution": "variable - minimum 224x224, maximum 4096x4096", + "eco:imageFormat": ["JPEG"], + "eco:imageLicense": "https://creativecommons.org/licenses/by-nc/4.0/", "distribution": [ { "@type": "cr:FileObject", - "@id": "images.tar.gz", - "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/images.tar.gz", - "encodingFormat": "application/gzip" + "@id": "metadata.parquet", + "name": "metadata.parquet", + "description": "Queryable metadata without downloading images", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/metadata.parquet", + "encodingFormat": "application/x-parquet" } ], @@ -312,25 +459,35 @@ The recommended JSON-LD context for ecoCroissant: "field": [ { "@type": "cr:Field", - "@id": "species_images/image", - "dataType": "sc:ImageObject" + "@id": "species_images/image_id", + "dataType": "sc:Text" }, { "@type": "cr:Field", - "@id": "species_images/scientific_name", - "description": "Scientific name of the species", - "dataType": "sc:Text" + "@id": "species_images/scientificName", + "description": "Darwin Core scientific name", + "dataType": "dwc:scientificName" }, { "@type": "cr:Field", - "@id": "species_images/taxon_id", - "description": "GBIF taxon identifier", - "dataType": "sc:URL" + "@id": "species_images/taxonID", + "description": "iNaturalist taxon identifier linking to GBIF", + "dataType": "dwc:taxonID" }, { "@type": "cr:Field", "@id": "species_images/kingdom", - "description": "Taxonomic kingdom", + "dataType": "dwc:kingdom" + }, + { + "@type": "cr:Field", + "@id": "species_images/family", + "dataType": "dwc:family" + }, + { + "@type": "cr:Field", + "@id": "species_images/split", + "description": "train/val/test split assignment", "dataType": "sc:Text" } ] @@ -339,7 +496,7 @@ The recommended JSON-LD context for ecoCroissant: } ``` -### Example 2: Butterfly Specimen Dataset +### Example 2: Butterfly Specimen Dataset with AI-Generated Annotations ```json { @@ -348,50 +505,64 @@ The recommended JSON-LD context for ecoCroissant: "@vocab": "https://schema.org/", "cr": "http://mlcommons.org/croissant/", "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", "dct": "http://purl.org/dc/terms/" }, "@type": "sc:Dataset", "name": "Heliconius Butterfly Wing Pattern Dataset", - "description": "High-resolution images of Heliconius butterfly specimens with wing pattern annotations for studying mimicry and adaptation.", + "description": "AI-ready dataset of museum specimens with standardized imaging and AI-assisted wing pattern segmentations.", "license": "https://creativecommons.org/licenses/by/4.0/", "dct:conformsTo": [ "http://mlcommons.org/croissant/1.0", "http://imageomics.org/ecoCroissant/1.0" ], - "eco:taxon": "Heliconius", - "eco:taxonRank": "genus", - "eco:scientificName": "Heliconius Kluk, 1780", - "eco:taxonID": ["https://www.gbif.org/species/1932585"], - "eco:higherClassification": "Animalia > Arthropoda > Insecta > Lepidoptera > Nymphalidae > Heliconiinae > Heliconius", - "eco:vernacularName": ["Longwing butterflies", "Heliconius butterflies"], + "dwc:scientificName": "Heliconius Kluk, 1780", + "dwc:taxonRank": "genus", + "dwc:taxonID": "https://www.gbif.org/species/1932585", + "dwc:higherClassification": "Animalia > Arthropoda > Insecta > Lepidoptera > Nymphalidae > Heliconiinae > Heliconius", + "dwc:vernacularName": ["Longwing butterflies"], + "dwc:kingdom": "Animalia", + "dwc:class": "Insecta", + "dwc:order": "Lepidoptera", + "dwc:family": "Nymphalidae", + "dwc:genus": "Heliconius", - "eco:habitat": ["tropical rainforest", "forest edge", "secondary forest"], - "eco:biome": "tropical moist broadleaf forest", - "eco:continent": ["South America", "Central America"], - "eco:country": ["Ecuador", "Peru", "Colombia", "Panama", "Costa Rica"], - "eco:minimumElevationInMeters": 0, - "eco:maximumElevationInMeters": 2000, + "dwc:habitat": ["tropical rainforest", "forest edge", "secondary forest"], + "dwc:continent": ["South America", "Central America"], + "dwc:country": ["Ecuador", "Peru", "Colombia", "Panama", "Costa Rica"], + "dwc:minimumElevationInMeters": 0, + "dwc:maximumElevationInMeters": 2000, + + "dwc:lifeStage": ["adult"], + "dwc:basisOfRecord": "PreservedSpecimen", + "dwc:identificationVerificationStatus": "expert-verified", + "dwc:identifiedBy": ["Museum taxonomists", "Heliconius specialists"], + "dwc:samplingProtocol": "Museum specimens imaged with standardized dorsal and ventral views at 300 DPI", - "eco:lifeStage": ["adult"], + "eco:biome": "tropical moist broadleaf forest", "eco:trophicLevel": "primary consumer", "eco:ecologicalRole": ["pollinator", "Müllerian mimic"], "eco:diet": ["pollen", "nectar"], - "eco:hostOrganism": ["Passiflora (host plant for larvae)"], - "eco:speciesInteractions": "Müllerian mimicry complex with other Heliconius species; larvae feed exclusively on Passiflora plants", + "eco:speciesInteractions": "Müllerian mimicry complex; larvae on Passiflora", "eco:iucnStatus": "LC", "eco:populationTrend": "stable", - "eco:basisOfRecord": "PreservedSpecimen", - "eco:identificationVerificationStatus": "expert-verified", - "eco:identifiedBy": ["Museum taxonomists", "Heliconius specialists"], - "eco:samplingProtocol": "Museum specimens imaged with standardized dorsal and ventral views", - + "eco:recordType": "image-based", + "eco:preprocessingSteps": ["white background removal", "standardized to 1024x1024", "color-corrected"], "eco:imageType": ["museum specimen photograph"], + "eco:imageResolution": "1024x1024", + "eco:imageFormat": ["TIFF", "JPEG"], "eco:viewAngle": ["dorsal", "ventral"], "eco:anatomicalFeatures": ["forewing", "hindwing", "wing pattern"], - "eco:phenotype": "wing color pattern" + "eco:phenotype": "wing color pattern", + "eco:imageAnnotationType": ["segmentation"], + + "eco:generatedBy": "Mask R-CNN v2.1 trained on 5K hand-annotated specimens", + "eco:modelConfidence": 0.92, + "eco:humanVerified": true, + "eco:generationMethod": "automated wing boundary segmentation with manual correction" } ``` @@ -446,26 +617,29 @@ The recommended JSON-LD context for ecoCroissant: } ``` -## Alignment with Darwin Core - -ecoCroissant properties are designed to be compatible with [Darwin Core](https://dwc.tdwg.org/) terms where applicable. The following table shows the mapping: - -| ecoCroissant Property | Darwin Core Term | -|----------------------|------------------| -| eco:taxon | dwc:scientificName | -| eco:taxonRank | dwc:taxonRank | -| eco:higherClassification | dwc:higherClassification | -| eco:vernacularName | dwc:vernacularName | -| eco:locality | dwc:locality | -| eco:habitat | dwc:habitat | -| eco:continent | dwc:continent | -| eco:country | dwc:country | -| eco:coordinateUncertaintyInMeters | dwc:coordinateUncertaintyInMeters | -| eco:eventDate | dwc:eventDate | -| eco:lifeStage | dwc:lifeStage | -| eco:basisOfRecord | dwc:basisOfRecord | -| eco:identifiedBy | dwc:identifiedBy | -| eco:samplingProtocol | dwc:samplingProtocol | +## Darwin Core Integration + +ecoCroissant uses Darwin Core terms directly as part of its vocabulary. There is no separate "ecoCroissant version" of Darwin Core terms - the standard Darwin Core terms are used natively through the `dwc:` namespace. + +### Direct Darwin Core Usage + +All Darwin Core terms are available for use in ecoCroissant datasets. The most commonly used terms include: + +- **Taxonomic**: `dwc:scientificName`, `dwc:taxonRank`, `dwc:kingdom`, `dwc:phylum`, `dwc:class`, `dwc:order`, `dwc:family`, `dwc:genus`, `dwc:higherClassification`, `dwc:taxonID`, `dwc:vernacularName`, `dwc:taxonomicStatus` +- **Geographic**: `dwc:locality`, `dwc:habitat`, `dwc:continent`, `dwc:country`, `dwc:decimalLatitude`, `dwc:decimalLongitude`, `dwc:coordinateUncertaintyInMeters`, `dwc:geodeticDatum`, elevation and depth terms +- **Temporal**: `dwc:eventDate`, `dwc:year`, `dwc:month`, `dwc:day` +- **Data Quality**: `dwc:basisOfRecord`, `dwc:identifiedBy`, `dwc:identificationVerificationStatus`, `dwc:samplingProtocol`, `dwc:dataGeneralizations`, `dwc:informationWithheld` + +### Queryability with Ontology Synonyms + +Darwin Core terms in ecoCroissant datasets can be queried using synonyms from other biodiversity ontologies: + +- **GBIF Backbone Taxonomy**: `dwc:taxonID` can link to GBIF species pages +- **NCBI Taxonomy**: Cross-reference via taxon identifiers +- **Encyclopedia of Life (EOL)**: Link species concepts across systems +- **Integrated Taxonomic Information System (ITIS)**: Standard taxonomic references + +This satisfies the FAIR4AI requirement that "ontology used can be queried with synonyms from other ontologies." ## Integration with FAIR4AI Principles diff --git a/examples/treeoflife-200m.json b/examples/treeoflife-200m.json index 54bde31..89ff321 100644 --- a/examples/treeoflife-200m.json +++ b/examples/treeoflife-200m.json @@ -5,11 +5,12 @@ "sc": "https://schema.org/", "cr": "http://mlcommons.org/croissant/", "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", "dct": "http://purl.org/dc/terms/" }, "@type": "sc:Dataset", "name": "TreeOfLife-200M", - "description": "Tree of Life 200M (ToL-200M) is a comprehensive phylogenetically-guided image dataset containing approximately 200 million images from iNaturalist, spanning nearly all known species across the tree of life. The dataset is designed for training and evaluating computer vision models for species identification and biodiversity research.", + "description": "AI-ready dataset of 200M images from iNaturalist Research Grade observations spanning the tree of life. Includes stratified train/val/test splits, queryable metadata, and preprocessing documentation for species identification ML models.", "license": "https://creativecommons.org/licenses/by-nc-sa/4.0/", "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M", "creator": { @@ -25,257 +26,171 @@ "iNaturalist", "phylogeny", "tree of life", - "image classification" + "image classification", + "AI-ready", + "FAIR4AI" ], "dct:conformsTo": [ "http://mlcommons.org/croissant/1.0", "http://imageomics.org/ecoCroissant/1.0" ], - "eco:taxon": [ - "Animalia", - "Plantae", - "Fungi", - "Chromista", - "Protozoa", - "Bacteria", - "Archaea" - ], - "eco:taxonRank": "kingdom", - "eco:higherClassification": "All domains of life represented", - "eco:taxonomicStatus": "accepted names from iNaturalist taxonomy", - - "eco:habitat": [ - "terrestrial", - "freshwater", - "marine", - "urban", - "agricultural", - "forest", - "grassland", - "desert", - "wetland", - "alpine" - ], - "eco:biome": "all major biomes represented", - "eco:continent": [ - "Africa", - "Antarctica", - "Asia", - "Europe", - "North America", - "Oceania", - "South America" - ], + "dwc:basisOfRecord": "HumanObservation", + "dwc:identificationVerificationStatus": "Research Grade (2/3+ community agreement)", + "dwc:samplingProtocol": "Community science observations via iNaturalist platform, 2008-2023", + "dwc:dataGeneralizations": "Coordinates obscured for sensitive/threatened species per observer privacy settings", - "eco:basisOfRecord": [ - "HumanObservation" + "eco:recordType": "image-based", + "eco:dataDistribution": "Long-tailed distribution: ~500K species with 1-10,000 images per species. Top 1% of species account for 30% of images. Stratified by taxonomic family to ensure representation.", + "eco:preprocessingSteps": [ + "Resized to 224x224 pixels (aspect ratio preserved, zero-padded)", + "Normalized using ImageNet statistics (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])", + "Training augmentation: random horizontal flip, random crop, color jitter" ], - "eco:imageType": [ - "photograph" - ], - "eco:identificationVerificationStatus": "Research Grade observations from iNaturalist (community consensus with at least 2/3 agreement)", - "eco:identifiedBy": [ - "iNaturalist community", - "Subject matter experts" - ], - "eco:samplingProtocol": "Community science observations from iNaturalist platform", - "eco:dataGeneralizations": "Coordinates obscured for sensitive species (geoprivacy settings applied by observers)", + "eco:standardizationMethod": "ImageNet normalization applied to all images for transfer learning compatibility", + "eco:trainTestSplit": "80% train (160M images), 10% validation (20M images), 10% test (20M images)", + "eco:stratificationVariable": ["dwc:family", "dwc:genus"], + "eco:dataSplitRationale": "Stratified by taxonomic family to ensure phylogenetic diversity across splits. Species with <10 images excluded from test set to avoid data leakage. Temporal split avoided due to seasonal observation biases.", - "eco:lifeStage": [ - "egg", - "larva", - "juvenile", - "adult", - "various" - ], + "eco:apiEndpoint": "https://huggingface.co/api/datasets/imageomics/TreeOfLife-200M", + "eco:streamingSupported": true, + "eco:bulkDownloadSize": "~15TB uncompressed images + 5GB metadata", + "eco:rateLimitRequests": 1000, + "eco:rateLimitPeriod": "per hour", - "eco:trophicLevel": "multiple - dataset spans producers, consumers, and decomposers", - "eco:ecologicalRole": [ - "primary producer", - "herbivore", - "carnivore", - "omnivore", - "decomposer", - "pollinator", - "parasite", - "symbiont" - ], - - "eco:iucnStatus": "multiple - includes species across all IUCN categories", - "eco:populationTrend": "multiple - includes increasing, stable, and decreasing populations", - - "eco:imageLicense": "https://creativecommons.org/licenses/by-nc/4.0/", - "eco:viewAngle": [ - "various" - ], - "eco:anatomicalFeatures": [ - "whole organism", - "diagnostic features", - "various body parts" - ], - "eco:phenotype": "natural variation in appearance, coloration, and morphology", + "eco:imageType": "photograph", + "eco:imageResolution": "Variable source resolution (224x224 to 4096x4096), standardized to 224x224 for ML", + "eco:imageFormat": "JPEG", + "eco:imageLicense": "CC-BY-NC 4.0 (individual images may have more permissive licenses)", "distribution": [ { "@type": "cr:FileObject", "@id": "metadata.parquet", "name": "metadata.parquet", - "description": "Parquet file containing image metadata including species labels, taxonomic hierarchy, and observation details", + "description": "Queryable metadata file with Darwin Core terms. Can be queried without downloading images. Includes taxonomy, geography, temporal info, and split assignments.", "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/metadata.parquet", - "encodingFormat": "application/x-parquet" + "encodingFormat": "application/x-parquet", + "contentSize": "5GB" + }, + { + "@type": "cr:FileSet", + "@id": "train-images", + "name": "train-images", + "description": "Training split images", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/images/train/", + "encodingFormat": "image/jpeg" } ], "recordSet": [ { "@type": "cr:RecordSet", - "@id": "species_observations", + "@id": "observations", "name": "Species Observations", - "description": "Records of species observations with associated images and taxonomic metadata", + "description": "Image-based species observations with Darwin Core metadata", "field": [ { "@type": "cr:Field", - "@id": "species_observations/image_id", + "@id": "observations/image_id", "name": "image_id", "description": "Unique identifier for the image", "dataType": "sc:Text" }, { "@type": "cr:Field", - "@id": "species_observations/image_url", + "@id": "observations/image_url", "name": "image_url", "description": "URL to the image file", "dataType": "sc:URL" }, { "@type": "cr:Field", - "@id": "species_observations/scientific_name", - "name": "scientific_name", - "description": "Scientific name of the observed species", - "dataType": "sc:Text" + "@id": "observations/scientificName", + "name": "scientificName", + "description": "Darwin Core scientific name of the observed species", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "scientificName"} + } }, { "@type": "cr:Field", - "@id": "species_observations/common_name", - "name": "common_name", - "description": "Common name of the observed species", - "dataType": "sc:Text" - }, - { - "@type": "cr:Field", - "@id": "species_observations/taxon_id", - "name": "taxon_id", - "description": "iNaturalist taxon identifier", - "dataType": "sc:Integer" + "@id": "observations/taxonID", + "name": "taxonID", + "description": "iNaturalist taxon identifier (links to GBIF)", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "taxonID"} + } }, { "@type": "cr:Field", - "@id": "species_observations/kingdom", + "@id": "observations/kingdom", "name": "kingdom", - "description": "Taxonomic kingdom", - "dataType": "sc:Text" - }, - { - "@type": "cr:Field", - "@id": "species_observations/phylum", - "name": "phylum", - "description": "Taxonomic phylum", - "dataType": "sc:Text" + "description": "Taxonomic kingdom (Darwin Core)", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "kingdom"} + } }, { "@type": "cr:Field", - "@id": "species_observations/class", - "name": "class", - "description": "Taxonomic class", - "dataType": "sc:Text" - }, - { - "@type": "cr:Field", - "@id": "species_observations/order", - "name": "order", - "description": "Taxonomic order", - "dataType": "sc:Text" - }, - { - "@type": "cr:Field", - "@id": "species_observations/family", + "@id": "observations/family", "name": "family", - "description": "Taxonomic family", - "dataType": "sc:Text" - }, - { - "@type": "cr:Field", - "@id": "species_observations/genus", - "name": "genus", - "description": "Taxonomic genus", - "dataType": "sc:Text" - }, - { - "@type": "cr:Field", - "@id": "species_observations/latitude", - "name": "latitude", - "description": "Latitude of observation (may be obscured for sensitive species)", - "dataType": "sc:Float" - }, - { - "@type": "cr:Field", - "@id": "species_observations/longitude", - "name": "longitude", - "description": "Longitude of observation (may be obscured for sensitive species)", - "dataType": "sc:Float" + "description": "Taxonomic family (Darwin Core)", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "family"} + } }, { "@type": "cr:Field", - "@id": "species_observations/observed_on", - "name": "observed_on", - "description": "Date when the observation was made", - "dataType": "sc:Date" + "@id": "observations/decimalLatitude", + "name": "decimalLatitude", + "description": "Latitude in decimal degrees (may be obscured for sensitive species)", + "dataType": "sc:Float", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "decimalLatitude"} + } }, { "@type": "cr:Field", - "@id": "species_observations/quality_grade", - "name": "quality_grade", - "description": "iNaturalist quality grade (research, needs_id, casual)", - "dataType": "sc:Text" - } - ] - }, - { - "@type": "cr:RecordSet", - "@id": "taxonomic_hierarchy", - "name": "Taxonomic Hierarchy", - "description": "Complete taxonomic classification for species in the dataset", - "dataType": "sc:Enumeration", - "key": {"@id": "taxonomic_hierarchy/taxon_id"}, - "field": [ - { - "@type": "cr:Field", - "@id": "taxonomic_hierarchy/taxon_id", - "name": "taxon_id", - "description": "Unique taxon identifier", - "dataType": "sc:Integer" + "@id": "observations/decimalLongitude", + "name": "decimalLongitude", + "description": "Longitude in decimal degrees (may be obscured for sensitive species)", + "dataType": "sc:Float", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "decimalLongitude"} + } }, { "@type": "cr:Field", - "@id": "taxonomic_hierarchy/scientific_name", - "name": "scientific_name", - "description": "Scientific name", - "dataType": "sc:Text" - }, - { - "@type": "cr:Field", - "@id": "taxonomic_hierarchy/rank", - "name": "rank", - "description": "Taxonomic rank", - "dataType": "sc:Text" + "@id": "observations/eventDate", + "name": "eventDate", + "description": "Date of observation (Darwin Core)", + "dataType": "sc:Date", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "eventDate"} + } }, { "@type": "cr:Field", - "@id": "taxonomic_hierarchy/parent_taxon_id", - "name": "parent_taxon_id", - "description": "Parent taxon identifier for hierarchical relationships", - "dataType": "sc:Integer" + "@id": "observations/split", + "name": "split", + "description": "Data split assignment: train, val, or test", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "split"} + } } ] } diff --git a/schema/eco-context.jsonld b/schema/eco-context.jsonld index fe139a2..d2d6d21 100644 --- a/schema/eco-context.jsonld +++ b/schema/eco-context.jsonld @@ -8,36 +8,73 @@ "dwc": "http://rs.tdwg.org/dwc/terms/", "dct": "http://purl.org/dc/terms/", - "taxon": "eco:taxon", - "taxonRank": "eco:taxonRank", - "scientificName": "eco:scientificName", - "taxonID": "eco:taxonID", - "higherClassification": "eco:higherClassification", - "vernacularName": "eco:vernacularName", - "taxonomicStatus": "eco:taxonomicStatus", + "scientificName": "dwc:scientificName", + "taxonRank": "dwc:taxonRank", + "kingdom": "dwc:kingdom", + "phylum": "dwc:phylum", + "class": "dwc:class", + "order": "dwc:order", + "family": "dwc:family", + "genus": "dwc:genus", + "taxonID": "dwc:taxonID", + "higherClassification": "dwc:higherClassification", + "vernacularName": "dwc:vernacularName", + "taxonomicStatus": "dwc:taxonomicStatus", - "locality": "eco:locality", - "habitat": "eco:habitat", - "biome": "eco:biome", - "continent": "eco:continent", - "country": "eco:country", - "coordinateUncertaintyInMeters": "eco:coordinateUncertaintyInMeters", - "minimumElevationInMeters": "eco:minimumElevationInMeters", - "maximumElevationInMeters": "eco:maximumElevationInMeters", - "minimumDepthInMeters": "eco:minimumDepthInMeters", - "maximumDepthInMeters": "eco:maximumDepthInMeters", + "locality": "dwc:locality", + "habitat": "dwc:habitat", + "continent": "dwc:continent", + "country": "dwc:country", + "decimalLatitude": "dwc:decimalLatitude", + "decimalLongitude": "dwc:decimalLongitude", + "coordinateUncertaintyInMeters": "dwc:coordinateUncertaintyInMeters", + "minimumElevationInMeters": "dwc:minimumElevationInMeters", + "maximumElevationInMeters": "dwc:maximumElevationInMeters", + "minimumDepthInMeters": "dwc:minimumDepthInMeters", + "maximumDepthInMeters": "dwc:maximumDepthInMeters", + "geodeticDatum": "dwc:geodeticDatum", + + "eventDate": "dwc:eventDate", + "year": "dwc:year", + "month": "dwc:month", + "day": "dwc:day", + "lifeStage": "dwc:lifeStage", + + "identificationVerificationStatus": "dwc:identificationVerificationStatus", + "identifiedBy": "dwc:identifiedBy", + "samplingProtocol": "dwc:samplingProtocol", + "dataGeneralizations": "dwc:dataGeneralizations", + "informationWithheld": "dwc:informationWithheld", + "basisOfRecord": "dwc:basisOfRecord", + "occurrenceStatus": "dwc:occurrenceStatus", + + "dataDistribution": "eco:dataDistribution", + "preprocessingSteps": "eco:preprocessingSteps", + "standardizationMethod": "eco:standardizationMethod", + "trainTestSplit": "eco:trainTestSplit", + "stratificationVariable": "eco:stratificationVariable", + "dataSplitRationale": "eco:dataSplitRationale", - "eventDate": "eco:eventDate", - "eventDateStart": "eco:eventDateStart", - "eventDateEnd": "eco:eventDateEnd", - "seasonality": "eco:seasonality", - "lifeStage": "eco:lifeStage", + "generatedBy": "eco:generatedBy", + "modelConfidence": "eco:modelConfidence", + "humanVerified": "eco:humanVerified", + "generationMethod": "eco:generationMethod", + "apiEndpoint": "eco:apiEndpoint", + "rateLimitRequests": "eco:rateLimitRequests", + "rateLimitPeriod": "eco:rateLimitPeriod", + "streamingSupported": "eco:streamingSupported", + "bulkDownloadSize": "eco:bulkDownloadSize", + + "recordType": "eco:recordType", + "occurrenceToImageRatio": "eco:occurrenceToImageRatio", + "imageAnnotationType": "eco:imageAnnotationType", + + "biome": "eco:biome", "trophicLevel": "eco:trophicLevel", "ecologicalRole": "eco:ecologicalRole", "speciesInteractions": "eco:speciesInteractions", "diet": "eco:diet", - "hostOrganism": "eco:hostOrganism", "iucnStatus": "eco:iucnStatus", "iucnStatusSource": "eco:iucnStatusSource", @@ -46,18 +83,13 @@ "conservationActions": "eco:conservationActions", "protectedArea": "eco:protectedArea", - "identificationVerificationStatus": "eco:identificationVerificationStatus", - "identifiedBy": "eco:identifiedBy", - "samplingProtocol": "eco:samplingProtocol", - "dataGeneralizations": "eco:dataGeneralizations", - "informationWithheld": "eco:informationWithheld", - "basisOfRecord": "eco:basisOfRecord", - "imageLicense": "eco:imageLicense", "imageType": "eco:imageType", "viewAngle": "eco:viewAngle", "anatomicalFeatures": "eco:anatomicalFeatures", "phenotype": "eco:phenotype", + "imageResolution": "eco:imageResolution", + "imageFormat": "eco:imageFormat", "column": "cr:column", "conformsTo": "dct:conformsTo", From c3a0014658240f74bcf7effcbd44d94ffe6d1017 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Wed, 17 Dec 2025 17:11:21 -0500 Subject: [PATCH 5/9] Move Claude-generated code to designated folder intended for reference (what we do and don't want), not use --- LICENSE => AI-generated-test/LICENSE | 0 README.md => AI-generated-test/README.md | 0 {docs => AI-generated-test/docs}/eco-spec.md | 0 {examples => AI-generated-test/examples}/treeoflife-200m.json | 0 {schema => AI-generated-test/schema}/eco-context.jsonld | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename LICENSE => AI-generated-test/LICENSE (100%) rename README.md => AI-generated-test/README.md (100%) rename {docs => AI-generated-test/docs}/eco-spec.md (100%) rename {examples => AI-generated-test/examples}/treeoflife-200m.json (100%) rename {schema => AI-generated-test/schema}/eco-context.jsonld (100%) diff --git a/LICENSE b/AI-generated-test/LICENSE similarity index 100% rename from LICENSE rename to AI-generated-test/LICENSE diff --git a/README.md b/AI-generated-test/README.md similarity index 100% rename from README.md rename to AI-generated-test/README.md diff --git a/docs/eco-spec.md b/AI-generated-test/docs/eco-spec.md similarity index 100% rename from docs/eco-spec.md rename to AI-generated-test/docs/eco-spec.md diff --git a/examples/treeoflife-200m.json b/AI-generated-test/examples/treeoflife-200m.json similarity index 100% rename from examples/treeoflife-200m.json rename to AI-generated-test/examples/treeoflife-200m.json diff --git a/schema/eco-context.jsonld b/AI-generated-test/schema/eco-context.jsonld similarity index 100% rename from schema/eco-context.jsonld rename to AI-generated-test/schema/eco-context.jsonld From 77c1450802b21f9a769f4aae0adad9fca38b90f7 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Wed, 17 Dec 2025 17:11:44 -0500 Subject: [PATCH 6/9] Provide some more description and context for repo --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..7b2ed49 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# ecoCroissant + +Repository for developing croissant-based biodiversity metadata schema following FAIR4AI principles. + +Our goal in this repo is to first incorporate Darwin Core into the Croissant format, then start adding our particular ecological FAIR4AI terms that are needed. + +See for instance [croissant-spec-1.1](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec-1.1.md) as a basis, potentially using the [RAI spec](https://github.com/mlcommons/croissant/blob/main/docs/croissant-rai-spec.md) as a model for building on top of existing schema. Note the use of schema.org as a basis, with only new terms defined. See also the [croissant.ttl](https://github.com/mlcommons/croissant/blob/main/docs/croissant.ttl). From 1f4c0a7b518a5906531f353c4c33e863bacfc619 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Wed, 17 Dec 2025 17:15:03 -0500 Subject: [PATCH 7/9] Add an example jsonld format def --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7b2ed49..372fcc3 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,5 @@ Repository for developing croissant-based biodiversity metadata schema following Our goal in this repo is to first incorporate Darwin Core into the Croissant format, then start adding our particular ecological FAIR4AI terms that are needed. See for instance [croissant-spec-1.1](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec-1.1.md) as a basis, potentially using the [RAI spec](https://github.com/mlcommons/croissant/blob/main/docs/croissant-rai-spec.md) as a model for building on top of existing schema. Note the use of schema.org as a basis, with only new terms defined. See also the [croissant.ttl](https://github.com/mlcommons/croissant/blob/main/docs/croissant.ttl). + +See also, this [example JSONLD](https://doi.org/10.7717/peerj.12618). From 5d3bc21a6fe780a1d90ceb763f1c9cb2405c16a7 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Thu, 18 Dec 2025 09:57:42 -0500 Subject: [PATCH 8/9] Add new note with pointer for Google Datasets --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 372fcc3..10d68e5 100644 --- a/README.md +++ b/README.md @@ -7,3 +7,6 @@ Our goal in this repo is to first incorporate Darwin Core into the Croissant for See for instance [croissant-spec-1.1](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec-1.1.md) as a basis, potentially using the [RAI spec](https://github.com/mlcommons/croissant/blob/main/docs/croissant-rai-spec.md) as a model for building on top of existing schema. Note the use of schema.org as a basis, with only new terms defined. See also the [croissant.ttl](https://github.com/mlcommons/croissant/blob/main/docs/croissant.ttl). See also, this [example JSONLD](https://doi.org/10.7717/peerj.12618). + +We should also look into the Google Datasets format and indexing process (see [Dataset shema](https://schema.org/Dataset)). [Example Query](https://datasetsearch.research.google.com/search?src=0&query=tree%20of%20life&docid=L2cvMTF4ZmJyZ2s0cQ%3D%3D); for [TreeOfLife-200M](https://huggingface.co/datasets/imageomics/TreeOfLife-200M) it does captpure the DOI and the description we provide, but then the update date is "May 1, 2024", which is the GBIF snapshot we use (at least the DOI date would be logical). The correct date doesn't actually seem to be captured by Croissant either [existing-format-ex/TOL-200M-croissant](existing-format-ex/TOL-200M-croissant.jsonld). + From ad2992aa5be7c9030e2102b404259c4441885bb3 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Thu, 18 Dec 2025 09:58:10 -0500 Subject: [PATCH 9/9] Add example of the TreeOfLife-200M dataset in croissant format for comparison --- existing-format-ex/TOL-200M-croissant.jsonld | 414 +++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 existing-format-ex/TOL-200M-croissant.jsonld diff --git a/existing-format-ex/TOL-200M-croissant.jsonld b/existing-format-ex/TOL-200M-croissant.jsonld new file mode 100644 index 0000000..d13528b --- /dev/null +++ b/existing-format-ex/TOL-200M-croissant.jsonld @@ -0,0 +1,414 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "arrayShape": "cr:arrayShape", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataBiases": "cr:dataBiases", + "dataCollection": "cr:dataCollection", + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isArray": "cr:isArray", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "personalSensitiveInformation": "cr:personalSensitiveInformation", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "containedIn": "cr:containedIn" + }, + "@type": "sc:Dataset", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "repo", + "name": "repo", + "description": "The Hugging Face git repository.", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/tree/refs%2Fconvert%2Fparquet", + "encodingFormat": "git+https", + "sha256": "https://github.com/mlcommons/croissant/issues/80" + }, + { + "@type": "cr:FileSet", + "@id": "parquet-files-for-config-default", + "containedIn": { + "@id": "repo" + }, + "encodingFormat": "application/x-parquet", + "includes": "default/*/*.parquet" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "dataType": "cr:Split", + "key": { + "@id": "default_splits/split_name" + }, + "@id": "default_splits", + "name": "default_splits", + "description": "Splits for the default config.", + "field": [ + { + "@type": "cr:Field", + "@id": "default_splits/split_name", + "dataType": "sc:Text" + } + ], + "data": [ + { + "default_splits/split_name": "train" + } + ] + }, + { + "@type": "cr:RecordSet", + "@id": "default", + "description": "imageomics/TreeOfLife-200M - 'default' subset", + "field": [ + { + "@type": "cr:Field", + "@id": "default/split", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "fileProperty": "fullpath" + }, + "transform": { + "regex": "default/(?:partial-)?(train)/.+parquet$" + } + }, + "references": { + "field": { + "@id": "default_splits/split_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/uuid", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "uuid" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/source_url", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "source_url" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/kingdom", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "kingdom" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/phylum", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "phylum" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/class", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "class" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/order", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "order" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/family", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "family" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/genus", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "genus" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/species", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "species" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/scientific_name", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "scientific_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/common", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "common" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/data_source", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "data_source" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/publisher", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "publisher" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/basis_of_record", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "basis_of_record" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/img_type", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "img_type" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/source_id", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "source_id" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/shard_filename", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "shard_filename" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/shard_file_path", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "shard_file_path" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/base_dataset_file_path", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "base_dataset_file_path" + } + } + } + ] + } + ], + "conformsTo": "http://mlcommons.org/croissant/1.1", + "name": "TreeOfLife-200M", + "identifier": "10.57967/hf/6786", + "description": "With nearly 214 million images representing 952,257 taxa across the tree of life, TreeOfLife-200M is the largest and most diverse public ML-ready dataset for computer vision models in biology at release. This dataset combines images and metadata from four core biodiversity data providers: Global Biodiversity Information Facility (GBIF), Encyclopedia of Life (EOL), BIOSCAN-5M, and FathomNet to more than double the number of unique taxa covered by TreeOfLife-10M, adding 50 million more images than BioTrove (and nearly triple the unique taxa). TreeOfLife-200M also increases image context diversity with museum specimen, camera trap, and citizen science images well-represented. Our rigorous curation process ensures each image has the most specific taxonomic label possible and that the overall dataset provides a well-rounded foundation for training BioCLIP 2 and future biology foundation models.", + "alternateName": [ + "imageomics/TreeOfLife-200M", + "TreeOfLife-200M" + ], + "creator": { + "@type": "Organization", + "name": "HDR Imageomics Institute", + "url": "https://huggingface.co/imageomics" + }, + "keywords": [ + "image-classification", + "zero-shot-classification", + "English", + "Latin", + "cc0-1.0", + "100M - 1B", + "parquet", + "Text", + "Image", + "Datasets", + "pandas", + "Croissant", + "Polars", + "arxiv:2505.23883", + "doi:10.57967/hf/6786", + "🇺🇸 Region: US", + "biology", + "image", + "imageomics", + "animals", + "evolutionary biology", + "CV", + "multimodal", + "clip", + "species", + "taxonomy", + "knowledge-guided", + "imbalanced" + ], + "license": "https://choosealicense.com/licenses/cc0-1.0/", + "sameAs": "https://imageomics.github.io/bioclip-2/", + "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M" +}