diff --git a/AI-generated-test/LICENSE b/AI-generated-test/LICENSE new file mode 100644 index 0000000..bfb6d93 --- /dev/null +++ b/AI-generated-test/LICENSE @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2024 Imageomics Institute + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/AI-generated-test/README.md b/AI-generated-test/README.md new file mode 100644 index 0000000..ab2a0c3 --- /dev/null +++ b/AI-generated-test/README.md @@ -0,0 +1,155 @@ +# ecoCroissant 🥐🌿 + +**Croissant Extension for Biodiversity Metadata** + +[![Specification](https://img.shields.io/badge/spec-v1.0-green.svg)](docs/eco-spec.md) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) + +## Overview + +ecoCroissant makes biodiversity datasets **AI-ready** by integrating [Darwin Core](https://dwc.tdwg.org/) terms with [FAIR4AI](https://www.nature.com/articles/s41597-022-01759-2) requirements. Rather than redefining existing standards, ecoCroissant uses Darwin Core terms directly and adds AI-specific metadata for machine learning applications. + +### FAIR4AI Requirements + +ecoCroissant addresses the three key FAIR4AI requirements: + +1. **Queryable Metadata**: Data/metadata can be queried without downloading large files (via Parquet/queryable formats) +2. **Ontology Integration**: Darwin Core terms are queryable with synonyms from GBIF, NCBI, EOL +3. **Content/Context Extraction**: Clear distinction between occurrence-based and image-based records + +### AI-Ready Features + +ecoCroissant ensures datasets are AI-ready by documenting: + +- **Data Distribution**: Class distributions, long-tail characteristics, stratification details +- **Preprocessing Pipeline**: Standardization methods, augmentation, normalization +- **Train/Val/Test Splits**: Rationale, stratification variables, split proportions +- **Model Provenance**: For AI-generated annotations or labels +- **Streaming Support**: API endpoints and rate limits for scalable data access + +## Quick Start + +### Using Darwin Core with AI-Ready Metadata + +ecoCroissant uses Darwin Core terms directly, adding AI-specific properties: + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "My Biodiversity Dataset", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "dwc:scientificName": "Lepidoptera", + "dwc:taxonRank": "order", + "dwc:habitat": ["tropical rainforest", "temperate forest"], + "dwc:basisOfRecord": "PreservedSpecimen", + + "eco:recordType": "image-based", + "eco:dataDistribution": "long-tailed: 5K species, 10-1000 images each", + "eco:trainTestSplit": "80/10/10 stratified by family", + "eco:preprocessingSteps": ["resized to 224x224", "ImageNet normalization"] +} +``` + +### Example Datasets + +See the [examples](examples/) directory for complete examples: + +- [TreeOfLife-200M](examples/treeoflife-200m.json) - AI-ready species image dataset with 200M images + +## Documentation + +- **[ecoCroissant Specification](docs/eco-spec.md)** - Complete specification with property definitions +- **[JSON-LD Context](schema/eco-context.jsonld)** - JSON-LD context file for ecoCroissant + +## Property Categories + +### Darwin Core Terms (Used Directly) + +ecoCroissant uses Darwin Core terms without redefinition: + +#### Taxonomic +`dwc:scientificName`, `dwc:taxonRank`, `dwc:kingdom`, `dwc:phylum`, `dwc:class`, `dwc:order`, `dwc:family`, `dwc:genus`, `dwc:taxonID`, `dwc:higherClassification`, `dwc:vernacularName` + +#### Geographic +`dwc:locality`, `dwc:habitat`, `dwc:continent`, `dwc:country`, `dwc:decimalLatitude`, `dwc:decimalLongitude`, `dwc:coordinateUncertaintyInMeters`, `dwc:minimumElevationInMeters`, `dwc:maximumElevationInMeters` + +#### Temporal +`dwc:eventDate`, `dwc:year`, `dwc:month`, `dwc:day`, `dwc:lifeStage` + +#### Data Quality +`dwc:basisOfRecord`, `dwc:identifiedBy`, `dwc:identificationVerificationStatus`, `dwc:samplingProtocol`, `dwc:dataGeneralizations`, `dwc:informationWithheld` + +### AI-Specific ecoCroissant Extensions + +#### Data Distribution & Preprocessing +`eco:dataDistribution`, `eco:preprocessingSteps`, `eco:standardizationMethod`, `eco:trainTestSplit`, `eco:stratificationVariable`, `eco:dataSplitRationale` + +#### Model Provenance +`eco:generatedBy`, `eco:modelConfidence`, `eco:humanVerified`, `eco:generationMethod` + +#### API & Streaming +`eco:apiEndpoint`, `eco:rateLimitRequests`, `eco:rateLimitPeriod`, `eco:streamingSupported`, `eco:bulkDownloadSize` + +#### Record Type Context +`eco:recordType`, `eco:occurrenceToImageRatio`, `eco:imageAnnotationType` + +#### Ecological Extensions +`eco:biome`, `eco:trophicLevel`, `eco:ecologicalRole`, `eco:speciesInteractions`, `eco:diet` + +#### Conservation Extensions +`eco:iucnStatus`, `eco:populationTrend`, `eco:threats`, `eco:protectedArea` + +See the [full specification](docs/eco-spec.md) for complete property definitions. + +## Integration with Standards + +ecoCroissant is designed to integrate with established biodiversity standards: + +- **[Darwin Core](https://dwc.tdwg.org/)** - Standard for biodiversity data sharing +- **[GBIF](https://www.gbif.org/)** - Global Biodiversity Information Facility +- **[IUCN Red List](https://www.iucnredlist.org/)** - Conservation status assessments +- **[Encyclopedia of Life](https://eol.org/)** - Species information aggregator + +## Related Resources + +- [Croissant Format](https://github.com/mlcommons/croissant) - Base ML dataset format +- [Croissant RAI Extension](https://github.com/mlcommons/croissant/blob/main/docs/croissant-rai-spec.md) - Responsible AI extension +- [TreeOfLife-200M Dataset](https://huggingface.co/datasets/imageomics/TreeOfLife-200M) - Example dataset using ecoCroissant +- [Imageomics Institute](https://imageomics.org/) - Advancing biological knowledge through images + +## Contributing + +We welcome contributions! Please see our contributing guidelines for more information. + +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. + +## Citation + +If you use ecoCroissant in your research, please cite: + +```bibtex +@misc{ecoCroissant2024, + title={ecoCroissant: A Croissant Extension for Biodiversity Metadata}, + author={Imageomics Institute}, + year={2024}, + url={https://github.com/Imageomics/ecoCroissant} +} +``` + +## Acknowledgments + +This work builds upon the [Croissant format](https://github.com/mlcommons/croissant) developed by the MLCommons Datasets Working Group. We thank the biodiversity informatics community for their contributions to standards like Darwin Core that inform this work. diff --git a/AI-generated-test/docs/eco-spec.md b/AI-generated-test/docs/eco-spec.md new file mode 100644 index 0000000..238e904 --- /dev/null +++ b/AI-generated-test/docs/eco-spec.md @@ -0,0 +1,687 @@ +# ecoCroissant Specification + +## Croissant Extension for Biodiversity Metadata + +Version 1.0 + + + +## Introduction + +ecoCroissant is an extension to the [Croissant format](http://mlcommons.org/croissant/1.0) designed to make biodiversity datasets **AI-ready** by integrating Darwin Core terms with FAIR4AI-specific requirements. Rather than redefining existing biodiversity standards, ecoCroissant builds upon [Darwin Core](https://dwc.tdwg.org/) terms and enhances them with AI-specific metadata needed for machine learning applications. + +### FAIR4AI Requirements + +ecoCroissant specifically addresses FAIR4AI requirements: + +1. **Queryable Metadata**: Data and metadata can be queried without downloading large files or specialized file types +2. **Ontology Integration**: Darwin Core terms are directly integrated, queryable with synonyms from other biodiversity ontologies (GBIF, NCBI, EOL) +3. **Content/Context Extraction**: Clear distinction between occurrence-based and image-based data records + +### AI-Ready Data Requirements + +ecoCroissant ensures datasets are **AI-ready** by including: + +- **Distribution Information**: Data splits, stratification details, and class distributions in usable form +- **Preprocessing Documentation**: Information about whether data has been processed or standardized and how +- **Model Provenance**: For AI-generated annotations or classifications +- **Rate Limiting**: Server profiling information for streaming data from sources + +### Extension Scope + +ecoCroissant extends Croissant by: + +- **Direct Darwin Core Integration**: Using Darwin Core terms natively rather than redefining them +- **AI-Specific Metadata**: Adding properties for model provenance, data splits, and preprocessing pipelines +- **Ecological Context**: Properties for ecological relationships and conservation status not in Darwin Core +- **Image-Specific Metadata**: Properties for anatomical features, view angles, and image types relevant to biodiversity ML + +## Prerequisites + +The ecoCroissant vocabulary builds on the [schema.org/Dataset](http://schema.org/Dataset) vocabulary and the [Croissant core vocabulary](http://mlcommons.org/croissant/1.0). + +### Namespace + +The ecoCroissant vocabulary is defined in its own namespace, identified by the IRI: + +``` +http://imageomics.org/ecoCroissant/ +``` + +We abbreviate this namespace IRI using the prefix `eco`. + +### Related Vocabularies + +ecoCroissant integrates with established biodiversity standards: + +| Prefix | IRI | Description | +|--------|-----|-------------| +| sc | http://schema.org/ | The schema.org namespace | +| cr | http://mlcommons.org/croissant/ | MLCommons Croissant namespace | +| dwc | http://rs.tdwg.org/dwc/terms/ | Darwin Core terms | +| gbif | https://www.gbif.org/species/ | GBIF Species API | +| ncbi | https://www.ncbi.nlm.nih.gov/taxonomy/ | NCBI Taxonomy | +| eol | https://eol.org/pages/ | Encyclopedia of Life | +| iucn | https://www.iucnredlist.org/ | IUCN Red List | + +### Conformance + +ecoCroissant datasets must declare conformance to this specification: + +```json +"dct:conformsTo": "http://imageomics.org/ecoCroissant/1.0" +``` + +## Use Cases + +### Use Case 1: Taxonomic Discovery and Classification + +ML models for species identification require rich taxonomic context. ecoCroissant enables: + +- **Hierarchical taxonomy**: Complete taxonomic lineage from kingdom to subspecies +- **Taxonomic identifiers**: Links to authoritative databases (GBIF, NCBI, EOL) +- **Nomenclature history**: Synonyms, basionyms, and taxonomic revisions +- **Vernacular names**: Common names across languages and regions + +### Use Case 2: Geographic and Habitat Context + +Ecological datasets require spatial and habitat information: + +- **Geolocation**: Coordinates with precision and datum information +- **Habitat classification**: Biome, ecosystem, and microhabitat descriptions +- **Elevation and depth**: Altitude/depth ranges for species occurrences +- **Protected areas**: National parks, reserves, and conservation zones + +### Use Case 3: Temporal Ecology + +Understanding temporal patterns in biodiversity data: + +- **Seasonality**: Phenological timing, migration patterns +- **Collection timeline**: When observations or specimens were collected +- **Historical context**: Changes in distribution or abundance over time + +### Use Case 4: Species Interactions and Ecology + +Capturing ecological relationships: + +- **Trophic relationships**: Predator-prey, herbivore-plant interactions +- **Symbiotic relationships**: Mutualism, parasitism, commensalism +- **Pollination and dispersal**: Plant-animal interactions +- **Ecological roles**: Keystone species, ecosystem engineers + +### Use Case 5: Conservation and Population Status + +Conservation-relevant metadata: + +- **IUCN Red List status**: Global and regional threat assessments +- **Population trends**: Increasing, stable, decreasing +- **Threats**: Habitat loss, climate change, invasive species +- **Protection status**: Legal protection levels + +### Use Case 6: Data Quality and Provenance + +Ensuring data reliability for ML applications: + +- **Identification confidence**: Expert-verified vs. citizen science observations +- **Data collection method**: Field observation, museum specimen, remote sensing +- **Georeferencing quality**: GPS accuracy, geocoding method +- **Temporal precision**: Exact date vs. date range + +## Properties + +ecoCroissant uses Darwin Core terms directly where applicable and adds new properties only when needed for AI-specific requirements or ecological concepts not covered by Darwin Core. + +### Darwin Core Properties (Used Directly) + +The following Darwin Core terms are used directly without redefinition: + +#### Taxonomic Terms +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| dwc:scientificName | sc:Text | ONE | The full scientific name including authorship | +| dwc:taxonRank | sc:Text | ONE | The taxonomic rank (e.g., species, genus, family) | +| dwc:kingdom | sc:Text | ONE | Taxonomic kingdom | +| dwc:phylum | sc:Text | ONE | Taxonomic phylum | +| dwc:class | sc:Text | ONE | Taxonomic class | +| dwc:order | sc:Text | ONE | Taxonomic order | +| dwc:family | sc:Text | ONE | Taxonomic family | +| dwc:genus | sc:Text | ONE | Taxonomic genus | +| dwc:higherClassification | sc:Text | ONE | Full taxonomic hierarchy | +| dwc:vernacularName | sc:Text | MANY | Common name(s) in various languages | +| dwc:taxonomicStatus | sc:Text | ONE | Status of the taxon name (accepted, synonym, etc.) | +| dwc:taxonID | sc:URL | MANY | Identifier from taxonomic databases (GBIF, NCBI, etc.) | + +#### Geographic Terms +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| dwc:locality | sc:Text | ONE | Description of the location | +| dwc:habitat | sc:Text | MANY | Habitat type(s) where organisms occur | +| dwc:continent | sc:Text | ONE | Continent of occurrence | +| dwc:country | sc:Text | MANY | Country/countries of occurrence | +| dwc:coordinateUncertaintyInMeters | sc:Number | ONE | Uncertainty radius for coordinates | +| dwc:minimumElevationInMeters | sc:Number | ONE | Minimum elevation of occurrences | +| dwc:maximumElevationInMeters | sc:Number | ONE | Maximum elevation of occurrences | +| dwc:minimumDepthInMeters | sc:Number | ONE | Minimum depth (for aquatic organisms) | +| dwc:maximumDepthInMeters | sc:Number | ONE | Maximum depth (for aquatic organisms) | +| dwc:decimalLatitude | sc:Float | ONE | Latitude in decimal degrees | +| dwc:decimalLongitude | sc:Float | ONE | Longitude in decimal degrees | +| dwc:geodeticDatum | sc:Text | ONE | Spatial reference system (e.g., WGS84) | + +#### Temporal Terms +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| dwc:eventDate | sc:Date or sc:DateTime | MANY | Date(s) when data was collected | +| dwc:year | sc:Integer | ONE | Year of collection | +| dwc:month | sc:Integer | ONE | Month of collection | +| dwc:day | sc:Integer | ONE | Day of collection | +| dwc:lifeStage | sc:Text | MANY | Life stage(s) represented (egg, larva, adult, etc.) | + +#### Data Quality Terms +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| dwc:identificationVerificationStatus | sc:Text | ONE | Verification level of taxonomic identifications | +| dwc:identifiedBy | sc:Text | MANY | Who identified the specimens/observations | +| dwc:samplingProtocol | sc:Text | ONE | Method used to collect data | +| dwc:dataGeneralizations | sc:Text | ONE | Any data generalizations applied (e.g., coordinate obscuring) | +| dwc:informationWithheld | sc:Text | ONE | Information intentionally withheld (e.g., for endangered species) | +| dwc:basisOfRecord | sc:Text | ONE | Type of record (PreservedSpecimen, HumanObservation, MachineObservation, etc.) | +| dwc:occurrenceStatus | sc:Text | ONE | Whether organism was present or absent | + +### AI-Specific Properties (ecoCroissant Extensions) + +These properties are ecoCroissant additions for AI-ready data requirements: + +#### Data Distribution and Preprocessing +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:dataDistribution | sc:Text | ONE | Description of class distribution (e.g., "long-tailed", "balanced", stratification details) | +| eco:preprocessingSteps | sc:Text | MANY | List of preprocessing steps applied (e.g., "resized to 224x224", "normalized to [-1,1]") | +| eco:standardizationMethod | sc:Text | ONE | Method used for data standardization if applicable | +| eco:trainTestSplit | sc:Text | ONE | Description of train/test/validation splits with proportions | +| eco:stratificationVariable | sc:Text | MANY | Variables used for stratification (e.g., "taxonomic family", "geographic region") | +| eco:dataSplitRationale | sc:Text | ONE | Rationale for data splitting strategy | + +#### Model Provenance for AI-Generated Data +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:generatedBy | sc:Text | ONE | Name/version of model that generated annotations or classifications | +| eco:modelConfidence | sc:Float | ONE | Confidence score for AI-generated labels (0-1) | +| eco:humanVerified | sc:Boolean | ONE | Whether AI-generated data has been human-verified | +| eco:generationMethod | sc:Text | ONE | Method used for generation (e.g., "automated classification", "bounding box detection") | + +#### API and Streaming Information +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:apiEndpoint | sc:URL | ONE | API endpoint for streaming data access | +| eco:rateLimitRequests | sc:Integer | ONE | Maximum requests per time period | +| eco:rateLimitPeriod | sc:Text | ONE | Time period for rate limit (e.g., "per minute", "per hour") | +| eco:streamingSupported | sc:Boolean | ONE | Whether data can be streamed rather than downloaded | +| eco:bulkDownloadSize | sc:Text | ONE | Approximate size of full dataset download | + +#### Record Type Context (FAIR4AI Requirement) +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:recordType | sc:Text | ONE | Type of data record: "occurrence-based" or "image-based" or "mixed" | +| eco:occurrenceToImageRatio | sc:Float | ONE | Ratio of occurrence records to images (relevant for mixed datasets) | +| eco:imageAnnotationType | sc:Text | MANY | Type of image annotations (e.g., "bounding box", "segmentation", "whole image classification") | + +#### Ecological Extensions (Not in Darwin Core) +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:biome | sc:Text | ONE | Major biome classification | +| eco:trophicLevel | sc:Text | ONE | Position in food chain (producer, primary consumer, etc.) | +| eco:ecologicalRole | sc:Text | MANY | Ecological function (pollinator, predator, decomposer, etc.) | +| eco:speciesInteractions | sc:Text | MANY | Description of species interactions in the dataset | +| eco:diet | sc:Text | MANY | Diet composition for animals | + +#### Conservation Extensions +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:iucnStatus | sc:Text | ONE | IUCN Red List category (LC, NT, VU, EN, CR, EW, EX) | +| eco:iucnStatusSource | sc:URL | ONE | Link to IUCN assessment | +| eco:populationTrend | sc:Text | ONE | Population trend (increasing, stable, decreasing, unknown) | +| eco:threats | sc:Text | MANY | Known threats to the species | +| eco:conservationActions | sc:Text | MANY | Conservation actions in place or recommended | +| eco:protectedArea | sc:Text | MANY | Protected areas where species occurs | + +#### Image-Specific Extensions +| Property | Expected Type | Cardinality | Description | +|----------|---------------|-------------|-------------| +| eco:imageLicense | sc:URL | ONE | License for images in the dataset | +| eco:imageType | sc:Text | MANY | Type of images (photograph, illustration, microscopy, etc.) | +| eco:viewAngle | sc:Text | MANY | View angle of specimens in images (dorsal, ventral, lateral, etc.) | +| eco:anatomicalFeatures | sc:Text | MANY | Anatomical features visible or annotated | +| eco:phenotype | sc:Text | MANY | Observable phenotypic characteristics | +| eco:imageResolution | sc:Text | ONE | Resolution of images (e.g., "1024x1024", "variable") | +| eco:imageFormat | sc:Text | MANY | Image file formats (e.g., "JPEG", "PNG", "TIFF") | + +## JSON-LD Context + +The recommended JSON-LD context for ecoCroissant uses Darwin Core terms directly: + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/", + + "scientificName": "dwc:scientificName", + "taxonRank": "dwc:taxonRank", + "kingdom": "dwc:kingdom", + "phylum": "dwc:phylum", + "class": "dwc:class", + "order": "dwc:order", + "family": "dwc:family", + "genus": "dwc:genus", + "taxonID": "dwc:taxonID", + "higherClassification": "dwc:higherClassification", + "vernacularName": "dwc:vernacularName", + "taxonomicStatus": "dwc:taxonomicStatus", + + "locality": "dwc:locality", + "habitat": "dwc:habitat", + "continent": "dwc:continent", + "country": "dwc:country", + "decimalLatitude": "dwc:decimalLatitude", + "decimalLongitude": "dwc:decimalLongitude", + "coordinateUncertaintyInMeters": "dwc:coordinateUncertaintyInMeters", + "minimumElevationInMeters": "dwc:minimumElevationInMeters", + "maximumElevationInMeters": "dwc:maximumElevationInMeters", + "minimumDepthInMeters": "dwc:minimumDepthInMeters", + "maximumDepthInMeters": "dwc:maximumDepthInMeters", + "geodeticDatum": "dwc:geodeticDatum", + + "eventDate": "dwc:eventDate", + "year": "dwc:year", + "month": "dwc:month", + "day": "dwc:day", + "lifeStage": "dwc:lifeStage", + + "identificationVerificationStatus": "dwc:identificationVerificationStatus", + "identifiedBy": "dwc:identifiedBy", + "samplingProtocol": "dwc:samplingProtocol", + "dataGeneralizations": "dwc:dataGeneralizations", + "informationWithheld": "dwc:informationWithheld", + "basisOfRecord": "dwc:basisOfRecord", + "occurrenceStatus": "dwc:occurrenceStatus", + + "dataDistribution": "eco:dataDistribution", + "preprocessingSteps": "eco:preprocessingSteps", + "standardizationMethod": "eco:standardizationMethod", + "trainTestSplit": "eco:trainTestSplit", + "stratificationVariable": "eco:stratificationVariable", + "dataSplitRationale": "eco:dataSplitRationale", + + "generatedBy": "eco:generatedBy", + "modelConfidence": "eco:modelConfidence", + "humanVerified": "eco:humanVerified", + "generationMethod": "eco:generationMethod", + + "apiEndpoint": "eco:apiEndpoint", + "rateLimitRequests": "eco:rateLimitRequests", + "rateLimitPeriod": "eco:rateLimitPeriod", + "streamingSupported": "eco:streamingSupported", + "bulkDownloadSize": "eco:bulkDownloadSize", + + "recordType": "eco:recordType", + "occurrenceToImageRatio": "eco:occurrenceToImageRatio", + "imageAnnotationType": "eco:imageAnnotationType", + + "biome": "eco:biome", + "trophicLevel": "eco:trophicLevel", + "ecologicalRole": "eco:ecologicalRole", + "speciesInteractions": "eco:speciesInteractions", + "diet": "eco:diet", + + "iucnStatus": "eco:iucnStatus", + "iucnStatusSource": "eco:iucnStatusSource", + "populationTrend": "eco:populationTrend", + "threats": "eco:threats", + "conservationActions": "eco:conservationActions", + "protectedArea": "eco:protectedArea", + + "imageLicense": "eco:imageLicense", + "imageType": "eco:imageType", + "viewAngle": "eco:viewAngle", + "anatomicalFeatures": "eco:anatomicalFeatures", + "phenotype": "eco:phenotype", + "imageResolution": "eco:imageResolution", + "imageFormat": "eco:imageFormat", + + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + } +} +``` + +## Examples + +### Example 1: AI-Ready Species Image Dataset (TreeOfLife-200M style) + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "TreeOfLife-200M", + "description": "AI-ready dataset of 200M images spanning the tree of life with stratified splits for species identification. Images from iNaturalist Research Grade observations.", + "license": "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "dwc:basisOfRecord": ["HumanObservation"], + "dwc:identificationVerificationStatus": "Research Grade (2/3+ community agreement)", + "dwc:samplingProtocol": "Community science observations via iNaturalist platform", + "dwc:dataGeneralizations": "Coordinates obscured for sensitive species per observer privacy settings", + + "eco:recordType": "image-based", + "eco:dataDistribution": "long-tailed: 500K species with 1-10,000 images each, stratified by taxonomic family", + "eco:preprocessingSteps": ["resized to 224x224", "normalized to ImageNet stats", "augmented with random crops and flips"], + "eco:standardizationMethod": "ImageNet normalization (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])", + "eco:trainTestSplit": "80% train, 10% validation, 10% test - stratified by species", + "eco:stratificationVariable": ["dwc:family", "dwc:genus"], + "eco:dataSplitRationale": "Stratified to ensure representation across taxonomic groups; temporal split avoided due to seasonal biases", + + "eco:apiEndpoint": "https://huggingface.co/api/datasets/imageomics/TreeOfLife-200M", + "eco:streamingSupported": true, + "eco:bulkDownloadSize": "~15TB uncompressed", + "eco:rateLimitRequests": 1000, + "eco:rateLimitPeriod": "per hour", + + "eco:imageType": ["photograph"], + "eco:imageResolution": "variable - minimum 224x224, maximum 4096x4096", + "eco:imageFormat": ["JPEG"], + "eco:imageLicense": "https://creativecommons.org/licenses/by-nc/4.0/", + + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "metadata.parquet", + "name": "metadata.parquet", + "description": "Queryable metadata without downloading images", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/metadata.parquet", + "encodingFormat": "application/x-parquet" + } + ], + + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "species_images", + "field": [ + { + "@type": "cr:Field", + "@id": "species_images/image_id", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "species_images/scientificName", + "description": "Darwin Core scientific name", + "dataType": "dwc:scientificName" + }, + { + "@type": "cr:Field", + "@id": "species_images/taxonID", + "description": "iNaturalist taxon identifier linking to GBIF", + "dataType": "dwc:taxonID" + }, + { + "@type": "cr:Field", + "@id": "species_images/kingdom", + "dataType": "dwc:kingdom" + }, + { + "@type": "cr:Field", + "@id": "species_images/family", + "dataType": "dwc:family" + }, + { + "@type": "cr:Field", + "@id": "species_images/split", + "description": "train/val/test split assignment", + "dataType": "sc:Text" + } + ] + } + ] +} +``` + +### Example 2: Butterfly Specimen Dataset with AI-Generated Annotations + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "Heliconius Butterfly Wing Pattern Dataset", + "description": "AI-ready dataset of museum specimens with standardized imaging and AI-assisted wing pattern segmentations.", + "license": "https://creativecommons.org/licenses/by/4.0/", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "dwc:scientificName": "Heliconius Kluk, 1780", + "dwc:taxonRank": "genus", + "dwc:taxonID": "https://www.gbif.org/species/1932585", + "dwc:higherClassification": "Animalia > Arthropoda > Insecta > Lepidoptera > Nymphalidae > Heliconiinae > Heliconius", + "dwc:vernacularName": ["Longwing butterflies"], + "dwc:kingdom": "Animalia", + "dwc:class": "Insecta", + "dwc:order": "Lepidoptera", + "dwc:family": "Nymphalidae", + "dwc:genus": "Heliconius", + + "dwc:habitat": ["tropical rainforest", "forest edge", "secondary forest"], + "dwc:continent": ["South America", "Central America"], + "dwc:country": ["Ecuador", "Peru", "Colombia", "Panama", "Costa Rica"], + "dwc:minimumElevationInMeters": 0, + "dwc:maximumElevationInMeters": 2000, + + "dwc:lifeStage": ["adult"], + "dwc:basisOfRecord": "PreservedSpecimen", + "dwc:identificationVerificationStatus": "expert-verified", + "dwc:identifiedBy": ["Museum taxonomists", "Heliconius specialists"], + "dwc:samplingProtocol": "Museum specimens imaged with standardized dorsal and ventral views at 300 DPI", + + "eco:biome": "tropical moist broadleaf forest", + "eco:trophicLevel": "primary consumer", + "eco:ecologicalRole": ["pollinator", "Müllerian mimic"], + "eco:diet": ["pollen", "nectar"], + "eco:speciesInteractions": "Müllerian mimicry complex; larvae on Passiflora", + + "eco:iucnStatus": "LC", + "eco:populationTrend": "stable", + + "eco:recordType": "image-based", + "eco:preprocessingSteps": ["white background removal", "standardized to 1024x1024", "color-corrected"], + "eco:imageType": ["museum specimen photograph"], + "eco:imageResolution": "1024x1024", + "eco:imageFormat": ["TIFF", "JPEG"], + "eco:viewAngle": ["dorsal", "ventral"], + "eco:anatomicalFeatures": ["forewing", "hindwing", "wing pattern"], + "eco:phenotype": "wing color pattern", + "eco:imageAnnotationType": ["segmentation"], + + "eco:generatedBy": "Mask R-CNN v2.1 trained on 5K hand-annotated specimens", + "eco:modelConfidence": 0.92, + "eco:humanVerified": true, + "eco:generationMethod": "automated wing boundary segmentation with manual correction" +} +``` + +### Example 3: Camera Trap Dataset + +```json +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "Amazon Rainforest Camera Trap Survey", + "description": "Camera trap images of mammals from the Amazon rainforest for biodiversity monitoring and species identification.", + "license": "https://creativecommons.org/licenses/by-nc/4.0/", + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "eco:taxon": "Mammalia", + "eco:taxonRank": "class", + "eco:higherClassification": "Animalia > Chordata > Mammalia", + + "eco:locality": "Yasuní National Park, Ecuador", + "eco:habitat": ["lowland tropical rainforest", "terra firme forest", "várzea forest"], + "eco:biome": "tropical moist broadleaf forest", + "eco:continent": "South America", + "eco:country": "Ecuador", + "eco:coordinateUncertaintyInMeters": 10, + "eco:minimumElevationInMeters": 200, + "eco:maximumElevationInMeters": 400, + + "eco:eventDateStart": "2020-01-01", + "eco:eventDateEnd": "2022-12-31", + "eco:seasonality": ["wet season", "dry season"], + + "eco:protectedArea": "Yasuní National Park", + "eco:threats": ["habitat fragmentation", "oil extraction", "hunting"], + + "eco:basisOfRecord": "MachineObservation", + "eco:identificationVerificationStatus": "expert-verified with AI-assisted pre-classification", + "eco:samplingProtocol": "Camera traps deployed at 1km intervals, active 24/7", + "eco:dataGeneralizations": "Exact coordinates obscured for sensitive species locations", + "eco:informationWithheld": "Precise locations of endangered species nesting sites withheld", + + "eco:imageType": ["camera trap photograph"], + "eco:imageLicense": "https://creativecommons.org/licenses/by-nc/4.0/" +} +``` + +## Darwin Core Integration + +ecoCroissant uses Darwin Core terms directly as part of its vocabulary. There is no separate "ecoCroissant version" of Darwin Core terms - the standard Darwin Core terms are used natively through the `dwc:` namespace. + +### Direct Darwin Core Usage + +All Darwin Core terms are available for use in ecoCroissant datasets. The most commonly used terms include: + +- **Taxonomic**: `dwc:scientificName`, `dwc:taxonRank`, `dwc:kingdom`, `dwc:phylum`, `dwc:class`, `dwc:order`, `dwc:family`, `dwc:genus`, `dwc:higherClassification`, `dwc:taxonID`, `dwc:vernacularName`, `dwc:taxonomicStatus` +- **Geographic**: `dwc:locality`, `dwc:habitat`, `dwc:continent`, `dwc:country`, `dwc:decimalLatitude`, `dwc:decimalLongitude`, `dwc:coordinateUncertaintyInMeters`, `dwc:geodeticDatum`, elevation and depth terms +- **Temporal**: `dwc:eventDate`, `dwc:year`, `dwc:month`, `dwc:day` +- **Data Quality**: `dwc:basisOfRecord`, `dwc:identifiedBy`, `dwc:identificationVerificationStatus`, `dwc:samplingProtocol`, `dwc:dataGeneralizations`, `dwc:informationWithheld` + +### Queryability with Ontology Synonyms + +Darwin Core terms in ecoCroissant datasets can be queried using synonyms from other biodiversity ontologies: + +- **GBIF Backbone Taxonomy**: `dwc:taxonID` can link to GBIF species pages +- **NCBI Taxonomy**: Cross-reference via taxon identifiers +- **Encyclopedia of Life (EOL)**: Link species concepts across systems +- **Integrated Taxonomic Information System (ITIS)**: Standard taxonomic references + +This satisfies the FAIR4AI requirement that "ontology used can be queried with synonyms from other ontologies." + +## Integration with FAIR4AI Principles + +ecoCroissant supports FAIR4AI principles: + +### Findable +- Standardized metadata fields enable discovery across repositories +- Links to authoritative taxonomic databases (GBIF, NCBI, EOL) +- Rich keyword and classification support + +### Accessible +- Clear licensing information for data and images +- Information about data access restrictions or withheld information +- Links to data sources and repositories + +### Interoperable +- JSON-LD format compatible with Croissant ecosystem +- Alignment with Darwin Core for biodiversity data exchange +- Integration with schema.org for web discoverability + +### Reusable +- Detailed provenance and methodology documentation +- Data quality indicators and verification status +- Conservation context for ethical use considerations + +## References + +1. [Croissant: A Metadata Format for ML-Ready Datasets](https://doi.org/10.1145/3650203.3663326) +2. [Darwin Core Standard](https://dwc.tdwg.org/) +3. [GBIF Data Quality](https://www.gbif.org/data-quality-requirements) +4. [FAIR4AI Principles](https://www.nature.com/articles/s41597-022-01759-2) +5. [IUCN Red List Categories and Criteria](https://www.iucnredlist.org/resources/categories-and-criteria) +6. [Encyclopedia of Life](https://eol.org/) + +## License + +This specification is released under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). + +## Contributors + +- Imageomics Institute + +## Acknowledgments + +This work builds upon the [Croissant format](https://github.com/mlcommons/croissant) developed by the MLCommons Datasets Working Group. diff --git a/AI-generated-test/examples/treeoflife-200m.json b/AI-generated-test/examples/treeoflife-200m.json new file mode 100644 index 0000000..89ff321 --- /dev/null +++ b/AI-generated-test/examples/treeoflife-200m.json @@ -0,0 +1,198 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/" + }, + "@type": "sc:Dataset", + "name": "TreeOfLife-200M", + "description": "AI-ready dataset of 200M images from iNaturalist Research Grade observations spanning the tree of life. Includes stratified train/val/test splits, queryable metadata, and preprocessing documentation for species identification ML models.", + "license": "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M", + "creator": { + "@type": "Organization", + "name": "Imageomics Institute", + "url": "https://imageomics.org/" + }, + "datePublished": "2024", + "keywords": [ + "species identification", + "biodiversity", + "computer vision", + "iNaturalist", + "phylogeny", + "tree of life", + "image classification", + "AI-ready", + "FAIR4AI" + ], + "dct:conformsTo": [ + "http://mlcommons.org/croissant/1.0", + "http://imageomics.org/ecoCroissant/1.0" + ], + + "dwc:basisOfRecord": "HumanObservation", + "dwc:identificationVerificationStatus": "Research Grade (2/3+ community agreement)", + "dwc:samplingProtocol": "Community science observations via iNaturalist platform, 2008-2023", + "dwc:dataGeneralizations": "Coordinates obscured for sensitive/threatened species per observer privacy settings", + + "eco:recordType": "image-based", + "eco:dataDistribution": "Long-tailed distribution: ~500K species with 1-10,000 images per species. Top 1% of species account for 30% of images. Stratified by taxonomic family to ensure representation.", + "eco:preprocessingSteps": [ + "Resized to 224x224 pixels (aspect ratio preserved, zero-padded)", + "Normalized using ImageNet statistics (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])", + "Training augmentation: random horizontal flip, random crop, color jitter" + ], + "eco:standardizationMethod": "ImageNet normalization applied to all images for transfer learning compatibility", + "eco:trainTestSplit": "80% train (160M images), 10% validation (20M images), 10% test (20M images)", + "eco:stratificationVariable": ["dwc:family", "dwc:genus"], + "eco:dataSplitRationale": "Stratified by taxonomic family to ensure phylogenetic diversity across splits. Species with <10 images excluded from test set to avoid data leakage. Temporal split avoided due to seasonal observation biases.", + + "eco:apiEndpoint": "https://huggingface.co/api/datasets/imageomics/TreeOfLife-200M", + "eco:streamingSupported": true, + "eco:bulkDownloadSize": "~15TB uncompressed images + 5GB metadata", + "eco:rateLimitRequests": 1000, + "eco:rateLimitPeriod": "per hour", + + "eco:imageType": "photograph", + "eco:imageResolution": "Variable source resolution (224x224 to 4096x4096), standardized to 224x224 for ML", + "eco:imageFormat": "JPEG", + "eco:imageLicense": "CC-BY-NC 4.0 (individual images may have more permissive licenses)", + + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "metadata.parquet", + "name": "metadata.parquet", + "description": "Queryable metadata file with Darwin Core terms. Can be queried without downloading images. Includes taxonomy, geography, temporal info, and split assignments.", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/metadata.parquet", + "encodingFormat": "application/x-parquet", + "contentSize": "5GB" + }, + { + "@type": "cr:FileSet", + "@id": "train-images", + "name": "train-images", + "description": "Training split images", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/resolve/main/images/train/", + "encodingFormat": "image/jpeg" + } + ], + + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "observations", + "name": "Species Observations", + "description": "Image-based species observations with Darwin Core metadata", + "field": [ + { + "@type": "cr:Field", + "@id": "observations/image_id", + "name": "image_id", + "description": "Unique identifier for the image", + "dataType": "sc:Text" + }, + { + "@type": "cr:Field", + "@id": "observations/image_url", + "name": "image_url", + "description": "URL to the image file", + "dataType": "sc:URL" + }, + { + "@type": "cr:Field", + "@id": "observations/scientificName", + "name": "scientificName", + "description": "Darwin Core scientific name of the observed species", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "scientificName"} + } + }, + { + "@type": "cr:Field", + "@id": "observations/taxonID", + "name": "taxonID", + "description": "iNaturalist taxon identifier (links to GBIF)", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "taxonID"} + } + }, + { + "@type": "cr:Field", + "@id": "observations/kingdom", + "name": "kingdom", + "description": "Taxonomic kingdom (Darwin Core)", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "kingdom"} + } + }, + { + "@type": "cr:Field", + "@id": "observations/family", + "name": "family", + "description": "Taxonomic family (Darwin Core)", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "family"} + } + }, + { + "@type": "cr:Field", + "@id": "observations/decimalLatitude", + "name": "decimalLatitude", + "description": "Latitude in decimal degrees (may be obscured for sensitive species)", + "dataType": "sc:Float", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "decimalLatitude"} + } + }, + { + "@type": "cr:Field", + "@id": "observations/decimalLongitude", + "name": "decimalLongitude", + "description": "Longitude in decimal degrees (may be obscured for sensitive species)", + "dataType": "sc:Float", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "decimalLongitude"} + } + }, + { + "@type": "cr:Field", + "@id": "observations/eventDate", + "name": "eventDate", + "description": "Date of observation (Darwin Core)", + "dataType": "sc:Date", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "eventDate"} + } + }, + { + "@type": "cr:Field", + "@id": "observations/split", + "name": "split", + "description": "Data split assignment: train, val, or test", + "dataType": "sc:Text", + "source": { + "fileObject": {"@id": "metadata.parquet"}, + "extract": {"column": "split"} + } + } + ] + } + ] +} diff --git a/AI-generated-test/schema/eco-context.jsonld b/AI-generated-test/schema/eco-context.jsonld new file mode 100644 index 0000000..d2d6d21 --- /dev/null +++ b/AI-generated-test/schema/eco-context.jsonld @@ -0,0 +1,131 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "eco": "http://imageomics.org/ecoCroissant/", + "dwc": "http://rs.tdwg.org/dwc/terms/", + "dct": "http://purl.org/dc/terms/", + + "scientificName": "dwc:scientificName", + "taxonRank": "dwc:taxonRank", + "kingdom": "dwc:kingdom", + "phylum": "dwc:phylum", + "class": "dwc:class", + "order": "dwc:order", + "family": "dwc:family", + "genus": "dwc:genus", + "taxonID": "dwc:taxonID", + "higherClassification": "dwc:higherClassification", + "vernacularName": "dwc:vernacularName", + "taxonomicStatus": "dwc:taxonomicStatus", + + "locality": "dwc:locality", + "habitat": "dwc:habitat", + "continent": "dwc:continent", + "country": "dwc:country", + "decimalLatitude": "dwc:decimalLatitude", + "decimalLongitude": "dwc:decimalLongitude", + "coordinateUncertaintyInMeters": "dwc:coordinateUncertaintyInMeters", + "minimumElevationInMeters": "dwc:minimumElevationInMeters", + "maximumElevationInMeters": "dwc:maximumElevationInMeters", + "minimumDepthInMeters": "dwc:minimumDepthInMeters", + "maximumDepthInMeters": "dwc:maximumDepthInMeters", + "geodeticDatum": "dwc:geodeticDatum", + + "eventDate": "dwc:eventDate", + "year": "dwc:year", + "month": "dwc:month", + "day": "dwc:day", + "lifeStage": "dwc:lifeStage", + + "identificationVerificationStatus": "dwc:identificationVerificationStatus", + "identifiedBy": "dwc:identifiedBy", + "samplingProtocol": "dwc:samplingProtocol", + "dataGeneralizations": "dwc:dataGeneralizations", + "informationWithheld": "dwc:informationWithheld", + "basisOfRecord": "dwc:basisOfRecord", + "occurrenceStatus": "dwc:occurrenceStatus", + + "dataDistribution": "eco:dataDistribution", + "preprocessingSteps": "eco:preprocessingSteps", + "standardizationMethod": "eco:standardizationMethod", + "trainTestSplit": "eco:trainTestSplit", + "stratificationVariable": "eco:stratificationVariable", + "dataSplitRationale": "eco:dataSplitRationale", + + "generatedBy": "eco:generatedBy", + "modelConfidence": "eco:modelConfidence", + "humanVerified": "eco:humanVerified", + "generationMethod": "eco:generationMethod", + + "apiEndpoint": "eco:apiEndpoint", + "rateLimitRequests": "eco:rateLimitRequests", + "rateLimitPeriod": "eco:rateLimitPeriod", + "streamingSupported": "eco:streamingSupported", + "bulkDownloadSize": "eco:bulkDownloadSize", + + "recordType": "eco:recordType", + "occurrenceToImageRatio": "eco:occurrenceToImageRatio", + "imageAnnotationType": "eco:imageAnnotationType", + + "biome": "eco:biome", + "trophicLevel": "eco:trophicLevel", + "ecologicalRole": "eco:ecologicalRole", + "speciesInteractions": "eco:speciesInteractions", + "diet": "eco:diet", + + "iucnStatus": "eco:iucnStatus", + "iucnStatusSource": "eco:iucnStatusSource", + "populationTrend": "eco:populationTrend", + "threats": "eco:threats", + "conservationActions": "eco:conservationActions", + "protectedArea": "eco:protectedArea", + + "imageLicense": "eco:imageLicense", + "imageType": "eco:imageType", + "viewAngle": "eco:viewAngle", + "anatomicalFeatures": "eco:anatomicalFeatures", + "phenotype": "eco:phenotype", + "imageResolution": "eco:imageResolution", + "imageFormat": "eco:imageFormat", + + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + } +} diff --git a/README.md b/README.md index dc13b00..10d68e5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,12 @@ # ecoCroissant + Repository for developing croissant-based biodiversity metadata schema following FAIR4AI principles. + +Our goal in this repo is to first incorporate Darwin Core into the Croissant format, then start adding our particular ecological FAIR4AI terms that are needed. + +See for instance [croissant-spec-1.1](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec-1.1.md) as a basis, potentially using the [RAI spec](https://github.com/mlcommons/croissant/blob/main/docs/croissant-rai-spec.md) as a model for building on top of existing schema. Note the use of schema.org as a basis, with only new terms defined. See also the [croissant.ttl](https://github.com/mlcommons/croissant/blob/main/docs/croissant.ttl). + +See also, this [example JSONLD](https://doi.org/10.7717/peerj.12618). + +We should also look into the Google Datasets format and indexing process (see [Dataset shema](https://schema.org/Dataset)). [Example Query](https://datasetsearch.research.google.com/search?src=0&query=tree%20of%20life&docid=L2cvMTF4ZmJyZ2s0cQ%3D%3D); for [TreeOfLife-200M](https://huggingface.co/datasets/imageomics/TreeOfLife-200M) it does captpure the DOI and the description we provide, but then the update date is "May 1, 2024", which is the GBIF snapshot we use (at least the DOI date would be logical). The correct date doesn't actually seem to be captured by Croissant either [existing-format-ex/TOL-200M-croissant](existing-format-ex/TOL-200M-croissant.jsonld). + diff --git a/existing-format-ex/TOL-200M-croissant.jsonld b/existing-format-ex/TOL-200M-croissant.jsonld new file mode 100644 index 0000000..d13528b --- /dev/null +++ b/existing-format-ex/TOL-200M-croissant.jsonld @@ -0,0 +1,414 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "arrayShape": "cr:arrayShape", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataBiases": "cr:dataBiases", + "dataCollection": "cr:dataCollection", + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isArray": "cr:isArray", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "personalSensitiveInformation": "cr:personalSensitiveInformation", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "containedIn": "cr:containedIn" + }, + "@type": "sc:Dataset", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "repo", + "name": "repo", + "description": "The Hugging Face git repository.", + "contentUrl": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M/tree/refs%2Fconvert%2Fparquet", + "encodingFormat": "git+https", + "sha256": "https://github.com/mlcommons/croissant/issues/80" + }, + { + "@type": "cr:FileSet", + "@id": "parquet-files-for-config-default", + "containedIn": { + "@id": "repo" + }, + "encodingFormat": "application/x-parquet", + "includes": "default/*/*.parquet" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "dataType": "cr:Split", + "key": { + "@id": "default_splits/split_name" + }, + "@id": "default_splits", + "name": "default_splits", + "description": "Splits for the default config.", + "field": [ + { + "@type": "cr:Field", + "@id": "default_splits/split_name", + "dataType": "sc:Text" + } + ], + "data": [ + { + "default_splits/split_name": "train" + } + ] + }, + { + "@type": "cr:RecordSet", + "@id": "default", + "description": "imageomics/TreeOfLife-200M - 'default' subset", + "field": [ + { + "@type": "cr:Field", + "@id": "default/split", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "fileProperty": "fullpath" + }, + "transform": { + "regex": "default/(?:partial-)?(train)/.+parquet$" + } + }, + "references": { + "field": { + "@id": "default_splits/split_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/uuid", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "uuid" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/source_url", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "source_url" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/kingdom", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "kingdom" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/phylum", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "phylum" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/class", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "class" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/order", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "order" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/family", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "family" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/genus", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "genus" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/species", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "species" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/scientific_name", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "scientific_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/common", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "common" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/data_source", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "data_source" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/publisher", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "publisher" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/basis_of_record", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "basis_of_record" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/img_type", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "img_type" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/source_id", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "source_id" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/shard_filename", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "shard_filename" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/shard_file_path", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "shard_file_path" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/base_dataset_file_path", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "base_dataset_file_path" + } + } + } + ] + } + ], + "conformsTo": "http://mlcommons.org/croissant/1.1", + "name": "TreeOfLife-200M", + "identifier": "10.57967/hf/6786", + "description": "With nearly 214 million images representing 952,257 taxa across the tree of life, TreeOfLife-200M is the largest and most diverse public ML-ready dataset for computer vision models in biology at release. This dataset combines images and metadata from four core biodiversity data providers: Global Biodiversity Information Facility (GBIF), Encyclopedia of Life (EOL), BIOSCAN-5M, and FathomNet to more than double the number of unique taxa covered by TreeOfLife-10M, adding 50 million more images than BioTrove (and nearly triple the unique taxa). TreeOfLife-200M also increases image context diversity with museum specimen, camera trap, and citizen science images well-represented. Our rigorous curation process ensures each image has the most specific taxonomic label possible and that the overall dataset provides a well-rounded foundation for training BioCLIP 2 and future biology foundation models.", + "alternateName": [ + "imageomics/TreeOfLife-200M", + "TreeOfLife-200M" + ], + "creator": { + "@type": "Organization", + "name": "HDR Imageomics Institute", + "url": "https://huggingface.co/imageomics" + }, + "keywords": [ + "image-classification", + "zero-shot-classification", + "English", + "Latin", + "cc0-1.0", + "100M - 1B", + "parquet", + "Text", + "Image", + "Datasets", + "pandas", + "Croissant", + "Polars", + "arxiv:2505.23883", + "doi:10.57967/hf/6786", + "🇺🇸 Region: US", + "biology", + "image", + "imageomics", + "animals", + "evolutionary biology", + "CV", + "multimodal", + "clip", + "species", + "taxonomy", + "knowledge-guided", + "imbalanced" + ], + "license": "https://choosealicense.com/licenses/cc0-1.0/", + "sameAs": "https://imageomics.github.io/bioclip-2/", + "url": "https://huggingface.co/datasets/imageomics/TreeOfLife-200M" +}