diff --git a/data/sheets_d4dassistant/cm4ai_d4d.yaml b/data/sheets_d4dassistant/cm4ai_d4d.yaml new file mode 100644 index 00000000..8fe31e34 --- /dev/null +++ b/data/sheets_d4dassistant/cm4ai_d4d.yaml @@ -0,0 +1,279 @@ +# Consolidated D4D Datasheet for Cell Maps for Artificial Intelligence (CM4AI) +# Generated by D4D Assistant from CM4AI publications and data releases +# Sources: cm4ai.org, Virginia Dataverse releases, CM4AI publications +# Generated: 2025-11-07 + +id: "cm4ai" +name: "Cell Maps for Artificial Intelligence (CM4AI)" +title: "Cell Maps for Artificial Intelligence - Comprehensive Dataset Collection" +description: > + Cell Maps for Artificial Intelligence (CM4AI) is the Functional Genomics Grand Challenge in the NIH Bridge2AI + program. CM4AI aims to map the spatiotemporal architecture of human cells and utilize these maps for interpretable + genotype-phenotype learning. The project generates comprehensive, AI-ready datasets combining three complementary + mapping strategies: (1) CRISPR/Cas9 genetic perturbation screens with perturb-seq, (2) proteomic mass spectrometry + (SEC-MS) for protein-protein interactions, and (3) cellular imaging for spatial subcellular organization mapping. + + The CM4AI dataset collection includes data from multiple cell types: undifferentiated KOLF2.1J human induced + pluripotent stem cells (hiPSCs), iPSC-derived neural progenitor cells (NPCs), neurons, cardiomyocytes, and + MDA-MB-468 breast cancer cells. All data are packaged with provenance graphs and rich metadata as AI-ready + datasets in RO-Crate format using the FAIRSCAPE framework. Data releases are available quarterly through the + University of Virginia Dataverse repository. + + As of the March 2025 Beta release, the dataset includes: 53,788 immunofluorescence images, 1,792 proteins + investigated, 11,739 genes targeted via CRISPR, 1,374 protein interactions mapped, totaling 22.7 TB of data. + CM4AI is a multi-institutional collaboration led by UC San Diego with partners at UCSF, Stanford, University + of Virginia, Yale, University of Texas at Austin, University of Alabama at Birmingham, Simon Fraser University, + and the Hastings Center. + +doi: "doi:10.18130/V3/CM4AI" +publisher: "https://dataverse.lib.virginia.edu" +issued: "2025-03-03" +page: "https://cm4ai.org" +created_on: "2024-05-11" +license: "CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)" +version: "March 2025 Beta (v0.6)" +conforms_to: "https://w3id.org/ro/crate" +language: "en" + +keywords: + - AI + - affinity purification + - AP-MS + - artificial intelligence + - breast cancer + - Bridge2AI + - cardiomyocyte + - cell mapping + - CM4AI + - CRISPR/Cas9 + - FAIRSCAPE + - functional genomics + - induced pluripotent stem cell + - iPSC + - KOLF2.1J + - machine learning + - mass spectroscopy + - MDA-MB-468 + - neural progenitor cell + - NPC + - neuron + - paclitaxel + - perturb-seq + - perturbation sequencing + - protein-protein interaction + - protein localization + - RO-Crate + - single-cell RNA sequencing + - scRNAseq + - SEC-MS + - size exclusion chromatography + - spatial proteomics + - subcellular imaging + - vorinostat + +created_by: + - "Ideker T (University of California San Diego) - ORCID: https://orcid.org/0000-0002-1708-8454" + - "Clark T (University of Virginia) - ORCID: https://orcid.org/0000-0003-4060-7360" + - "Parker J (University of California, San Diego) - ORCID: https://orcid.org/0000-0003-4535-3486" + - "Krogan N (University of California San Francisco) - ORCID: https://orcid.org/0000-0003-4902-337X" + - "Lundberg E (Stanford University) - ORCID: https://orcid.org/0000-0001-7034-0850" + - "Mali P (University of California San Diego) https://orcid.org/0000-0002-3383-1287" + +distribution_formats: + - description: + - "RO-Crate packages (JSON-LD metadata with embedded provenance graphs)" + - "ZIP archives for imaging data (multi-channel TIFF images)" + - "Raw sequence data files (FASTQ format)" + - "Processed data matrices (CSV, H5AD formats)" + - "Mass spectrometry data (standard proteomics formats, planned PRIDE repository submission)" + +distribution_dates: + - description: "Alpha v0.5: May 11, 2024; March 2025 Beta: March 3, 2025; June 2025 Beta: July 1, 2025; Quarterly updates planned" + +license_and_use_terms: + description: + - "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)" + - "Attribution required to copyright holders and authors" + - "Publications using this data must cite: (1) the relevant data release DOI, and (2) Clark T, et al. Cell Maps for Artificial Intelligence: AI-Ready Maps of Human Cell Architecture from Disease-Relevant Cell Lines. bioRxiv 2024. doi: 10.1101/2024.05.21.589311" + - "Commercial use requires separate licensing arrangement" + +ip_restrictions: + description: + - "Copyright (c) 2025 The Regents of the University of California except where otherwise noted" + - "Spatial proteomics imaging data: Copyright (c) 2025 The Board of Trustees of the Leland Stanford Junior University" + - "Non-commercial license - commercial applications require permission" + +maintainers: + - description: + - "Primary repository: University of Virginia Dataverse (LibraData)" + - "Point of Contact: Trey Ideker (University of California San Diego)" + - "Data curation and stewardship: Tim Clark and team (University of Virginia)" + - "Contact via CM4AI website (https://cm4ai.org) or Dataverse dataset pages" + +updates: + description: + - "Quarterly data releases planned through project completion (estimated 2026)" + - "Data will be augmented regularly with new assay types, cell types, and experimental conditions" + - "Version history maintained through Dataverse with DOI assignment for each release" + - "Future planned additions: Additional iPSC differentiation states, expanded protein and gene targets, additional disease-relevant cell lines" + +use_repository: + - description: "All CM4AI data releases are publicly available via University of Virginia Dataverse repository (https://dataverse.lib.virginia.edu/dataverse/CM4AI). Individual data releases accessible by DOI. Large datasets may require selective file download rather than bulk download due to size (22.7 TB total)." + +existing_uses: + - description: "Clark T, Parker J, Schaffer L, et al. Cell Maps for Artificial Intelligence: AI-Ready Maps of Human Cell Architecture from Disease-Relevant Cell Lines. bioRxiv 2024. doi: 10.1101/2024.05.21.589311" + - description: "Nourreddine S, et al. A Perturbation Cell Atlas of Human Induced Pluripotent Stem Cells. bioRxiv 2024. doi: 10.1101/2024.11.03.621734" + - description: "Schaffer LV, Hu M, Qian G, et al. Multimodal cell maps as a foundation for structural and functional genomics. Nature 2025. doi: 10.1038/s41586-025-08878-3" + - description: "Lenkiewicz J, Churas C, Hu M, et al. Cell Mapping Toolkit: an end-to-end pipeline for mapping subcellular organization. Bioinformatics 2024. doi: 10.1093/bioinformatics/btaf205" + - description: "Al Manir S, Levinson MA, Niestroy J, et al. FAIRSCAPE: An Evolving AI-readiness Framework for Biomedical Research. bioRxiv 2024. doi: 10.1101/2024.12.23.629818" + - description: "Clark T, Caufield H, Parker JA, et al. AI-readiness for Biomedical Data: Bridge2AI Recommendations. bioRxiv 2024. doi: 10.1101/2024.10.23.619844" + +resources: + - id: "cm4ai_crispr_perturbation_atlas" + name: "CRISPR Perturbation Cell Atlas" + title: "CRISPR Perturbation Cell Atlas in hiPSCs" + description: > + Expressed genome-scale CRISPRi Perturbation Cell Atlas in undifferentiated KOLF2.1J human induced + pluripotent stem cells (hiPSCs) mapping transcriptional and fitness phenotypes associated with 11,739 + targeted genes. Findings validated via phenotypic, protein-interaction, and metabolic tracing assays. + format: "RO-Crate" + media_type: "application/json" + license: "CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)" + purposes: + - response: "AI-ready functional genomics data release for CM4AI to support machine learning research" + tasks: + - response: "Single-cell perturb-seq analysis and model development" + - response: "Gene function and fitness phenotype modeling" + instances: + - representation: "CRISPR perturbation atlas with single-cell RNA sequencing data" + data_type: "Single-cell RNA sequencing data with CRISPR perturbation metadata" + acquisition_methods: + - description: "CRISPR interference (CRISPRi) with perturb-seq in KOLF2.1J hiPSCs targeting 11,739 genes" + collection_timeframes: + - description: "Data creation date: 2025-02-27; Published: 2025-03-03" + data_collectors: + - description: "CM4AI consortium; work conducted at UC San Diego and collaborating institutions" + funders: + - grantor: + id: "https://reporter.nih.gov/" + name: National Institutes of Health + grant: + name: "Bridge2AI CM4AI" + grant_number: "1OT2OD032742-01" + existing_uses: + - description: "Related publication: Cell Maps for Artificial Intelligence (bioRxiv 2024.05.21.589311)" + - description: "Related publication: A PERTURBATION CELL ATLAS OF HUMAN iPSCs (bioRxiv 2024.11.03.621734)" + distribution_formats: + - description: "RO-Crate (JSON metadata)" + distribution_dates: + - description: "2025-03-03" + license_and_use_terms: + description: + - "Attribution required to copyright holders and authors; cite related publication and this data collection" + - "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International" + ip_restrictions: + description: + - "Copyright (c) 2025 The Regents of the University of California except where otherwise noted" + maintainers: + - description: + - "Hosted by University of Virginia Dataverse; contact via dataset page" + - "Point of Contact: Trey Ideker (University of California San Diego)" + conforms_to: "https://w3id.org/ro/crate" + + - id: "cm4ai_protein_protein_interaction" + name: "Protein-Protein Interaction SEC-MS" + title: "Protein-Protein Interaction SEC-MS Data" + description: > + Size exclusion chromatography-mass spectroscopy (SEC-MS) on undifferentiated KOLF2.1J human induced + pluripotent stem cells (hiPSCs), generated in the Nevan Krogan laboratory at UCSF as part of CM4AI + (NIH Bridge2AI program). The data will be uploaded to PRIDE when available. + format: "RO-Crate" + media_type: "application/json" + license: "CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)" + purposes: + - response: "Provide AI-ready proteomics interaction data for modeling protein complexes" + tasks: + - response: "Protein complex inference and PPI network analysis from SEC-MS profiles" + instances: + - representation: "SEC-MS assay data with protein complex elution profiles" + data_type: "Mass spectrometry data with protein-protein interaction annotations" + acquisition_methods: + - description: "Size exclusion chromatography followed by mass spectrometry (SEC-MS) in KOLF2.1J hiPSCs" + collection_timeframes: + - description: "Data creation date: 2025-02-27; Published: 2025-03-03" + data_collectors: + - description: "Nevan Krogan Laboratory, University of California San Francisco" + funders: + - grantor: + id: "https://reporter.nih.gov/" + name: National Institutes of Health + grant: + name: "Bridge2AI CM4AI" + grant_number: "1OT2OD032742-01" + external_resources: + - external_resources: "PRIDE repository (planned upload when available)" + distribution_formats: + - description: "RO-Crate (JSON metadata)" + distribution_dates: + - description: "2025-03-03" + license_and_use_terms: + description: + - "Attribution required; cite related publication and this data collection" + - "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International" + ip_restrictions: + description: + - "Copyright (c) 2025 The Regents of the University of California" + maintainers: + - description: + - "Hosted by University of Virginia Dataverse; contact via dataset page" + - "Point of Contact: Trey Ideker (University of California San Diego)" + + - id: "cm4ai_protein_localization_imaging" + name: "Protein Localization Subcellular Images" + title: "Protein Localization Subcellular Imaging (MDA-MB-468)" + description: > + Spatial localization of 563 proteins of interest in MDA-MB-468 breast cancer cells under untreated, + paclitaxel-treated, and vorinostat-treated conditions, imaged by immunofluorescence-based staining (ICC-IF) + and confocal microscopy in the Lundberg Lab at Stanford University. Four-channel imaging: DAPI (nuclei, blue), + calreticulin antibody (ER, yellow), tubulin antibody (microtubules, red), protein-of-interest antibody (green). + format: "ZIP" + compression: "ZIP" + media_type: "application/zip" + license: "CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/)" + purposes: + - response: "AI-ready subcellular imaging for protein localization analysis and ML benchmarking" + tasks: + - response: "Subcellular localization classification and feature learning from IF images" + - response: "Drug response phenotyping from imaging data" + instances: + - representation: "Immunofluorescence confocal microscopy images (multi-condition)" + data_type: "Multichannel TIFF imagery within ZIP archive" + acquisition_methods: + - description: "ICC-IF staining and confocal microscopy; channels: DAPI (nuclei), calreticulin (ER), tubulin (microtubules), protein-of-interest" + collection_timeframes: + - description: "Published: 2025-03-03" + data_collectors: + - description: "Lundberg Lab, Stanford University" + funders: + - grantor: + id: "https://reporter.nih.gov/" + name: National Institutes of Health + grant: + name: "Bridge2AI CM4AI" + grant_number: "1OT2OD032742-01" + distribution_formats: + - description: "ZIP archive of imaging data (multi-channel TIFF files)" + distribution_dates: + - description: "2025-03-03" + license_and_use_terms: + description: + - "Attribution required; cite related publication and this data collection" + - "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International" + ip_restrictions: + description: + - "Spatial proteomics raw image data copyright (c) 2025 The Board of Trustees of the Leland Stanford Junior University" + - "Other data copyright (c) 2025 The Regents of the University of California except where otherwise noted" + maintainers: + - description: + - "Hosted by University of Virginia Dataverse; contact via dataset page" + - "Point of Contact: Trey Ideker (University of California San Diego)"