From 3b64a2b372f6a55eb7a1e2da24a2d321045abfd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Tue, 18 Mar 2025 11:06:24 +0100 Subject: [PATCH 001/107] Adding dir at dataset level --- src/schema/rules/directories.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/schema/rules/directories.yaml b/src/schema/rules/directories.yaml index db3db194fe..2d1ba9125c 100644 --- a/src/schema/rules/directories.yaml +++ b/src/schema/rules/directories.yaml @@ -22,6 +22,7 @@ raw: - code - derivatives - phenotype + - prov - sourcedata - stimuli - subject @@ -37,6 +38,10 @@ raw: name: phenotype level: optional opaque: false + prov: + name: prov + level: optional + opaque: false sourcedata: name: sourcedata level: optional @@ -70,6 +75,7 @@ derivative: - code - derivatives - phenotype + - prov - sourcedata - stimuli - subject @@ -85,6 +91,10 @@ derivative: name: phenotype level: optional opaque: false + prov: + name: prov + level: optional + opaque: false sourcedata: name: sourcedata level: optional From 45f6af471ce4ad3b54ff211448552d83439f8a69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Tue, 18 Mar 2025 11:09:00 +0100 Subject: [PATCH 002/107] Adding dir at any level --- src/schema/rules/directories.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/schema/rules/directories.yaml b/src/schema/rules/directories.yaml index 2d1ba9125c..29e4295482 100644 --- a/src/schema/rules/directories.yaml +++ b/src/schema/rules/directories.yaml @@ -58,16 +58,20 @@ raw: - oneOf: - session - datatype + - prov session: entity: session level: optional opaque: false subdirs: - datatype + - prov datatype: value: datatype level: required opaque: false + subdirs: + - prov derivative: root: @@ -110,13 +114,17 @@ derivative: subdirs: - session - datatype + - prov session: entity: session level: optional opaque: false subdirs: - datatype + - prov datatype: value: datatype level: optional opaque: false + subdirs: + - prov From 4c315ce384f275c2da75e517490dbf92aefea5b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Tue, 18 Mar 2025 11:10:08 +0100 Subject: [PATCH 003/107] Adding rule for dir at dataset level --- src/schema/rules/files/common/core.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/schema/rules/files/common/core.yaml b/src/schema/rules/files/common/core.yaml index 9f54db072f..4a5324d52b 100644 --- a/src/schema/rules/files/common/core.yaml +++ b/src/schema/rules/files/common/core.yaml @@ -36,6 +36,9 @@ code: derivatives: level: optional path: derivatives +prov: + level: optional + path: prov sourcedata: level: optional path: sourcedata From b490c91ad7853c3126283d6ca1a91c80559fb4ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Tue, 18 Mar 2025 11:18:43 +0100 Subject: [PATCH 004/107] Adding objects: JSONLD extension + top level dir --- src/schema/objects/extensions.yaml | 9 +++++++++ src/schema/objects/files.yaml | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/src/schema/objects/extensions.yaml b/src/schema/objects/extensions.yaml index 1984661e87..268ecdc173 100644 --- a/src/schema/objects/extensions.yaml +++ b/src/schema/objects/extensions.yaml @@ -130,6 +130,15 @@ json: These sidecar files follow the inheritance principle. There are also a few special cases of JSON files being first-order data files, such as `genetic_info.json`. +jsonld: + value: .jsonld + display_name: JavaScript Object Notation for Linked Data + description: | + A JSON-LD file. + + JSON-LD files are used to describe relations between objects in the dataset. + + For example, the description of provenance uses this file format. kdf: value: .kdf display_name: KRISS KDF diff --git a/src/schema/objects/files.yaml b/src/schema/objects/files.yaml index d2d8761410..cee42d520e 100644 --- a/src/schema/objects/files.yaml +++ b/src/schema/objects/files.yaml @@ -102,6 +102,13 @@ code: (for example the one used to generate the derivatives from the raw data). See the [Code section](SPEC_ROOT/modality-agnostic-files.md#code) for more information. +prov: + display_name: Provenance Records + file_type: directory + description: | + A directory in which to store provenance metadata. + See the [Provenance section](SPEC_ROOT/modality-agnostic-files.md#provenance) + for more information. derivatives: display_name: Derivative data file_type: directory From f2a67a67bf87a786f03c46f7394d2e82c2e78029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Tue, 18 Mar 2025 11:21:18 +0100 Subject: [PATCH 005/107] Adding object : prov entity --- src/schema/objects/entities.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/schema/objects/entities.yaml b/src/schema/objects/entities.yaml index a5638bc984..a94f5c8e1d 100644 --- a/src/schema/objects/entities.yaml +++ b/src/schema/objects/entities.yaml @@ -229,6 +229,14 @@ processing: exploited. type: string format: label +prov: + name: prov + display_name: Provenance group + description: | + A grouping of provenance records. + Defining multiple provenance records groups is appropriate when several processings have been performed on data. + type: string + format: label reconstruction: name: rec display_name: Reconstruction From d9bb7b04b9ab39fe93cfdb168b848b2cf3a45c1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Tue, 18 Mar 2025 11:57:42 +0100 Subject: [PATCH 006/107] Add suffixes in object.suffixes --- src/schema/objects/suffixes.yaml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/schema/objects/suffixes.yaml b/src/schema/objects/suffixes.yaml index 1e2825ced2..cb4613e33a 100644 --- a/src/schema/objects/suffixes.yaml +++ b/src/schema/objects/suffixes.yaml @@ -502,6 +502,16 @@ VFA: Depending on the provided metadata fields and the sequence type, data may be eligible for DESPOT1, DESPOT2 and their variants ([Deoni et al. 2005](https://doi.org/10.1002/mrm.20314)). +act: + value: act + display_name: Provenance Activity Records + description: | + Provenance Activity records for a group of provenance. Activities represent the transformations that have been applied to the data. +all: + value: all + display_name: Provenance Records + description: | + All types of provenance records (Activity, Entity, Environment, Software) for a group of provenance. angio: value: angio display_name: Angiogram @@ -528,6 +538,11 @@ asllabeling: A deidentified screenshot of the planning of the labeling slab/plane with respect to the imaging slab or slices. This screenshot is based on DICOM macro C.8.13.5.14. +base: + value: base + display_name: Base for Provenance Records + description: | + Base fields for provenance records of a group of provenance. beh: value: beh display_name: Behavioral recording @@ -612,6 +627,16 @@ electrodes: display_name: Electrodes description: | File that gives the location of (i)EEG electrodes. +ent: + value: ent + display_name: Provenance Entity Records + description: | + Provenance Entity records for a group of provenance. Entities are inputs and outputs of Activities. +env: + value: env + display_name: Provenance Environment Records + description: | + Provenance Environment records for a group of provenance. Environments specify the software environment in which the provenance record was obtained. epi: value: epi display_name: EPI @@ -817,6 +842,11 @@ probseg: A probabilistic segmentation. This suffix may only be used in derivative datasets. +soft: + value: soft + display_name: Provenance Software Records + description: | + Provenance Software records for a group of provenance. The Software specifies the software package with which transformations were applied to the data. sbref: value: sbref display_name: Single-band reference image From 64e2c6593d7ed73aaa7fed6c5239e16f764848b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Tue, 18 Mar 2025 13:59:39 +0100 Subject: [PATCH 007/107] GeneratedByProv and SidecarGeneratedBy metadata --- src/schema/objects/metadata.yaml | 12 ++++++++++++ src/schema/rules/dataset_metadata.yaml | 2 ++ 2 files changed, 14 insertions(+) diff --git a/src/schema/objects/metadata.yaml b/src/schema/objects/metadata.yaml index 09984125ad..fd8c2a8b10 100644 --- a/src/schema/objects/metadata.yaml +++ b/src/schema/objects/metadata.yaml @@ -1281,6 +1281,12 @@ GeneratedBy: URI: type: string format: uri +GeneratedByProv: + name: GeneratedByProv + display_name: Generated By Prov + description: | + Specify the provenance of a file in the dataset. + type: object GeneticLevel: name: GeneticLevel display_name: Genetic Level @@ -3292,6 +3298,12 @@ ShortChannelCount: The number of short channels. 0 indicates no short channels. type: integer minimum: 0 +SidecarGeneratedBy: + name: SidecarGeneratedBy + display_name: Sidecar Generated By + description: | + Specify the provenance of a sidecar JSON. + type: object SinglesRate: name: SinglesRate display_name: Singles Rate diff --git a/src/schema/rules/dataset_metadata.yaml b/src/schema/rules/dataset_metadata.yaml index d4ecca7da1..504baf6498 100644 --- a/src/schema/rules/dataset_metadata.yaml +++ b/src/schema/rules/dataset_metadata.yaml @@ -17,6 +17,7 @@ dataset_description: ReferencesAndLinks: optional DatasetDOI: optional GeneratedBy: recommended + GeneratedByProv: recommended SourceDatasets: recommended dataset_authors: @@ -38,6 +39,7 @@ derivative_description: - path == "/dataset_description.json" - json.DatasetType == "derivative" fields: + GeneratedByProv: recommended GeneratedBy: required dataset_description_with_genetics: From acc6af0f8c72ab857a965ad7d244506151eb6872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Wed, 19 Mar 2025 10:28:50 +0100 Subject: [PATCH 008/107] prov entitiy and provenance json/jsonld files --- src/schema/rules/entities.yaml | 1 + src/schema/rules/files/common/tables.yaml | 31 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/schema/rules/entities.yaml b/src/schema/rules/entities.yaml index 44174d333e..e1d118b754 100644 --- a/src/schema/rules/entities.yaml +++ b/src/schema/rules/entities.yaml @@ -32,3 +32,4 @@ - density - label - description +- prov diff --git a/src/schema/rules/files/common/tables.yaml b/src/schema/rules/files/common/tables.yaml index 21f03f6579..31888b198a 100644 --- a/src/schema/rules/files/common/tables.yaml +++ b/src/schema/rules/files/common/tables.yaml @@ -5,12 +5,14 @@ participants: extensions: - .tsv - .json + samples: level: optional stem: samples extensions: - .tsv - .json + scans: level: optional suffixes: @@ -21,6 +23,7 @@ scans: entities: subject: required session: optional # session is required if session is present in the dataset. + sessions: # This file may only exist if session is present in the dataset. level: optional suffixes: @@ -43,3 +46,31 @@ phenotype: extensions: - .tsv - .json + +# Provenance json files +provenance: + level: optional + suffixes: + - act + - base + - ent + - env + - soft + extensions: + - .json + entities: + subject: optional + session: optional + prov: required + +# Provenance ld files +provenanceld: + level: optional + suffixes: + - all + extensions: + - .jsonld + entities: + subject: optional + session: optional + prov: required From 34fafe1633f4b569bb68e308aaf5f6ce730f8be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Wed, 19 Mar 2025 10:40:20 +0100 Subject: [PATCH 009/107] provenance json/jsonld files inside new rules file --- .../rules/files/common/modality_agnostic.yaml | 30 +++++++++++++++++++ src/schema/rules/files/common/tables.yaml | 28 ----------------- 2 files changed, 30 insertions(+), 28 deletions(-) create mode 100644 src/schema/rules/files/common/modality_agnostic.yaml diff --git a/src/schema/rules/files/common/modality_agnostic.yaml b/src/schema/rules/files/common/modality_agnostic.yaml new file mode 100644 index 0000000000..2343f4d443 --- /dev/null +++ b/src/schema/rules/files/common/modality_agnostic.yaml @@ -0,0 +1,30 @@ +--- +# Files and directories that can be found at any level of a dataset. + +# Provenance json files +provenance: + level: optional + suffixes: + - act + - base + - ent + - env + - soft + extensions: + - .json + entities: + subject: optional + session: optional + prov: required + +# Provenance ld files +provenanceld: + level: optional + suffixes: + - all + extensions: + - .jsonld + entities: + subject: optional + session: optional + prov: required diff --git a/src/schema/rules/files/common/tables.yaml b/src/schema/rules/files/common/tables.yaml index 31888b198a..7c818acc66 100644 --- a/src/schema/rules/files/common/tables.yaml +++ b/src/schema/rules/files/common/tables.yaml @@ -46,31 +46,3 @@ phenotype: extensions: - .tsv - .json - -# Provenance json files -provenance: - level: optional - suffixes: - - act - - base - - ent - - env - - soft - extensions: - - .json - entities: - subject: optional - session: optional - prov: required - -# Provenance ld files -provenanceld: - level: optional - suffixes: - - all - extensions: - - .jsonld - entities: - subject: optional - session: optional - prov: required From a1fbb8a29ff4d50b0dd3b2b0aaf3442e557843f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Thu, 10 Apr 2025 14:51:29 +0200 Subject: [PATCH 010/107] [SCHEMA] removing provenance subdir level description --- src/schema/objects/metadata.yaml | 24 ++++++++++++++++--- src/schema/rules/directories.yaml | 7 ------ .../rules/files/common/modality_agnostic.yaml | 6 ++--- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/schema/objects/metadata.yaml b/src/schema/objects/metadata.yaml index fd8c2a8b10..6e290a7f9c 100644 --- a/src/schema/objects/metadata.yaml +++ b/src/schema/objects/metadata.yaml @@ -1281,11 +1281,23 @@ GeneratedBy: URI: type: string format: uri +GeneratedById: + name: GeneratedBy + display_name: Generated By + description: | + Specify the provenance of a file in the dataset, referencing the Id of the Activity (or Activities) responsible for its creation. + anyOf: + - type: string + format: uri + - type: array + items: + type: string + format: uri GeneratedByProv: name: GeneratedByProv display_name: Generated By Prov description: | - Specify the provenance of a file in the dataset. + Specify the provenance of the dataset using BIDS-Prov specifications. type: object GeneticLevel: name: GeneticLevel @@ -3302,8 +3314,14 @@ SidecarGeneratedBy: name: SidecarGeneratedBy display_name: Sidecar Generated By description: | - Specify the provenance of a sidecar JSON. - type: object + Specify the provenance of a sidecar JSON, referencing the Id of the Activity (or Activities) responsible for its creation. + anyOf: + - type: string + format: uri + - type: array + items: + type: string + format: uri SinglesRate: name: SinglesRate display_name: Singles Rate diff --git a/src/schema/rules/directories.yaml b/src/schema/rules/directories.yaml index 29e4295482..06988660ae 100644 --- a/src/schema/rules/directories.yaml +++ b/src/schema/rules/directories.yaml @@ -58,20 +58,17 @@ raw: - oneOf: - session - datatype - - prov session: entity: session level: optional opaque: false subdirs: - datatype - - prov datatype: value: datatype level: required opaque: false subdirs: - - prov derivative: root: @@ -114,17 +111,13 @@ derivative: subdirs: - session - datatype - - prov session: entity: session level: optional opaque: false subdirs: - datatype - - prov datatype: value: datatype level: optional opaque: false - subdirs: - - prov diff --git a/src/schema/rules/files/common/modality_agnostic.yaml b/src/schema/rules/files/common/modality_agnostic.yaml index 2343f4d443..3dbab51678 100644 --- a/src/schema/rules/files/common/modality_agnostic.yaml +++ b/src/schema/rules/files/common/modality_agnostic.yaml @@ -4,6 +4,7 @@ # Provenance json files provenance: level: optional + path: prov/* suffixes: - act - base @@ -13,18 +14,15 @@ provenance: extensions: - .json entities: - subject: optional - session: optional prov: required # Provenance ld files provenanceld: level: optional + path: prov/* suffixes: - all extensions: - .jsonld entities: - subject: optional - session: optional prov: required From 6622d082ee0ec1de4fa2c17b706a8bd462c30d70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Thu, 10 Apr 2025 15:13:54 +0200 Subject: [PATCH 011/107] Yaml lint --- src/schema/objects/metadata.yaml | 6 ++++-- src/schema/objects/suffixes.yaml | 15 ++++++++++----- src/schema/rules/checks/deprecations.yml | 1 + 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/schema/objects/metadata.yaml b/src/schema/objects/metadata.yaml index 6e290a7f9c..8966fdfc0e 100644 --- a/src/schema/objects/metadata.yaml +++ b/src/schema/objects/metadata.yaml @@ -1285,7 +1285,8 @@ GeneratedById: name: GeneratedBy display_name: Generated By description: | - Specify the provenance of a file in the dataset, referencing the Id of the Activity (or Activities) responsible for its creation. + Specify the provenance of a file in the dataset, referencing the Id + of the Activity (or Activities) responsible for its creation. anyOf: - type: string format: uri @@ -3314,7 +3315,8 @@ SidecarGeneratedBy: name: SidecarGeneratedBy display_name: Sidecar Generated By description: | - Specify the provenance of a sidecar JSON, referencing the Id of the Activity (or Activities) responsible for its creation. + Specify the provenance of a sidecar JSON, + referencing the Id of the Activity (or Activities) responsible for its creation. anyOf: - type: string format: uri diff --git a/src/schema/objects/suffixes.yaml b/src/schema/objects/suffixes.yaml index cb4613e33a..e9971c99b0 100644 --- a/src/schema/objects/suffixes.yaml +++ b/src/schema/objects/suffixes.yaml @@ -506,12 +506,14 @@ act: value: act display_name: Provenance Activity Records description: | - Provenance Activity records for a group of provenance. Activities represent the transformations that have been applied to the data. + Provenance Activity records for a group of provenance. + Activities represent the transformations that have been applied to the data. all: value: all display_name: Provenance Records description: | - All types of provenance records (Activity, Entity, Environment, Software) for a group of provenance. + All types of provenance records (Activity, Entity, Environment, Software) + for a group of provenance. angio: value: angio display_name: Angiogram @@ -631,12 +633,14 @@ ent: value: ent display_name: Provenance Entity Records description: | - Provenance Entity records for a group of provenance. Entities are inputs and outputs of Activities. + Provenance Entity records for a group of provenance. + Entities are inputs and outputs of Activities. env: value: env display_name: Provenance Environment Records description: | - Provenance Environment records for a group of provenance. Environments specify the software environment in which the provenance record was obtained. + Provenance Environment records for a group of provenance. + Environments specify the software environment in which the provenance record was obtained. epi: value: epi display_name: EPI @@ -846,7 +850,8 @@ soft: value: soft display_name: Provenance Software Records description: | - Provenance Software records for a group of provenance. The Software specifies the software package with which transformations were applied to the data. + Provenance Software records for a group of provenance. + The Software specifies the software package with which transformations were applied to the data. sbref: value: sbref display_name: Single-band reference image diff --git a/src/schema/rules/checks/deprecations.yml b/src/schema/rules/checks/deprecations.yml index d7a3c27372..d45be8356a 100644 --- a/src/schema/rules/checks/deprecations.yml +++ b/src/schema/rules/checks/deprecations.yml @@ -1,3 +1,4 @@ +--- AnatomicalLandmarkCoordinateSystemDeprecation: issue: code: ELEKTA_NEUROMAG_DEPRECATED From b05b95007181a641626bf6556a35badeeb946b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Cl=C3=A9net?= Date: Thu, 10 Apr 2025 17:00:49 +0200 Subject: [PATCH 012/107] [SPEC] first draft for BEP028 BIDS-Prov --- mkdocs.yml | 6 +- src/modality-agnostic-files/code.md | 16 + .../data-description.md} | 233 +----- .../dataset-description.md | 213 +++++ src/modality-agnostic-files/provenance.md | 781 ++++++++++++++++++ 5 files changed, 1016 insertions(+), 233 deletions(-) create mode 100644 src/modality-agnostic-files/code.md rename src/{modality-agnostic-files.md => modality-agnostic-files/data-description.md} (57%) create mode 100644 src/modality-agnostic-files/dataset-description.md create mode 100644 src/modality-agnostic-files/provenance.md diff --git a/mkdocs.yml b/mkdocs.yml index ef36d6eb5c..8bda7a2a16 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,7 +5,11 @@ nav: - The BIDS Specification: - Introduction: introduction.md - Common principles: common-principles.md - - Modality agnostic files: modality-agnostic-files.md + - Modality agnostic files: + - Dataset description: modality-agnostic-files/dataset-description.md + - Data description: modality-agnostic-files/data-description.md + - Code: modality-agnostic-files/code.md + - Provenance: modality-agnostic-files/provenance.md - Modality specific files: - Magnetic Resonance Imaging: modality-specific-files/magnetic-resonance-imaging-data.md - Magnetoencephalography: modality-specific-files/magnetoencephalography.md diff --git a/src/modality-agnostic-files/code.md b/src/modality-agnostic-files/code.md new file mode 100644 index 0000000000..0e894d06a7 --- /dev/null +++ b/src/modality-agnostic-files/code.md @@ -0,0 +1,16 @@ +# Code + +Template: `code/*` + +Source code of scripts that were used to prepare the dataset MAY be stored here. +Examples include deidentification or defacing of the data, or +the conversion from the format of the source data to the BIDS format +(see [source vs. raw vs. derived data](./common-principles.md#source-vs-raw-vs-derived-data)). +Extra care should be taken to avoid including original IDs or +any identifiable information with the source code. +There are no limitations or recommendations on the language and/or +code organization of these scripts at the moment. + + + +[bids uris]: ./common-principles.md#bids-uri diff --git a/src/modality-agnostic-files.md b/src/modality-agnostic-files/data-description.md similarity index 57% rename from src/modality-agnostic-files.md rename to src/modality-agnostic-files/data-description.md index 84df1a2dca..57fa736ceb 100644 --- a/src/modality-agnostic-files.md +++ b/src/modality-agnostic-files/data-description.md @@ -1,218 +1,4 @@ -# Modality agnostic files - -## Dataset description - -Templates: - -- `dataset_description.json` -- `README[.md|.rst|.txt]` -- `CITATION.cff` -- `CHANGES` -- `LICENSE[.md|.rst|.txt]` - -### `dataset_description.json` - - -{{ MACROS___render_text("objects.files.dataset_description.description") }} - -Every dataset MUST include this file with the following fields: - - -{{ MACROS___make_metadata_table( - { - "Name": "REQUIRED", - "BIDSVersion": "REQUIRED", - "HEDVersion": "RECOMMENDED", - "DatasetLinks": "REQUIRED if [BIDS URIs][] are used", - "DatasetType": "RECOMMENDED", - "License": "RECOMMENDED", - "Authors": "RECOMMENDED if CITATION.cff is not present", - "Acknowledgements": "OPTIONAL", - "HowToAcknowledge": "OPTIONAL", - "Funding": "OPTIONAL", - "EthicsApprovals": "OPTIONAL", - "ReferencesAndLinks": "OPTIONAL", - "DatasetDOI": "OPTIONAL", - "GeneratedBy": "RECOMMENDED", - "SourceDatasets": "RECOMMENDED", - } -) }} - -Each object in the `GeneratedBy` array includes the following REQUIRED, RECOMMENDED -and OPTIONAL keys: - - -{{ MACROS___make_subobject_table("metadata.GeneratedBy.items") }} - -Example: - -```JSON -{ - "Name": "The mother of all experiments", - "BIDSVersion": "1.6.0", - "DatasetType": "raw", - "License": "CC0", - "Authors": [ - "Paul Broca", - "Carl Wernicke" - ], - "Acknowledgements": "Special thanks to Korbinian Brodmann for help in formatting this dataset in BIDS. We thank Alan Lloyd Hodgkin and Andrew Huxley for helpful comments and discussions about the experiment and manuscript; Hermann Ludwig Helmholtz for administrative support; and Claudius Galenus for providing data for the medial-to-lateral index analysis.", - "HowToAcknowledge": "Please cite this paper: https://www.ncbi.nlm.nih.gov/pubmed/001012092119281", - "Funding": [ - "National Institute of Neuroscience Grant F378236MFH1", - "National Institute of Neuroscience Grant 5RMZ0023106" - ], - "EthicsApprovals": [ - "Army Human Research Protections Office (Protocol ARL-20098-10051, ARL 12-040, and ARL 12-041)" - ], - "ReferencesAndLinks": [ - "https://www.ncbi.nlm.nih.gov/pubmed/001012092119281", - "Alzheimer A., & Kraepelin, E. (2015). Neural correlates of presenile dementia in humans. Journal of Neuroscientific Data, 2, 234001. doi:1920.8/jndata.2015.7" - ], - "DatasetDOI": "doi:10.0.2.3/dfjj.10", - "HEDVersion": "8.0.0", - "GeneratedBy": [ - { - "Name": "reproin", - "Version": "0.6.0", - "Container": { - "Type": "docker", - "Tag": "repronim/reproin:0.6.0" - } - } - ], - "SourceDatasets": [ - { - "URL": "s3://dicoms/studies/correlates", - "Version": "April 11 2011" - } - ] -} -``` - -#### Derived dataset and pipeline description - -As for any BIDS dataset, a `dataset_description.json` file MUST be found at the -top level of every derived dataset: -`/derivatives//dataset_description.json`. - -In contrast to raw BIDS datasets, derived BIDS datasets MUST include a -`GeneratedBy` key: - - -{{ MACROS___make_metadata_table( - { - "GeneratedBy": "REQUIRED" - } -) }} - -If a derived dataset is stored as a subdirectory of the raw dataset, then the `Name` field -of the first `GeneratedBy` object MUST be a substring of the derived dataset directory name. -That is, in a directory `/derivatives/[-]/`, the first -`GeneratedBy` object should have a `Name` of ``. - -Example: - -```JSON -{ - "Name": "FMRIPREP Outputs", - "BIDSVersion": "1.6.0", - "DatasetType": "derivative", - "GeneratedBy": [ - { - "Name": "fmriprep", - "Version": "1.4.1", - "Container": { - "Type": "docker", - "Tag": "poldracklab/fmriprep:1.4.1" - } - }, - { - "Name": "Manual", - "Description": "Re-added RepetitionTime metadata to bold.json files" - } - ], - "SourceDatasets": [ - { - "DOI": "doi:10.18112/openneuro.ds000114.v1.0.1", - "URL": "https://openneuro.org/datasets/ds000114/versions/1.0.1", - "Version": "1.0.1" - } - ] -} -``` - -### `README` - - -{{ MACROS___render_text("objects.files.README.description") }} - -### `CITATION.cff` - - -{{ MACROS___render_text("objects.files.CITATION.description") }} - -For most redundant fields between `CITATION.cff` and `dataset_description.json`, -the `CITATION.cff` SHOULD take precedence. -To avoid inconsistency, metadata present in `CITATION.cff` SHOULD NOT be -be included in `dataset_description.json`, with the exception of `Name` and -`DatasetDOI`, to ensure that `CITATION.cff`-unaware tools can generate -references to the dataset. -In particular, if `CITATION.cff` is present, -the `"Authors"` field of `dataset_description.json` MUST be omitted, -and the `"HowToAcknowledge"`, `"License"` and `"ReferencesAndLinks"` SHOULD be omitted -in favor of the `CITATION.cff` fields `message`/`preferred-citation`, `license` and -`references`. - -### `CHANGES` - - -{{ MACROS___render_text("objects.files.CHANGES.description") }} - -Example: - -```Text -1.0.1 2015-08-27 - - Fixed slice timing information. - -1.0.0 2015-08-17 - - Initial release. -``` - -### `LICENSE` - - -{{ MACROS___render_text("objects.files.LICENSE.description") }} +# Data description ## Participants file @@ -528,20 +314,3 @@ ses-predrug 2009-06-15T13:45:30 120 ses-postdrug 2009-06-16T13:45:30 100 ses-followup 2009-06-17T13:45:30 110 ``` - -## Code - -Template: `code/*` - -Source code of scripts that were used to prepare the dataset MAY be stored here. -Examples include deidentification or defacing of the data, or -the conversion from the format of the source data to the BIDS format -(see [source vs. raw vs. derived data](./common-principles.md#source-vs-raw-vs-derived-data)). -Extra care should be taken to avoid including original IDs or -any identifiable information with the source code. -There are no limitations or recommendations on the language and/or -code organization of these scripts at the moment. - - - -[bids uris]: ./common-principles.md#bids-uri diff --git a/src/modality-agnostic-files/dataset-description.md b/src/modality-agnostic-files/dataset-description.md new file mode 100644 index 0000000000..a4c36a7207 --- /dev/null +++ b/src/modality-agnostic-files/dataset-description.md @@ -0,0 +1,213 @@ +# Dataset description + +Templates: + +- `dataset_description.json` +- `README[.md|.rst|.txt]` +- `CITATION.cff` +- `CHANGES` +- `LICENSE[.md|.rst|.txt]` + +## `dataset_description.json` + + +{{ MACROS___render_text("objects.files.dataset_description.description") }} + +Every dataset MUST include this file with the following fields: + + +{{ MACROS___make_metadata_table( + { + "Name": "REQUIRED", + "BIDSVersion": "REQUIRED", + "HEDVersion": "RECOMMENDED", + "DatasetLinks": "REQUIRED if [BIDS URIs][] are used", + "DatasetType": "RECOMMENDED", + "License": "RECOMMENDED", + "Authors": "RECOMMENDED if CITATION.cff is not present", + "Acknowledgements": "OPTIONAL", + "HowToAcknowledge": "OPTIONAL", + "Funding": "OPTIONAL", + "EthicsApprovals": "OPTIONAL", + "ReferencesAndLinks": "OPTIONAL", + "DatasetDOI": "OPTIONAL", + "GeneratedBy": "RECOMMENDED", + "SourceDatasets": "RECOMMENDED", + } +) }} + +Each object in the `GeneratedBy` array includes the following REQUIRED, RECOMMENDED +and OPTIONAL keys: + + +{{ MACROS___make_subobject_table("metadata.GeneratedBy.items") }} + +Example: + +```JSON +{ + "Name": "The mother of all experiments", + "BIDSVersion": "1.6.0", + "DatasetType": "raw", + "License": "CC0", + "Authors": [ + "Paul Broca", + "Carl Wernicke" + ], + "Acknowledgements": "Special thanks to Korbinian Brodmann for help in formatting this dataset in BIDS. We thank Alan Lloyd Hodgkin and Andrew Huxley for helpful comments and discussions about the experiment and manuscript; Hermann Ludwig Helmholtz for administrative support; and Claudius Galenus for providing data for the medial-to-lateral index analysis.", + "HowToAcknowledge": "Please cite this paper: https://www.ncbi.nlm.nih.gov/pubmed/001012092119281", + "Funding": [ + "National Institute of Neuroscience Grant F378236MFH1", + "National Institute of Neuroscience Grant 5RMZ0023106" + ], + "EthicsApprovals": [ + "Army Human Research Protections Office (Protocol ARL-20098-10051, ARL 12-040, and ARL 12-041)" + ], + "ReferencesAndLinks": [ + "https://www.ncbi.nlm.nih.gov/pubmed/001012092119281", + "Alzheimer A., & Kraepelin, E. (2015). Neural correlates of presenile dementia in humans. Journal of Neuroscientific Data, 2, 234001. doi:1920.8/jndata.2015.7" + ], + "DatasetDOI": "doi:10.0.2.3/dfjj.10", + "HEDVersion": "8.0.0", + "GeneratedBy": [ + { + "Name": "reproin", + "Version": "0.6.0", + "Container": { + "Type": "docker", + "Tag": "repronim/reproin:0.6.0" + } + } + ], + "SourceDatasets": [ + { + "URL": "s3://dicoms/studies/correlates", + "Version": "April 11 2011" + } + ] +} +``` + +#### Derived dataset and pipeline description + +As for any BIDS dataset, a `dataset_description.json` file MUST be found at the +top level of every derived dataset: +`/derivatives//dataset_description.json`. + +In contrast to raw BIDS datasets, derived BIDS datasets MUST include a +`GeneratedBy` key: + + +{{ MACROS___make_metadata_table( + { + "GeneratedBy": "REQUIRED" + } +) }} + +If a derived dataset is stored as a subdirectory of the raw dataset, then the `Name` field +of the first `GeneratedBy` object MUST be a substring of the derived dataset directory name. +That is, in a directory `/derivatives/[-]/`, the first +`GeneratedBy` object should have a `Name` of ``. + +Example: + +```JSON +{ + "Name": "FMRIPREP Outputs", + "BIDSVersion": "1.6.0", + "DatasetType": "derivative", + "GeneratedBy": [ + { + "Name": "fmriprep", + "Version": "1.4.1", + "Container": { + "Type": "docker", + "Tag": "poldracklab/fmriprep:1.4.1" + } + }, + { + "Name": "Manual", + "Description": "Re-added RepetitionTime metadata to bold.json files" + } + ], + "SourceDatasets": [ + { + "DOI": "doi:10.18112/openneuro.ds000114.v1.0.1", + "URL": "https://openneuro.org/datasets/ds000114/versions/1.0.1", + "Version": "1.0.1" + } + ] +} +``` + +## `README` + + +{{ MACROS___render_text("objects.files.README.description") }} + +## `CITATION.cff` + + +{{ MACROS___render_text("objects.files.CITATION.description") }} + +For most redundant fields between `CITATION.cff` and `dataset_description.json`, +the `CITATION.cff` SHOULD take precedence. +To avoid inconsistency, metadata present in `CITATION.cff` SHOULD NOT be +be included in `dataset_description.json`, with the exception of `Name` and +`DatasetDOI`, to ensure that `CITATION.cff`-unaware tools can generate +references to the dataset. +In particular, if `CITATION.cff` is present, +the `"Authors"` field of `dataset_description.json` MUST be omitted, +and the `"HowToAcknowledge"`, `"License"` and `"ReferencesAndLinks"` SHOULD be omitted +in favor of the `CITATION.cff` fields `message`/`preferred-citation`, `license` and +`references`. + +## `CHANGES` + + +{{ MACROS___render_text("objects.files.CHANGES.description") }} + +Example: + +```Text +1.0.1 2015-08-27 + - Fixed slice timing information. + +1.0.0 2015-08-17 + - Initial release. +``` + +## `LICENSE` + + +{{ MACROS___render_text("objects.files.LICENSE.description") }} diff --git a/src/modality-agnostic-files/provenance.md b/src/modality-agnostic-files/provenance.md new file mode 100644 index 0000000000..8ec67dd78e --- /dev/null +++ b/src/modality-agnostic-files/provenance.md @@ -0,0 +1,781 @@ +# Provenance + +## 1. Overview + +### 1.1 Goals + +Interpreting and comparing scientific results and enabling reusable data and analysis output require understanding provenance, i.e. how the data were generated and processed. To be useful, the provenance must be comprehensive, understandable, easily communicated, and captured automatically in machine accessible form. Provenance records are thus used to encode transformations between digital objects. + +This specification is aimed at describing the provenance of a BIDS dataset. This description is retrospective, i.e. it describes a set of steps that were executed in order to obtain the dataset (this is different from prospective descriptions of workflows that could for instance list all sets of steps that can be run on this dataset). + +### 1.2 Which type of provenance is covered in BIDS ? + +Provenance comes up in many different contexts in BIDS. This specification focuses on representing the processings that were applied to a dataset. These could be for instance: + +1. The raw conversion from DICOM images or other instrument native formats to BIDS layout, details of stimulus presentation and cognitive paradigms, and clinical and neuropsychiatric assessments, each come with their own details of provenance. +2. In BIDS derivatives, the consideration of outputs requires knowledge of which inputs from the BIDS dataset were used together with what software was run in what environment and with what parameters. + +But provenance comes up in other contexts as well, which might be addressed at a later stage: + +3. For datasets and derivatives, provenance can also include details of why the data were collected in the first place covering hypotheses, claims, and prior publications. Provenance can encode support for which claims were supported by future analyses. +4. Provenance can involve information about people and institutions involved in a study. +5. Provenance records can highlight reuse of datasets while providing appropriate attribution to the original dataset generators as well as future transformers. + +Provenance can be captured using different mechanisms, but independent of encoding, always reflects transformations by either humans or software. The interpretability of provenance records requires a consistent vocabulary for provenance as well as an expectation for a consistent terminology for the objects being encoded. + +## 1.3 Principles for encoding provenance in BIDS + +1. Provenance information SHOULD be included in a BIDS dataset when possible. +2. If provenance records are included, these MUST be described using the conventions detailed by this specification. +3. Provenance records MAY be used to reflect the provenance of a dataset, a collection of files or a specific file at any level of the BIDS hierarchy. +4. Provenance information SHOULD be anonymized/de-identified as necessary. + +### 1.4 Provenance format + +Provenance metadata is written in JSON or JSON-LD. JSON-LD is a specific type of JSON that allows encoding graph-like structures with the Resource Description Framework.[^1] + +Provenance records use the PROV model ontology [^2], augmented by terms curated in this specification, and defined in the [BIDS-Prov context](/context.json). + +A skeleton for a BIDS-Prov JSON-LD file looks like this: +``` +{ + "@context": "https://purl.org/nidash/bidsprov/context.json", + "BIDSProvVersion": "0.0.1", + "Records": { + "Agent": [ + { + <...Agent 1...> + }, + { + <...Agent 2...> + } + ], + "Activity": [ + { + <...Activity 1...> + }, + { + <...Activity 2...> + } + ], + "Entity": [ + { + <...Entity 1...> + }, + { + <...Entity 2...> + } + ], + "Environment": [ + { + <...Environment 1...> + }, + { + <...Environment 2...> + } + ] + } +} +``` + + + + + + + + + + + + + + + + + + +
Key name + Description +
@context + REQUIRED. A URL to the BIDS-Prov json context. Value must be "https://purl.org/nidash/bidsprov/context.json" +
BIDSProvVersion + REQUIRED. A string identifying the version of the specification adhered to. +
Records + REQUIRED. A list of provenance records (Activity, Entity, Agent, Environment), describing the provenance (see the 2. Provenance records section below). +
+ +BIDS-Prov allows this skeleton to be split into several *JSON* files. This is described in sections [3.1.3 Suffixes](#3-1-3-suffixes) +and [3.2 Provenance description levels](#3-2-provenance-description-levels). + +Using tools provided by BIDS-Prov ([5. Tools](#5-tools)), these JSON contents can be merged back to a structured JSON-LD as described above. + +> [!NOTE] +> Since the JSON-LD documents are graph objects, they can be aggregated using RDF tools without the need to apply the inheritance principle. + +> [!WARNING] +> A group of provenance records MUST be described: +> * either in several `.json` files ; +> * or in several `.jsonld` files. + +A complete schema for the model file to facilitate specification and validation is available from [https://github.com/bids-standard/BEP028_BIDSprov](https://github.com/bids-standard/BEP028_BIDSprov). In the event of disagreements between the schema and the specification, the specification is authoritative. + +## 2. Provenance records + +BIDS-Prov metadata consists in a set or records. There are 4 types of records: `Activity`, `Entity`, `Agent`, and `Environment`. + +Activities represent the transformations that have been applied to the data. Each Activity can use Entities as inputs and outputs. The Agent specifies the software package. Environments specify the software environment in which the provenance record was obtained. + +![](img/records.svg) + +### 2.1 Activity +Each Activity record is a JSON Object with the following fields: + +> [!CAUTION] +> TODO: AssociatedWith and Used can also entirely describe the Agent (resp. Entity) +> TODO: AssociatedWith and Used can be lists +> TODO: Can an Activity represent a group of command lines ? If so, Command can be a list + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Key name + Description +
Id + REQUIRED. Unique URIs (for example a UUID). Identifier for the activity. +
Label + REQUIRED. String. Name of the tool, script, or function used (e.g. “bet”, "recon-all", "myFunc", "docker"). +
Command + REQUIRED. String. Command used to run the tool, including all parameters. +
AssociatedWith + OPTIONAL. UUID. Identifier of the software package used to compute this activity (the corresponding Agent must be defined with its own Agent record). +
Used + OPTIONAL. List. Identifiers (UUIDs) of entities or environments used by this activity. The corresponding Entities (resp. Environments) must be defined with their own Entity (resp. Environment) record). +
Type + OPTIONAL. URI. A term from a controlled vocabulary that more specifically describes the Activity. +
StartedAtTime + OPTIONAL. xsd:dateTime. A timestamp tracking when this activity started +
EndedAtTime + OPTIONAL. xsd:dateTime. A timestamp tracking when this activity ended +
+ +Here is an example of an Activity record: +```JSON +{ + "Id": "bids::prov/#conversion-00f3a18f", + "Label": "Dicom to Nifti conversion", + "Command": "dcm2niix -o . -f sub-%i/anat/sub-%i_T1w sourcedata/dicoms", + "AssociatedWith": "bids::prov/#dcm2niix-khhkm7u1", + "Used": [ + "bids::prov/#fedora-uldfv058", + "bids::sourcedata/dicoms" + ], + "Type": "Activity", + "StartedAtTime": "2025-03-13T10:26:00", + "EndedAtTime": "2025-03-13T10:26:05" +} +``` + +### 2.2 Entity +Each Entity record is a JSON Object with the following fields: + +> [!CAUTION] +> TODO: GeneratedBy can also entirely describe the Activity +> TODO: GeneratedBy can be a list + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Key name + Description +
Id + REQUIRED. Unique URIs (for example a UUID). Identifier for the entity. +
Label + REQUIRED. String. A name for the entity. +
AtLocation + OPTIONAL. String. For input files, this is the relative path to the file on disk. +
GeneratedBy + OPTIONAL. UUID. Identifier of the activity which generated this entity (the corresponding Activity must be defined with its own Activity record). +
Type + OPTIONAL. URI. A term from a controlled vocabulary that more specifically describes the Entity. +
Digest + RECOMMENDED. Dict. For files, this would include checksums of files. It would take the form {"": "value"}. +
+ +Here is an example of an Entity record: +```JSON +{ + "Id": "bids::sub-02/anat/sub-02_T1w.nii", + "Label": "sub-02_T1w.nii", + "AtLocation": "sub-02/anat/sub-02_T1w.nii", + "GeneratedBy": "bids::prov/#conversion-00f3a18f", + "Type": "Activity", + "Digest": { + "SHA-256": "42d8faeaa6d4988a9233a95860ef3f481fb0daccce4c81bc2c1634ea8cf89e52" + } +} +``` + +### 2.3 Agent (Optional) +Agent records are OPTIONAL. If included, each Agent record is a JSON Object with the following fields: + +> [!CAUTION] +> TODO: do we need a Type field for Agent? +> TODO: shall we use `Software`, `Agent`, `SoftwareAgent` ? + + + + + + + + + + + + + + + + + + + + + + +
Key name + Description +
Id + REQUIRED. A unique identifier like a UUID that will be used to associate activities with this software (e.g., urn:1264-1233-11231-12312, "urn:bet-o1ef4rt" +
AltIdentifier + OPTIONAL. URI. URI of the RRID for this software package (cf. scicrunch). +
Label + REQUIRED. String. Name of the software. +
Version + REQUIRED. String. Version of the software. +
+ +Here is an example of an Agent record: +```JSON +{ + "Id": "bids::prov/#dcm2niix-khhkm7u1", + "AltIdentifier": "RRID:SCR_023517", + "Label": "dcm2niix", + "Version": "v1.0.20220720" +} +``` + +### 2.4 Environment (Optional) +Environment records are OPTIONAL. If included, each Environment record is a JSON Object with the following fields: + +> [!CAUTION] +> TODO: do we need a Type field for Environment? +> TODO: Environment not currently defined in the BIDS-Prov context + + + + + + + + + + + + + + + + + + + + + + + + + + +
Key name + Description +
Id + REQUIRED. Unique URIs (for example a UUID). Identifier for the environment (this identifier will be used to associated activities with this environment). +
Label + REQUIRED. String. Name of the software. +
EnvVars + OPTIONAL. Dict. A dictionary defining the environment variables as key-value pairs. +
OperatingSystem + OPTIONAL. String. Name of the operating system. +
Dependencies + OPTIONAL. Dict. A dictionary defining the software used and their versions as key-value pairs. +
+ +Here is an example of an Environment record: +```JSON +{ + "Id": "bids::prov/#fedora-uldfv058", + "Label": "Fedora release 36 (Thirty Six)", + "OperatingSystem": "GNU/Linux 6.2.15-100.fc36.x86_64" +} +``` + +## 3. Additions to BIDS + +### 3.1 File naming + +This section describes additions to the BIDS naming conventions for BIDS-Prov files. + +For further information about naming conventions, please consult the BIDS specification ([https://bids-specification.readthedocs.io](https://bids-specification.readthedocs.io)). Until these conventions are established in BIDS, it is RECOMMENDED to use the following. + +#### 3.1.1 File extensions + +> [!CAUTION] +> TODO: do we keep a `.prov.json` or `.prov.jsonld` extension ? + +BIDS-Prov files contain JSON or JSON-LD data, hence having either a `.json` or a `.jsonld` extension. + +When using a `.jsonld` extension, the contents of the file must be JSON-LD. + +As JSON-LD is JSON, `*.jsonld` files can contain JSON. + +#### 3.1.2 The `prov` entity + +> [!CAUTION] +> TODO: is the use of this entity mandatory ? + +BIDS-Prov introduces the following entity: + +`prov` +* Full name: Provenance records +* Format: `prov-