added script to retrieve pet data from openneuro

bendhouseart · effigies · commit 04ae5f993505 · 2025-08-18T17:15:05.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -80,6 +80,8 @@ test = [
     "pytest-cov >= 2.11",
     "pytest-env",
     "pytest-xdist >= 2.5",
+    "datalad",
+    "datalad-osf",
 ]
 maint = [
     "fuzzywuzzy",
diff --git a/scripts/collect_test_data.py b/scripts/collect_test_data.py
@@ -0,0 +1,230 @@
+from datalad import api
+from tempfile import TemporaryDirectory
+from pathlib import Path
+from os.path import join
+import shutil
+import subprocess
+import bids
+import pandas as pd
+import sys
+import json
+import argparse
+import os
+
+readme_template = """# PETPrep Test Data Collection
+
+## Overview
+
+This dataset contains a curated collection of PET imaging data from multiple OpenNeuro datasets, compiled for testing and development of the PETPrep software pipeline. The data has been selected to provide a diverse range of PET imaging scenarios for comprehensive software testing.
+
+## Dataset Information
+
+- **Dataset Type**: Raw BIDS data
+- **BIDS Version**: 1.7.0
+- **License**: CC0 (Public Domain)
+- **Compiled for**: PETPrep software testing and development
+
+## Included Datasets
+
+This collection includes data from the following OpenNeuro datasets:
+
+{dataset_list}
+## Data Structure
+
+The dataset follows the Brain Imaging Data Structure (BIDS) specification:
+
+```
+├── dataset_description.json
+├── participants.tsv
+├── sub-*/                    # Subject directories
+│   ├── anat/                 # Anatomical data
+│   │   └── sub-*_T1w.nii.gz
+│   └── pet/                  # PET data
+│       ├── sub-*_pet.nii.gz
+│       ├── sub-*_pet.json
+│       └── sub-*_blood.tsv   # Blood data (if available)
+```
+
+## Usage
+
+This dataset is intended for:
+- PETPrep software testing and validation
+- Development of PET preprocessing pipelines
+- Educational purposes in PET data analysis
+
+## Citation
+
+If you use this test dataset, please cite:
+- The original OpenNeuro datasets
+- The PETPrep software: [PETPrep GitHub Repository](https://github.com/nipreps/petprep)
+
+## Acknowledgments
+
+- OpenNeuro for hosting the original datasets
+- The BIDS community for data organization standards
+- Contributors to the PETPrep project
+
+## Contact
+
+For questions about this test dataset or PETPrep:
+- PETPrep GitHub: https://github.com/nipreps/petprep
+- OpenNeuro: https://openneuro.org
+
+---
+
+*This is a test dataset compiled for software development purposes. Please refer to the original datasets for research use.*
+"""
+
+# Create dataset_description.json content
+def create_dataset_description():
+    """Create BIDS dataset_description.json content."""
+    return {
+        "Name": "PETPrep Test Data Collection",
+        "BIDSVersion": "1.7.0",
+        "DatasetType": "raw",
+        "License": "CC0",
+        "Authors": [
+            "datalad",
+            "python", 
+            "make",
+            "openneuro"
+        ],
+        "HowToAcknowledge": "Please cite the original datasets and PETPrep software.",
+        "Funding": [
+            "This test data collection was created for PETPrep development and testing purposes"
+        ],
+        "EthicsApprovals": [
+            "This is a test dataset compiled from publicly available BIDS datasets for software testing purposes"
+        ],
+        "ReferencesAndLinks": [
+            "https://github.com/nipreps/petprep",
+            "https://openneuro.org"
+        ],
+        "DatasetDOI": "10.18112/openneuro.ds000000.v1.0.0",
+        "HEDVersion": "8.0.0"
+    }
+
+# Create README.md content
+def create_readme_content(pet_datasets, readme_template):
+    """Create README content dynamically based on the datasets."""
+    
+    # Generate dataset list dynamically
+    dataset_list = ""
+    for i, (dataset_id, meta) in enumerate(pet_datasets.items(), 1):
+        dataset_list += f"{i}. **{dataset_id}**: {meta['description']}\n"
+    
+    return readme_template.format(dataset_list=dataset_list)
+
+
+pet_datasets = {
+    "ds005619": {
+        "version": "1.1.0",
+        "description": "[18F]SF51, a Novel 18F-labeled PET Radioligand for Translocator Protein 18kDa (TSPO) in Brain, Works Well in Monkeys but Fails in Humans",
+        "subject_ids": ["sf02"]
+    },
+    "ds004868": {
+        "version": "1.0.4",
+        "description": "[11C]PS13 demonstrates pharmacologically selective and substantial binding to cyclooxygenase-1 (COX-1) in the human brain",
+        "subject_ids": ["PSBB01"]
+    },
+    "ds004869": {
+        "version": "1.1.1",
+        "description": "https://openneuro.org/datasets/ds004869/versions/1.1.1",
+        "subject_ids": ["01"]
+    },
+}
+
+openneuro_template_string = "https://github.com/OpenNeuroDatasets/{DATASET_ID}.git"
+
+
+
+def download_test_data(working_directory=TemporaryDirectory(), output_directory=os.getcwd()):
+    with working_directory as data_path:
+        combined_participants_tsv = pd.DataFrame()
+        combined_subjects = []
+        combined_dataset_files = []
+        for dataset_id, meta in pet_datasets.items():
+            dataset_path = Path(data_path) / Path(dataset_id)
+            if dataset_path.is_dir() and len(sys.argv) <= 1:
+                dataset_path.rmdir()
+            dataset = api.install(path=dataset_path, source=openneuro_template_string.format(DATASET_ID=dataset_id))
+            #api.unlock(str(dataset_path))
+            dataset.unlock()
+
+            # see how pybids handles this datalad nonsense
+            b = bids.layout.BIDSLayout(dataset_path, derivatives=False) # when petderivatives are a thing, we'll think about using pybids to get them
+            
+            # Access participants.tsv
+            participants_files = b.get(suffix="participants", extension=".tsv", return_type="file")
+            if participants_files:
+                participants_file = participants_files[0]
+                
+                # Read participants.tsv as pandas DataFrame
+                participants_df = pd.read_csv(participants_file, sep="\t")
+                
+                # Combine with overall participants DataFrame
+                combined_participants_tsv = pd.concat([combined_participants_tsv, participants_df], ignore_index=True)
+            # if a subset of subjects are specified collect only those subjects in the install
+            if meta.get("subject_ids", []) != []:
+                for id in meta["subject_ids"]:
+                    combined_subjects.append(id)
+                    # Instead of using pybids, get all files in the subject directory
+                    all_files = []
+                    subject_dir = dataset_path / f"sub-{id}"
+                    if subject_dir.exists():
+                        # Get all files in the subject directory recursively
+                        all_files = []
+                        for file_path in subject_dir.rglob("*"):
+                            if file_path.is_file():
+                                relative_path = file_path.relative_to(dataset_path)
+                                all_files.append(str(relative_path))
+                    for f in all_files:
+                        print(f)
+                        # Get the file relative to the dataset path
+                        result = dataset.get(dataset_path / f)
+                        print(result)
+                        if result[0].get("status") == "ok" or result[0].get("message") == "already present":
+                            # Then unlock it to make it writable
+                            api.unlock(path=str(dataset_path / f), dataset=str(dataset_path))
+                            source_file = dataset_path / f
+                            relative_path = source_file.relative_to(dataset_path)
+                            target_file = Path(output_directory) / relative_path
+                            target_file.parent.mkdir(parents=True, exist_ok=True)
+                            shutil.copy2(source_file, target_file)
+                        
+            else:
+                combined_subjects += b.get(return_type="id", target="subject")
+                # Get all files first
+                dataset.get(dataset_path)
+                api.unlock(path=str(dataset_path), dataset=dataset)
+                shutil.copytree(dataset_path, output_directory)
+
+        combined_subjects = [f"sub-{s}" for s in combined_subjects]
+        
+        # Filter participants DataFrame to keep only subjects in combined_subjects list
+        combined_participants = combined_participants_tsv[
+            combined_participants_tsv['participant_id'].isin(combined_subjects)
+        ]
+        
+        print(combined_participants)
+
+        # Only write files if a specific download path was provided
+        dataset_desc_path = Path(output_directory) / "dataset_description.json"
+        readme_path = Path(output_directory) / "README.md"
+        
+        with open(dataset_desc_path, 'w') as f:
+            json.dump(create_dataset_description(), f, indent=4)
+        
+        with open(readme_path, 'w') as f:
+            f.write(create_readme_content(pet_datasets, readme_template))
+        combined_participants.to_csv(Path(output_directory) / "participants.tsv", sep="\t", index=False)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog="PETPrepTestDataCollector", description="Collects PET datasets from OpenNeuro.org and combines them into a single BIDS dataset using datalad and pandas",)
+    parser.add_argument("--working-directory", "-w", type=str, default=TemporaryDirectory(), help="Working directory for downloading and combining datasets, defaults to a temporary directory.")
+    parser.add_argument("--output-directory", "-o", type=str, default=os.getcwd(), help=f"Output directory of combined dataset, defaults where this script is called from, presently {os.getcwd}")
+    args = parser.parse_args()
+
+    download_test_data(working_directory=args.working_directory, output_directory=args.output_directory)

Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,8 @@ test = [`
`80`	`80`	`"pytest-cov >= 2.11",`
`81`	`81`	`"pytest-env",`
`82`	`82`	`"pytest-xdist >= 2.5",`
	`83`	`+ "datalad",`
	`84`	`+ "datalad-osf",`
`83`	`85`	`]`
`84`	`86`	`maint = [`
`85`	`87`	`"fuzzywuzzy",`