Skip to content

Commit 04ae5f9

Browse files
bendhousearteffigies
authored andcommitted
added script to retrieve pet data from openneuro
1 parent fd6af92 commit 04ae5f9

File tree

2 files changed

+232
-0
lines changed

2 files changed

+232
-0
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ test = [
8080
"pytest-cov >= 2.11",
8181
"pytest-env",
8282
"pytest-xdist >= 2.5",
83+
"datalad",
84+
"datalad-osf",
8385
]
8486
maint = [
8587
"fuzzywuzzy",

scripts/collect_test_data.py

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
from datalad import api
2+
from tempfile import TemporaryDirectory
3+
from pathlib import Path
4+
from os.path import join
5+
import shutil
6+
import subprocess
7+
import bids
8+
import pandas as pd
9+
import sys
10+
import json
11+
import argparse
12+
import os
13+
14+
readme_template = """# PETPrep Test Data Collection
15+
16+
## Overview
17+
18+
This dataset contains a curated collection of PET imaging data from multiple OpenNeuro datasets, compiled for testing and development of the PETPrep software pipeline. The data has been selected to provide a diverse range of PET imaging scenarios for comprehensive software testing.
19+
20+
## Dataset Information
21+
22+
- **Dataset Type**: Raw BIDS data
23+
- **BIDS Version**: 1.7.0
24+
- **License**: CC0 (Public Domain)
25+
- **Compiled for**: PETPrep software testing and development
26+
27+
## Included Datasets
28+
29+
This collection includes data from the following OpenNeuro datasets:
30+
31+
{dataset_list}
32+
## Data Structure
33+
34+
The dataset follows the Brain Imaging Data Structure (BIDS) specification:
35+
36+
```
37+
├── dataset_description.json
38+
├── participants.tsv
39+
├── sub-*/ # Subject directories
40+
│ ├── anat/ # Anatomical data
41+
│ │ └── sub-*_T1w.nii.gz
42+
│ └── pet/ # PET data
43+
│ ├── sub-*_pet.nii.gz
44+
│ ├── sub-*_pet.json
45+
│ └── sub-*_blood.tsv # Blood data (if available)
46+
```
47+
48+
## Usage
49+
50+
This dataset is intended for:
51+
- PETPrep software testing and validation
52+
- Development of PET preprocessing pipelines
53+
- Educational purposes in PET data analysis
54+
55+
## Citation
56+
57+
If you use this test dataset, please cite:
58+
- The original OpenNeuro datasets
59+
- The PETPrep software: [PETPrep GitHub Repository](https://github.com/nipreps/petprep)
60+
61+
## Acknowledgments
62+
63+
- OpenNeuro for hosting the original datasets
64+
- The BIDS community for data organization standards
65+
- Contributors to the PETPrep project
66+
67+
## Contact
68+
69+
For questions about this test dataset or PETPrep:
70+
- PETPrep GitHub: https://github.com/nipreps/petprep
71+
- OpenNeuro: https://openneuro.org
72+
73+
---
74+
75+
*This is a test dataset compiled for software development purposes. Please refer to the original datasets for research use.*
76+
"""
77+
78+
# Create dataset_description.json content
79+
def create_dataset_description():
80+
"""Create BIDS dataset_description.json content."""
81+
return {
82+
"Name": "PETPrep Test Data Collection",
83+
"BIDSVersion": "1.7.0",
84+
"DatasetType": "raw",
85+
"License": "CC0",
86+
"Authors": [
87+
"datalad",
88+
"python",
89+
"make",
90+
"openneuro"
91+
],
92+
"HowToAcknowledge": "Please cite the original datasets and PETPrep software.",
93+
"Funding": [
94+
"This test data collection was created for PETPrep development and testing purposes"
95+
],
96+
"EthicsApprovals": [
97+
"This is a test dataset compiled from publicly available BIDS datasets for software testing purposes"
98+
],
99+
"ReferencesAndLinks": [
100+
"https://github.com/nipreps/petprep",
101+
"https://openneuro.org"
102+
],
103+
"DatasetDOI": "10.18112/openneuro.ds000000.v1.0.0",
104+
"HEDVersion": "8.0.0"
105+
}
106+
107+
# Create README.md content
108+
def create_readme_content(pet_datasets, readme_template):
109+
"""Create README content dynamically based on the datasets."""
110+
111+
# Generate dataset list dynamically
112+
dataset_list = ""
113+
for i, (dataset_id, meta) in enumerate(pet_datasets.items(), 1):
114+
dataset_list += f"{i}. **{dataset_id}**: {meta['description']}\n"
115+
116+
return readme_template.format(dataset_list=dataset_list)
117+
118+
119+
pet_datasets = {
120+
"ds005619": {
121+
"version": "1.1.0",
122+
"description": "[18F]SF51, a Novel 18F-labeled PET Radioligand for Translocator Protein 18kDa (TSPO) in Brain, Works Well in Monkeys but Fails in Humans",
123+
"subject_ids": ["sf02"]
124+
},
125+
"ds004868": {
126+
"version": "1.0.4",
127+
"description": "[11C]PS13 demonstrates pharmacologically selective and substantial binding to cyclooxygenase-1 (COX-1) in the human brain",
128+
"subject_ids": ["PSBB01"]
129+
},
130+
"ds004869": {
131+
"version": "1.1.1",
132+
"description": "https://openneuro.org/datasets/ds004869/versions/1.1.1",
133+
"subject_ids": ["01"]
134+
},
135+
}
136+
137+
openneuro_template_string = "https://github.com/OpenNeuroDatasets/{DATASET_ID}.git"
138+
139+
140+
141+
def download_test_data(working_directory=TemporaryDirectory(), output_directory=os.getcwd()):
142+
with working_directory as data_path:
143+
combined_participants_tsv = pd.DataFrame()
144+
combined_subjects = []
145+
combined_dataset_files = []
146+
for dataset_id, meta in pet_datasets.items():
147+
dataset_path = Path(data_path) / Path(dataset_id)
148+
if dataset_path.is_dir() and len(sys.argv) <= 1:
149+
dataset_path.rmdir()
150+
dataset = api.install(path=dataset_path, source=openneuro_template_string.format(DATASET_ID=dataset_id))
151+
#api.unlock(str(dataset_path))
152+
dataset.unlock()
153+
154+
# see how pybids handles this datalad nonsense
155+
b = bids.layout.BIDSLayout(dataset_path, derivatives=False) # when petderivatives are a thing, we'll think about using pybids to get them
156+
157+
# Access participants.tsv
158+
participants_files = b.get(suffix="participants", extension=".tsv", return_type="file")
159+
if participants_files:
160+
participants_file = participants_files[0]
161+
162+
# Read participants.tsv as pandas DataFrame
163+
participants_df = pd.read_csv(participants_file, sep="\t")
164+
165+
# Combine with overall participants DataFrame
166+
combined_participants_tsv = pd.concat([combined_participants_tsv, participants_df], ignore_index=True)
167+
# if a subset of subjects are specified collect only those subjects in the install
168+
if meta.get("subject_ids", []) != []:
169+
for id in meta["subject_ids"]:
170+
combined_subjects.append(id)
171+
# Instead of using pybids, get all files in the subject directory
172+
all_files = []
173+
subject_dir = dataset_path / f"sub-{id}"
174+
if subject_dir.exists():
175+
# Get all files in the subject directory recursively
176+
all_files = []
177+
for file_path in subject_dir.rglob("*"):
178+
if file_path.is_file():
179+
relative_path = file_path.relative_to(dataset_path)
180+
all_files.append(str(relative_path))
181+
for f in all_files:
182+
print(f)
183+
# Get the file relative to the dataset path
184+
result = dataset.get(dataset_path / f)
185+
print(result)
186+
if result[0].get("status") == "ok" or result[0].get("message") == "already present":
187+
# Then unlock it to make it writable
188+
api.unlock(path=str(dataset_path / f), dataset=str(dataset_path))
189+
source_file = dataset_path / f
190+
relative_path = source_file.relative_to(dataset_path)
191+
target_file = Path(output_directory) / relative_path
192+
target_file.parent.mkdir(parents=True, exist_ok=True)
193+
shutil.copy2(source_file, target_file)
194+
195+
else:
196+
combined_subjects += b.get(return_type="id", target="subject")
197+
# Get all files first
198+
dataset.get(dataset_path)
199+
api.unlock(path=str(dataset_path), dataset=dataset)
200+
shutil.copytree(dataset_path, output_directory)
201+
202+
combined_subjects = [f"sub-{s}" for s in combined_subjects]
203+
204+
# Filter participants DataFrame to keep only subjects in combined_subjects list
205+
combined_participants = combined_participants_tsv[
206+
combined_participants_tsv['participant_id'].isin(combined_subjects)
207+
]
208+
209+
print(combined_participants)
210+
211+
# Only write files if a specific download path was provided
212+
dataset_desc_path = Path(output_directory) / "dataset_description.json"
213+
readme_path = Path(output_directory) / "README.md"
214+
215+
with open(dataset_desc_path, 'w') as f:
216+
json.dump(create_dataset_description(), f, indent=4)
217+
218+
with open(readme_path, 'w') as f:
219+
f.write(create_readme_content(pet_datasets, readme_template))
220+
combined_participants.to_csv(Path(output_directory) / "participants.tsv", sep="\t", index=False)
221+
222+
223+
224+
if __name__ == "__main__":
225+
parser = argparse.ArgumentParser(prog="PETPrepTestDataCollector", description="Collects PET datasets from OpenNeuro.org and combines them into a single BIDS dataset using datalad and pandas",)
226+
parser.add_argument("--working-directory", "-w", type=str, default=TemporaryDirectory(), help="Working directory for downloading and combining datasets, defaults to a temporary directory.")
227+
parser.add_argument("--output-directory", "-o", type=str, default=os.getcwd(), help=f"Output directory of combined dataset, defaults where this script is called from, presently {os.getcwd}")
228+
args = parser.parse_args()
229+
230+
download_test_data(working_directory=args.working_directory, output_directory=args.output_directory)

0 commit comments

Comments
 (0)