Skip to content

Commit 4db49fd

Browse files
authored
Merge pull request #71 from bendhouseart/download-test-data
Download test data
2 parents fd6af92 + 73429c5 commit 4db49fd

File tree

1 file changed

+323
-0
lines changed

1 file changed

+323
-0
lines changed

scripts/collect_test_data.py

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
# /// script
2+
# requires-python = ">=3.11"
3+
# dependencies = [
4+
# "datalad",
5+
# "pandas",
6+
# "pybids",
7+
# ]
8+
# ///
9+
10+
import argparse
11+
import json
12+
import os
13+
import shutil
14+
import sys
15+
from pathlib import Path
16+
from tempfile import TemporaryDirectory
17+
18+
import bids
19+
import pandas as pd
20+
from datalad import api
21+
22+
# fmt: skip
23+
readme_template = """# PETPrep Test Data Collection
24+
25+
## Overview
26+
27+
This dataset contains a curated collection of PET imaging data from multiple
28+
OpenNeuro datasets,compiled for testing and development of the PETPrep software pipeline.
29+
The data has been selected to provide a diverse range of PET imaging scenarios for comprehensive
30+
software testing.
31+
32+
## Dataset Information
33+
34+
- **Dataset Type**: Raw BIDS data
35+
- **BIDS Version**: 1.7.0
36+
- **License**: CC0 (Public Domain)
37+
- **Compiled for**: PETPrep software testing and development
38+
39+
## Included Datasets
40+
41+
This collection includes data from the following OpenNeuro datasets:
42+
43+
{dataset_list}
44+
## Data Structure
45+
46+
The dataset follows the Brain Imaging Data Structure (BIDS) specification:
47+
48+
```
49+
├── dataset_description.json
50+
├── participants.tsv
51+
├── sub-*/ # Subject directories
52+
│ ├── anat/ # Anatomical data
53+
│ │ └── sub-*_T1w.nii.gz
54+
│ └── pet/ # PET data
55+
│ ├── sub-*_pet.nii.gz
56+
│ ├── sub-*_pet.json
57+
│ └── sub-*_blood.tsv # Blood data (if available)
58+
```
59+
60+
## Usage
61+
62+
This dataset is intended for:
63+
- PETPrep software testing and validation
64+
- Development of PET preprocessing pipelines
65+
- Educational purposes in PET data analysis
66+
67+
## Citation
68+
69+
If you use this test dataset, please cite:
70+
- The original OpenNeuro datasets
71+
- The PETPrep software: [PETPrep GitHub Repository](https://github.com/nipreps/petprep)
72+
73+
## Acknowledgments
74+
75+
- OpenNeuro for hosting the original datasets
76+
- The BIDS community for data organization standards
77+
- Contributors to the PETPrep project
78+
79+
## Contact
80+
81+
For questions about this test dataset or PETPrep:
82+
- PETPrep GitHub: https://github.com/nipreps/petprep
83+
- OpenNeuro: https://openneuro.org
84+
85+
---
86+
87+
*This is a test dataset compiled for software development purposes. Please refer to the original
88+
datasets for research use.*
89+
"""
90+
91+
92+
# Create dataset_description.json content
93+
def create_dataset_description():
94+
"""Create BIDS dataset_description.json content."""
95+
# fmt: skip
96+
return {
97+
'Name': 'PETPrep Test Data Collection',
98+
'BIDSVersion': '1.7.0',
99+
'DatasetType': 'raw',
100+
'License': 'CC0',
101+
'Authors': ['datalad', 'python', 'make', 'openneuro'],
102+
'HowToAcknowledge': 'Please cite the original datasets and PETPrep software.',
103+
'Funding': [
104+
'This test data collection was created for PETPrep development and testing purposes'
105+
],
106+
'EthicsApprovals': [
107+
'This is a test dataset compiled from publicly available BIDS datasets for software',
108+
'testing purposes',
109+
],
110+
'ReferencesAndLinks': [
111+
'https://github.com/nipreps/petprep',
112+
'https://openneuro.org',
113+
],
114+
'DatasetDOI': '10.18112/openneuro.ds000000.v1.0.0',
115+
'HEDVersion': '8.0.0',
116+
}
117+
118+
119+
# Create README.md content
120+
def create_readme_content(pet_datasets, readme_template):
121+
"""Create README content dynamically based on the datasets."""
122+
123+
# Generate dataset list dynamically
124+
dataset_list = ''
125+
for i, (dataset_id, meta) in enumerate(pet_datasets.items(), 1):
126+
dataset_list += f'{i}. **{dataset_id}**: {meta["description"]}\n'
127+
128+
return readme_template.format(dataset_list=dataset_list)
129+
130+
131+
pet_datasets = {
132+
'ds005619': {
133+
'version': '1.1.0',
134+
'description': '[18F]SF51, a Novel 18F-labeled PET Radioligand for '
135+
'Translocator Protein 18kDa (TSPO) in Brain, Works Well '
136+
'in Monkeys but Fails in Humans',
137+
'subject_ids': ['sf02'],
138+
},
139+
'ds004868': {
140+
'version': '1.0.4',
141+
'description': '[11C]PS13 demonstrates pharmacologically selective and '
142+
'substantial binding to cyclooxygenase-1 (COX-1) in the '
143+
'human brain',
144+
'subject_ids': ['PSBB01'],
145+
},
146+
'ds004869': {
147+
'version': '1.1.1',
148+
'description': 'https://openneuro.org/datasets/ds004869/versions/1.1.1',
149+
'subject_ids': ['01'],
150+
},
151+
}
152+
153+
openneuro_template_string = 'https://github.com/OpenNeuroDatasets/{DATASET_ID}.git'
154+
155+
156+
def download_test_data(
157+
working_directory: TemporaryDirectory | None = None,
158+
output_directory: Path | str = '',
159+
pet_datasets_json=None, # Default to None, not the dict
160+
):
161+
# Use default datasets if no JSON file provided
162+
if pet_datasets_json is None:
163+
datasets_to_use = pet_datasets # Use the default defined at module level
164+
else:
165+
# Load from JSON file
166+
with open(pet_datasets_json) as infile:
167+
datasets_to_use = json.load(infile)
168+
169+
if not working_directory:
170+
working_directory = TemporaryDirectory()
171+
172+
if not output_directory:
173+
output_directory = os.getcwd()
174+
175+
with working_directory as data_path:
176+
combined_participants_tsv = pd.DataFrame()
177+
combined_subjects = []
178+
for (
179+
dataset_id,
180+
meta,
181+
) in datasets_to_use.items(): # Use datasets_to_use instead of pet_datasets
182+
dataset_path = Path(data_path) / Path(dataset_id)
183+
if dataset_path.is_dir() and len(sys.argv) <= 1:
184+
dataset_path.rmdir()
185+
dataset = api.install(
186+
path=dataset_path,
187+
source=openneuro_template_string.format(DATASET_ID=dataset_id),
188+
)
189+
# api.unlock(str(dataset_path))
190+
dataset.unlock()
191+
192+
# see how pybids handles this datalad nonsense
193+
b = bids.layout.BIDSLayout(
194+
dataset_path, derivatives=False
195+
) # when petderivatives are a thing, we'll think about using pybids to get them
196+
197+
# Access participants.tsv
198+
participants_files = b.get(suffix='participants', extension='.tsv', return_type='file')
199+
if participants_files:
200+
participants_file = participants_files[0]
201+
202+
# Read participants.tsv as pandas DataFrame
203+
participants_df = pd.read_csv(participants_file, sep='\t')
204+
205+
# Combine with overall participants DataFrame
206+
combined_participants_tsv = pd.concat(
207+
[combined_participants_tsv, participants_df], ignore_index=True
208+
)
209+
# if a subset of subjects are specified collect only those subjects in the install
210+
if meta.get('subject_ids', []) != []:
211+
for _id in meta['subject_ids']:
212+
combined_subjects.append(_id)
213+
# Get the entire subject directory content including git-annex files
214+
subject_dir = dataset_path / f'sub-{_id}'
215+
if subject_dir.exists():
216+
# First, get all content in the subject directory
217+
# (this retrieves git-annex files)
218+
dataset.get(str(subject_dir))
219+
220+
# Then collect all files after they've been retrieved
221+
all_files = []
222+
for file_path in subject_dir.rglob('*'):
223+
if file_path.is_file():
224+
relative_path = file_path.relative_to(dataset_path)
225+
all_files.append(str(relative_path))
226+
227+
# Copy all files to output directory
228+
for f in all_files:
229+
print(f)
230+
# Unlock the file to make it writable
231+
api.unlock(path=str(dataset_path / f), dataset=str(dataset_path))
232+
source_file = dataset_path / f
233+
relative_path = source_file.relative_to(dataset_path)
234+
target_file = Path(output_directory) / relative_path
235+
target_file.parent.mkdir(parents=True, exist_ok=True)
236+
shutil.copy2(source_file, target_file)
237+
238+
else:
239+
combined_subjects += b.get(return_type='id', target='subject')
240+
# Get all files first
241+
dataset.get(dataset_path)
242+
api.unlock(path=str(dataset_path), dataset=dataset)
243+
shutil.copytree(dataset_path, output_directory)
244+
245+
combined_subjects = [f'sub-{s}' for s in combined_subjects]
246+
247+
# Filter participants DataFrame to keep only subjects in combined_subjects list
248+
combined_participants = combined_participants_tsv[
249+
combined_participants_tsv['participant_id'].isin(combined_subjects)
250+
]
251+
252+
# Only write files if a specific download path was provided
253+
dataset_desc_path = Path(output_directory) / 'dataset_description.json'
254+
readme_path = Path(output_directory) / 'README.md'
255+
256+
with open(dataset_desc_path, 'w') as f:
257+
json.dump(create_dataset_description(), f, indent=4)
258+
259+
with open(readme_path, 'w') as f:
260+
f.write(create_readme_content(pet_datasets, readme_template))
261+
combined_participants.to_csv(
262+
Path(output_directory) / 'participants.tsv', sep='\t', index=False
263+
)
264+
265+
266+
if __name__ == '__main__':
267+
parser = argparse.ArgumentParser(
268+
prog='PETPrepTestDataCollector',
269+
description='Collects PET datasets from OpenNeuro.org and'
270+
'combines them into a single BIDS dataset using datalad and pandas',
271+
formatter_class=argparse.RawTextHelpFormatter,
272+
)
273+
parser.add_argument(
274+
'--working-directory',
275+
'-w',
276+
type=str,
277+
default=TemporaryDirectory(),
278+
help='Working directory for downloading and combining datasets,'
279+
'defaults to a temporary directory.',
280+
)
281+
parser.add_argument(
282+
'--output-directory',
283+
'-o',
284+
type=str,
285+
default=os.getcwd(),
286+
help='Output directory of combined dataset,'
287+
'defaults where this script is called from, presently {os.getcwd()}',
288+
required=True,
289+
)
290+
parser.add_argument(
291+
'--datasets-json',
292+
'-j',
293+
type=str,
294+
default=None,
295+
help="""Use a custom json of datasets along
296+
a subset of subjects can also be specified.
297+
The default is structured like the following:
298+
299+
{
300+
"ds005619": {
301+
"version": "1.1.0",
302+
"description": "[description]",
303+
"subject_ids": ["sf02"]
304+
},
305+
"ds004868": {
306+
"version": "1.0.4",
307+
"description": "[description]",
308+
"subject_ids": ["PSBB01"]
309+
},
310+
"ds004869": {
311+
"version": "1.1.1",
312+
"description": "[description]",
313+
"subject_ids": ["01"]
314+
}
315+
},""",
316+
)
317+
args = parser.parse_args()
318+
319+
download_test_data(
320+
working_directory=args.working_directory,
321+
output_directory=args.output_directory,
322+
pet_datasets_json=args.datasets_json, # This will be None if not provided
323+
)

0 commit comments

Comments
 (0)