Skip to content

Commit e232b62

Browse files
bendhousearteffigies
authored andcommitted
updated help menu
1 parent 04ae5f9 commit e232b62

File tree

1 file changed

+117
-43
lines changed

1 file changed

+117
-43
lines changed

scripts/collect_test_data.py

Lines changed: 117 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from tempfile import TemporaryDirectory
33
from pathlib import Path
44
from os.path import join
5+
import pprint
56
import shutil
67
import subprocess
78
import bids
@@ -75,6 +76,7 @@
7576
*This is a test dataset compiled for software development purposes. Please refer to the original datasets for research use.*
7677
"""
7778

79+
7880
# Create dataset_description.json content
7981
def create_dataset_description():
8082
"""Create BIDS dataset_description.json content."""
@@ -83,12 +85,7 @@ def create_dataset_description():
8385
"BIDSVersion": "1.7.0",
8486
"DatasetType": "raw",
8587
"License": "CC0",
86-
"Authors": [
87-
"datalad",
88-
"python",
89-
"make",
90-
"openneuro"
91-
],
88+
"Authors": ["datalad", "python", "make", "openneuro"],
9289
"HowToAcknowledge": "Please cite the original datasets and PETPrep software.",
9390
"Funding": [
9491
"This test data collection was created for PETPrep development and testing purposes"
@@ -98,72 +95,97 @@ def create_dataset_description():
9895
],
9996
"ReferencesAndLinks": [
10097
"https://github.com/nipreps/petprep",
101-
"https://openneuro.org"
98+
"https://openneuro.org",
10299
],
103100
"DatasetDOI": "10.18112/openneuro.ds000000.v1.0.0",
104-
"HEDVersion": "8.0.0"
101+
"HEDVersion": "8.0.0",
105102
}
106103

104+
107105
# Create README.md content
108106
def create_readme_content(pet_datasets, readme_template):
109107
"""Create README content dynamically based on the datasets."""
110-
108+
111109
# Generate dataset list dynamically
112110
dataset_list = ""
113111
for i, (dataset_id, meta) in enumerate(pet_datasets.items(), 1):
114112
dataset_list += f"{i}. **{dataset_id}**: {meta['description']}\n"
115-
113+
116114
return readme_template.format(dataset_list=dataset_list)
117115

118116

119117
pet_datasets = {
120118
"ds005619": {
121119
"version": "1.1.0",
122-
"description": "[18F]SF51, a Novel 18F-labeled PET Radioligand for Translocator Protein 18kDa (TSPO) in Brain, Works Well in Monkeys but Fails in Humans",
123-
"subject_ids": ["sf02"]
120+
"description": "[18F]SF51, a Novel 18F-labeled PET Radioligand for "
121+
"Translocator Protein 18kDa (TSPO) in Brain, Works Well "
122+
"in Monkeys but Fails in Humans",
123+
"subject_ids": ["sf02"],
124124
},
125125
"ds004868": {
126126
"version": "1.0.4",
127-
"description": "[11C]PS13 demonstrates pharmacologically selective and substantial binding to cyclooxygenase-1 (COX-1) in the human brain",
128-
"subject_ids": ["PSBB01"]
127+
"description": "[11C]PS13 demonstrates pharmacologically selective and "
128+
"substantial binding to cyclooxygenase-1 (COX-1) in the "
129+
"human brain",
130+
"subject_ids": ["PSBB01"],
129131
},
130132
"ds004869": {
131133
"version": "1.1.1",
132134
"description": "https://openneuro.org/datasets/ds004869/versions/1.1.1",
133-
"subject_ids": ["01"]
135+
"subject_ids": ["01"],
134136
},
135137
}
136138

137139
openneuro_template_string = "https://github.com/OpenNeuroDatasets/{DATASET_ID}.git"
138140

139141

140-
141-
def download_test_data(working_directory=TemporaryDirectory(), output_directory=os.getcwd()):
142+
def download_test_data(
143+
working_directory=TemporaryDirectory(),
144+
output_directory=os.getcwd(),
145+
pet_datasets_json=None, # Default to None, not the dict
146+
):
147+
# Use default datasets if no JSON file provided
148+
if pet_datasets_json is None:
149+
datasets_to_use = pet_datasets # Use the default defined at module level
150+
else:
151+
# Load from JSON file
152+
with open(pet_datasets_json, "r") as infile:
153+
datasets_to_use = json.load(infile)
154+
142155
with working_directory as data_path:
143156
combined_participants_tsv = pd.DataFrame()
144157
combined_subjects = []
145158
combined_dataset_files = []
146-
for dataset_id, meta in pet_datasets.items():
159+
for dataset_id, meta in datasets_to_use.items(): # Use datasets_to_use instead of pet_datasets
147160
dataset_path = Path(data_path) / Path(dataset_id)
148161
if dataset_path.is_dir() and len(sys.argv) <= 1:
149162
dataset_path.rmdir()
150-
dataset = api.install(path=dataset_path, source=openneuro_template_string.format(DATASET_ID=dataset_id))
151-
#api.unlock(str(dataset_path))
163+
dataset = api.install(
164+
path=dataset_path,
165+
source=openneuro_template_string.format(DATASET_ID=dataset_id),
166+
)
167+
# api.unlock(str(dataset_path))
152168
dataset.unlock()
153169

154170
# see how pybids handles this datalad nonsense
155-
b = bids.layout.BIDSLayout(dataset_path, derivatives=False) # when petderivatives are a thing, we'll think about using pybids to get them
156-
171+
b = bids.layout.BIDSLayout(
172+
dataset_path, derivatives=False
173+
) # when petderivatives are a thing, we'll think about using pybids to get them
174+
157175
# Access participants.tsv
158-
participants_files = b.get(suffix="participants", extension=".tsv", return_type="file")
176+
participants_files = b.get(
177+
suffix="participants", extension=".tsv", return_type="file"
178+
)
159179
if participants_files:
160180
participants_file = participants_files[0]
161-
181+
162182
# Read participants.tsv as pandas DataFrame
163183
participants_df = pd.read_csv(participants_file, sep="\t")
164-
184+
165185
# Combine with overall participants DataFrame
166-
combined_participants_tsv = pd.concat([combined_participants_tsv, participants_df], ignore_index=True)
186+
combined_participants_tsv = pd.concat(
187+
[combined_participants_tsv, participants_df], ignore_index=True
188+
)
167189
# if a subset of subjects are specified collect only those subjects in the install
168190
if meta.get("subject_ids", []) != []:
169191
for id in meta["subject_ids"]:
@@ -182,16 +204,20 @@ def download_test_data(working_directory=TemporaryDirectory(), output_directory=
182204
print(f)
183205
# Get the file relative to the dataset path
184206
result = dataset.get(dataset_path / f)
185-
print(result)
186-
if result[0].get("status") == "ok" or result[0].get("message") == "already present":
207+
if (
208+
result[0].get("status") == "ok"
209+
or result[0].get("message") == "already present"
210+
):
187211
# Then unlock it to make it writable
188-
api.unlock(path=str(dataset_path / f), dataset=str(dataset_path))
212+
api.unlock(
213+
path=str(dataset_path / f), dataset=str(dataset_path)
214+
)
189215
source_file = dataset_path / f
190216
relative_path = source_file.relative_to(dataset_path)
191217
target_file = Path(output_directory) / relative_path
192218
target_file.parent.mkdir(parents=True, exist_ok=True)
193219
shutil.copy2(source_file, target_file)
194-
220+
195221
else:
196222
combined_subjects += b.get(return_type="id", target="subject")
197223
# Get all files first
@@ -200,31 +226,79 @@ def download_test_data(working_directory=TemporaryDirectory(), output_directory=
200226
shutil.copytree(dataset_path, output_directory)
201227

202228
combined_subjects = [f"sub-{s}" for s in combined_subjects]
203-
229+
204230
# Filter participants DataFrame to keep only subjects in combined_subjects list
205231
combined_participants = combined_participants_tsv[
206-
combined_participants_tsv['participant_id'].isin(combined_subjects)
232+
combined_participants_tsv["participant_id"].isin(combined_subjects)
207233
]
208-
234+
209235
print(combined_participants)
210236

211237
# Only write files if a specific download path was provided
212238
dataset_desc_path = Path(output_directory) / "dataset_description.json"
213239
readme_path = Path(output_directory) / "README.md"
214-
215-
with open(dataset_desc_path, 'w') as f:
240+
241+
with open(dataset_desc_path, "w") as f:
216242
json.dump(create_dataset_description(), f, indent=4)
217-
218-
with open(readme_path, 'w') as f:
219-
f.write(create_readme_content(pet_datasets, readme_template))
220-
combined_participants.to_csv(Path(output_directory) / "participants.tsv", sep="\t", index=False)
221243

244+
with open(readme_path, "w") as f:
245+
f.write(create_readme_content(pet_datasets, readme_template))
246+
combined_participants.to_csv(
247+
Path(output_directory) / "participants.tsv", sep="\t", index=False
248+
)
222249

223250

224251
if __name__ == "__main__":
225-
parser = argparse.ArgumentParser(prog="PETPrepTestDataCollector", description="Collects PET datasets from OpenNeuro.org and combines them into a single BIDS dataset using datalad and pandas",)
226-
parser.add_argument("--working-directory", "-w", type=str, default=TemporaryDirectory(), help="Working directory for downloading and combining datasets, defaults to a temporary directory.")
227-
parser.add_argument("--output-directory", "-o", type=str, default=os.getcwd(), help=f"Output directory of combined dataset, defaults where this script is called from, presently {os.getcwd}")
252+
parser = argparse.ArgumentParser(
253+
prog="PETPrepTestDataCollector",
254+
description="Collects PET datasets from OpenNeuro.org and combines them into a single BIDS dataset using datalad and pandas",
255+
formatter_class=argparse.RawTextHelpFormatter,
256+
)
257+
parser.add_argument(
258+
"--working-directory",
259+
"-w",
260+
type=str,
261+
default=TemporaryDirectory(),
262+
help="Working directory for downloading and combining datasets, defaults to a temporary directory.",
263+
)
264+
parser.add_argument(
265+
"--output-directory",
266+
"-o",
267+
type=str,
268+
default=os.getcwd(),
269+
help=f"Output directory of combined dataset, defaults where this script is called from, presently {os.getcwd()}",
270+
)
271+
parser.add_argument(
272+
"--datasets-json",
273+
"-j",
274+
type=str,
275+
default=None,
276+
help="""Use a custom json of datasets along
277+
a subset of subjects can also be specified.
278+
The default is structured like the following:
279+
280+
{
281+
"ds005619": {
282+
"version": "1.1.0",
283+
"description": "[description]",
284+
"subject_ids": ["sf02"]
285+
},
286+
"ds004868": {
287+
"version": "1.0.4",
288+
"description": "[description]",
289+
"subject_ids": ["PSBB01"]
290+
},
291+
"ds004869": {
292+
"version": "1.1.1",
293+
"description": "[description]",
294+
"subject_ids": ["01"]
295+
}
296+
},""",
297+
)
228298
args = parser.parse_args()
229299

230-
download_test_data(working_directory=args.working_directory, output_directory=args.output_directory)
300+
download_test_data(
301+
working_directory=args.working_directory,
302+
output_directory=args.output_directory,
303+
pet_datasets_json=args.datasets_json # This will be None if not provided
304+
)

0 commit comments

Comments
 (0)