2
2
from tempfile import TemporaryDirectory
3
3
from pathlib import Path
4
4
from os .path import join
5
+ import pprint
5
6
import shutil
6
7
import subprocess
7
8
import bids
75
76
*This is a test dataset compiled for software development purposes. Please refer to the original datasets for research use.*
76
77
"""
77
78
79
+
78
80
# Create dataset_description.json content
79
81
def create_dataset_description ():
80
82
"""Create BIDS dataset_description.json content."""
@@ -83,12 +85,7 @@ def create_dataset_description():
83
85
"BIDSVersion" : "1.7.0" ,
84
86
"DatasetType" : "raw" ,
85
87
"License" : "CC0" ,
86
- "Authors" : [
87
- "datalad" ,
88
- "python" ,
89
- "make" ,
90
- "openneuro"
91
- ],
88
+ "Authors" : ["datalad" , "python" , "make" , "openneuro" ],
92
89
"HowToAcknowledge" : "Please cite the original datasets and PETPrep software." ,
93
90
"Funding" : [
94
91
"This test data collection was created for PETPrep development and testing purposes"
@@ -98,72 +95,97 @@ def create_dataset_description():
98
95
],
99
96
"ReferencesAndLinks" : [
100
97
"https://github.com/nipreps/petprep" ,
101
- "https://openneuro.org"
98
+ "https://openneuro.org" ,
102
99
],
103
100
"DatasetDOI" : "10.18112/openneuro.ds000000.v1.0.0" ,
104
- "HEDVersion" : "8.0.0"
101
+ "HEDVersion" : "8.0.0" ,
105
102
}
106
103
104
+
107
105
# Create README.md content
108
106
def create_readme_content (pet_datasets , readme_template ):
109
107
"""Create README content dynamically based on the datasets."""
110
-
108
+
111
109
# Generate dataset list dynamically
112
110
dataset_list = ""
113
111
for i , (dataset_id , meta ) in enumerate (pet_datasets .items (), 1 ):
114
112
dataset_list += f"{ i } . **{ dataset_id } **: { meta ['description' ]} \n "
115
-
113
+
116
114
return readme_template .format (dataset_list = dataset_list )
117
115
118
116
119
117
pet_datasets = {
120
118
"ds005619" : {
121
119
"version" : "1.1.0" ,
122
- "description" : "[18F]SF51, a Novel 18F-labeled PET Radioligand for Translocator Protein 18kDa (TSPO) in Brain, Works Well in Monkeys but Fails in Humans" ,
123
- "subject_ids" : ["sf02" ]
120
+ "description" : "[18F]SF51, a Novel 18F-labeled PET Radioligand for "
121
+ "Translocator Protein 18kDa (TSPO) in Brain, Works Well "
122
+ "in Monkeys but Fails in Humans" ,
123
+ "subject_ids" : ["sf02" ],
124
124
},
125
125
"ds004868" : {
126
126
"version" : "1.0.4" ,
127
- "description" : "[11C]PS13 demonstrates pharmacologically selective and substantial binding to cyclooxygenase-1 (COX-1) in the human brain" ,
128
- "subject_ids" : ["PSBB01" ]
127
+ "description" : "[11C]PS13 demonstrates pharmacologically selective and "
128
+ "substantial binding to cyclooxygenase-1 (COX-1) in the "
129
+ "human brain" ,
130
+ "subject_ids" : ["PSBB01" ],
129
131
},
130
132
"ds004869" : {
131
133
"version" : "1.1.1" ,
132
134
"description" : "https://openneuro.org/datasets/ds004869/versions/1.1.1" ,
133
- "subject_ids" : ["01" ]
135
+ "subject_ids" : ["01" ],
134
136
},
135
137
}
136
138
137
139
openneuro_template_string = "https://github.com/OpenNeuroDatasets/{DATASET_ID}.git"
138
140
139
141
140
-
141
- def download_test_data (working_directory = TemporaryDirectory (), output_directory = os .getcwd ()):
142
+ def download_test_data (
143
+ working_directory = TemporaryDirectory (),
144
+ output_directory = os .getcwd (),
145
+ pet_datasets_json = None , # Default to None, not the dict
146
+ ):
147
+ # Use default datasets if no JSON file provided
148
+ if pet_datasets_json is None :
149
+ datasets_to_use = pet_datasets # Use the default defined at module level
150
+ else :
151
+ # Load from JSON file
152
+ with open (pet_datasets_json , "r" ) as infile :
153
+ datasets_to_use = json .load (infile )
154
+
142
155
with working_directory as data_path :
143
156
combined_participants_tsv = pd .DataFrame ()
144
157
combined_subjects = []
145
158
combined_dataset_files = []
146
- for dataset_id , meta in pet_datasets .items ():
159
+ for dataset_id , meta in datasets_to_use .items (): # Use datasets_to_use instead of pet_datasets
147
160
dataset_path = Path (data_path ) / Path (dataset_id )
148
161
if dataset_path .is_dir () and len (sys .argv ) <= 1 :
149
162
dataset_path .rmdir ()
150
- dataset = api .install (path = dataset_path , source = openneuro_template_string .format (DATASET_ID = dataset_id ))
151
- #api.unlock(str(dataset_path))
163
+ dataset = api .install (
164
+ path = dataset_path ,
165
+ source = openneuro_template_string .format (DATASET_ID = dataset_id ),
166
+ )
167
+ # api.unlock(str(dataset_path))
152
168
dataset .unlock ()
153
169
154
170
# see how pybids handles this datalad nonsense
155
- b = bids .layout .BIDSLayout (dataset_path , derivatives = False ) # when petderivatives are a thing, we'll think about using pybids to get them
156
-
171
+ b = bids .layout .BIDSLayout (
172
+ dataset_path , derivatives = False
173
+ ) # when petderivatives are a thing, we'll think about using pybids to get them
174
+
157
175
# Access participants.tsv
158
- participants_files = b .get (suffix = "participants" , extension = ".tsv" , return_type = "file" )
176
+ participants_files = b .get (
177
+ suffix = "participants" , extension = ".tsv" , return_type = "file"
178
+ )
159
179
if participants_files :
160
180
participants_file = participants_files [0 ]
161
-
181
+
162
182
# Read participants.tsv as pandas DataFrame
163
183
participants_df = pd .read_csv (participants_file , sep = "\t " )
164
-
184
+
165
185
# Combine with overall participants DataFrame
166
- combined_participants_tsv = pd .concat ([combined_participants_tsv , participants_df ], ignore_index = True )
186
+ combined_participants_tsv = pd .concat (
187
+ [combined_participants_tsv , participants_df ], ignore_index = True
188
+ )
167
189
# if a subset of subjects are specified collect only those subjects in the install
168
190
if meta .get ("subject_ids" , []) != []:
169
191
for id in meta ["subject_ids" ]:
@@ -182,16 +204,20 @@ def download_test_data(working_directory=TemporaryDirectory(), output_directory=
182
204
print (f )
183
205
# Get the file relative to the dataset path
184
206
result = dataset .get (dataset_path / f )
185
- print (result )
186
- if result [0 ].get ("status" ) == "ok" or result [0 ].get ("message" ) == "already present" :
207
+ if (
208
+ result [0 ].get ("status" ) == "ok"
209
+ or result [0 ].get ("message" ) == "already present"
210
+ ):
187
211
# Then unlock it to make it writable
188
- api .unlock (path = str (dataset_path / f ), dataset = str (dataset_path ))
212
+ api .unlock (
213
+ path = str (dataset_path / f ), dataset = str (dataset_path )
214
+ )
189
215
source_file = dataset_path / f
190
216
relative_path = source_file .relative_to (dataset_path )
191
217
target_file = Path (output_directory ) / relative_path
192
218
target_file .parent .mkdir (parents = True , exist_ok = True )
193
219
shutil .copy2 (source_file , target_file )
194
-
220
+
195
221
else :
196
222
combined_subjects += b .get (return_type = "id" , target = "subject" )
197
223
# Get all files first
@@ -200,31 +226,79 @@ def download_test_data(working_directory=TemporaryDirectory(), output_directory=
200
226
shutil .copytree (dataset_path , output_directory )
201
227
202
228
combined_subjects = [f"sub-{ s } " for s in combined_subjects ]
203
-
229
+
204
230
# Filter participants DataFrame to keep only subjects in combined_subjects list
205
231
combined_participants = combined_participants_tsv [
206
- combined_participants_tsv [' participant_id' ].isin (combined_subjects )
232
+ combined_participants_tsv [" participant_id" ].isin (combined_subjects )
207
233
]
208
-
234
+
209
235
print (combined_participants )
210
236
211
237
# Only write files if a specific download path was provided
212
238
dataset_desc_path = Path (output_directory ) / "dataset_description.json"
213
239
readme_path = Path (output_directory ) / "README.md"
214
-
215
- with open (dataset_desc_path , 'w' ) as f :
240
+
241
+ with open (dataset_desc_path , "w" ) as f :
216
242
json .dump (create_dataset_description (), f , indent = 4 )
217
-
218
- with open (readme_path , 'w' ) as f :
219
- f .write (create_readme_content (pet_datasets , readme_template ))
220
- combined_participants .to_csv (Path (output_directory ) / "participants.tsv" , sep = "\t " , index = False )
221
243
244
+ with open (readme_path , "w" ) as f :
245
+ f .write (create_readme_content (pet_datasets , readme_template ))
246
+ combined_participants .to_csv (
247
+ Path (output_directory ) / "participants.tsv" , sep = "\t " , index = False
248
+ )
222
249
223
250
224
251
if __name__ == "__main__" :
225
- parser = argparse .ArgumentParser (prog = "PETPrepTestDataCollector" , description = "Collects PET datasets from OpenNeuro.org and combines them into a single BIDS dataset using datalad and pandas" ,)
226
- parser .add_argument ("--working-directory" , "-w" , type = str , default = TemporaryDirectory (), help = "Working directory for downloading and combining datasets, defaults to a temporary directory." )
227
- parser .add_argument ("--output-directory" , "-o" , type = str , default = os .getcwd (), help = f"Output directory of combined dataset, defaults where this script is called from, presently { os .getcwd } " )
252
+ parser = argparse .ArgumentParser (
253
+ prog = "PETPrepTestDataCollector" ,
254
+ description = "Collects PET datasets from OpenNeuro.org and combines them into a single BIDS dataset using datalad and pandas" ,
255
+ formatter_class = argparse .RawTextHelpFormatter ,
256
+ )
257
+ parser .add_argument (
258
+ "--working-directory" ,
259
+ "-w" ,
260
+ type = str ,
261
+ default = TemporaryDirectory (),
262
+ help = "Working directory for downloading and combining datasets, defaults to a temporary directory." ,
263
+ )
264
+ parser .add_argument (
265
+ "--output-directory" ,
266
+ "-o" ,
267
+ type = str ,
268
+ default = os .getcwd (),
269
+ help = f"Output directory of combined dataset, defaults where this script is called from, presently { os .getcwd ()} " ,
270
+ )
271
+ parser .add_argument (
272
+ "--datasets-json" ,
273
+ "-j" ,
274
+ type = str ,
275
+ default = None ,
276
+ help = """Use a custom json of datasets along
277
+ a subset of subjects can also be specified.
278
+ The default is structured like the following:
279
+
280
+ {
281
+ "ds005619": {
282
+ "version": "1.1.0",
283
+ "description": "[description]",
284
+ "subject_ids": ["sf02"]
285
+ },
286
+ "ds004868": {
287
+ "version": "1.0.4",
288
+ "description": "[description]",
289
+ "subject_ids": ["PSBB01"]
290
+ },
291
+ "ds004869": {
292
+ "version": "1.1.1",
293
+ "description": "[description]",
294
+ "subject_ids": ["01"]
295
+ }
296
+ },""" ,
297
+ )
228
298
args = parser .parse_args ()
229
299
230
- download_test_data (working_directory = args .working_directory , output_directory = args .output_directory )
300
+ download_test_data (
301
+ working_directory = args .working_directory ,
302
+ output_directory = args .output_directory ,
303
+ pet_datasets_json = args .datasets_json # This will be None if not provided
304
+ )
0 commit comments