Skip to content

Commit 36f4b4e

Browse files
committed
enh/rf: add option to fetch derivatives (if available), light refactoring
1 parent 589fa49 commit 36f4b4e

File tree

1 file changed

+69
-39
lines changed

1 file changed

+69
-39
lines changed

scripts/collect_test_data.py

Lines changed: 69 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def create_readme_content(pet_datasets, readme_template):
128128
return readme_template.format(dataset_list=dataset_list)
129129

130130

131-
pet_datasets = {
131+
DEFAULT_PET_DATASETS = {
132132
'ds005619': {
133133
'version': '1.1.0',
134134
'description': '[18F]SF51, a Novel 18F-labeled PET Radioligand for '
@@ -150,22 +150,26 @@ def create_readme_content(pet_datasets, readme_template):
150150
},
151151
}
152152

153-
openneuro_template_string = 'https://github.com/OpenNeuroDatasets/{DATASET_ID}.git'
153+
OPENNEURO_TEMPLATE_STRING = 'https://github.com/OpenNeuroDatasets/{DATASET_ID}.git'
154154

155155

156156
def download_test_data(
157-
working_directory: TemporaryDirectory | None = None,
158-
output_directory: Path | str = '',
159-
pet_datasets_json=None, # Default to None, not the dict
157+
working_directory: Path | None = None,
158+
output_directory: Path | None = None,
159+
pet_datasets_json: dict = None, # Default to None, not the dict
160+
derivatives: list[str] | None = None,
160161
):
161162
# Use default datasets if no JSON file provided
162163
if pet_datasets_json is None:
163-
datasets_to_use = pet_datasets # Use the default defined at module level
164+
datasets_to_use = DEFAULT_PET_DATASETS # Use the default defined at module level
164165
else:
165166
# Load from JSON file
166167
with open(pet_datasets_json) as infile:
167168
datasets_to_use = json.load(infile)
168169

170+
if derivatives is None:
171+
derivatives = []
172+
169173
if not working_directory:
170174
working_directory = TemporaryDirectory()
171175

@@ -184,18 +188,25 @@ def download_test_data(
184188
dataset_path.rmdir()
185189
dataset = api.install(
186190
path=dataset_path,
187-
source=openneuro_template_string.format(DATASET_ID=dataset_id),
191+
source=OPENNEURO_TEMPLATE_STRING.format(DATASET_ID=dataset_id),
188192
)
189193
# api.unlock(str(dataset_path))
190194
dataset.unlock()
191195

192196
# see how pybids handles this datalad nonsense
193197
b = bids.layout.BIDSLayout(
194-
dataset_path, derivatives=False
198+
dataset_path,
199+
derivatives=False,
200+
validate=False,
195201
) # when petderivatives are a thing, we'll think about using pybids to get them
196202

197203
# Access participants.tsv
198-
participants_files = b.get(suffix='participants', extension='.tsv', return_type='file')
204+
participants_files = b.get(
205+
suffix='participants',
206+
extension='.tsv',
207+
return_type='file',
208+
scope='raw',
209+
)
199210
if participants_files:
200211
participants_file = participants_files[0]
201212

@@ -207,33 +218,47 @@ def download_test_data(
207218
[combined_participants_tsv, participants_df], ignore_index=True
208219
)
209220
# if a subset of subjects are specified collect only those subjects in the install
210-
if meta.get('subject_ids', []) != []:
211-
for _id in meta['subject_ids']:
212-
combined_subjects.append(_id)
221+
if meta.get('subject_ids', []):
222+
for sid in meta['subject_ids']:
223+
combined_subjects.append(sid)
213224
# Get the entire subject directory content including git-annex files
214-
subject_dir = dataset_path / f'sub-{_id}'
215-
if subject_dir.exists():
216-
# First, get all content in the subject directory
217-
# (this retrieves git-annex files)
218-
dataset.get(str(subject_dir))
219-
220-
# Then collect all files after they've been retrieved
221-
all_files = []
222-
for file_path in subject_dir.rglob('*'):
223-
if file_path.is_file():
224-
relative_path = file_path.relative_to(dataset_path)
225+
subject_dir = dataset_path / f'sub-{sid}'
226+
if not subject_dir.exists():
227+
continue
228+
# First, get all content in the subject directory
229+
# (this retrieves git-annex files)
230+
dataset.get(str(subject_dir))
231+
232+
# Then collect all files after they've been retrieved
233+
all_files = []
234+
for file_path in subject_dir.rglob('*'):
235+
if file_path.is_file():
236+
relative_path = file_path.relative_to(dataset_path)
237+
all_files.append(str(relative_path))
238+
239+
for deriv in derivatives:
240+
print(f'Getting derivative: {deriv}/sub-{sid}')
241+
deriv_dir = dataset_path / 'derivatives' / deriv / f'sub-{sid}'
242+
try:
243+
dataset.get(str(deriv_dir))
244+
except Exception as e: # noqa: BLE001
245+
print(f'Error getting derivative {deriv}/sub-{sid}: {e}')
246+
continue
247+
for dv in deriv_dir.rglob('*'):
248+
if dv.is_file():
249+
relative_path = dv.relative_to(dataset_path)
225250
all_files.append(str(relative_path))
226251

227-
# Copy all files to output directory
228-
for f in all_files:
229-
print(f)
230-
# Unlock the file to make it writable
231-
api.unlock(path=str(dataset_path / f), dataset=str(dataset_path))
232-
source_file = dataset_path / f
233-
relative_path = source_file.relative_to(dataset_path)
234-
target_file = Path(output_directory) / relative_path
235-
target_file.parent.mkdir(parents=True, exist_ok=True)
236-
shutil.copy2(source_file, target_file)
252+
# Copy all files to output directory
253+
for f in all_files:
254+
print(f)
255+
# Unlock the file to make it writable
256+
api.unlock(path=str(dataset_path / f), dataset=str(dataset_path))
257+
source_file = dataset_path / f
258+
relative_path = source_file.relative_to(dataset_path)
259+
target_file = Path(output_directory) / relative_path
260+
target_file.parent.mkdir(parents=True, exist_ok=True)
261+
shutil.copy2(source_file, target_file)
237262

238263
else:
239264
combined_subjects += b.get(return_type='id', target='subject')
@@ -257,7 +282,7 @@ def download_test_data(
257282
json.dump(create_dataset_description(), f, indent=4)
258283

259284
with open(readme_path, 'w') as f:
260-
f.write(create_readme_content(pet_datasets, readme_template))
285+
f.write(create_readme_content(datasets_to_use, readme_template))
261286
combined_participants.to_csv(
262287
Path(output_directory) / 'participants.tsv', sep='\t', index=False
263288
)
@@ -273,19 +298,23 @@ def download_test_data(
273298
parser.add_argument(
274299
'--working-directory',
275300
'-w',
276-
type=str,
277301
default=TemporaryDirectory(),
278302
help='Working directory for downloading and combining datasets,'
279303
'defaults to a temporary directory.',
280304
)
281305
parser.add_argument(
282306
'--output-directory',
283307
'-o',
284-
type=str,
285-
default=os.getcwd(),
308+
default=Path.cwd(),
286309
help='Output directory of combined dataset,'
287-
'defaults where this script is called from, presently {os.getcwd()}',
288-
required=True,
310+
'defaults where this script is called from, presently current working directory.',
311+
)
312+
parser.add_argument(
313+
'--derivatives',
314+
'-d',
315+
nargs='+',
316+
type=str,
317+
help='Additional derivatives to include alongside the BIDS data.',
289318
)
290319
parser.add_argument(
291320
'--datasets-json',
@@ -320,4 +349,5 @@ def download_test_data(
320349
working_directory=args.working_directory,
321350
output_directory=args.output_directory,
322351
pet_datasets_json=args.datasets_json, # This will be None if not provided
352+
derivatives=args.derivatives,
323353
)

0 commit comments

Comments
 (0)