Skip to content

Commit 5b07fea

Browse files
committed
add required_files filter
1 parent 1cecbb8 commit 5b07fea

File tree

3 files changed

+560
-2
lines changed

3 files changed

+560
-2
lines changed

babs/input_dataset.py

Lines changed: 218 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""This module is for input dataset(s)."""
22

3+
import fnmatch
34
import os
45
import re
56
import warnings
@@ -159,7 +160,7 @@ def generate_inclusion_dataframe(self, initial_inclu_df=None):
159160
if self.is_zipped:
160161
inclu_df = self._get_sub_ses_from_zipped_input()
161162
else:
162-
inclu_df = self._get_sub_ses_from_nonzipped_input()
163+
inclu_df = self.get_sub_ses_from_nonzipped_input()
163164

164165
if inclu_df.empty:
165166
if self.processing_level == 'session':
@@ -168,6 +169,15 @@ def generate_inclusion_dataframe(self, initial_inclu_df=None):
168169
columns = ['job_id', 'task_id', 'sub_id', 'has_results']
169170
return pd.DataFrame(columns=columns)
170171

172+
# Filter based on required_files if specified
173+
if self.required_files:
174+
if self._is_input_dataset:
175+
print(
176+
f'Filtering subjects (sessions) based on required_files '
177+
f'in dataset {self.name}...'
178+
)
179+
inclu_df = self.filter_by_required_files(inclu_df)
180+
171181
return inclu_df
172182

173183
def _get_sub_ses_from_zipped_input(self):
@@ -208,7 +218,7 @@ def _get_sub_ses_from_zipped_input(self):
208218

209219
return pd.DataFrame(found_sub_ses)
210220

211-
def _get_sub_ses_from_nonzipped_input(self):
221+
def get_sub_ses_from_nonzipped_input(self):
212222
"""Find the subjects (and sessions) available as directories in the input dataset.
213223
No validation is done on the directories.
214224
@@ -254,6 +264,212 @@ def _get_sub_ses_from_nonzipped_input(self):
254264

255265
return df
256266

267+
def filter_by_required_files(self, inclu_df):
268+
"""
269+
Filter inclusion dataframe to only include subjects/sessions that have all required files.
270+
271+
Parameters
272+
----------
273+
inclu_df: pandas DataFrame
274+
DataFrame with columns 'sub_id' and optionally 'ses_id'
275+
276+
Returns
277+
-------
278+
filtered_df: pandas DataFrame
279+
Filtered DataFrame containing only subjects/sessions with all required files
280+
"""
281+
if inclu_df.empty:
282+
return inclu_df
283+
284+
if not self.required_files:
285+
return inclu_df
286+
287+
# Track which rows to keep
288+
keep_mask = []
289+
290+
for _, row in inclu_df.iterrows():
291+
sub_id = row['sub_id']
292+
ses_id = row.get('ses_id', None)
293+
294+
if self.has_required_files(sub_id, ses_id):
295+
keep_mask.append(True)
296+
else:
297+
keep_mask.append(False)
298+
299+
filtered_df = inclu_df[keep_mask].copy()
300+
301+
if len(filtered_df) < len(inclu_df):
302+
excluded_count = len(inclu_df) - len(filtered_df)
303+
print(
304+
f'Excluded {excluded_count} subject(s)/session(s) '
305+
f'that do not have all required files.'
306+
)
307+
308+
return filtered_df
309+
310+
def has_required_files(self, sub_id, ses_id=None):
311+
"""
312+
Check if a subject/session has all required files.
313+
314+
Parameters
315+
----------
316+
sub_id: str
317+
Subject ID (e.g., 'sub-01')
318+
ses_id: str or None
319+
Session ID (e.g., 'ses-01') or None for subject-level processing
320+
321+
Returns
322+
-------
323+
bool
324+
True if all required files are present, False otherwise
325+
"""
326+
if not self.required_files:
327+
return True
328+
329+
if self.is_zipped:
330+
return self.check_required_files_in_zip(sub_id, ses_id)
331+
else:
332+
return self.check_required_files_in_dir(sub_id, ses_id)
333+
334+
def check_required_files_in_dir(self, sub_id, ses_id=None):
335+
"""
336+
Check if required files exist in the unzipped dataset directory.
337+
338+
Parameters
339+
----------
340+
sub_id: str
341+
Subject ID (e.g., 'sub-01')
342+
ses_id: str or None
343+
Session ID (e.g., 'ses-01') or None for subject-level processing
344+
345+
Returns
346+
-------
347+
bool
348+
True if all required files are present, False otherwise
349+
"""
350+
# Build the base path to the subject/session directory
351+
if ses_id is not None:
352+
# Session-level: path/to/dataset/sub-01/ses-01
353+
base_path = os.path.join(
354+
self.babs_project_analysis_path,
355+
self.unzipped_path_containing_subject_dirs,
356+
sub_id,
357+
ses_id,
358+
)
359+
else:
360+
# Subject-level: path/to/dataset/sub-01
361+
base_path = os.path.join(
362+
self.babs_project_analysis_path,
363+
self.unzipped_path_containing_subject_dirs,
364+
sub_id,
365+
)
366+
367+
# Check each required file pattern
368+
for pattern in self.required_files:
369+
# Pattern is relative to the subject/session directory
370+
# e.g., "func/*_bold.nii*" or "anat/*_T1w.nii*"
371+
search_path = os.path.join(base_path, pattern)
372+
matches = glob(search_path)
373+
if not matches:
374+
return False
375+
376+
return True
377+
378+
def check_required_files_in_zip(self, sub_id, ses_id=None):
379+
"""
380+
Check if required files exist in the zipped dataset.
381+
382+
Parameters
383+
----------
384+
sub_id: str
385+
Subject ID (e.g., 'sub-01')
386+
ses_id: str or None
387+
Session ID (e.g., 'ses-01') or None for subject-level processing
388+
389+
Returns
390+
-------
391+
bool
392+
True if all required files are present, False otherwise
393+
"""
394+
# Find the zip file for this subject/session
395+
zip_name = self.name if self._is_input_dataset else ''
396+
if ses_id is not None:
397+
zip_pattern = f'{sub_id}_{ses_id}_{zip_name}*.zip'
398+
else:
399+
zip_pattern = f'{sub_id}_{zip_name}*.zip'
400+
401+
zip_files = glob(os.path.join(self.babs_project_analysis_path, zip_pattern))
402+
403+
if not zip_files:
404+
return False
405+
406+
# Use the first matching zip file (should only be one per subject/session)
407+
zip_file = zip_files[0]
408+
409+
# Get the zip file from datalad if needed
410+
try:
411+
dlapi.get(path=zip_file, dataset=self.babs_project_analysis_path)
412+
except Exception as e:
413+
# If we can't get the file, assume it doesn't exist
414+
print(f'Warning: Could not get zip file {zip_file} from datalad: {e}')
415+
return False
416+
417+
try:
418+
# Check each required file pattern
419+
with zipfile.ZipFile(zip_file) as zf:
420+
zip_contents = zf.namelist()
421+
422+
# Determine the base path within the zip file
423+
# For zipped datasets, there's usually a root directory (e.g., dataset name)
424+
# followed by subject/session directories
425+
if ses_id is not None:
426+
# Session-level: dataset_name/sub-01/ses-01/
427+
base_prefix = f'{self.name}/{sub_id}/{ses_id}/'
428+
else:
429+
# Subject-level: dataset_name/sub-01/
430+
base_prefix = f'{self.name}/{sub_id}/'
431+
432+
# Check each required file pattern
433+
for pattern in self.required_files:
434+
# Pattern is relative to the subject/session directory
435+
# e.g., "func/*_bold.nii*" or "anat/*_T1w.nii*"
436+
# We need to search within the zip contents
437+
search_pattern = base_prefix + pattern
438+
439+
# Use fnmatch to match glob patterns
440+
matches = [
441+
name for name in zip_contents if fnmatch.fnmatch(name, search_pattern)
442+
]
443+
444+
# Also try without the dataset name prefix (in case zip structure is different)
445+
if not matches:
446+
if ses_id is not None:
447+
alt_prefix = f'{sub_id}/{ses_id}/'
448+
else:
449+
alt_prefix = f'{sub_id}/'
450+
alt_pattern = alt_prefix + pattern
451+
matches = [
452+
name for name in zip_contents if fnmatch.fnmatch(name, alt_pattern)
453+
]
454+
455+
if not matches:
456+
return False
457+
458+
return True
459+
except Exception as e:
460+
# If there's an error reading the zip, assume files don't exist
461+
print(f'Warning: Error reading zip file {zip_file} to check required files: {e}')
462+
return False
463+
finally:
464+
# Clean up: drop the zip file from datalad
465+
# Ignore errors during cleanup to avoid masking the actual result
466+
try:
467+
dlapi.drop(path=zip_file, dataset=self.babs_project_analysis_path)
468+
except Exception as e: # noqa: S110
469+
print(
470+
f'Warning: Could not drop zip file {zip_file} from datalad during cleanup: {e}'
471+
)
472+
257473
def as_dict(self):
258474
"""Return the input dataset as a dictionary."""
259475
# Ensure unzipped_path_containing_subject_dirs is set correctly

babs/input_datasets.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,14 @@ def generate_inclusion_dataframe(self):
122122
if self.initial_inclu_df is not None:
123123
print('Using the subjects (sessions) provided in the initial inclusion list.')
124124
inclu_df = self.initial_inclu_df
125+
# Filter the initial inclusion list based on required_files for each dataset
126+
for dataset in self._datasets:
127+
if dataset.required_files:
128+
print(
129+
f'Filtering initial inclusion list based on required_files '
130+
f'in dataset {dataset.name}...'
131+
)
132+
inclu_df = dataset.filter_by_required_files(inclu_df)
125133
else:
126134
initial_inclusion_dfs = [
127135
dataset.generate_inclusion_dataframe() for dataset in self._datasets

0 commit comments

Comments
 (0)