11"""This module is for input dataset(s)."""
22
3+ import fnmatch
34import os
45import re
56import warnings
@@ -159,7 +160,7 @@ def generate_inclusion_dataframe(self, initial_inclu_df=None):
159160 if self .is_zipped :
160161 inclu_df = self ._get_sub_ses_from_zipped_input ()
161162 else :
162- inclu_df = self ._get_sub_ses_from_nonzipped_input ()
163+ inclu_df = self .get_sub_ses_from_nonzipped_input ()
163164
164165 if inclu_df .empty :
165166 if self .processing_level == 'session' :
@@ -168,6 +169,15 @@ def generate_inclusion_dataframe(self, initial_inclu_df=None):
168169 columns = ['job_id' , 'task_id' , 'sub_id' , 'has_results' ]
169170 return pd .DataFrame (columns = columns )
170171
172+ # Filter based on required_files if specified
173+ if self .required_files :
174+ if self ._is_input_dataset :
175+ print (
176+ f'Filtering subjects (sessions) based on required_files '
177+ f'in dataset { self .name } ...'
178+ )
179+ inclu_df = self .filter_by_required_files (inclu_df )
180+
171181 return inclu_df
172182
173183 def _get_sub_ses_from_zipped_input (self ):
@@ -208,7 +218,7 @@ def _get_sub_ses_from_zipped_input(self):
208218
209219 return pd .DataFrame (found_sub_ses )
210220
211- def _get_sub_ses_from_nonzipped_input (self ):
221+ def get_sub_ses_from_nonzipped_input (self ):
212222 """Find the subjects (and sessions) available as directories in the input dataset.
213223 No validation is done on the directories.
214224
@@ -254,6 +264,212 @@ def _get_sub_ses_from_nonzipped_input(self):
254264
255265 return df
256266
267+ def filter_by_required_files (self , inclu_df ):
268+ """
269+ Filter inclusion dataframe to only include subjects/sessions that have all required files.
270+
271+ Parameters
272+ ----------
273+ inclu_df: pandas DataFrame
274+ DataFrame with columns 'sub_id' and optionally 'ses_id'
275+
276+ Returns
277+ -------
278+ filtered_df: pandas DataFrame
279+ Filtered DataFrame containing only subjects/sessions with all required files
280+ """
281+ if inclu_df .empty :
282+ return inclu_df
283+
284+ if not self .required_files :
285+ return inclu_df
286+
287+ # Track which rows to keep
288+ keep_mask = []
289+
290+ for _ , row in inclu_df .iterrows ():
291+ sub_id = row ['sub_id' ]
292+ ses_id = row .get ('ses_id' , None )
293+
294+ if self .has_required_files (sub_id , ses_id ):
295+ keep_mask .append (True )
296+ else :
297+ keep_mask .append (False )
298+
299+ filtered_df = inclu_df [keep_mask ].copy ()
300+
301+ if len (filtered_df ) < len (inclu_df ):
302+ excluded_count = len (inclu_df ) - len (filtered_df )
303+ print (
304+ f'Excluded { excluded_count } subject(s)/session(s) '
305+ f'that do not have all required files.'
306+ )
307+
308+ return filtered_df
309+
310+ def has_required_files (self , sub_id , ses_id = None ):
311+ """
312+ Check if a subject/session has all required files.
313+
314+ Parameters
315+ ----------
316+ sub_id: str
317+ Subject ID (e.g., 'sub-01')
318+ ses_id: str or None
319+ Session ID (e.g., 'ses-01') or None for subject-level processing
320+
321+ Returns
322+ -------
323+ bool
324+ True if all required files are present, False otherwise
325+ """
326+ if not self .required_files :
327+ return True
328+
329+ if self .is_zipped :
330+ return self .check_required_files_in_zip (sub_id , ses_id )
331+ else :
332+ return self .check_required_files_in_dir (sub_id , ses_id )
333+
334+ def check_required_files_in_dir (self , sub_id , ses_id = None ):
335+ """
336+ Check if required files exist in the unzipped dataset directory.
337+
338+ Parameters
339+ ----------
340+ sub_id: str
341+ Subject ID (e.g., 'sub-01')
342+ ses_id: str or None
343+ Session ID (e.g., 'ses-01') or None for subject-level processing
344+
345+ Returns
346+ -------
347+ bool
348+ True if all required files are present, False otherwise
349+ """
350+ # Build the base path to the subject/session directory
351+ if ses_id is not None :
352+ # Session-level: path/to/dataset/sub-01/ses-01
353+ base_path = os .path .join (
354+ self .babs_project_analysis_path ,
355+ self .unzipped_path_containing_subject_dirs ,
356+ sub_id ,
357+ ses_id ,
358+ )
359+ else :
360+ # Subject-level: path/to/dataset/sub-01
361+ base_path = os .path .join (
362+ self .babs_project_analysis_path ,
363+ self .unzipped_path_containing_subject_dirs ,
364+ sub_id ,
365+ )
366+
367+ # Check each required file pattern
368+ for pattern in self .required_files :
369+ # Pattern is relative to the subject/session directory
370+ # e.g., "func/*_bold.nii*" or "anat/*_T1w.nii*"
371+ search_path = os .path .join (base_path , pattern )
372+ matches = glob (search_path )
373+ if not matches :
374+ return False
375+
376+ return True
377+
378+ def check_required_files_in_zip (self , sub_id , ses_id = None ):
379+ """
380+ Check if required files exist in the zipped dataset.
381+
382+ Parameters
383+ ----------
384+ sub_id: str
385+ Subject ID (e.g., 'sub-01')
386+ ses_id: str or None
387+ Session ID (e.g., 'ses-01') or None for subject-level processing
388+
389+ Returns
390+ -------
391+ bool
392+ True if all required files are present, False otherwise
393+ """
394+ # Find the zip file for this subject/session
395+ zip_name = self .name if self ._is_input_dataset else ''
396+ if ses_id is not None :
397+ zip_pattern = f'{ sub_id } _{ ses_id } _{ zip_name } *.zip'
398+ else :
399+ zip_pattern = f'{ sub_id } _{ zip_name } *.zip'
400+
401+ zip_files = glob (os .path .join (self .babs_project_analysis_path , zip_pattern ))
402+
403+ if not zip_files :
404+ return False
405+
406+ # Use the first matching zip file (should only be one per subject/session)
407+ zip_file = zip_files [0 ]
408+
409+ # Get the zip file from datalad if needed
410+ try :
411+ dlapi .get (path = zip_file , dataset = self .babs_project_analysis_path )
412+ except Exception as e :
413+ # If we can't get the file, assume it doesn't exist
414+ print (f'Warning: Could not get zip file { zip_file } from datalad: { e } ' )
415+ return False
416+
417+ try :
418+ # Check each required file pattern
419+ with zipfile .ZipFile (zip_file ) as zf :
420+ zip_contents = zf .namelist ()
421+
422+ # Determine the base path within the zip file
423+ # For zipped datasets, there's usually a root directory (e.g., dataset name)
424+ # followed by subject/session directories
425+ if ses_id is not None :
426+ # Session-level: dataset_name/sub-01/ses-01/
427+ base_prefix = f'{ self .name } /{ sub_id } /{ ses_id } /'
428+ else :
429+ # Subject-level: dataset_name/sub-01/
430+ base_prefix = f'{ self .name } /{ sub_id } /'
431+
432+ # Check each required file pattern
433+ for pattern in self .required_files :
434+ # Pattern is relative to the subject/session directory
435+ # e.g., "func/*_bold.nii*" or "anat/*_T1w.nii*"
436+ # We need to search within the zip contents
437+ search_pattern = base_prefix + pattern
438+
439+ # Use fnmatch to match glob patterns
440+ matches = [
441+ name for name in zip_contents if fnmatch .fnmatch (name , search_pattern )
442+ ]
443+
444+ # Also try without the dataset name prefix (in case zip structure is different)
445+ if not matches :
446+ if ses_id is not None :
447+ alt_prefix = f'{ sub_id } /{ ses_id } /'
448+ else :
449+ alt_prefix = f'{ sub_id } /'
450+ alt_pattern = alt_prefix + pattern
451+ matches = [
452+ name for name in zip_contents if fnmatch .fnmatch (name , alt_pattern )
453+ ]
454+
455+ if not matches :
456+ return False
457+
458+ return True
459+ except Exception as e :
460+ # If there's an error reading the zip, assume files don't exist
461+ print (f'Warning: Error reading zip file { zip_file } to check required files: { e } ' )
462+ return False
463+ finally :
464+ # Clean up: drop the zip file from datalad
465+ # Ignore errors during cleanup to avoid masking the actual result
466+ try :
467+ dlapi .drop (path = zip_file , dataset = self .babs_project_analysis_path )
468+ except Exception as e : # noqa: S110
469+ print (
470+ f'Warning: Could not drop zip file { zip_file } from datalad during cleanup: { e } '
471+ )
472+
257473 def as_dict (self ):
258474 """Return the input dataset as a dictionary."""
259475 # Ensure unzipped_path_containing_subject_dirs is set correctly
0 commit comments