|
| 1 | +import os.path as op |
| 2 | +import re |
| 3 | + |
| 4 | +import boto3 |
| 5 | + |
| 6 | +from .base import InputS3Keys |
| 7 | + |
| 8 | + |
| 9 | +def single_subject_s3_keys(subject, site, raw_keys, derivative_keys): |
| 10 | + """Get the S3 keys for a single subject's input files |
| 11 | +
|
| 12 | + Parameters |
| 13 | + ---------- |
| 14 | + subject : string |
| 15 | + Subject ID on which to filter the s3 keys |
| 16 | +
|
| 17 | + site : string |
| 18 | + Site ID from which to collect raw data |
| 19 | +
|
| 20 | + raw_keys : sequence |
| 21 | + Sequence of raw data s3 keys to filter |
| 22 | +
|
| 23 | + derivative_keys : sequence |
| 24 | + Sequence of derivative data s3 keys to filter |
| 25 | +
|
| 26 | + Returns |
| 27 | + ------- |
| 28 | + InputS3Keys namedtuple |
| 29 | + If all prerequisite s3 keys are present, return a namedtuple of |
| 30 | + s3 keys. Otherwise, use the default None values. |
| 31 | + """ |
| 32 | + # Get only the s3 keys corresponding to this subject |
| 33 | + sub_dwi_files = [k for k in raw_keys if subject in k and '/dwi/' in k] |
| 34 | + sub_fmap_files = [k for k in raw_keys if subject in k and '/fmap/' in k] |
| 35 | + sub_deriv_files = [k for k in derivative_keys if subject in k] |
| 36 | + |
| 37 | + # Get the dwi files, bvec files, and bval files |
| 38 | + dwi = [f for f in sub_dwi_files |
| 39 | + if f.endswith('.nii.gz') and 'TRACEW' not in f] |
| 40 | + bvec = [f for f in sub_dwi_files if f.endswith('.bvec')] |
| 41 | + bval = [f for f in sub_dwi_files if f.endswith('.bval')] |
| 42 | + epi_nii = [f for f in sub_fmap_files if f.endswith('epi.nii.gz') |
| 43 | + and 'fMRI' not in f] |
| 44 | + epi_json = [f for f in sub_fmap_files if f.endswith('epi.json') |
| 45 | + and 'fMRI' not in f] |
| 46 | + t1w = [f for f in sub_deriv_files if f.endswith('/T1w.nii.gz')] |
| 47 | + freesurfer = [f for f in sub_deriv_files |
| 48 | + if '/freesurfer/' in f] |
| 49 | + |
| 50 | + # Use truthiness of non-empty lists to verify that all |
| 51 | + # of the required prereq files exist in `s3_keys` |
| 52 | + # TODO: If some of the files are missing, look farther up in the directory |
| 53 | + # TODO: structure to see if there are files we should inherit |
| 54 | + if all([dwi, bval, bvec, epi_nii, epi_json, t1w, freesurfer]): |
| 55 | + return InputS3Keys( |
| 56 | + subject=subject, |
| 57 | + site=site, |
| 58 | + valid=True, |
| 59 | + s3_keys=dict( |
| 60 | + dwi=dwi, |
| 61 | + bvec=bvec, |
| 62 | + bval=bval, |
| 63 | + epi_nii=epi_nii, |
| 64 | + epi_json=epi_json, |
| 65 | + freesurfer=freesurfer, |
| 66 | + t1w=t1w, |
| 67 | + ), |
| 68 | + ) |
| 69 | + else: |
| 70 | + return InputS3Keys( |
| 71 | + subject=subject, |
| 72 | + site=site, |
| 73 | + valid=False, |
| 74 | + s3_keys=None, |
| 75 | + ) |
| 76 | + |
| 77 | + |
| 78 | +def get_all_s3_keys(prefix, sites, bucket='fcp-indi'): |
| 79 | + """ |
| 80 | + Parameters |
| 81 | + ---------- |
| 82 | + prefix : string |
| 83 | + S3 prefix designating the S3 "directory" in which to search. |
| 84 | + Do not include the site ID in the prefix. |
| 85 | +
|
| 86 | + sites : sequence of strings |
| 87 | + Site IDs from which to collect raw data |
| 88 | +
|
| 89 | + bucket : string |
| 90 | + AWS S3 bucket in which to search |
| 91 | +
|
| 92 | + Returns |
| 93 | + ------- |
| 94 | + dict |
| 95 | + A dictionary with keys corresponding to `sites` and values |
| 96 | + that are a list of `InputS3Keys` namedtuples |
| 97 | + """ |
| 98 | + s3 = boto3.client('s3') |
| 99 | + subjects = {} |
| 100 | + |
| 101 | + # Avoid duplicate trailing slash in prefix |
| 102 | + prefix = prefix.rstrip('/') |
| 103 | + |
| 104 | + for site in sites: |
| 105 | + response = s3.list_objects_v2( |
| 106 | + Bucket=bucket, |
| 107 | + Prefix=prefix + '/' + site + '/', |
| 108 | + ) |
| 109 | + |
| 110 | + try: |
| 111 | + keys = [d['Key'] for d in response.get('Contents')] |
| 112 | + except TypeError: |
| 113 | + raise ValueError( |
| 114 | + 'There are no subject files in the S3 bucket with prefix ' |
| 115 | + '{pfix:s} and site {site:s}'.format(pfix=prefix, site=site) |
| 116 | + ) |
| 117 | + |
| 118 | + while response['IsTruncated']: |
| 119 | + response = s3.list_objects_v2( |
| 120 | + Bucket=bucket, |
| 121 | + Prefix=prefix + '/' + site + '/', |
| 122 | + ContinuationToken=response['NextContinuationToken'] |
| 123 | + ) |
| 124 | + |
| 125 | + keys += [d['Key'] for d in response.get('Contents')] |
| 126 | + |
| 127 | + def get_subject_id(key): |
| 128 | + match = re.search('/sub-[0-9a-zA-Z]*/', key) |
| 129 | + if match is not None: |
| 130 | + return match.group().strip('/') |
| 131 | + else: |
| 132 | + return None |
| 133 | + |
| 134 | + derivative_keys = [ |
| 135 | + k for k in keys |
| 136 | + if k.startswith(prefix + '/' + site + '/derivatives/sub-') |
| 137 | + ] |
| 138 | + |
| 139 | + raw_keys = [ |
| 140 | + k for k in keys |
| 141 | + if k.startswith(prefix + '/' + site + '/sub-') |
| 142 | + ] |
| 143 | + |
| 144 | + subs_with_dwi = { |
| 145 | + get_subject_id(k) for k in raw_keys |
| 146 | + if '/dwi/' in k |
| 147 | + } |
| 148 | + |
| 149 | + subs_with_epi_nii = { |
| 150 | + get_subject_id(k) for k in raw_keys |
| 151 | + if ( |
| 152 | + k.endswith('epi.nii.gz') |
| 153 | + and '/fmap/' in k |
| 154 | + and 'fMRI' not in k |
| 155 | + ) |
| 156 | + } |
| 157 | + |
| 158 | + subs_with_epi_json = { |
| 159 | + get_subject_id(k) for k in raw_keys |
| 160 | + if ( |
| 161 | + k.endswith('epi.json') |
| 162 | + and '/fmap/' in k |
| 163 | + and 'fMRI' not in k |
| 164 | + ) |
| 165 | + } |
| 166 | + |
| 167 | + subs_with_freesurfer = { |
| 168 | + get_subject_id(k) for k in derivative_keys |
| 169 | + if '/freesurfer/' in k |
| 170 | + } |
| 171 | + |
| 172 | + subs_with_t1w = { |
| 173 | + get_subject_id(k) for k in derivative_keys |
| 174 | + if k.endswith('T1w.nii.gz') |
| 175 | + } |
| 176 | + |
| 177 | + valid_subjects = ( |
| 178 | + subs_with_dwi |
| 179 | + & subs_with_epi_nii |
| 180 | + & subs_with_epi_json |
| 181 | + & subs_with_freesurfer |
| 182 | + & subs_with_t1w |
| 183 | + ) |
| 184 | + |
| 185 | + subject_s3_keys = [ |
| 186 | + single_subject_s3_keys(s, site, raw_keys, derivative_keys) |
| 187 | + for s in valid_subjects |
| 188 | + ] |
| 189 | + |
| 190 | + subjects[site] = list(filter( |
| 191 | + lambda sub: sub.valid, |
| 192 | + subject_s3_keys |
| 193 | + )) |
| 194 | + |
| 195 | + return subjects |
0 commit comments