Add base.py, __init__.py, and fetch_hbn.py to preafq dir

Adam Richie-Halford · Adam Richie-Halford · commit 1a8c058591fd · 2018-09-06T05:01:47.000-07:00
diff --git a/preafq/__init__.py b/preafq/__init__.py
@@ -0,0 +1,2 @@
+from .base import download_prereqs, upload_to_s3  # noqa
+from . import fetch_hbn  # noqa
diff --git a/preafq/base.py b/preafq/base.py
@@ -0,0 +1,117 @@
+import os.path as op
+from collections import namedtuple
+from pathlib import Path
+
+import boto3
+
+# Define the different namedtuple return types
+InputS3Keys = namedtuple(
+    'InputS3Keys',
+    ['subject', 'site', 'valid', 's3_keys']
+)
+
+# Input files namedtuple
+InputFiles = namedtuple('InputFiles', ['subject', 'site', 'files'])
+
+
+def download_prereqs(subject_keys, bucket='fcp-indi', directory='./input'):
+    """
+    Parameters
+    ----------
+    subject_keys : InputS3Keys namedtuple
+        Input s3 keys stored in namedtuple. Must have the fields
+        'subject': subjectID,
+        'site': siteID,
+        's3_keys': dictionary of S3 keys
+
+    bucket : string
+        S3 bucket from which to extract files
+
+    directory : string
+        Local directory to which to save files
+
+    Returns
+    -------
+    files : InputFiles namedtuple
+        Input file paths stored in namedtuple. Has the fields
+        'subject': subjectID,
+        'site' : siteID,
+        'files' : local file paths
+    """
+    s3 = boto3.client('s3')
+    subject = subject_keys.subject
+    site = subject_keys.site
+
+    input_files = InputFiles(
+        subject=subject,
+        site=site,
+        files={
+            k: [op.abspath(op.join(
+                directory, site, p.split('/' + site + '/')[-1]
+            )) for p in v] for k, v in subject_keys.s3_keys.items()
+        },
+    )
+
+    s3keys = subject_keys.s3_keys
+    files = input_files.files
+    for ftype in s3keys.keys():
+        for key, fname in zip(s3keys[ftype], files[ftype]):
+            # Create the directory and file if necessary
+            Path(op.dirname(fname)).mkdir(parents=True, exist_ok=True)
+            Path(fname).touch(exist_ok=True)
+
+            # Download the file
+            s3.download_file(Bucket=bucket, Key=key, Filename=fname)
+
+    return input_files
+
+
+def upload_to_s3(output_files, bucket, prefix, site, session, subject):
+    """Upload output files to S3, using key format specified by input params
+
+    Parameters
+    ----------
+    output_files : dict of filenames
+        Output files to transfer to S3. Assume that the user has passed in
+        relative paths that are appropriate to fill in after the 'preAFQ'
+        directory.
+
+    bucket : string
+        Output S3 bucket
+
+    prefix : string
+        Output S3 prefix
+
+    site : string
+        Site ID, e.g. 'side-SI'
+
+    session : string
+        Session ID, e.g. 'sess-001'
+
+    subject : string
+        Subject ID, e.g. 'sub-ABCXYZ'
+
+    Returns
+    -------
+    dict
+        S3 keys for each output file. The dict keys are the same as the keys
+        for the input parameter `output_files`
+    """
+    s3 = boto3.client('s3')
+
+    def filename2s3key(filename):
+        return '/'.join([
+            prefix, site, subject, session,
+            'derivatives', 'preAFQ',
+            op.basename(filename)
+        ])
+
+    for file in output_files.values():
+        with open(file, 'rb') as fp:
+            s3.put_object(
+                Bucket=bucket,
+                Body=fp,
+                Key=filename2s3key(file),
+            )
+
+    return {k: filename2s3key(v) for k, v in output_files.items()}
diff --git a/preafq/fetch_hbn.py b/preafq/fetch_hbn.py
@@ -0,0 +1,195 @@
+import os.path as op
+import re
+
+import boto3
+
+from .base import InputS3Keys
+
+
+def single_subject_s3_keys(subject, site, raw_keys, derivative_keys):
+    """Get the S3 keys for a single subject's input files
+
+    Parameters
+    ----------
+    subject : string
+        Subject ID on which to filter the s3 keys
+
+    site : string
+        Site ID from which to collect raw data
+
+    raw_keys : sequence
+        Sequence of raw data s3 keys to filter
+
+    derivative_keys : sequence
+        Sequence of derivative data s3 keys to filter
+
+    Returns
+    -------
+    InputS3Keys namedtuple
+        If all prerequisite s3 keys are present, return a namedtuple of
+        s3 keys. Otherwise, use the default None values.
+    """
+    # Get only the s3 keys corresponding to this subject
+    sub_dwi_files = [k for k in raw_keys if subject in k and '/dwi/' in k]
+    sub_fmap_files = [k for k in raw_keys if subject in k and '/fmap/' in k]
+    sub_deriv_files = [k for k in derivative_keys if subject in k]
+
+    # Get the dwi files, bvec files, and bval files
+    dwi = [f for f in sub_dwi_files
+           if f.endswith('.nii.gz') and 'TRACEW' not in f]
+    bvec = [f for f in sub_dwi_files if f.endswith('.bvec')]
+    bval = [f for f in sub_dwi_files if f.endswith('.bval')]
+    epi_nii = [f for f in sub_fmap_files if f.endswith('epi.nii.gz')
+               and 'fMRI' not in f]
+    epi_json = [f for f in sub_fmap_files if f.endswith('epi.json')
+                and 'fMRI' not in f]
+    t1w = [f for f in sub_deriv_files if f.endswith('/T1w.nii.gz')]
+    freesurfer = [f for f in sub_deriv_files
+                  if '/freesurfer/' in f]
+
+    # Use truthiness of non-empty lists to verify that all
+    # of the required prereq files exist in `s3_keys`
+    # TODO: If some of the files are missing, look farther up in the directory
+    # TODO: structure to see if there are files we should inherit
+    if all([dwi, bval, bvec, epi_nii, epi_json, t1w, freesurfer]):
+        return InputS3Keys(
+            subject=subject,
+            site=site,
+            valid=True,
+            s3_keys=dict(
+                dwi=dwi,
+                bvec=bvec,
+                bval=bval,
+                epi_nii=epi_nii,
+                epi_json=epi_json,
+                freesurfer=freesurfer,
+                t1w=t1w,
+            ),
+        )
+    else:
+        return InputS3Keys(
+            subject=subject,
+            site=site,
+            valid=False,
+            s3_keys=None,
+        )
+
+
+def get_all_s3_keys(prefix, sites, bucket='fcp-indi'):
+    """
+    Parameters
+    ----------
+    prefix : string
+        S3 prefix designating the S3 "directory" in which to search.
+        Do not include the site ID in the prefix.
+
+    sites : sequence of strings
+        Site IDs from which to collect raw data
+
+    bucket : string
+        AWS S3 bucket in which to search
+
+    Returns
+    -------
+    dict
+        A dictionary with keys corresponding to `sites` and values
+        that are a list of `InputS3Keys` namedtuples
+    """
+    s3 = boto3.client('s3')
+    subjects = {}
+
+    # Avoid duplicate trailing slash in prefix
+    prefix = prefix.rstrip('/')
+
+    for site in sites:
+        response = s3.list_objects_v2(
+            Bucket=bucket,
+            Prefix=prefix + '/' + site + '/',
+        )
+
+        try:
+            keys = [d['Key'] for d in response.get('Contents')]
+        except TypeError:
+            raise ValueError(
+                'There are no subject files in the S3 bucket with prefix '
+                '{pfix:s} and site {site:s}'.format(pfix=prefix, site=site)
+            )
+
+        while response['IsTruncated']:
+            response = s3.list_objects_v2(
+                Bucket=bucket,
+                Prefix=prefix + '/' + site + '/',
+                ContinuationToken=response['NextContinuationToken']
+            )
+
+            keys += [d['Key'] for d in response.get('Contents')]
+
+        def get_subject_id(key):
+            match = re.search('/sub-[0-9a-zA-Z]*/', key)
+            if match is not None:
+                return match.group().strip('/')
+            else:
+                return None
+
+        derivative_keys = [
+            k for k in keys
+            if k.startswith(prefix + '/' + site + '/derivatives/sub-')
+        ]
+
+        raw_keys = [
+            k for k in keys
+            if k.startswith(prefix + '/' + site + '/sub-')
+        ]
+
+        subs_with_dwi = {
+            get_subject_id(k) for k in raw_keys
+            if '/dwi/' in k
+        }
+
+        subs_with_epi_nii = {
+            get_subject_id(k) for k in raw_keys
+            if (
+                k.endswith('epi.nii.gz')
+                and '/fmap/' in k
+                and 'fMRI' not in k
+            )
+        }
+
+        subs_with_epi_json = {
+            get_subject_id(k) for k in raw_keys
+            if (
+                k.endswith('epi.json')
+                and '/fmap/' in k
+                and 'fMRI' not in k
+            )
+        }
+
+        subs_with_freesurfer = {
+            get_subject_id(k) for k in derivative_keys
+            if '/freesurfer/' in k
+        }
+
+        subs_with_t1w = {
+            get_subject_id(k) for k in derivative_keys
+            if k.endswith('T1w.nii.gz')
+        }
+
+        valid_subjects = (
+                subs_with_dwi
+                & subs_with_epi_nii
+                & subs_with_epi_json
+                & subs_with_freesurfer
+                & subs_with_t1w
+        )
+
+        subject_s3_keys = [
+            single_subject_s3_keys(s, site, raw_keys, derivative_keys)
+            for s in valid_subjects
+        ]
+
+        subjects[site] = list(filter(
+            lambda sub: sub.valid,
+            subject_s3_keys
+        ))
+
+    return subjects

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .base import download_prereqs, upload_to_s3 # noqa`
	`2`	`+from . import fetch_hbn # noqa`