Skip to content
This repository was archived by the owner on Dec 27, 2022. It is now read-only.

Commit 1a8c058

Browse files
author
Adam Richie-Halford
committed
Add base.py, __init__.py, and fetch_hbn.py to preafq dir
1 parent 819d964 commit 1a8c058

File tree

3 files changed

+314
-0
lines changed

3 files changed

+314
-0
lines changed

preafq/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .base import download_prereqs, upload_to_s3 # noqa
2+
from . import fetch_hbn # noqa

preafq/base.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import os.path as op
2+
from collections import namedtuple
3+
from pathlib import Path
4+
5+
import boto3
6+
7+
# Define the different namedtuple return types
8+
InputS3Keys = namedtuple(
9+
'InputS3Keys',
10+
['subject', 'site', 'valid', 's3_keys']
11+
)
12+
13+
# Input files namedtuple
14+
InputFiles = namedtuple('InputFiles', ['subject', 'site', 'files'])
15+
16+
17+
def download_prereqs(subject_keys, bucket='fcp-indi', directory='./input'):
18+
"""
19+
Parameters
20+
----------
21+
subject_keys : InputS3Keys namedtuple
22+
Input s3 keys stored in namedtuple. Must have the fields
23+
'subject': subjectID,
24+
'site': siteID,
25+
's3_keys': dictionary of S3 keys
26+
27+
bucket : string
28+
S3 bucket from which to extract files
29+
30+
directory : string
31+
Local directory to which to save files
32+
33+
Returns
34+
-------
35+
files : InputFiles namedtuple
36+
Input file paths stored in namedtuple. Has the fields
37+
'subject': subjectID,
38+
'site' : siteID,
39+
'files' : local file paths
40+
"""
41+
s3 = boto3.client('s3')
42+
subject = subject_keys.subject
43+
site = subject_keys.site
44+
45+
input_files = InputFiles(
46+
subject=subject,
47+
site=site,
48+
files={
49+
k: [op.abspath(op.join(
50+
directory, site, p.split('/' + site + '/')[-1]
51+
)) for p in v] for k, v in subject_keys.s3_keys.items()
52+
},
53+
)
54+
55+
s3keys = subject_keys.s3_keys
56+
files = input_files.files
57+
for ftype in s3keys.keys():
58+
for key, fname in zip(s3keys[ftype], files[ftype]):
59+
# Create the directory and file if necessary
60+
Path(op.dirname(fname)).mkdir(parents=True, exist_ok=True)
61+
Path(fname).touch(exist_ok=True)
62+
63+
# Download the file
64+
s3.download_file(Bucket=bucket, Key=key, Filename=fname)
65+
66+
return input_files
67+
68+
69+
def upload_to_s3(output_files, bucket, prefix, site, session, subject):
70+
"""Upload output files to S3, using key format specified by input params
71+
72+
Parameters
73+
----------
74+
output_files : dict of filenames
75+
Output files to transfer to S3. Assume that the user has passed in
76+
relative paths that are appropriate to fill in after the 'preAFQ'
77+
directory.
78+
79+
bucket : string
80+
Output S3 bucket
81+
82+
prefix : string
83+
Output S3 prefix
84+
85+
site : string
86+
Site ID, e.g. 'side-SI'
87+
88+
session : string
89+
Session ID, e.g. 'sess-001'
90+
91+
subject : string
92+
Subject ID, e.g. 'sub-ABCXYZ'
93+
94+
Returns
95+
-------
96+
dict
97+
S3 keys for each output file. The dict keys are the same as the keys
98+
for the input parameter `output_files`
99+
"""
100+
s3 = boto3.client('s3')
101+
102+
def filename2s3key(filename):
103+
return '/'.join([
104+
prefix, site, subject, session,
105+
'derivatives', 'preAFQ',
106+
op.basename(filename)
107+
])
108+
109+
for file in output_files.values():
110+
with open(file, 'rb') as fp:
111+
s3.put_object(
112+
Bucket=bucket,
113+
Body=fp,
114+
Key=filename2s3key(file),
115+
)
116+
117+
return {k: filename2s3key(v) for k, v in output_files.items()}

preafq/fetch_hbn.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
import os.path as op
2+
import re
3+
4+
import boto3
5+
6+
from .base import InputS3Keys
7+
8+
9+
def single_subject_s3_keys(subject, site, raw_keys, derivative_keys):
10+
"""Get the S3 keys for a single subject's input files
11+
12+
Parameters
13+
----------
14+
subject : string
15+
Subject ID on which to filter the s3 keys
16+
17+
site : string
18+
Site ID from which to collect raw data
19+
20+
raw_keys : sequence
21+
Sequence of raw data s3 keys to filter
22+
23+
derivative_keys : sequence
24+
Sequence of derivative data s3 keys to filter
25+
26+
Returns
27+
-------
28+
InputS3Keys namedtuple
29+
If all prerequisite s3 keys are present, return a namedtuple of
30+
s3 keys. Otherwise, use the default None values.
31+
"""
32+
# Get only the s3 keys corresponding to this subject
33+
sub_dwi_files = [k for k in raw_keys if subject in k and '/dwi/' in k]
34+
sub_fmap_files = [k for k in raw_keys if subject in k and '/fmap/' in k]
35+
sub_deriv_files = [k for k in derivative_keys if subject in k]
36+
37+
# Get the dwi files, bvec files, and bval files
38+
dwi = [f for f in sub_dwi_files
39+
if f.endswith('.nii.gz') and 'TRACEW' not in f]
40+
bvec = [f for f in sub_dwi_files if f.endswith('.bvec')]
41+
bval = [f for f in sub_dwi_files if f.endswith('.bval')]
42+
epi_nii = [f for f in sub_fmap_files if f.endswith('epi.nii.gz')
43+
and 'fMRI' not in f]
44+
epi_json = [f for f in sub_fmap_files if f.endswith('epi.json')
45+
and 'fMRI' not in f]
46+
t1w = [f for f in sub_deriv_files if f.endswith('/T1w.nii.gz')]
47+
freesurfer = [f for f in sub_deriv_files
48+
if '/freesurfer/' in f]
49+
50+
# Use truthiness of non-empty lists to verify that all
51+
# of the required prereq files exist in `s3_keys`
52+
# TODO: If some of the files are missing, look farther up in the directory
53+
# TODO: structure to see if there are files we should inherit
54+
if all([dwi, bval, bvec, epi_nii, epi_json, t1w, freesurfer]):
55+
return InputS3Keys(
56+
subject=subject,
57+
site=site,
58+
valid=True,
59+
s3_keys=dict(
60+
dwi=dwi,
61+
bvec=bvec,
62+
bval=bval,
63+
epi_nii=epi_nii,
64+
epi_json=epi_json,
65+
freesurfer=freesurfer,
66+
t1w=t1w,
67+
),
68+
)
69+
else:
70+
return InputS3Keys(
71+
subject=subject,
72+
site=site,
73+
valid=False,
74+
s3_keys=None,
75+
)
76+
77+
78+
def get_all_s3_keys(prefix, sites, bucket='fcp-indi'):
79+
"""
80+
Parameters
81+
----------
82+
prefix : string
83+
S3 prefix designating the S3 "directory" in which to search.
84+
Do not include the site ID in the prefix.
85+
86+
sites : sequence of strings
87+
Site IDs from which to collect raw data
88+
89+
bucket : string
90+
AWS S3 bucket in which to search
91+
92+
Returns
93+
-------
94+
dict
95+
A dictionary with keys corresponding to `sites` and values
96+
that are a list of `InputS3Keys` namedtuples
97+
"""
98+
s3 = boto3.client('s3')
99+
subjects = {}
100+
101+
# Avoid duplicate trailing slash in prefix
102+
prefix = prefix.rstrip('/')
103+
104+
for site in sites:
105+
response = s3.list_objects_v2(
106+
Bucket=bucket,
107+
Prefix=prefix + '/' + site + '/',
108+
)
109+
110+
try:
111+
keys = [d['Key'] for d in response.get('Contents')]
112+
except TypeError:
113+
raise ValueError(
114+
'There are no subject files in the S3 bucket with prefix '
115+
'{pfix:s} and site {site:s}'.format(pfix=prefix, site=site)
116+
)
117+
118+
while response['IsTruncated']:
119+
response = s3.list_objects_v2(
120+
Bucket=bucket,
121+
Prefix=prefix + '/' + site + '/',
122+
ContinuationToken=response['NextContinuationToken']
123+
)
124+
125+
keys += [d['Key'] for d in response.get('Contents')]
126+
127+
def get_subject_id(key):
128+
match = re.search('/sub-[0-9a-zA-Z]*/', key)
129+
if match is not None:
130+
return match.group().strip('/')
131+
else:
132+
return None
133+
134+
derivative_keys = [
135+
k for k in keys
136+
if k.startswith(prefix + '/' + site + '/derivatives/sub-')
137+
]
138+
139+
raw_keys = [
140+
k for k in keys
141+
if k.startswith(prefix + '/' + site + '/sub-')
142+
]
143+
144+
subs_with_dwi = {
145+
get_subject_id(k) for k in raw_keys
146+
if '/dwi/' in k
147+
}
148+
149+
subs_with_epi_nii = {
150+
get_subject_id(k) for k in raw_keys
151+
if (
152+
k.endswith('epi.nii.gz')
153+
and '/fmap/' in k
154+
and 'fMRI' not in k
155+
)
156+
}
157+
158+
subs_with_epi_json = {
159+
get_subject_id(k) for k in raw_keys
160+
if (
161+
k.endswith('epi.json')
162+
and '/fmap/' in k
163+
and 'fMRI' not in k
164+
)
165+
}
166+
167+
subs_with_freesurfer = {
168+
get_subject_id(k) for k in derivative_keys
169+
if '/freesurfer/' in k
170+
}
171+
172+
subs_with_t1w = {
173+
get_subject_id(k) for k in derivative_keys
174+
if k.endswith('T1w.nii.gz')
175+
}
176+
177+
valid_subjects = (
178+
subs_with_dwi
179+
& subs_with_epi_nii
180+
& subs_with_epi_json
181+
& subs_with_freesurfer
182+
& subs_with_t1w
183+
)
184+
185+
subject_s3_keys = [
186+
single_subject_s3_keys(s, site, raw_keys, derivative_keys)
187+
for s in valid_subjects
188+
]
189+
190+
subjects[site] = list(filter(
191+
lambda sub: sub.valid,
192+
subject_s3_keys
193+
))
194+
195+
return subjects

0 commit comments

Comments
 (0)