Skip to content

Commit 3cc9c57

Browse files
authored
Merge pull request #187 from dartmouth-pbs/enh-mih-case
WiP: ReproIn heuristic - support usecase from @mih et al.
2 parents 4eff1a6 + 41ec485 commit 3cc9c57

File tree

4 files changed

+120
-63
lines changed

4 files changed

+120
-63
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ before_install:
2525
- pip install --upgrade virtualenv
2626
- virtualenv --python=python venv
2727
- source venv/bin/activate
28+
- pip --version # check again since seems that python_requires='>=3.5' in secretstorage is not in effect
2829
- python --version # just to check
2930
- pip install -r dev-requirements.txt
3031
- pip install requests # below installs pyld but that assumes we have requests already

heudiconv/bids.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -293,10 +293,14 @@ def get_formatted_scans_key_row(item):
293293
force=True))
294294
# we need to store filenames and acquisition times
295295
# parse date and time and get it into isoformat
296-
date = mw.dcm_data.ContentDate
297-
time = mw.dcm_data.ContentTime.split('.')[0]
298-
td = time + date
299-
acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat()
296+
try:
297+
date = mw.dcm_data.ContentDate
298+
time = mw.dcm_data.ContentTime.split('.')[0]
299+
td = time + date
300+
acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat()
301+
except AttributeError as exc:
302+
lgr.warning("Failed to get date/time for the content: %s", str(exc))
303+
acq_time = None
300304
# add random string
301305
randstr = ''.join(map(chr, sample(k=8, population=range(33, 127))))
302306
try:
@@ -326,6 +330,10 @@ def convert_sid_bids(subject_id):
326330
"""
327331
cleaner = lambda y: ''.join([x for x in y if x.isalnum()])
328332
sid = cleaner(subject_id)
333+
if not sid:
334+
raise ValueError(
335+
"Subject ID became empty after cleanup. Please provide manually "
336+
"a suitable alphanumeric subject ID")
329337
lgr.warning('{0} contained nonalphanumeric character(s), subject '
330338
'ID was cleaned to be {1}'.format(subject_id, sid))
331339
return sid, subject_id

heudiconv/convert.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,12 @@ def prep_conversion(sid, dicoms, outdir, heuristic, converter, anon_sid,
8787
else:
8888
raise ValueError("neither dicoms nor seqinfo dict was provided")
8989

90-
if bids and not sid.isalnum(): # alphanumeric only
91-
sid, old_sid = convert_sid_bids(sid)
90+
if bids:
91+
if not sid:
92+
raise ValueError(
93+
"BIDS requires alphanumeric subject ID. Got an empty value")
94+
if not sid.isalnum(): # alphanumeric only
95+
sid, old_sid = convert_sid_bids(sid)
9296

9397
if not anon_sid:
9498
anon_sid = sid

heudiconv/heuristics/reproin.py

Lines changed: 101 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@
2828
Sequence names on the scanner must follow this specification to avoid manual
2929
conversion/handling:
3030
31-
<seqtype[-label]>[_ses-<SESID>][_task-<TASKID>][_acq-<ACQLABEL>][_run-<RUNID>][_dir-<DIR>][<more BIDS>][__<custom>]
31+
[PREFIX:]<seqtype[-label]>[_ses-<SESID>][_task-<TASKID>][_acq-<ACQLABEL>][_run-<RUNID>][_dir-<DIR>][<more BIDS>][__<custom>]
3232
3333
where
34+
[PREFIX:] - leading capital letters followed by : are stripped/ignored
3435
<...> - value to be entered
3536
[...] - optional -- might be nearly mandatory for some modalities (e.g.,
3637
run for functional) and very optional for others
@@ -114,6 +115,18 @@
114115
import logging
115116
lgr = logging.getLogger('heudiconv')
116117

118+
# Terminology to hamornise and use to name variables etc
119+
# experiment
120+
# subject
121+
# [session]
122+
# exam (AKA scanning session) - currently seqinfo, unless brought together from multiple
123+
# series (AKA protocol?)
124+
# - series_spec - deduced from fields the spec (literal value)
125+
# - series_info - the dictionary with fields parsed from series_spec
126+
127+
# Which fields in seqinfo (in this order) to check for the ReproIn spec
128+
series_spec_fields = ('protocol_name', 'series_description')
129+
117130
# dictionary from accession-number to runs that need to be marked as bad
118131
# NOTE: even if filename has number that is 0-padded, internally no padding
119132
# is done
@@ -227,7 +240,6 @@
227240
('_test', ''),
228241
],
229242
}
230-
keys2replace = ['protocol_name', 'series_description']
231243

232244
# list containing StudyInstanceUID to skip -- hopefully doesn't happen too often
233245
dicoms2skip = [
@@ -341,13 +353,13 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run):
341353
if re.match(badruns_pattern, s.series_id):
342354
lgr.info('Fixing bad run {0}'.format(s.series_id))
343355
fixedkwargs = dict()
344-
for key in keys2replace:
356+
for key in series_spec_fields:
345357
fixedkwargs[key] = 'cancelme_' + getattr(s, key)
346358
seqinfo[i] = s._replace(**fixedkwargs)
347359
return seqinfo
348360

349361

350-
def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix):
362+
def fix_dbic_protocol(seqinfo, keys=series_spec_fields, subsdict=protocols2fix):
351363
"""Ad-hoc fixup for existing protocols
352364
"""
353365
study_hash = get_study_hash(seqinfo)
@@ -413,12 +425,13 @@ def infotodict(seqinfo):
413425
skipped, skipped_unknown = [], []
414426
current_run = 0
415427
run_label = None # run-
416-
image_data_type = None
428+
dcm_image_iod_spec = None
429+
skip_derived = False
417430
for s in seqinfo:
418431
# XXX: skip derived sequences, we don't store them to avoid polluting
419432
# the directory, unless it is the motion corrected ones
420433
# (will get _rec-moco suffix)
421-
if s.is_derived and not s.is_motion_corrected:
434+
if skip_derived and s.is_derived and not s.is_motion_corrected:
422435
skipped.append(s.series_id)
423436
lgr.debug("Ignoring derived data %s", s.series_id)
424437
continue
@@ -433,40 +446,56 @@ def infotodict(seqinfo):
433446

434447
# figure out type of image from s.image_info -- just for checking ATM
435448
# since we primarily rely on encoded in the protocol name information
436-
prev_image_data_type = image_data_type
437-
image_data_type = s.image_type[2]
438-
image_type_seqtype = {
439-
'P': 'fmap', # phase
440-
'FMRI': 'func',
441-
'MPR': 'anat',
442-
# 'M': 'func', "magnitude" -- can be for scout, anat, bold, fmap
443-
'DIFFUSION': 'dwi',
444-
'MIP_SAG': 'anat', # angiography
445-
'MIP_COR': 'anat', # angiography
446-
'MIP_TRA': 'anat', # angiography
447-
}.get(image_data_type, None)
448-
449-
protocol_name_tuned = s.protocol_name
450-
# Few common replacements
451-
if protocol_name_tuned in {'AAHead_Scout'}:
452-
protocol_name_tuned = 'anat-scout'
453-
454-
regd = parse_dbic_protocol_name(protocol_name_tuned)
455-
456-
if image_data_type.startswith('MIP'):
457-
regd['acq'] = regd.get('acq', '') + sanitize_str(image_data_type)
458-
459-
if not regd:
449+
prev_dcm_image_iod_spec = dcm_image_iod_spec
450+
if len(s.image_type) > 2:
451+
# https://dicom.innolitics.com/ciods/cr-image/general-image/00080008
452+
# 0 - ORIGINAL/DERIVED
453+
# 1 - PRIMARY/SECONDARY
454+
# 3 - Image IOD specific specialization (optional)
455+
dcm_image_iod_spec = s.image_type[2]
456+
image_type_seqtype = {
457+
'P': 'fmap', # phase
458+
'FMRI': 'func',
459+
'MPR': 'anat',
460+
# 'M': 'func', "magnitude" -- can be for scout, anat, bold, fmap
461+
'DIFFUSION': 'dwi',
462+
'MIP_SAG': 'anat', # angiography
463+
'MIP_COR': 'anat', # angiography
464+
'MIP_TRA': 'anat', # angiography
465+
}.get(dcm_image_iod_spec, None)
466+
else:
467+
dcm_image_iod_spec = image_type_seqtype = None
468+
469+
series_info = {} # For please lintian and its friends
470+
for sfield in series_spec_fields:
471+
svalue = getattr(s, sfield)
472+
series_info = parse_series_spec(svalue)
473+
if series_info: # looks like a valid spec - we are done
474+
series_spec = svalue
475+
break
476+
else:
477+
lgr.debug(
478+
"Failed to parse reproin spec in .%s=%r",
479+
sfield, svalue)
480+
481+
if not series_info:
482+
series_spec = None # we cannot know better
483+
lgr.warning(
484+
"Could not determine the series name by looking at "
485+
"%s fields", ', '.join(series_spec_fields))
460486
skipped_unknown.append(s.series_id)
461487
continue
462488

463-
seqtype = regd.pop('seqtype')
464-
seqtype_label = regd.pop('seqtype_label', None)
489+
if dcm_image_iod_spec and dcm_image_iod_spec.startswith('MIP'):
490+
series_info['acq'] = series_info.get('acq', '') + sanitize_str(dcm_image_iod_spec)
491+
492+
seqtype = series_info.pop('seqtype')
493+
seqtype_label = series_info.pop('seqtype_label', None)
465494

466495
if image_type_seqtype and seqtype != image_type_seqtype:
467496
lgr.warning(
468497
"Deduced seqtype to be %s from DICOM, but got %s out of %s",
469-
image_type_seqtype, seqtype, protocol_name_tuned)
498+
image_type_seqtype, seqtype, series_spec)
470499

471500
# if s.is_derived:
472501
# # Let's for now stash those close to original images
@@ -483,32 +512,34 @@ def infotodict(seqinfo):
483512

484513
# analyze s.protocol_name (series_id is based on it) for full name mapping etc
485514
if seqtype == 'func' and not seqtype_label:
486-
if '_pace_' in protocol_name_tuned:
515+
if '_pace_' in series_spec:
487516
seqtype_label = 'pace' # or should it be part of seq-
488517
else:
489518
# assume bold by default
490519
seqtype_label = 'bold'
491520

492521
if seqtype == 'fmap' and not seqtype_label:
522+
if not dcm_image_iod_spec:
523+
raise ValueError("Do not know image data type yet to make decision")
493524
seqtype_label = {
494525
'M': 'magnitude', # might want explicit {file_index} ?
495526
'P': 'phasediff',
496527
'DIFFUSION': 'epi', # according to KODI those DWI are the EPIs we need
497-
}[image_data_type]
528+
}[dcm_image_iod_spec]
498529

499530
# label for dwi as well
500531
if seqtype == 'dwi' and not seqtype_label:
501532
seqtype_label = 'dwi'
502533

503-
run = regd.get('run')
534+
run = series_info.get('run')
504535
if run is not None:
505536
# so we have an indicator for a run
506537
if run == '+':
507538
# some sequences, e.g. fmap, would generate two (or more?)
508539
# sequences -- e.g. one for magnitude(s) and other ones for
509540
# phases. In those we must not increment run!
510-
if image_data_type == 'P':
511-
if prev_image_data_type != 'M':
541+
if dcm_image_iod_spec and dcm_image_iod_spec == 'P':
542+
if prev_dcm_image_iod_spec != 'M':
512543
# XXX if we have a known earlier study, we need to always
513544
# increase the run counter for phasediff because magnitudes
514545
# were not acquired
@@ -517,7 +548,7 @@ def infotodict(seqinfo):
517548
else:
518549
raise RuntimeError(
519550
"Was expecting phase image to follow magnitude "
520-
"image, but previous one was %r", prev_image_data_type)
551+
"image, but previous one was %r", prev_dcm_image_iod_spec)
521552
# else we do nothing special
522553
else: # and otherwise we go to the next run
523554
current_run += 1
@@ -547,16 +578,16 @@ def infotodict(seqinfo):
547578
# if s.is_motion_corrected:
548579
# assert s.is_derived, "Motion corrected images must be 'derived'"
549580

550-
if s.is_motion_corrected and 'rec-' in regd.get('bids', ''):
581+
if s.is_motion_corrected and 'rec-' in series_info.get('bids', ''):
551582
raise NotImplementedError("want to add _acq-moco but there is _acq- already")
552583

553584
suffix_parts = [
554-
None if not regd.get('task') else "task-%s" % regd['task'],
555-
None if not regd.get('acq') else "acq-%s" % regd['acq'],
585+
None if not series_info.get('task') else "task-%s" % series_info['task'],
586+
None if not series_info.get('acq') else "acq-%s" % series_info['acq'],
556587
# But we want to add an indicator in case it was motion corrected
557588
# in the magnet. ref sample /2017/01/03/qa
558589
None if not s.is_motion_corrected else 'rec-moco',
559-
regd.get('bids'),
590+
series_info.get('bids'),
560591
run_label,
561592
seqtype_label,
562593
]
@@ -664,7 +695,7 @@ def infotoids(seqinfos, outdir):
664695

665696
# So -- use `outdir` and locator etc to see if for a given locator/subject
666697
# and possible ses+ in the sequence names, so we would provide a sequence
667-
# So might need to go through parse_dbic_protocol_name(s.protocol_name)
698+
# So might need to go through parse_series_spec(s.protocol_name)
668699
# to figure out presence of sessions.
669700
ses_markers = []
670701

@@ -675,7 +706,7 @@ def infotoids(seqinfos, outdir):
675706
for s in seqinfos:
676707
if s.is_derived:
677708
continue
678-
session_ = parse_dbic_protocol_name(s.protocol_name).get('session', None)
709+
session_ = parse_series_spec(s.protocol_name).get('session', None)
679710
if session_ and '{' in session_:
680711
# there was a marker for something we could provide from our seqinfo
681712
# e.g. {date}
@@ -745,22 +776,31 @@ def sanitize_str(value):
745776
return _delete_chars(value, '#!@$%^&.,:;_-')
746777

747778

748-
def parse_dbic_protocol_name(protocol_name):
779+
def parse_series_spec(series_spec):
749780
"""Parse protocol name according to our convention with minimal set of fixups
750781
"""
751782
# Since Yarik didn't know better place to put it in, but could migrate outside
752-
# at some point
753-
protocol_name = protocol_name.replace("anat_T1w", "anat-T1w")
754-
protocol_name = protocol_name.replace("hardi_64", "dwi_acq-hardi64")
783+
# at some point. TODO
784+
series_spec = series_spec.replace("anat_T1w", "anat-T1w")
785+
series_spec = series_spec.replace("hardi_64", "dwi_acq-hardi64")
786+
series_spec = series_spec.replace("AAHead_Scout", "anat-scout")
787+
788+
# Parse the name according to our convention/specification
789+
790+
# leading or trailing spaces do not matter
791+
series_spec = series_spec.strip(' ')
792+
793+
# Strip off leading CAPITALS: prefix to accommodate some reported usecases:
794+
# https://github.com/ReproNim/reproin/issues/14
795+
# where PU: prefix is added by the scanner
796+
series_spec = re.sub("^[A-Z]*:", "", series_spec)
755797

756-
# Parse the name according to our convention
757-
# https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing
758798
# Remove possible suffix we don't care about after __
759-
protocol_name = protocol_name.split('__', 1)[0]
799+
series_spec = series_spec.split('__', 1)[0]
760800

761801
bids = None # we don't know yet for sure
762802
# We need to figure out if it is a valid bids
763-
split = protocol_name.split('_')
803+
split = series_spec.split('_')
764804
prefix = split[0]
765805

766806
# Fixups
@@ -940,8 +980,8 @@ def test_fixupsubjectid():
940980
assert fixup_subjectid("SID30") == "sid000030"
941981

942982

943-
def test_parse_dbic_protocol_name():
944-
pdpn = parse_dbic_protocol_name
983+
def test_parse_series_spec():
984+
pdpn = parse_series_spec
945985

946986
assert pdpn("nondbic_func-bold") == {}
947987
assert pdpn("cancelme_func-bold") == {}
@@ -951,15 +991,19 @@ def test_parse_dbic_protocol_name():
951991
{'seqtype': 'func', 'seqtype_label': 'bold'}
952992

953993
# pdpn("bids_func_ses+_task-boo_run+") == \
954-
# order should not matter
955-
assert pdpn("bids_func_ses+_run+_task-boo") == \
994+
# order and PREFIX: should not matter, as well as trailing spaces
995+
assert \
996+
pdpn(" PREFIX:bids_func_ses+_task-boo_run+ ") == \
997+
pdpn("PREFIX:bids_func_ses+_task-boo_run+") == \
998+
pdpn("bids_func_ses+_run+_task-boo") == \
956999
{
9571000
'seqtype': 'func',
9581001
# 'seqtype_label': 'bold',
9591002
'session': '+',
9601003
'run': '+',
9611004
'task': 'boo',
9621005
}
1006+
9631007
# TODO: fix for that
9641008
assert pdpn("bids_func-pace_ses-1_task-boo_acq-bu_bids-please_run-2__therest") == \
9651009
pdpn("bids_func-pace_ses-1_run-2_task-boo_acq-bu_bids-please__therest") == \

0 commit comments

Comments
 (0)