Skip to content

Commit f0738ce

Browse files
committed
RF+ENH: allow matching study using regexp on study_description
1 parent 75460f5 commit f0738ce

File tree

2 files changed

+49
-28
lines changed

2 files changed

+49
-28
lines changed

heudiconv/heuristics/reproin.py

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -392,39 +392,58 @@ def fix_canceled_runs(seqinfo):
392392

393393

394394
def fix_dbic_protocol(seqinfo):
395-
"""Ad-hoc fixup for existing protocols
395+
"""Ad-hoc fixup for existing protocols.
396+
397+
It will operate in 3 stages on `protocols2fix` records.
398+
1. consider a record which has md5sum of study_description
399+
2. apply all substitutions, where key is a regular expression which
400+
successfully searches (not necessarily matches, so anchor appropriately)
401+
study_description
402+
3. apply "catch all" substitutions in the key containing an empty string
403+
404+
3. is somewhat redundant since `re.compile('.*')` could match any, but is
405+
kept for simplicity of its specification.
396406
"""
397407

398408
study_hash = get_study_hash(seqinfo)
399-
400-
# We will consider study specific (based on hash) and global (if key is "",
401-
# ie empty string) and in that order substitutions
402-
candidate_substitutions = (
403-
('study (%s) specific' % study_hash, study_hash),
404-
('global', ''),
405-
)
406-
for subs_scope, subs_key in candidate_substitutions:
407-
if subs_key not in protocols2fix:
408-
continue
409-
substitutions = protocols2fix[subs_key]
410-
lgr.info("Considering %s substitutions", subs_scope)
411-
for i, s in enumerate(seqinfo):
412-
fixed_kwargs = dict()
413-
# need to replace both protocol_name series_description
414-
for key in series_spec_fields:
415-
oldvalue = value = getattr(s, key)
416-
# replace all I need to replace
417-
for substring, replacement in substitutions:
418-
value = re.sub(substring, replacement, value)
419-
if oldvalue != value:
420-
lgr.info(" %s: %r -> %r", key, oldvalue, value)
421-
fixed_kwargs[key] = value
422-
# namedtuples are immutable
423-
seqinfo[i] = s._replace(**fixed_kwargs)
409+
study_description = get_study_description(seqinfo)
410+
411+
# We will consider first study specific (based on hash)
412+
if study_hash in protocols2fix:
413+
_apply_substitutions(seqinfo,
414+
protocols2fix[study_hash],
415+
'study (%s) specific' % study_hash)
416+
# Then go through all regexps returning regex "search" result
417+
# on study_description
418+
for sub, substitutions in protocols2fix.items():
419+
if isinstance(sub, re.Pattern) and sub.search(study_description):
420+
_apply_substitutions(seqinfo,
421+
substitutions,
422+
'%r regex matching' % sub.pattern)
423+
# and at the end - global
424+
if '' in protocols2fix:
425+
_apply_substitutions(seqinfo, protocols2fix[''], 'global')
424426

425427
return seqinfo
426428

427429

430+
def _apply_substitutions(seqinfo, substitutions, subs_scope):
431+
lgr.info("Considering %s substitutions", subs_scope)
432+
for i, s in enumerate(seqinfo):
433+
fixed_kwargs = dict()
434+
# need to replace both protocol_name series_description
435+
for key in series_spec_fields:
436+
oldvalue = value = getattr(s, key)
437+
# replace all I need to replace
438+
for substring, replacement in substitutions:
439+
value = re.sub(substring, replacement, value)
440+
if oldvalue != value:
441+
lgr.info(" %s: %r -> %r", key, oldvalue, value)
442+
fixed_kwargs[key] = value
443+
# namedtuples are immutable
444+
seqinfo[i] = s._replace(**fixed_kwargs)
445+
446+
428447
def fix_seqinfo(seqinfo):
429448
"""Just a helper on top of both fixers
430449
"""

heudiconv/heuristics/test_reproin.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#
44
from collections import OrderedDict
55
from mock import patch
6+
import re
67

78
from . import reproin
89
from .reproin import (
@@ -110,12 +111,13 @@ def test_fix_dbic_protocol():
110111
'nochangeplease',
111112
'nochangeeither')
112113

113-
114114
seqinfos = [seq1, seq2]
115115
protocols2fix = {
116116
md5sum('mystudy'):
117-
[('scout_run\+', 'THESCOUT'),
117+
[('scout_run\+', 'THESCOUT-runX'),
118118
('run-life[0-9]', 'run+_task-life')],
119+
re.compile('^my.*'):
120+
[('THESCOUT-runX', 'THESCOUT')],
119121
# rely on 'catch-all' to fix up above scout
120122
'': [('THESCOUT', 'scout')]
121123
}

0 commit comments

Comments
 (0)