Skip to content

Commit 4247eb6

Browse files
authored
Merge pull request #425 from dbic/reproin-catchallremap
ReproIn: make specification of protocols2fix more flexible + do not bind fixup structs in func signatures
2 parents fdea77c + febd0fd commit 4247eb6

File tree

2 files changed

+65
-25
lines changed

2 files changed

+65
-25
lines changed

heudiconv/heuristics/reproin.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@
126126
import logging
127127
lgr = logging.getLogger('heudiconv')
128128

129+
# pythons before 3.7 didn't have re.Pattern, it was some protected
130+
# _sre.SRE_Pattern, so let's just sample a class of the compiled regex
131+
re_Pattern = re.compile('.').__class__
132+
129133
# Terminology to harmonise and use to name variables etc
130134
# experiment
131135
# subject
@@ -372,14 +376,14 @@ def get_study_hash(seqinfo):
372376
return md5sum(get_study_description(seqinfo))
373377

374378

375-
def fix_canceled_runs(seqinfo, accession2run=fix_accession2run):
379+
def fix_canceled_runs(seqinfo):
376380
"""Function that adds cancelme_ to known bad runs which were forgotten
377381
"""
378382
accession_number = get_unique(seqinfo, 'accession_number')
379-
if accession_number in accession2run:
383+
if accession_number in fix_accession2run:
380384
lgr.info("Considering some runs possibly marked to be "
381385
"canceled for accession %s", accession_number)
382-
badruns = accession2run[accession_number]
386+
badruns = fix_accession2run[accession_number]
383387
badruns_pattern = '|'.join(badruns)
384388
for i, s in enumerate(seqinfo):
385389
if re.match(badruns_pattern, s.series_id):
@@ -391,39 +395,65 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run):
391395
return seqinfo
392396

393397

394-
def fix_dbic_protocol(seqinfo, keys=series_spec_fields, subsdict=protocols2fix):
395-
"""Ad-hoc fixup for existing protocols
398+
def fix_dbic_protocol(seqinfo):
399+
"""Ad-hoc fixup for existing protocols.
400+
401+
It will operate in 3 stages on `protocols2fix` records.
402+
1. consider a record which has md5sum of study_description
403+
2. apply all substitutions, where key is a regular expression which
404+
successfully searches (not necessarily matches, so anchor appropriately)
405+
study_description
406+
3. apply "catch all" substitutions in the key containing an empty string
407+
408+
3. is somewhat redundant since `re.compile('.*')` could match any, but is
409+
kept for simplicity of its specification.
396410
"""
411+
397412
study_hash = get_study_hash(seqinfo)
413+
study_description = get_study_description(seqinfo)
414+
415+
# We will consider first study specific (based on hash)
416+
if study_hash in protocols2fix:
417+
_apply_substitutions(seqinfo,
418+
protocols2fix[study_hash],
419+
'study (%s) specific' % study_hash)
420+
# Then go through all regexps returning regex "search" result
421+
# on study_description
422+
for sub, substitutions in protocols2fix.items():
423+
if isinstance(sub, re_Pattern) and sub.search(study_description):
424+
_apply_substitutions(seqinfo,
425+
substitutions,
426+
'%r regex matching' % sub.pattern)
427+
# and at the end - global
428+
if '' in protocols2fix:
429+
_apply_substitutions(seqinfo, protocols2fix[''], 'global')
430+
431+
return seqinfo
398432

399-
if study_hash not in subsdict:
400-
raise ValueError("I don't know how to fix {0}".format(study_hash))
401433

402-
# need to replace both protocol_name series_description
403-
substitutions = subsdict[study_hash]
434+
def _apply_substitutions(seqinfo, substitutions, subs_scope):
435+
lgr.info("Considering %s substitutions", subs_scope)
404436
for i, s in enumerate(seqinfo):
405437
fixed_kwargs = dict()
406-
for key in keys:
407-
value = getattr(s, key)
438+
# need to replace both protocol_name series_description
439+
for key in series_spec_fields:
440+
oldvalue = value = getattr(s, key)
408441
# replace all I need to replace
409442
for substring, replacement in substitutions:
410443
value = re.sub(substring, replacement, value)
444+
if oldvalue != value:
445+
lgr.info(" %s: %r -> %r", key, oldvalue, value)
411446
fixed_kwargs[key] = value
412447
# namedtuples are immutable
413448
seqinfo[i] = s._replace(**fixed_kwargs)
414449

415-
return seqinfo
416-
417450

418451
def fix_seqinfo(seqinfo):
419452
"""Just a helper on top of both fixers
420453
"""
421454
# add cancelme to known bad runs
422455
seqinfo = fix_canceled_runs(seqinfo)
423-
study_hash = get_study_hash(seqinfo)
424-
if study_hash in protocols2fix:
425-
lgr.info("Fixing up protocol for {0}".format(study_hash))
426-
seqinfo = fix_dbic_protocol(seqinfo)
456+
seqinfo = fix_dbic_protocol(seqinfo)
427457
return seqinfo
428458

429459

heudiconv/heuristics/test_reproin.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
# Tests for reproin.py
33
#
44
from collections import OrderedDict
5+
from mock import patch
6+
import re
7+
8+
from . import reproin
59
from .reproin import (
610
filter_files,
711
fix_canceled_runs,
@@ -78,7 +82,8 @@ def test_fix_canceled_runs():
7882
'accession1': ['^01-', '^03-']
7983
}
8084

81-
seqinfo_ = fix_canceled_runs(seqinfo, fake_accession2run)
85+
with patch.object(reproin, 'fix_accession2run', fake_accession2run):
86+
seqinfo_ = fix_canceled_runs(seqinfo)
8287

8388
for i, s in enumerate(seqinfo_, 1):
8489
output = runname
@@ -106,16 +111,20 @@ def test_fix_dbic_protocol():
106111
'nochangeplease',
107112
'nochangeeither')
108113

109-
110114
seqinfos = [seq1, seq2]
111-
keys = ['field1']
112-
subsdict = {
115+
protocols2fix = {
113116
md5sum('mystudy'):
114-
[('scout_run\+', 'scout'),
117+
[('scout_run\+', 'THESCOUT-runX'),
115118
('run-life[0-9]', 'run+_task-life')],
119+
re.compile('^my.*'):
120+
[('THESCOUT-runX', 'THESCOUT')],
121+
# rely on 'catch-all' to fix up above scout
122+
'': [('THESCOUT', 'scout')]
116123
}
117124

118-
seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict)
125+
with patch.object(reproin, 'protocols2fix', protocols2fix), \
126+
patch.object(reproin, 'series_spec_fields', ['field1']):
127+
seqinfos_ = fix_dbic_protocol(seqinfos)
119128
assert(seqinfos[1] == seqinfos_[1])
120129
# field2 shouldn't have changed since I didn't pass it
121130
assert(seqinfos_[0] == FakeSeqInfo(accession_number,
@@ -124,8 +133,9 @@ def test_fix_dbic_protocol():
124133
seq1.field2))
125134

126135
# change also field2 please
127-
keys = ['field1', 'field2']
128-
seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict)
136+
with patch.object(reproin, 'protocols2fix', protocols2fix), \
137+
patch.object(reproin, 'series_spec_fields', ['field1', 'field2']):
138+
seqinfos_ = fix_dbic_protocol(seqinfos)
129139
assert(seqinfos[1] == seqinfos_[1])
130140
# now everything should have changed
131141
assert(seqinfos_[0] == FakeSeqInfo(accession_number,

0 commit comments

Comments
 (0)