126
126
import logging
127
127
lgr = logging .getLogger ('heudiconv' )
128
128
129
+ # pythons before 3.7 didn't have re.Pattern, it was some protected
130
+ # _sre.SRE_Pattern, so let's just sample a class of the compiled regex
131
+ re_Pattern = re .compile ('.' ).__class__
132
+
129
133
# Terminology to harmonise and use to name variables etc
130
134
# experiment
131
135
# subject
@@ -372,14 +376,14 @@ def get_study_hash(seqinfo):
372
376
return md5sum (get_study_description (seqinfo ))
373
377
374
378
375
- def fix_canceled_runs (seqinfo , accession2run = fix_accession2run ):
379
+ def fix_canceled_runs (seqinfo ):
376
380
"""Function that adds cancelme_ to known bad runs which were forgotten
377
381
"""
378
382
accession_number = get_unique (seqinfo , 'accession_number' )
379
- if accession_number in accession2run :
383
+ if accession_number in fix_accession2run :
380
384
lgr .info ("Considering some runs possibly marked to be "
381
385
"canceled for accession %s" , accession_number )
382
- badruns = accession2run [accession_number ]
386
+ badruns = fix_accession2run [accession_number ]
383
387
badruns_pattern = '|' .join (badruns )
384
388
for i , s in enumerate (seqinfo ):
385
389
if re .match (badruns_pattern , s .series_id ):
@@ -391,39 +395,65 @@ def fix_canceled_runs(seqinfo, accession2run=fix_accession2run):
391
395
return seqinfo
392
396
393
397
394
- def fix_dbic_protocol (seqinfo , keys = series_spec_fields , subsdict = protocols2fix ):
395
- """Ad-hoc fixup for existing protocols
398
+ def fix_dbic_protocol (seqinfo ):
399
+ """Ad-hoc fixup for existing protocols.
400
+
401
+ It will operate in 3 stages on `protocols2fix` records.
402
+ 1. consider a record which has md5sum of study_description
403
+ 2. apply all substitutions, where key is a regular expression which
404
+ successfully searches (not necessarily matches, so anchor appropriately)
405
+ study_description
406
+ 3. apply "catch all" substitutions in the key containing an empty string
407
+
408
+ 3. is somewhat redundant since `re.compile('.*')` could match any, but is
409
+ kept for simplicity of its specification.
396
410
"""
411
+
397
412
study_hash = get_study_hash (seqinfo )
413
+ study_description = get_study_description (seqinfo )
414
+
415
+ # We will consider first study specific (based on hash)
416
+ if study_hash in protocols2fix :
417
+ _apply_substitutions (seqinfo ,
418
+ protocols2fix [study_hash ],
419
+ 'study (%s) specific' % study_hash )
420
+ # Then go through all regexps returning regex "search" result
421
+ # on study_description
422
+ for sub , substitutions in protocols2fix .items ():
423
+ if isinstance (sub , re_Pattern ) and sub .search (study_description ):
424
+ _apply_substitutions (seqinfo ,
425
+ substitutions ,
426
+ '%r regex matching' % sub .pattern )
427
+ # and at the end - global
428
+ if '' in protocols2fix :
429
+ _apply_substitutions (seqinfo , protocols2fix ['' ], 'global' )
430
+
431
+ return seqinfo
398
432
399
- if study_hash not in subsdict :
400
- raise ValueError ("I don't know how to fix {0}" .format (study_hash ))
401
433
402
- # need to replace both protocol_name series_description
403
- substitutions = subsdict [ study_hash ]
434
+ def _apply_substitutions ( seqinfo , substitutions , subs_scope ):
435
+ lgr . info ( "Considering %s substitutions" , subs_scope )
404
436
for i , s in enumerate (seqinfo ):
405
437
fixed_kwargs = dict ()
406
- for key in keys :
407
- value = getattr (s , key )
438
+ # need to replace both protocol_name series_description
439
+ for key in series_spec_fields :
440
+ oldvalue = value = getattr (s , key )
408
441
# replace all I need to replace
409
442
for substring , replacement in substitutions :
410
443
value = re .sub (substring , replacement , value )
444
+ if oldvalue != value :
445
+ lgr .info (" %s: %r -> %r" , key , oldvalue , value )
411
446
fixed_kwargs [key ] = value
412
447
# namedtuples are immutable
413
448
seqinfo [i ] = s ._replace (** fixed_kwargs )
414
449
415
- return seqinfo
416
-
417
450
418
451
def fix_seqinfo (seqinfo ):
419
452
"""Just a helper on top of both fixers
420
453
"""
421
454
# add cancelme to known bad runs
422
455
seqinfo = fix_canceled_runs (seqinfo )
423
- study_hash = get_study_hash (seqinfo )
424
- if study_hash in protocols2fix :
425
- lgr .info ("Fixing up protocol for {0}" .format (study_hash ))
426
- seqinfo = fix_dbic_protocol (seqinfo )
456
+ seqinfo = fix_dbic_protocol (seqinfo )
427
457
return seqinfo
428
458
429
459
0 commit comments