Merge pull request #511 from dbic/bf-anon-cmd

yarikoptic · web-flow · commit c4956b35b38d · 2021-10-14T12:47:42.000-04:00
BF (TST): make anonymize_script actually output anything and map determinstically
diff --git a/heudiconv/tests/anonymize_script.py b/heudiconv/tests/anonymize_script.py
@@ -2,12 +2,12 @@
 
 import sys
 import re
-import ctypes
+import hashlib
 
 
 def bids_id_(sid):
     parsed_id = re.compile(r"^(?:sub-|)(.+)$").search(sid).group(1)
-    return str(ctypes.c_size_t(hash(parsed_id)).value)
+    return hashlib.md5(parsed_id.encode()).hexdigest()[:8]
 
 
 def main():
@@ -16,4 +16,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    print(main())
diff --git a/heudiconv/tests/test_regression.py b/heudiconv/tests/test_regression.py
@@ -2,6 +2,7 @@
 from glob import glob
 import os
 import os.path as op
+import re
 
 import pytest
 
@@ -19,30 +20,48 @@
 
 
 @pytest.mark.skipif(not have_datalad, reason="no datalad")
-@pytest.mark.parametrize('subject', ['sub-sid000143'])
+@pytest.mark.parametrize('subject', ['sid000143'])
 @pytest.mark.parametrize('heuristic', ['reproin.py'])
 @pytest.mark.parametrize('anon_cmd', [None, 'anonymize_script.py'])
 def test_conversion(tmpdir, subject, heuristic, anon_cmd):
     tmpdir.chdir()
     try:
         datadir = fetch_data(tmpdir.strpath,
                              "dbic/QA",  # path from datalad database root
-                             getpath=op.join('sourcedata', subject))
+                             getpath=op.join('sourcedata', f'sub-{subject}'))
     except IncompleteResultsError as exc:
         pytest.skip("Failed to fetch test data: %s" % str(exc))
     outdir = tmpdir.mkdir('out').strpath
 
     args = gen_heudiconv_args(
         datadir, outdir, subject, heuristic, anon_cmd,
-        template=op.join('sourcedata/{subject}/*/*/*.tgz')
+        template='sourcedata/sub-{subject}/*/*/*.tgz'
     )
     runner(args)  # run conversion
 
+    # Get the possibly anonymized subject id and verify that it was
+    # anonymized or not:
+    subject_maybe_anon = glob(f'{outdir}/sub-*')
+    assert len(subject_maybe_anon) == 1  # just one should be there
+    subject_maybe_anon = op.basename(subject_maybe_anon[0])[4:]
+
+    if anon_cmd:
+        assert subject_maybe_anon != subject
+    else:
+        assert subject_maybe_anon == subject
+
     # verify functionals were converted
-    assert (
-        glob('{}/{}/func/*'.format(outdir, subject)) ==
-        glob('{}/{}/func/*'.format(datadir, subject))
-    )
+    outfiles = sorted([f[len(outdir):] for f in glob(f'{outdir}/sub-{subject_maybe_anon}/func/*')])
+    assert outfiles
+    datafiles = sorted([f[len(datadir):] for f in glob(f'{datadir}/sub-{subject}/ses-*/func/*')])
+    # original data has ses- but because we are converting only func, and not
+    # providing any session, we will not "match". Let's strip away the session
+    datafiles = [re.sub(r'[/\\_]ses-[^/\\_]*', '', f) for f in datafiles]
+    if not anon_cmd:
+        assert outfiles == datafiles
+    else:
+        assert outfiles != datafiles  # sid was anonymized
+        assert len(outfiles) == len(datafiles)  # but we have the same number of files
 
     # compare some json metadata
     json_ = '{}/task-rest_acq-24mm64sl1000tr32te600dyn_bold.json'.format
diff --git a/heudiconv/utils.py b/heudiconv/utils.py
@@ -111,7 +111,13 @@ def dec(obj):
 
 
 def anonymize_sid(sid, anon_sid_cmd):
-
+    """
+    Raises
+    ------
+    ValueError
+      if script returned an empty string (after whitespace stripping),
+      or output with multiple words/lines.
+    """
     cmd = [anon_sid_cmd, sid]
     shell_return = check_output(cmd)
 
@@ -120,7 +126,14 @@ def anonymize_sid(sid, anon_sid_cmd):
     else:
         anon_sid = shell_return
 
-    return anon_sid.strip()
+    anon_sid = anon_sid.strip()
+    if not anon_sid:
+        raise ValueError(f"{anon_sid_cmd!r} {sid!r} returned empty sid")
+    # rudimentary check for sanity: no multiple lines or words (in general
+    # ok, but not ok for BIDS) in the output
+    if len(anon_sid.split()) > 1:
+        raise ValueError(f"{anon_sid_cmd!r} {sid!r} returned multiline output")
+    return anon_sid
 
 
 def create_file_if_missing(filename, content):