Merge pull request #2 from nipraxis-fall-2022-forks/iqr-outliers

mtach · web-flow · commit cf075b82247e · 2022-09-07T16:44:42.000+01:00
An exercise to complete diagnostics code
diff --git a/data/group-00/hash_list.txt b/data/group-00/hash_list.txt
@@ -0,0 +1,40 @@
+5eaa2e01f05cee170e576f5b1e3d4661c75af764 group-00/sub-01/func/sub-01_task-taskzero_run-01_bold.nii.gz
+7e39dbebcb9504b26dc90ab97da6925c7229ecdd group-00/sub-01/func/sub-01_task-taskzero_run-01_events.tsv
+164738abad431b2e251d32ae5df39ac32d492662 group-00/sub-01/func/sub-01_task-taskzero_run-02_bold.nii.gz
+ab264a6822196940f06044f2fcbb611fc5954441 group-00/sub-01/func/sub-01_task-taskzero_run-02_events.tsv
+25a80c818325e9b94c604d0cb7ad02fc143eba64 group-00/sub-02/func/sub-02_task-taskzero_run-01_bold.nii.gz
+60d3100d2f98cd467482bd0b0a86aab87f63250c group-00/sub-02/func/sub-02_task-taskzero_run-01_events.tsv
+20708ce8c3fceb23b70818504103cb54dc11cd45 group-00/sub-02/func/sub-02_task-taskzero_run-02_bold.nii.gz
+97c6e697ad5d2837bce14169eca4bf80ed074d70 group-00/sub-02/func/sub-02_task-taskzero_run-02_events.tsv
+a5b57e14242e1333cdd5aaf7b0feb6d612d5bd52 group-00/sub-03/func/sub-03_task-taskzero_run-01_bold.nii.gz
+13a9fcaefc9e2f3f8d0f962246f894ec3f4cd13c group-00/sub-03/func/sub-03_task-taskzero_run-01_events.tsv
+bef1f787f29a6d76d4779d60da3a1d9ce69cdc22 group-00/sub-03/func/sub-03_task-taskzero_run-02_bold.nii.gz
+2e432cde2492a32c98c85e393a387a8042cac39c group-00/sub-03/func/sub-03_task-taskzero_run-02_events.tsv
+2b43b2821ea004ddbb172085ab7d52e7e679d07e group-00/sub-04/func/sub-04_task-taskzero_run-01_bold.nii.gz
+ec65e4be6b734b11e6b7d78bbb6088a6f8b50b53 group-00/sub-04/func/sub-04_task-taskzero_run-01_events.tsv
+e0fc9db87118f0765f2fb82c3b3915d72a7f50aa group-00/sub-04/func/sub-04_task-taskzero_run-02_bold.nii.gz
+486096db175b3e5fd173fb05b91035a90ee63c9b group-00/sub-04/func/sub-04_task-taskzero_run-02_events.tsv
+a04d0d79245e0dc301789ec02a03e98ce1b7a9b5 group-00/sub-05/func/sub-05_task-taskzero_run-01_bold.nii.gz
+dc2ab8324999e0a7317e6551305c476d1c0160ae group-00/sub-05/func/sub-05_task-taskzero_run-01_events.tsv
+69c697238d7930f07b2339b2afd4a7bf11a52f18 group-00/sub-05/func/sub-05_task-taskzero_run-02_bold.nii.gz
+76692e6435353d8b7fd3885f69a10bc64614f55f group-00/sub-05/func/sub-05_task-taskzero_run-02_events.tsv
+aaa411bb54454d39a9c245d5c4446899c17a495a group-00/sub-06/func/sub-06_task-taskzero_run-01_bold.nii.gz
+fd5f7feed8ddd12ffe4467a4020cc5931cd49b26 group-00/sub-06/func/sub-06_task-taskzero_run-01_events.tsv
+9a18f7addc6f9a13b1a4d06b631e67cc20380ea8 group-00/sub-06/func/sub-06_task-taskzero_run-02_bold.nii.gz
+2fe32e9ca3338512db646eb3da8b54d3a918ed53 group-00/sub-06/func/sub-06_task-taskzero_run-02_events.tsv
+86ecf9bdf2b834233c855569a64cc3ef0698522a group-00/sub-07/func/sub-07_task-taskzero_run-01_bold.nii.gz
+e80fd4d390c5be3635a963c14fb5c0d59b740e13 group-00/sub-07/func/sub-07_task-taskzero_run-01_events.tsv
+8c2f0d942a7c99c370f9544bc323714708b8e195 group-00/sub-07/func/sub-07_task-taskzero_run-02_bold.nii.gz
+b7f85a9b85e834727cfefcf356b9f69cd6f1ad46 group-00/sub-07/func/sub-07_task-taskzero_run-02_events.tsv
+dfce82fbf7b3e02d658918fa1341db093ab6e3c4 group-00/sub-08/func/sub-08_task-taskzero_run-01_bold.nii.gz
+a4eb6744d5e50a6783d3f365bbd5398151a650f3 group-00/sub-08/func/sub-08_task-taskzero_run-01_events.tsv
+4b416251f92ecb10f8c084878049001a224860e2 group-00/sub-08/func/sub-08_task-taskzero_run-02_bold.nii.gz
+ce183e545542d7c82b885bb27cb4c4bdfdb1e4b6 group-00/sub-08/func/sub-08_task-taskzero_run-02_events.tsv
+7df011856082ac011ac8b191f28e57d12f84bf67 group-00/sub-09/func/sub-09_task-taskzero_run-01_bold.nii.gz
+d527ddaee0ea05fdcedb41ef54b2107b655b05d8 group-00/sub-09/func/sub-09_task-taskzero_run-01_events.tsv
+36daa50dd2cbd064d652ad519457ad914f2fde12 group-00/sub-09/func/sub-09_task-taskzero_run-02_bold.nii.gz
+0734244bb63b92c814fa348bff4eee177bd0ea80 group-00/sub-09/func/sub-09_task-taskzero_run-02_events.tsv
+cdfa850cb3b626158bb28aa798803c11a5cd0049 group-00/sub-10/func/sub-10_task-taskzero_run-01_bold.nii.gz
+2ef1140403852426ed68f5e93e8bf228cb975ff0 group-00/sub-10/func/sub-10_task-taskzero_run-01_events.tsv
+b500a60b6ee1372317a93ebdfc5ae111082cb4fc group-00/sub-10/func/sub-10_task-taskzero_run-02_bold.nii.gz
+ab58f5933ca0cb8234a8d6f3713d07692c689437 group-00/sub-10/func/sub-10_task-taskzero_run-02_events.tsv
diff --git a/findoutlie/detectors.py b/findoutlie/detectors.py
@@ -0,0 +1,53 @@
+""" Utilities for detecting outliers
+
+These functions take a vector of values, and return a boolean vector of the
+same length as the input, where True indicates the corresponding value is an
+outlier.
+
+The outlier detection routines will likely be adapted to the specific measure
+that is being worked on.  So, some detector functions will work on values > 0,
+other on normally distributed values etc.  The routines should check that their
+requirements are met and raise an error otherwise.
+"""
+
+# Any imports you need
+# +++your code here+++
+
+
+def iqr_detector(measures, iqr_proportion=1.5):
+    """ Detect outliers in `measures` using interquartile range.
+
+    Returns a boolean vector of same length as `measures`, where True means the
+    corresponding value in `measures` is an outlier.
+
+    Call Q1, Q2 and Q3 the 25th, 50th and 75th percentiles of `measures`.
+
+    The interquartile range (IQR) is Q3 - Q1.
+
+    An outlier is any value in `measures` that is either:
+
+    * > Q3 + IQR * `iqr_proportion` or
+    * < Q1 - IQR * `iqr_proportion`.
+
+    See: https://en.wikipedia.org/wiki/Interquartile_range
+
+    Parameters
+    ----------
+    measures : 1D array
+        Values for which we will detect outliers
+    iqr_proportion : float, optional
+        Scalar to multiply the IQR to form upper and lower threshold (see
+        above).  Default is 1.5.
+
+    Returns
+    -------
+    outlier_tf : 1D boolean array
+        A boolean vector of same length as `measures`, where True means the
+        corresponding value in `measures` is an outlier.
+    """
+    # Any imports you need
+    # Hints:
+    # * investigate np.percentile
+    # * You'll likely need np.logical_or
+    # https://textbook.nipraxis.org/numpy_logical.html
+    # +++your code here+++
diff --git a/findoutlie/spm_funcs.py b/findoutlie/spm_funcs.py
@@ -0,0 +1,56 @@
+"""
+This module defines functions implementing algorithms in SPM
+
+In the same directory as this file, you will find a 'tests' directory.
+
+Test this module with:
+
+    python3 findoutlie/tests/test_spm_funcs.py
+
+or better, in IPython::
+
+    %run findoutlie/tests/test_spm_funcs.py
+"""
+
+import numpy as np
+
+import nibabel as nib
+
+
+def spm_global(vol):
+    """ Calculate SPM global metric for array `vol`
+
+    Parameters
+    ----------
+    vol : array
+        Array giving image data, usually 3D.
+
+    Returns
+    -------
+    g : float
+        SPM global metric for `vol`
+    """
+    T = np.mean(vol) / 8
+    return np.mean(vol[vol > T])
+
+
+def get_spm_globals(fname):
+    """ Calculate SPM global metrics for volumes in image filename `fname`
+
+    Parameters
+    ----------
+    fname : str
+        Filename of file containing 4D image
+
+    Returns
+    -------
+    spm_vals : array
+        SPM global metric for each 3D volume in the 4D image.
+    """
+    img = nib.load(fname)
+    data = img.get_fdata()
+    spm_vals = []
+    for i in range(data.shape[-1]):
+        vol = data[..., i]
+        spm_vals.append(spm_global(vol))
+    return np.array(spm_vals)
diff --git a/findoutlie/tests/ds107_sub012_t1r2_small.nii b/findoutlie/tests/ds107_sub012_t1r2_small.nii
diff --git a/findoutlie/tests/get_global_signals.m b/findoutlie/tests/get_global_signals.m
@@ -0,0 +1,11 @@
+% Get SPM global signal estimate for all volumes in smaller bold 4D
+file_volno = spm_select('ExtFPList', pwd, 'ds107_sub012_t1r2_small.nii', inf);
+V = spm_vol(file_volno);
+global_signals = ones([length(V), 1]);
+for i = 1:length(V)
+    global_signals(i) = spm_global(V(i));
+end
+% Save signal values to a text file
+fid = fopen('global_signals.txt','w');
+fprintf(fid,'%6.2f\n', global_signals);
+fclose(fid);
diff --git a/findoutlie/tests/global_signals.txt b/findoutlie/tests/global_signals.txt
@@ -0,0 +1,10 @@
+376.53
+375.75
+375.26
+376.01
+376.83
+374.15
+372.54
+373.49
+374.23
+374.46
diff --git a/findoutlie/tests/test_detectors.py b/findoutlie/tests/test_detectors.py
@@ -0,0 +1,45 @@
+""" Test script for detector functions
+
+Run these tests with::
+
+    python3 findoutlie/tests/test_detectors.py
+
+or better, in IPython::
+
+    %run findoutlie/tests/test_detectors.py
+"""
+
+from pathlib import Path
+import sys
+
+MY_DIR = Path(__file__).parent
+
+# Here you should add the directory containing the findoutlie
+# directory to the Python path.
+# Hint: sys.path
+# Hint: see the solutions if you are stuck.
+# +++your code here+++
+
+import numpy as np
+
+# This import needs the directory containing the findoutlie directory
+# on the Python path.
+from detectors import iqr_detector
+
+
+def test_iqr_detector():
+    # From: http://www.purplemath.com/modules/boxwhisk3.htm
+    example_values = np.array(
+        [10.2, 14.1, 14.4, 14.4, 14.4, 14.5, 14.5, 14.6, 14.7, 14.7, 14.7,
+         14.9, 15.1, 15.9, 16.4])
+    is_outlier = iqr_detector(example_values, 1.5)
+    assert np.all(example_values[is_outlier] == [10.2, 15.9, 16.4])
+    # Test not-default value for outlier proportion
+    is_outlier = iqr_detector(example_values, 0.5)
+    assert np.all(example_values[is_outlier] == [10.2, 14.1, 15.1, 15.9, 16.4])
+
+
+if __name__ == '__main__':
+    # File being executed as a script
+    test_iqr_detector()
+    print('Tests passed')
diff --git a/findoutlie/tests/test_spm_funcs.py b/findoutlie/tests/test_spm_funcs.py
@@ -0,0 +1,52 @@
+""" Test script for SPM functions
+
+Run these tests with::
+
+    python3 findoutlie/tests/test_spm_funcs.py
+
+or better, in IPython::
+
+    %run findoutlie/tests/test_spm_funcs.py
+"""
+
+from pathlib import Path
+import sys
+
+MY_DIR = Path(__file__).parent
+EXAMPLE_FILENAME = 'ds107_sub012_t1r2_small.nii'
+
+# Here you should add the directory containing the findoutlie
+# directory to the Python path.
+# Hint: sys.path
+# Hint: see the solutions if you are stuck.
+# +++your code here+++
+
+import numpy as np
+
+import nibabel as nib
+
+# This import needs the directory containing the findoutlie directory
+# on the Python path.
+from spm_funcs import get_spm_globals, spm_global
+
+
+def test_spm_globals():
+    # Test get_spm_globals and spm_global functions
+    example_path = MY_DIR / EXAMPLE_FILENAME
+    expected_values = np.loadtxt(MY_DIR / 'global_signals.txt')
+    glob_vals = get_spm_globals(example_path)
+    assert glob_vals is not None, 'Did you forget to return the values?'
+    assert np.allclose(glob_vals, expected_values, rtol=1e-4)
+    img = nib.load(example_path)
+    data = img.get_fdata()
+    globals = []
+    for vol_no in range(data.shape[-1]):
+        vol = data[..., vol_no]
+        globals.append(spm_global(vol))
+    assert np.allclose(globals, expected_values, rtol=1e-4)
+
+
+if __name__ == '__main__':
+    # File being executed as a script
+    test_spm_globals()
+    print('Tests passed')
diff --git a/scripts/validate_data.py b/scripts/validate_data.py
@@ -2,13 +2,13 @@
 
 Run as:
 
-    python3 scripts/validata_data.py data
+    python3 scripts/validate_data.py
 """
 
 from pathlib import Path
-import sys
 import hashlib
 
+
 def file_hash(filename):
     """ Get byte contents of file `filename`, return SHA1 hash
 
@@ -53,19 +53,22 @@ def validate_data(data_directory):
     # If hash for filename is not the same as the one in the file, raise
     # ValueError
     # This is a placeholder, replace it to write your solution.
-    raise NotImplementedError('This is just a template -- you are expected to code this.')
+    raise NotImplementedError(
+        'This is just a template -- fill out the template with code.')
 
 
 def main():
     # This function (main) called when this file run as a script.
-    #
-    # Get the data directory from the command line arguments
-    if len(sys.argv) < 2:
-        raise RuntimeError("Please give data directory on "
-                           "command line")
-    data_directory = sys.argv[1]
+    group_directory = (Path(__file__).parent.parent / 'data')
+    groups = list(group_directory.glob('group-??'))
+    if len(groups) == 0:
+        raise RuntimeError('No group directory in data directory: '
+                           'have you downloaded and unpacked the data?')
+
+    if len(groups) > 1:
+        raise RuntimeError('Too many group directories in data directory')
     # Call function to validate data in data directory
-    validate_data(data_directory)
+    validate_data(groups[0])
 
 
 if __name__ == '__main__':
diff --git a/solutions/.solutions.toml b/solutions/.solutions.toml
@@ -5,3 +5,12 @@ out_path = '{one_down}/findoutlie/metrics.py'
 
 [solution.validate_data]
 out_path = '{one_down}/scripts/validate_data.py'
+
+[solution.test_spm_funcs]
+out_path = '{one_down}/findoutlie/tests/test_spm_funcs.py'
+
+[solution.detectors]
+out_path = '{one_down}/findoutlie/detectors.py'
+
+[solution.test_detectors]
+out_path = '{one_down}/findoutlie/tests/test_detectors.py'
diff --git a/solutions/detectors.py b/solutions/detectors.py
@@ -0,0 +1,65 @@
+""" Utilities for detecting outliers
+
+These functions take a vector of values, and return a boolean vector of the
+same length as the input, where True indicates the corresponding value is an
+outlier.
+
+The outlier detection routines will likely be adapted to the specific measure
+that is being worked on.  So, some detector functions will work on values > 0,
+other on normally distributed values etc.  The routines should check that their
+requirements are met and raise an error otherwise.
+"""
+
+# Any imports you need
+# LAB(begin solution)
+import numpy as np
+# LAB(replace solution)
+# +++your code here+++
+# LAB(end solution)
+
+
+def iqr_detector(measures, iqr_proportion=1.5):
+    """ Detect outliers in `measures` using interquartile range.
+
+    Returns a boolean vector of same length as `measures`, where True means the
+    corresponding value in `measures` is an outlier.
+
+    Call Q1, Q2 and Q3 the 25th, 50th and 75th percentiles of `measures`.
+
+    The interquartile range (IQR) is Q3 - Q1.
+
+    An outlier is any value in `measures` that is either:
+
+    * > Q3 + IQR * `iqr_proportion` or
+    * < Q1 - IQR * `iqr_proportion`.
+
+    See: https://en.wikipedia.org/wiki/Interquartile_range
+
+    Parameters
+    ----------
+    measures : 1D array
+        Values for which we will detect outliers
+    iqr_proportion : float, optional
+        Scalar to multiply the IQR to form upper and lower threshold (see
+        above).  Default is 1.5.
+
+    Returns
+    -------
+    outlier_tf : 1D boolean array
+        A boolean vector of same length as `measures`, where True means the
+        corresponding value in `measures` is an outlier.
+    """
+    # Any imports you need
+    # LAB(begin solution)
+    q1, q3 = np.percentile(measures, [25, 75])
+    iqr = q3 - q1
+    up_thresh = q3 + iqr * iqr_proportion
+    down_thresh = q1 - iqr * iqr_proportion
+    return np.logical_or(measures > up_thresh, measures < down_thresh)
+    # LAB(replace solution)
+    # Hints:
+    # * investigate np.percentile
+    # * You'll likely need np.logical_or
+    # https://textbook.nipraxis.org/numpy_logical.html
+    # +++your code here+++
+    # LAB(end solution)
diff --git a/solutions/test_detectors.py b/solutions/test_detectors.py
diff --git a/solutions/test_spm_funcs.py b/solutions/test_spm_funcs.py
diff --git a/solutions/validate_data.py b/solutions/validate_data.py
diff --git a/solutions/write_solutions.sh b/solutions/write_solutions.sh

-Original file line number
+Diff line change
@@ @@ -0,0 +1,10 @@ @@
 +376.53
 +375.75
 +375.26
 +376.01
 +376.83
 +374.15
 +372.54
 +373.49
 +374.23
 +374.46