Merge pull request #24 from jagruti8/add-detectors-metrics

jagruti8 · web-flow · commit e42d140a0550 · 2022-10-24T16:00:26.000+02:00
Added a new detector, modified README.md file and added an algorithm.txt file
diff --git a/README.md b/README.md
@@ -8,6 +8,24 @@ You should put the code in this `findoutlie` directory on your Python PATH.
 
 This README file has instructions on how to get, validate and process the data.
 
+## clone the repository
+
+```
+git clone git@github.com:nipraxis-fall-2022/diagnostics-NME.git
+```
+
+## open the repository
+
+```
+cd diagnostics-NME
+```
+
+## Install the dependencies
+
+Make sure to install everything listed in 'requirements.txt' using 'pip':
+```
+pip3 install --user scipy matplotlib pandas scikit-image sympy nibabel jupyter ipython jupytext nipraxis okpy
+
 ## Get the data
 
 ```
@@ -16,7 +34,13 @@ curl -L https://figshare.com/ndownloader/files/34951650 -o group_data.tar
 tar xvf group_data.tar
 ```
 
-Add the hash_list file to Git:
+First check if the hash_list.txt is added or not
+```
+git status
+```
+
+if there is no modification, it means the hash_list.txt is already added to git
+Else, add the hash_list file to Git:
 
 ```
 git add data/group-*/hash_list.txt
@@ -35,6 +59,42 @@ cd ..
 python3 scripts/validate_data.py data
 ```
 
+## Install the new directory module 'findoutlie'
+
+To do this, first install the Flit Python package manager:
+Flit is a system for configuring and installing modules.
+You may be able to moit the --user below
+```
+python3 -m pip install --user flit
+```
+
+Next install the module using Flit. Here the command differs on Windows compared  to Linux or macOS.
+
+For macOS and Linux:
+
+(See below for Windows command)
+Use Flit to install the module.
+
+```
+python3 -m flit install --user -s
+```
+
+For Windows:
+(See above for macOS and Linux)
+Use Flit to install the module.
+
+```
+python3 -m flit install --user --pth-file
+```
+
+Now test that you can import the 'findoutlie' module by running the command. The -c flag tells Python to run the code that follows the -c flag.
+
+```
+python3 -c 'import findoutlie'
+```
+
+This should give no error, because the previous step installed the 'findoutlie' directory module to somewhere on Python's search path. 
+
 ## Find outliers
 
 ```
@@ -54,9 +114,24 @@ identified as an outlier.  0 refers to the first volume.  For example (these
 outlier IDs are completely random, for illustration):
 
 ```
-data/sub-01/func/sub-01_task-taskzero_run-01_bold.nii.gz, 3, 21, 22, 104
-data/sub-01/func/sub-01_task-taskzero_run-02_bold.nii.gz, 11, 33, 91
-data/sub-03/func/sub-03_task-taskzero_run-02_bold.nii.gz, 101, 102, 132
-data/sub-08/func/sub-08_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 166, 167
-data/sub-09/func/sub-08_task-taskzero_run-01_bold.nii.gz, 3
+data/group-01/sub-08/func/sub-08_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 133, 134, 135, 136, 154, 155, 157
+data/group-01/sub-08/func/sub-08_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 9, 17, 53, 54, 63, 78, 79, 151, 152, 153
+data/group-01/sub-01/func/sub-01_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 157, 158
+data/group-01/sub-01/func/sub-01_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 19, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+data/group-01/sub-06/func/sub-06_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 153, 154, 155, 156, 157, 158, 159, 160, 161
+data/group-01/sub-06/func/sub-06_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 19, 24, 25, 26, 27, 28, 29, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159
+data/group-01/sub-07/func/sub-07_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+data/group-01/sub-07/func/sub-07_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 132, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161
+data/group-01/sub-09/func/sub-09_task-taskzero_run-01_bold.nii.gz, 0, 134, 135, 136, 143, 144, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+data/group-01/sub-09/func/sub-09_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 36, 79, 80, 150, 151, 152, 153, 154, 155, 156, 157
+data/group-01/sub-10/func/sub-10_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 26, 104
+data/group-01/sub-10/func/sub-10_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159
+data/group-01/sub-05/func/sub-05_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 25, 26, 27, 48, 49, 52, 76, 77, 150
+data/group-01/sub-05/func/sub-05_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 50, 51, 52, 54, 157, 158, 159, 160, 161
+data/group-01/sub-02/func/sub-02_task-taskzero_run-02_bold.nii.gz, 34, 65, 105, 106, 107, 135, 140, 148
+data/group-01/sub-02/func/sub-02_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+data/group-01/sub-03/func/sub-03_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 101, 102, 103, 160, 161
+data/group-01/sub-03/func/sub-03_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 136, 137, 138, 139, 140, 142, 156, 157, 158, 159
+data/group-01/sub-04/func/sub-04_task-taskzero_run-01_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 59, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161
+data/group-01/sub-04/func/sub-04_task-taskzero_run-02_bold.nii.gz, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 49, 50, 52, 53, 54, 55, 57, 58, 148, 149, 157
 ```
diff --git a/algorithm.txt b/algorithm.txt
@@ -0,0 +1,10 @@
+Algorithm:
+
+1. The 4D image is first segmented (using otsu threshold) to segment the brain voxels from the background. (x)
+2. The median(med_voxel(x)) and median absolute deviation(mad_voxel(x)) is calculated for each of the brain voxels.
+3. The brain voxels lying outside the interval [med_voxel(x)-a*mad_voxel(x), med_voxel(x)+a*mad_voxel(x)] are considered as outliers. a = 3.5
+4. For each time t, the number of outlying voxels n(t) is counted.
+5. The median (n_med) and MAD (n_mad) of n(t) are calculated. Any time t with n(t)>n_med+3.5*n_mad are considered as outliers.
+
+References:
+1. Cox, R.W. Outlier Detection in FMRl Time Series. ISMRM(2002).
diff --git a/findoutlie/detectors.py b/findoutlie/detectors.py
@@ -14,6 +14,65 @@
 # +++your code here+++
 import numpy as np
 
+from scipy.stats import norm
+
+def mad_voxel_detector(img,threshold=3.5):
+    """ Detect outliers in 'img' using mediaan absolute deviation.
+    Returns 2D vector of same shape as 'img', where True means the corresponding
+    value in 'img' is an outlier.
+
+    Call med as median per voxel and mad as median absolute deviation per voxel of the 'img' 
+ 
+    Parameters
+    ----------
+    img : 2D array 
+        Values for which we will detect outliers
+    p : float, optional
+        Scalar to multiply the median absolute deviation 
+        to form the upper and lower threshold. Default is 3.5.
+
+    Returns
+    -------
+    outlier_tf : 2D boolean array
+        2D boolean array of same shape as 'img', where True means the corresponsding value in 'img' 
+        is an outlier.
+    """
+    # Calculate median per voxel
+    med = np.expand_dims(np.nanmedian(img,axis=-1),axis=1)
+    # Calculate mean absolute deviation per voxel
+    mad = np.expand_dims(np.nanmedian(np.abs(img-med),axis=-1),axis=1)
+    # calculate the outliers
+    outlier_tf = np.abs(img-med)>(threshold*mad)
+    return outlier_tf
+
+def mad_time_detector(measures, threshold=3.5):
+    """ Detect outliers in 'measures' using median absolute deviation.
+    Returns 1D vector of same length as 'measures', where True means the corresponsding 
+    value in 'measures' is an outlier.
+
+    Call med as median and mad as median absolute deviation of the 'measures'
+    
+    Parameters
+    ----------
+    measures : 1D array
+        Values for which we will detect outliers
+    threshold : float, optional
+        Scalar to multiply the median aboslute deviation to form the upper threshold. 
+        Default is 3.5.
+    
+    Returns
+    -------
+    outlier_tf : 1D boolean array
+        1D boolean array of same length as 'measures', where True means the 
+        corresponding value in 'measures' is an outlier. 
+    """
+    # Calculate median of measures
+    med = np.median(measures)
+    # Calculate median absoulte deviation of measures
+    mad = np.median(np.abs(measures-med))
+    # Calculate the outliers
+    outlier_tf = measures>med+threshold*mad
+    return outlier_tf
 
 def iqr_detector(measures, iqr_proportion=1.5):
     """ Detect outliers in `measures` using interquartile range.
@@ -59,6 +118,6 @@ def iqr_detector(measures, iqr_proportion=1.5):
     # Calculate the interquartile range
     IQR = Q3 - Q1
     # Calculate the outliers
-    outliers = np.logical_or(measures > (Q3 + IQR * iqr_proportion), measures < (Q1 - IQR * iqr_proportion))
-    return outliers
+    outlier_tf = np.logical_or(measures > (Q3 + IQR * iqr_proportion), measures < (Q1 - IQR * iqr_proportion))
+    return outlier_tf
     
diff --git a/findoutlie/outfind.py b/findoutlie/outfind.py
@@ -7,8 +7,61 @@
 
 import nibabel as nib
 
+from skimage.filters import threshold_otsu
+
 from .metrics import dvars
-from .detectors import iqr_detector
+from .detectors import iqr_detector,mad_voxel_detector,mad_time_detector
+
+def segment_brain(img):
+    """ Segments brain region from background and returns only brain voxels
+    Parameters
+    ----------
+    img : array
+        2D array with voxels in rows and timepoints in columns
+    Returns
+    -------
+    thresholded_img : array
+        2D array containing only brain voxels in rows and timepoints in columns
+    """
+    # calculate the mean of each voxel over time
+    mean_img = np.mean(img, axis=-1)
+    # calculate the threshold for segmenting brain from background
+    threshold = threshold_otsu(mean_img)
+    mask = np.expand_dims(mean_img > threshold, axis=1)
+    mask_2D = np.tile(mask, (1, img.shape[-1]))
+    thresholded_img = np.where(mask_2D, img, np.nan)
+    # filter only brain voxels
+    brain_voxels = thresholded_img[~np.isnan(thresholded_img).all(axis=1)]
+    return brain_voxels
+    
+def detect_outliers_mean_absolute_deviation_mask(fname):
+    """ Detect outliers given image file path 'filename'
+     
+    Parameters
+    ----------
+    fname : str or Path
+        Filename of 4D image, as string or Path object
+    
+    Returns
+    -------
+    outliers : array
+        Indices of outlier volumes.
+    """
+    # A mask is used to first segment the brain regions from the background, then mean absolute deviation is used to detect outliers
+    img = nib.load(fname)
+    img_data = img.get_fdata()
+    # reshape from 4D to 2D
+    img_data_2D = np.reshape(img_data, (-1,img_data.shape[-1]))
+    # segment brain from background
+    brain_voxels = segment_brain(img_data_2D)
+    # find the outlying voxels
+    outliers_voxel = mad_voxel_detector(brain_voxels)
+    # calculate the number of outlying voxels for each time point
+    voxel_outliers_per_time = np.nansum(outliers_voxel,axis=0)
+    # find the outliers in the time-series    
+    outliers_time = mad_time_detector(voxel_outliers_per_time)
+    # Return indices of True values from Boolean array. 
+    return np.nonzero(outliers_time)[0]
 
 
 def detect_outliers(fname):
@@ -29,7 +82,7 @@ def detect_outliers(fname):
     dvs = dvars(img)
     is_outlier = iqr_detector(dvs, iqr_proportion=2)
     # Return indices of True values from Boolean array.
-    return np.nonzero(is_outlier)
+    return np.nonzero(is_outlier)[0]
 
 
 def find_outliers(data_directory):
@@ -49,6 +102,7 @@ def find_outliers(data_directory):
     image_fnames = Path(data_directory).glob("**/sub-*.nii.gz")
     outlier_dict = {}
     for fname in image_fnames:
-        outliers = detect_outliers(fname)
+        outliers = detect_outliers_mean_absolute_deviation_mask(fname)
+        #outliers = detect_outliers(fname)
         outlier_dict[fname] = outliers
     return outlier_dict