NF: start cleanup of redundant code in util

cindeem · cindeem · commit 9b514f6c3fc9 · 2013-08-20T17:53:34.000-07:00
diff --git a/brainx/tests/test_util.py b/brainx/tests/test_util.py
@@ -28,6 +28,17 @@ def test_slice_data():
     npt.assert_raises(IndexError, util.slice_data, data_5d, subjects, blocks)
 
 
+def test_all_positive():
+    jnk = np.random.random(40)
+    npt.assert_equal(util.all_positive(jnk), True)
+    # zeros counted as positive
+    jnk[0] = 0
+    npt.assert_equal(util.all_positive(jnk), True)
+    # find real negative
+    jnk = jnk - 0.5
+    npt.assert_equal(util.all_positive(jnk), False) 
+
+    
 def test_cost_size():
     n_nodes = 5
     npt.assert_warns(DeprecationWarning, util.cost_size, n_nodes)
@@ -116,4 +127,20 @@ def test_no_empty_modules():
     b[2] = []
     util.assert_no_empty_modules(a)
     nt.assert_raises(ValueError, util.assert_no_empty_modules, b)
-    
+
+def test_rescale_arr():
+    array = np.arange(5)
+    scaled = util.rescale_arr(array, 3, 6)
+    npt.assert_equal(scaled.min(), 3)
+    scaled = util.rescale_arr(array, -10, 10)
+    npt.assert_equal(scaled.min(), -10)
+    npt.assert_equal(scaled.max(), 10)
+
+def test_normalize():
+    array = np.arange(5)
+    result = util.normalize(array)
+    npt.assert_equal(result.min(), 0)
+    npt.assert_equal(result.max(), 1)
+    npt.assert_raises(ValueError, util.normalize, array, 'blueberry', (0,2)) 
+
+
diff --git a/brainx/util.py b/brainx/util.py
@@ -4,10 +4,8 @@
 #-----------------------------------------------------------------------------
 # Imports
 #-----------------------------------------------------------------------------
-from __future__ import print_function
 
 import warnings
-
 import numpy as np
 import networkx as nx
 
@@ -40,15 +38,22 @@ def slice_data(data, sub, block, subcond=None):
 
 
 def format_matrix(data,s,b,lk,co,idc = [],costlist=[],nouptri = False):
-    """ Function which formats matrix for a particular subject and particular block (thresholds, upper-tris it) so that we can make a graph object out of it
+    """ Function which thresholds the adjacency matrix for a particular 
+    subject and particular block, using lookuptable to find thresholds, 
+    cost value to find threshold, costlist
+    (thresholds, upper-tris it) so that we can use it with simulated annealing
 
-    Parameters:
+    Parameters
     -----------
-    data = full data array
+    data : full data array 4D (block, sub, node, node)
     s = subject
     b = block
     lk = lookup table for study
     co = cost value to threshold at
+    idc = index of ideal cost
+    costlist = list (size num_edges) with ordered values used to find 
+        threshold to control number of edges
+    nouptri = if False only keeps upper tri, True yields symmetric matrix
 """
 
     cmat = data[b,s]
@@ -65,15 +70,28 @@ def format_matrix(data,s,b,lk,co,idc = [],costlist=[],nouptri = False):
 def format_matrix2(data,s,sc,c,lk,co,idc = [],costlist=[],nouptri = False):
     """ Function which formats matrix for a particular subject and particular block (thresholds, upper-tris it) so that we can make a graph object out of it
 
-    Parameters:
-    -----------
-    data = full data array
-    s = subject
-    b = block
-    lk = lookup table for study
-    co = cost value to threshold at
+    Parameters
+    ----------
+    data : numpy array
+        full data array 5D (subcondition, condition, subject, node, node) 
+    s : int
+        index of subject
+    sc : int
+        index of sub condition
+    c : int
+        index of condition
+    lk : numpy array
+        lookup table for thresholds at each possible cost
+    co : float
+        cost value to threshold at
+    idc : float
+        ideal cost 
+    costlist : list
+        list of possible costs
+    nouptri : bool
+        False zeros out diag and below, True returns symmetric matrix
 """
-
+    
     cmat = data[sc,c,s]
     th = cost2thresh2(co,s,sc,c,lk,[],idc,costlist) #get the right threshold
     
@@ -85,6 +103,46 @@ def format_matrix2(data,s,sc,c,lk,co,idc = [],costlist=[],nouptri = False):
         
     return cmat
 
+def threshold_adjacency_matrix(adj_matrix, cost):
+    """docstring for threshold_adjacency_matrix(adj_matrix, cost"""
+    
+    pass
+
+
+def all_positive(adjacency_matrix):
+    """ checks if edge value sin adjacency matrix are all positive
+    or positive and negative 
+    Returns
+    -------
+    all_positive : bool
+        True if all values are >=0
+        False if values < 0
+    """
+    # add 1 so 0-> 1(True) , -1 -> 0 False
+    signs = set( np.sign(adjacency_matrix) + 1 )
+    return bool(sorted(signs)[0])
+
+
+def make_cost_thresh_lookup(adjacency_matrix):
+    """takes upper triangular (offset 1, no diagonal) of summetric 
+    adjacency matrix, sorts (lowest -> highest)
+    Returns
+    -------
+    lookup : numpy array
+        3 X number_of_edges, numpy array
+        row 0 is sorted thresholds
+        row 1 is cost at each threshold
+        row 2 is costs rounded to one decimal point
+    """
+
+    ind = np.triu_indices_from(adjacency_matrix, k = 1)
+    edges = adjacency_matrix[ind]
+    nedges = edges.shape[0]
+    lookup = np.zeros((3, nedges))
+    lookup[0,:] = sorted(edges)
+    lookup[1,:] = np.arange(nedges) / float(nedges)
+    lookup[2,:] = np.round(lookup[1,:], decimals = 1)
+    return lookup
 
 def cost_size(nnodes):
     warnings.warn('deprecated: use make_cost_array', DeprecationWarning)
@@ -125,9 +183,17 @@ def make_cost_array(n_nodes, cost=0.5):
     costs = np.array(range(int(tot_edges * cost)), dtype=float) / tot_edges
     return costs, tot_edges
     
+def metrics_to_pandas():
+    """docstring for metrics_to_pandas"""
+    pass
 
 def store_metrics(b, s, co, metd, arr):
-    """Store a set of metrics into a structured array"""
+    """Store a set of metrics into a structured array
+    b = block
+    s = subject
+    co = cost? float
+    metd = dict of metrics
+    arr : array?"""
 
     if arr.ndim == 3:
         idx = b,s,co
@@ -148,6 +214,8 @@ def regular_lattice(n,k):
 
     This type of graph is the starting point for the Watts-Strogatz small-world
     model, where connections are then rewired in a second phase.
+
+    XXX TODO Use as comparison, check networkx to see if its update worth redundancy
     """
     # Code simplified from the networkx.watts_strogatz_graph one
     G = nx.Graph()
@@ -282,18 +350,23 @@ def normalize(arr,mode='direct',folding_edges=None):
     """Normalize an array to [0,1] range.
 
     By default, this simply rescales the input array to [0,1].  But it has a
-    special 'folding' mode that allows for the normalization of an array with
-    negative and positive values by mapping the negative values to their
-    flipped sign
+    special 'folding' mode that allong absolute value of all values, in addition
+    values between the folding_edges (low_cutoff, high_cutoff)  will be zeroed.
 
     Parameters
     ----------
     arr : 1d array
-    
+        assumes dtype == float, if int32, will raise ValueError
+
     mode : string, one of ['direct','folding']
+        if direct rescale all values (pos and neg) between 0,1
+        if folding, zeros out values between folding_values (inclusive)
+        and normalizes absolute value of remaining values
 
     folding_edges : (float,float)
-      Only needed for folding mode, ignored in 'direct' mode.
+        (low_cutoff, high_cutoff) lower and upper values to zero out
+        (values are inclusive)
+        Only needed for folding mode, ignored in 'direct' mode.
 
     Examples
     --------
@@ -315,37 +388,23 @@ def normalize(arr,mode='direct',folding_edges=None):
     >>> c
     array([-0.8   , -0.6333, -0.4667, -0.3   ,  0.3   ,  0.4333,  0.5667,  0.7   ])
     >>> normalize(c,'folding',[-0.3,0.3])
-    array([ 1.    ,  0.6667,  0.3333,  0.    ,  0.    ,  0.2667,  0.5333,  0.8   ])
+    array([ 1.    ,  0.7917,  0.5833,  0.    ,  0.    ,  0.5417,  0.7083,  0.875   ])
     """
     if mode == 'direct':
         return rescale_arr(arr,0,1)
-    else:
-        fa, fb = folding_edges
+    elif mode == 'folding':
+        # cast folding_edges to floats in case inputs are ints
+        low_cutoff, high_cutoff = [float(x) for x in folding_edges]
         amin, amax = arr.min(), arr.max()
-        ra,rb = float(fa-amin),float(amax-fb) # in case inputs are ints
-        if ra<0 or rb<0:
+        low_diff, high_diff = low_cutoff-amin, amax-high_cutoff 
+        if low_diff < 0 or high_diff < 0:
             raise ValueError("folding edges must be within array range")
-        greater = arr>= fb
-        upper_idx = greater.nonzero()
-        lower_idx = (~greater).nonzero()
-        # Two folding scenarios, we map the thresholds to zero but the upper
-        # ranges must retain comparability.
-        if ra > rb:
-            lower = 1.0 - rescale_arr(arr[lower_idx],0,1.0)
-            upper = rescale_arr(arr[upper_idx],0,float(rb)/ra)
-        else:
-            upper = rescale_arr(arr[upper_idx],0,1)
-            # The lower range is trickier: we need to rescale it and then flip
-            # it, so the edge goes to 0.
-            resc_a = float(ra)/rb
-            lower = rescale_arr(arr[lower_idx],0,resc_a)
-            lower = resc_a - lower
-        # Now, make output array
-        out = np.empty_like(arr)
-        out[lower_idx] = lower
-        out[upper_idx] = upper
-        return out
-
+        mask = np.logical_and( arr >= low_cutoff, arr <= high_cutoff)
+        out = arr.copy()
+        out[mask] = 0
+        return rescale_arr(np.abs(out), 0, 1)
+    else:
+        raise ValueError('Unknown mode %s: valid options("direct", "folding")')
 
 def mat2graph(cmat,threshold=0.0,threshold2=None):
     """Make a weighted graph object out of an adjacency matrix.
@@ -559,45 +618,59 @@ def cost2thresh(cost, sub, bl, lk, idc=[], costlist=[]):
     return th
 
 
-def cost2thresh2(cost,sub,sc,c,lk,last,idc = [],costlist=[]):
-    """A definition for loading the lookup table and finding the threshold associated with a particular cost for a particular subject in a particular block
+def cost2thresh2(cost, sub, sc, c, lk, last = None, idc = [], costlist=[]):
+    """A definition for loading the lookup table and finding the threshold 
+    associated with a particular cost for a particular subject in a 
+    particular block of data
     
-    inputs:
-    cost: cost value for which we need the associated threshold
-    sub: subject number
-    bl: block number
-    lk: lookup table (block x subject x cost
-    last: last threshold value
-
-    output:
-    th: threshold value for this cost"""
-
-    #print cost,sub,bl
+    Inputs
+    ------
+    cost : float
+        cost value for which we need the associated threshold
+    sub : int 
+        (axis -2) subject number
+    axis1 : int
+        axis 1 into lookup (eg block number or condition)
+    axis0 : int
+        axis 0 into lookup (eg subcondition)
+    lk : numpy array 
+        lookup table (axis0 x axis1  x subject x 2 )
+    last : None
+        NOT USED last threshold value
+    idc : int or empty list
+        Index in costlist corresponding to cost currently being
+        processed.  By default, idc is an empty list.
+    costlist : array-like
+        List of costs that are being queried with the current function
+        in order.
     
-    ind=np.where(lk[sc,c,sub][1]==cost)
-    th=lk[sc,c,sub][0][ind]
+    Returns
+    -------
+    threshold : float
+        threshold value for this cost"""
+
+    subject_lookup = slice_data(lk, sub, c, subcond=sc) 
+    index = np.where(subject_lookup[1] == cost)
+    threshold = subject_lookup[0][ind]
     
-    if len(th)>1:
-        th=th[0] #if there are multiple thresholds, go down to the lower cost ####Is this right?!!!####
-        print('multiple thresh')
-    elif len(th)<1:
-        done = 1
-        while done:
-            idc = idc-1
-            newcost = costlist[idc]
-            print(idc,newcost)
-            ind=np.where(lk[bl][sub][1]==newcost)
-            th=lk[bl][sub][0][ind]
-            if len(th) > 1:
-                th = th[0]
-                done = 0
-        #th=last #if there is no associated thresh value because of repeats, just use the previous one
-        print('use previous thresh')
+    if len(threshold) > 1:
+        threshold = threshold[0] 
+        #if there are multiple thresholds, go down to the lower cost 
+        ####Is this right?!!!####
+        print('Subject %s has multiple thresholds at cost %s'%(sub, cost))
+        print('index 1: %s, index 2: %s'%(c, sc))
+    elif len(threshold) < 1:
+        idc = idc-1
+        newcost = costlist[idc]
+        threshold = cost2thresh2(newcost, sub, sc, c, lk, 
+                                 idc=idc, costlist = costlist) 
+        print(' '.join(['Subject %s does not have cost at %s'%(sub, cost),
+                        'index 1: %s, index 2: %s'%(c, sc),
+                        'nearest cost %s being used'%(newcost)]))
     else:
-        th=th[0]
+        threshold = threshold[0]
       
-    #print th    
-    return th
+    return threshold
 
 
 def apply_cost(corr_mat, cost, tot_edges):