ngupta23
diff --git a/‎README.md‎
Lines changed: 12 additions & 1 deletion b/‎README.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎build/lib/more/pandas_helper/__init__.py‎
Lines changed: 48 additions & 0 deletions b/‎build/lib/more/pandas_helper/__init__.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎build/lib/more/scikit_helper/cluster/AgglomerativeHelper.py‎
Lines changed: 2 additions & 2 deletions b/‎build/lib/more/scikit_helper/cluster/AgglomerativeHelper.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎build/lib/more/scikit_helper/cluster/BaseClusterWithN.py‎
Lines changed: 54 additions & 9 deletions b/‎build/lib/more/scikit_helper/cluster/BaseClusterWithN.py‎
Lines changed: 54 additions & 9 deletions
diff --git a/‎build/lib/more/scikit_helper/cluster/KMeansHelper.py‎
Lines changed: 2 additions & 2 deletions b/‎build/lib/more/scikit_helper/cluster/KMeansHelper.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎build/lib/more/viz_helper/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎build/lib/more/viz_helper/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎build/lib/more/viz_helper/plot_heatmap.py‎
Lines changed: 135 additions & 0 deletions b/‎build/lib/more/viz_helper/plot_heatmap.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎dist/more-0.0.1b10-py3-none-any.whl‎
27.7 KB b/‎dist/more-0.0.1b10-py3-none-any.whl‎
27.7 KB
diff --git a/‎dist/more-0.0.1b10.tar.gz‎
18.1 KB b/‎dist/more-0.0.1b10.tar.gz‎
18.1 KB
diff --git a/‎dist/more-0.0.1b11-py3-none-any.whl‎
27.7 KB b/‎dist/more-0.0.1b11-py3-none-any.whl‎
27.7 KB
@@ -5,7 +5,7 @@ This is a helper package for a variety of functions as described in the Overview
 # Installation
 
 * For standard installation > pip install more 
-* For installing a particular version >  pip install more==0.0.1b8
+* For installing a particular version >  pip install more==0.0.1b10
 
 # Overview
 
@@ -19,6 +19,17 @@ Check out the  [examples](https://github.com/ngupta23/more/tree/master/examples)
 
 # Version History
 
+## 0.0.1b10 & 0.0.1b11
+
+* Updated Visualization Helper to add function to plot Heatmap
+* Updated BaseClusterWithN to allow plotting of heatmap showing how "cluster feature means" vary between clusters
+* 0.0.1b11 included a small bug fix in 0.0.1b10
+
+## 0.0.1b9
+
+* Updated KMeans and Agglomerative Cluster Helpers to include evaluate_by argument
+
+
 ## 0.0.1b8
 
 * Added Cluster Helpers for KMeans and Agglomerative Clustering
 
@@ -108,6 +108,54 @@ def drop_columns(self, drops, inplace=False):
             self.__set_all_feature_types()
         else:
             return(self._obj.drop(cols_to_del, axis=1, inplace=inplace))
+            
+    def map_columns(self, mapping):
+        """
+        mapping: Dictionary of column name mapping
+        """
+        self._obj.rename(columns=mapping, inplace=True)
+                
+    def add_columns(self,names,value = ""):
+        """
+        previously called add_new_col
+        adds a new column with a single value for all rows.
+        names is a list of column names to be added         
+        """
+        if (type(names) == str):
+            self._obj[names] = value
+        else:
+            for name in names:
+                self._obj[name] = value
+            
+    def filter_change(self,filterCol,filterValue,changeCol,changeValue):
+        """
+        filterCol: Column name to filter by
+        filterValue: value to filter
+        changeCol: Column to change values
+        changeValue: Value to change to in the changeCol column 
+        """
+        self._obj.loc[self._obj[filterCol] == filterValue, [changeCol]] = changeValue
+        
+    def filter_delete(self,deleteCol,deleteValue):
+        self._obj = self._obj[self.data[deleteCol] != deleteValue]
+            
+    def concat_columns(self,newColName,column1,column2,concatBy = " "):
+        self._obj[newColName] = self._obj[column1] + concatBy + self._obj[column2] 
+        
+    def strip_columns(self,names):
+        self._obj[names] = self._obj[names].astype(str).map(lambda x: x.strip()) # convert to string as some are treated as float
+        
+    def title_case(self,names):
+        self._obj[names] = self._obj[names].astype(str).map(lambda x: x.title()) # convert to string as some are treated as float
+        
+    def upper_case(self,names):
+        self._obj[names] = self._obj[names].astype(str).map(lambda x: x.upper()) # convert to string as some are treated as float
+        
+    def select(self,names, inplace=False):
+        if inplace:
+            self._obj = self._obj[names]
+            
+        return(self._obj[names])
 
     #########################
     #### Private Methods ####
 
@@ -2,11 +2,11 @@
 from . import BaseClusterWithN
 
 class AgglomerativeHelper(BaseClusterWithN):
-    def __init__(self, X, n_clusters, linkage = "ward", scaled = True, random_state = 101):
+    def __init__(self, X, n_clusters, evaluate_by = None, linkage = "ward", scaled = True, random_state = 101):
         """
         Class to train and evaluate a Agglomerative (Hierarchical) Cluster Model
         """
-        super().__init__(X=X, n_clusters=n_clusters, random_state = random_state)
+        super().__init__(X=X, n_clusters=n_clusters, evaluate_by=evaluate_by, random_state = random_state)
         self.linkage = linkage
         self.cluster_obj = AgglomerativeClustering(n_clusters=self.n_clusters, linkage=self.linkage)
 
 
@@ -1,16 +1,29 @@
 import warnings
 import pandas as pd
+import matplotlib.pyplot as plt
 from sklearn.preprocessing import StandardScaler
 from sklearn import metrics
 from more import viz_helper as vh
+from more import pandas_helper
 
 class BaseClusterWithN:
-    def __init__(self, X, n_clusters, scaled = True, random_state = 101):
+    def __init__(self, X, n_clusters, evaluate_by=None, scaled=True, random_state=101):
         """
         Class to train and evaluate a Base Cluster Class with Number of Clusters Specified
+        evaluate_by = column name to use to compare across the clusters eventually
         """
-        self.X = X.reset_index(drop=True)
+        self.evaluate_by = evaluate_by
+        
+        if (self.evaluate_by != None):
+            self.evaluate_by_values = X[self.evaluate_by]
+            self.X = X.helper.drop_columns([self.evaluate_by])
+        else:
+            self.X = X
+                    
+        #self.X = X.reset_index(drop=True)
         self.n_clusters = n_clusters
+        
+        
         self.scaled = scaled
         self.random_state = random_state
         self.cluster_obj = None # Define in child class
@@ -39,7 +52,7 @@ def train(self, merge = True):
 
         return(self) # Allows to cascade methods
 
-    def evaluate(self, metric = "silhoutte"):
+    def evaluate_fit(self, metric = "silhoutte"):
         """
         Provides the Goodness of Fit Statistics for the clustering algorithm
         """
@@ -49,13 +62,17 @@ def evaluate(self, metric = "silhoutte"):
             else:
                 self.silhoutte_score = metrics.silhouette_score(self.X, self.labels, random_state= self.random_state)
         else:
-            warnings.warn("Metrix {} is not supported".format(metric))
+            warnings.warn("Metric {} is not supported".format(metric))
 
         print("Silhouette Coefficient: {}".format(self.silhoutte_score))
 
     def merge_data_labels(self):
-        self.merged_data = pd.concat([self.X,pd.Series(self.labels,name='labels')], axis = 1)
-        self.merged_scaled_data = pd.concat([self.X_scaled,pd.Series(self.labels,name='labels')], axis = 1)
+        if (self.evaluate_by == None):
+            self.merged_data = pd.concat([self.X,pd.Series(self.labels,name='labels')], axis = 1)
+            self.merged_scaled_data = pd.concat([self.X_scaled,pd.Series(self.labels,name='labels')], axis = 1)
+        else:
+            self.merged_data = pd.concat([self.X,pd.Series(self.labels,name='labels'), self.evaluate_by_values], axis = 1)
+            self.merged_scaled_data = pd.concat([self.X_scaled,pd.Series(self.labels,name='labels'), self.evaluate_by_values], axis = 1)
 
     def cluster_obs_count(self):
         """
@@ -66,14 +83,22 @@ def cluster_obs_count(self):
     def cluster_means(self):
         """
         Provides the means of the cluster features for each cluster
+        If evaluate_by is set, then clusters will be sorted by the mean value of the "evaluate_by" column
         """
-        return(self.merged_data.groupby('labels').mean().transpose())
-        
+        if self.evaluate_by is not None:
+            return(self.merged_data.groupby('labels').mean().sort_values(self.evaluate_by).transpose())
+        else:
+            return(self.merged_data.groupby('labels').mean().transpose())    
+            
     def cluster_means_scaled(self):
         """
         Provides the means (scaled) of the cluster features for each cluster
+        If evaluate_by is set, then clusters will be sorted by the mean value of the "evaluate_by" column
         """
-        return(self.merged_data.groupby('labels').mean().transpose())
+        if self.evaluate_by is not None:
+            return(self.merged_scaled_data.groupby('labels').mean().sort_values(self.evaluate_by).transpose())
+        else:
+            return(self.merged_scaled_data.groupby('labels').mean().transpose())
 
     def plot_parallel_coordinates(self, scaled=True, frac=0.05, figsize=(12,6), xrot=0):
         """
@@ -84,6 +109,26 @@ def plot_parallel_coordinates(self, scaled=True, frac=0.05, figsize=(12,6), xrot
         else:
             vh.plot_parallel_coordinates(data = self.merged_data, by = 'labels', normalize=False, frac=frac, figsize=figsize, xrot=xrot)
 
+            
+    def plot_headmap(self, scale_rows=True, cmap='viridis', figsize=(6,6)
+                     , annot=False, valfmt="{x:.1f}", fontsize=12, fontweight="bold",textcolors=["white", "black"] ):
+        """
+        valfmt example: "{x:.1f}"
+        """
+        clmeans_df = self.cluster_means()
+        clmeans_np = clmeans_df.to_numpy()
+        if (scale_rows):
+            cbarlabel = "Normalized Values"
+        else:
+            cbarlabel = "Values"
+            
+        fig, ax = plt.subplots(figsize=figsize)
+        im, cbar = vh.heatmap(clmeans_np, row_labels=clmeans_df.index, col_labels=clmeans_df.columns
+                              , ax=ax, scale_rows=scale_rows, cmap=cmap, cbarlabel=cbarlabel)
+        
+        if annot:
+            vh.annotate_heatmap(im, valfmt=valfmt, size=fontsize, fontweight=fontweight,textcolors=textcolors)
+            
 
 
 
 
@@ -2,11 +2,11 @@
 from . import BaseClusterWithN
 
 class KMeansHelper(BaseClusterWithN):
-    def __init__(self, X, n_clusters, init = "k-means++", n_jobs = None, scaled = True, random_state = 101):
+    def __init__(self, X, n_clusters, evaluate_by = None, init = "k-means++", n_jobs = None, scaled = True, random_state = 101):
         """
         Class to train and evaluate a KMeans Cluster Model
         """
-        super().__init__(X=X, n_clusters=n_clusters, random_state = random_state)
+        super().__init__(X=X, n_clusters=n_clusters, evaluate_by=evaluate_by, random_state=random_state)
         self.init = init
         self.n_jobs = n_jobs
         self.cluster_obj = KMeans(n_clusters=self.n_clusters, init=self.init, random_state=self.random_state, n_jobs=self.n_jobs)
 
@@ -7,3 +7,4 @@
 from .plot_data import plot_data
 from .plot_parallel_coordinates import plot_parallel_coordinates
 from .plot_similarity import plot_similarity
+from .plot_heatmap import heatmap, annotate_heatmap
@@ -0,0 +1,135 @@
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+
+
+def heatmap(data, row_labels, col_labels, ax=None, scale_rows=False, xrot = 0,
+            cbar_kw={}, cbarlabel="", **kwargs):
+    """
+    Create a heatmap from a numpy array and two lists of labels.
+    Code taken from : https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py
+    Addition made to 
+        Scale by each row individually -- useful when each row has its own scale
+        Allow for Xlabel rotation
+    
+
+    Parameters
+    ----------
+    data
+        A 2D numpy array of shape (N, M).
+    row_labels
+        A list or array of length N with the labels for the rows.
+    col_labels
+        A list or array of length M with the labels for the columns.
+    ax
+        A `matplotlib.axes.Axes` instance to which the heatmap is plotted.  If
+        not provided, use current axes or create a new one.  Optional.
+    scale_rows:
+        Useful when each row has its own scale
+        Scaled each row (value - row min) / row range
+        This ensures that if the rows are representing items having different ranges, then the one with the max range does not overwhelm the plot
+    xrot:
+        Rotation of the X-labels
+    cbar_kw
+        A dictionary with arguments to `matplotlib.Figure.colorbar`.  Optional.
+    cbarlabel
+        The label for the colorbar.  Optional.
+    **kwargs
+        All other arguments are forwarded to `imshow`.
+    """
+
+    if not ax:
+        ax = plt.gca()
+
+    # If rows need to be scaled, perform the scaling now
+    if(scale_rows):
+        data = (data - np.min(data,axis=1)[:,np.newaxis]) / np.ptp(data,axis=1)[:,np.newaxis]
+
+    # Plot the heatmap
+    im = ax.imshow(data, interpolation='none',aspect='auto', **kwargs)
+
+    # Create colorbar
+    cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
+    cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
+
+    # We want to show all ticks...
+    ax.set_xticks(np.arange(data.shape[1]))
+    ax.set_yticks(np.arange(data.shape[0]))
+    # ... and label them with the respective list entries.
+    ax.set_xticklabels(col_labels)
+    ax.set_yticklabels(row_labels)
+
+    # Let the horizontal axes labeling appear on top.
+    ax.tick_params(top=True, bottom=True,labeltop=True, labelbottom=True)
+
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=xrot, ha="center",rotation_mode="anchor")
+
+    # Turn spines off and create white grid.
+    for edge, spine in ax.spines.items():
+        spine.set_visible(False)
+
+    ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
+    ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
+    #ax.grid(which="minor", color="w", linestyle='-', linewidth=1)
+    ax.tick_params(which="minor", bottom=False, left=False)
+
+    return im, cbar
+
+
+def annotate_heatmap(im, data=None, valfmt="{x:.2f}", textcolors=["black", "white"], threshold=None, **textkw):
+    """
+    A function to annotate a heatmap.
+    Code taken from : https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py
+
+    Parameters
+    ----------
+    im
+        The AxesImage to be labeled.
+    data
+        Data used to annotate.  If None, the image's data is used.  Optional.
+    valfmt
+        The format of the annotations inside the heatmap.  This should either
+        use the string format method, e.g. "$ {x:.2f}", or be a
+        `matplotlib.ticker.Formatter`.  Optional.
+    textcolors
+        A list or array of two color specifications.  The first is used for
+        values below a threshold, the second for those above.  Optional.
+    threshold
+        Value in data units according to which the colors from textcolors are
+        applied.  If None (the default) uses the middle of the colormap as
+        separation.  Optional.
+    **kwargs
+        All other arguments are forwarded to each call to `text` used to create
+        the text labels.
+    """
+
+    if not isinstance(data, (list, np.ndarray)):
+        data = im.get_array()
+
+    # Normalize the threshold to the images color range.
+    if threshold is not None:
+        threshold = im.norm(threshold)
+    else:
+        threshold = im.norm(data.max())/2.
+
+    # Set default alignment to center, but allow it to be
+    # overwritten by textkw.
+    kw = dict(horizontalalignment="center",
+              verticalalignment="center")
+    kw.update(textkw)
+
+    # Get the formatter in case a string is supplied
+    if isinstance(valfmt, str):
+        valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
+
+    # Loop over the data and create a `Text` for each "pixel".
+    # Change the text's color depending on the data.
+    texts = []
+    for i in range(data.shape[0]):
+        for j in range(data.shape[1]):
+            kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
+            text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
+            texts.append(text)
+
+    return texts