ngupta23
diff --git a/‎README.md‎
Lines changed: 8 additions & 1 deletion b/‎README.md‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎build/lib/more/scikit_helper/cluster/BaseClusterWithN.py‎
Lines changed: 39 additions & 5 deletions b/‎build/lib/more/scikit_helper/cluster/BaseClusterWithN.py‎
Lines changed: 39 additions & 5 deletions
diff --git a/‎build/lib/more/scikit_helper/cluster/KMeansHelper.py‎
Lines changed: 1 addition & 0 deletions b/‎build/lib/more/scikit_helper/cluster/KMeansHelper.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎build/lib/more/scikit_helper/cluster/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎build/lib/more/scikit_helper/cluster/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎build/lib/more/scikit_helper/cluster/plot_elbow_curve.py‎
Lines changed: 187 additions & 0 deletions b/‎build/lib/more/scikit_helper/cluster/plot_elbow_curve.py‎
Lines changed: 187 additions & 0 deletions
diff --git a/‎dist/more-0.0.1b12-py3-none-any.whl‎
31.3 KB b/‎dist/more-0.0.1b12-py3-none-any.whl‎
31.3 KB
diff --git a/‎dist/more-0.0.1b12.tar.gz‎
20.9 KB b/‎dist/more-0.0.1b12.tar.gz‎
20.9 KB
diff --git a/‎more.egg-info/PKG-INFO‎
Lines changed: 9 additions & 2 deletions b/‎more.egg-info/PKG-INFO‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎more.egg-info/SOURCES.txt‎
Lines changed: 1 addition & 0 deletions b/‎more.egg-info/SOURCES.txt‎
Lines changed: 1 addition & 0 deletions
@@ -5,7 +5,7 @@ This is a helper package for a variety of functions as described in the Overview
 # Installation
 
 * For standard installation > pip install more 
-* For installing a particular version >  pip install more==0.0.1b10
+* For installing a particular version >  pip install more==0.0.1b12
 
 # Overview
 
@@ -19,6 +19,13 @@ Check out the  [examples](https://github.com/ngupta23/more/tree/master/examples)
 
 # Version History
 
+## 0.0.1b12
+
+* Add functions for plotting elbow curves. 
+    - Code modified from: https://github.com/reiinakano/scikit-plot
+    - Modifications made to support running for Hierarchical Clustering as well as support for plotting Silhoutte Score 
+* typo fixed in function name
+
 ## 0.0.1b10 & 0.0.1b11
 
 * Updated Visualization Helper to add function to plot Heatmap
 
@@ -5,9 +5,11 @@
 from sklearn import metrics
 from more import viz_helper as vh
 from more import pandas_helper
+from .plot_elbow_curve import plot_elbow_curve
+
 
 class BaseClusterWithN:
-    def __init__(self, X, n_clusters, evaluate_by=None, scaled=True, random_state=101):
+    def __init__(self, X, n_clusters=2, evaluate_by=None, scaled=True, random_state=101):
         """
         Class to train and evaluate a Base Cluster Class with Number of Clusters Specified
         evaluate_by = column name to use to compare across the clusters eventually
@@ -35,11 +37,43 @@ def __init__(self, X, n_clusters, evaluate_by=None, scaled=True, random_state=10
 
         std_scl = StandardScaler()
         self.X_scaled = pd.DataFrame(std_scl.fit_transform(self.X), columns=self.columns)
-                    
-    def train(self, merge = True):
+    
+    def plot_elbow_curve(self , cluster_ranges, second_metric='time', n_jobs=1, figsize=(6,6)):
         """
-        Train the clustering method
+        n_jobs: 
+                Different from the one in the object that is used for training.
+                This is because when calculating silhoute score can take up a lot of memory
+                so it may be advisable to run it without parallelism. But training can still
+                occur in parallel, hence this option to set n_jobs is provided
+        """
+              
+        if (self.scaled):
+            # This plot_elbow_curve is not the same as self.plot_elbow_curve. 
+            # It is coming from the plot_elbow_curve.py file
+            plot_elbow_curve(self.cluster_obj,X=self.X_scaled
+                             ,cluster_ranges=cluster_ranges, second_metric=second_metric 
+                             ,n_jobs=n_jobs,figsize=figsize)
+        else:
+            plot_elbow_curve(self.cluster_obj,X=self.X
+                             ,cluster_ranges=cluster_ranges, second_metric=second_metric 
+                             ,n_jobs=n_jobs,figsize=figsize)
+            
+        plt.show()
+                
+    def train(self, n_clusters=None, merge=True):
         """
+        Train the clustering method
+        n_clusters: 
+            If specified, this will override the existing value. 
+            Useful when the value is determined after plotting elbow curve
+        merge (Default = True)
+            Should the data be merged with the labels. Recommended not to change
+            to False right now since that functionality has not been tested.
+        """
+        if (n_clusters != None):
+            self.n_clusters = n_clusters
+            setattr(self.cluster_obj, 'n_clusters', self.n_clusters)
+            
         if (self.scaled):
             self.cluster_obj.fit(self.X_scaled)
         else:
@@ -110,7 +144,7 @@ def plot_parallel_coordinates(self, scaled=True, frac=0.05, figsize=(12,6), xrot
             vh.plot_parallel_coordinates(data = self.merged_data, by = 'labels', normalize=False, frac=frac, figsize=figsize, xrot=xrot)
 
 
-    def plot_headmap(self, scale_rows=True, cmap='viridis', figsize=(6,6)
+    def plot_heatmap(self, scale_rows=True, cmap='viridis', figsize=(6,6)
                      , annot=False, valfmt="{x:.1f}", fontsize=12, fontweight="bold",textcolors=["white", "black"] ):
         """
         valfmt example: "{x:.1f}"
 
@@ -9,6 +9,7 @@ def __init__(self, X, n_clusters, evaluate_by = None, init = "k-means++", n_jobs
         super().__init__(X=X, n_clusters=n_clusters, evaluate_by=evaluate_by, random_state=random_state)
         self.init = init
         self.n_jobs = n_jobs
+        
         self.cluster_obj = KMeans(n_clusters=self.n_clusters, init=self.init, random_state=self.random_state, n_jobs=self.n_jobs)
 
 
 
@@ -1,4 +1,5 @@
 from .GaussianMixture import GaussianMixtureHelper 
 from .BaseClusterWithN import BaseClusterWithN 
 from .KMeansHelper import KMeansHelper
-from .AgglomerativeHelper import AgglomerativeHelper
+from .AgglomerativeHelper import AgglomerativeHelper
+from .plot_elbow_curve import plot_elbow_curve, _clone_and_score_clusterer
@@ -0,0 +1,187 @@
+"""
+Code modified from: https://github.com/reiinakano/scikit-plot
+Modifications made to support running for Hierarchical Clustering 
+as well as support for plotting Silhoutte Score 
+
+The :mod:`scikitplot.cluster` module includes plots built specifically for
+scikit-learn clusterer instances e.g. KMeans. You can use your own clusterers,
+but these plots assume specific properties shared by scikit-learn estimators.
+The specific requirements are documented per function.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.base import clone
+from joblib import Parallel, delayed
+
+import warnings
+import math
+from scipy.cluster.vq import vq
+from sklearn import metrics as mt
+
+
+def plot_elbow_curve(clf, X, title='Elbow Plot', cluster_ranges=None, n_jobs=1,
+                     show_second_metric=True, second_metric="time",
+                     ax=None, figsize=None,
+                     title_fontsize="large", text_fontsize="medium"):
+    """Plots elbow curve of different values of K for KMeans clustering.
+    Args:
+        clf: Clusterer instance that implements ``fit``,``fit_predict``, and
+            ``score`` methods, and an ``n_clusters`` hyperparameter.
+            e.g. :class:`sklearn.cluster.KMeans` instance
+        X (array-like, shape (n_samples, n_features)):
+            Data to cluster, where n_samples is the number of samples and
+            n_features is the number of features.
+        title (string, optional): Title of the generated plot. Defaults to
+            "Elbow Plot"
+        cluster_ranges (None or :obj:`list` of int, optional): List of
+            n_clusters for which to plot the explained variances. Defaults to
+            ``range(1, 12, 2)``.
+        n_jobs (int, optional): Number of jobs to run in parallel. Defaults to
+            1.
+        show_second_metric [Previously: show_cluster_time] (bool, optional): 
+            Should plot of second metric be included
+        second_metric (string, optional)= Metric to ploy on second axis. 
+            Defaults to "time" for time it took to cluster for a particular K.
+            Other options are 'silhoutte' for Silhoutte Score for a particular K
+        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to
+            plot the curve. If None, the plot is drawn on a new set of axes.
+        figsize (2-tuple, optional): Tuple denoting figure size of the plot
+            e.g. (6, 6). Defaults to ``None``.
+        title_fontsize (string or int, optional): Matplotlib-style fontsizes.
+            Use e.g. "small", "medium", "large" or integer-values. Defaults to
+            "large".
+        text_fontsize (string or int, optional): Matplotlib-style fontsizes.
+            Use e.g. "small", "medium", "large" or integer-values. Defaults to
+            "medium".
+    Returns:
+        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was
+            drawn.
+    Example:
+        >>> import scikitplot as skplt
+        >>> kmeans = KMeans(random_state=1)
+        >>> skplt.cluster.plot_elbow_curve(kmeans, cluster_ranges=range(1, 30))
+        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
+        >>> plt.show()
+        .. image:: _static/examples/plot_elbow_curve.png
+           :align: center
+           :alt: Elbow Curve
+    """
+    if (second_metric != 'time' and second_metric != 'silhoutte'):
+        warnings.warn("\nSecond Metric is not allowed. Must be one of ['time','silhoutte'].\nYou entered '{}'. This will be reset to 'time'".format(second_metric))
+        second_metric = 'time'
+    
+    if cluster_ranges is None:
+        cluster_ranges = range(1, 12, 2)
+    else:
+        cluster_ranges = sorted(cluster_ranges)
+
+    if not hasattr(clf, 'n_clusters'):
+        raise TypeError('"n_clusters" attribute not in classifier. '
+                        'Cannot plot elbow method.')
+
+    tuples = Parallel(n_jobs=n_jobs)(delayed(_clone_and_score_clusterer)
+                                     (clf, X, i, second_metric) for i in cluster_ranges)
+    clfs, second_metric_score = zip(*tuples)
+
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=figsize)
+
+    ax.set_title(title, fontsize=title_fontsize)
+    ax.plot(cluster_ranges, np.absolute(clfs), 'b*-')
+    ax.grid(True)
+    ax.set_xlabel('Number of clusters', fontsize=text_fontsize)
+    ax.set_ylabel('Sum of Squared Errors', fontsize=text_fontsize)
+    ax.tick_params(labelsize=text_fontsize)
+
+    if show_second_metric:
+        y_label = 'Clustering duration (seconds)' # Default Value
+        
+        # Overwrite if needed
+        # technically checking for time is not required but kept in there for consistency 
+        # (in case default changes later)
+        if (second_metric == 'time'):
+            y_label = 'Clustering duration (seconds)'
+        elif (second_metric == 'silhoutte'):
+            y_label = 'Silhoutte Score'
+            
+        ax2_color = 'green'
+        ax2 = ax.twinx()
+        ax2.plot(cluster_ranges, second_metric_score, ':', alpha=0.75, color=ax2_color)
+        ax2.set_ylabel(y_label,
+                       color=ax2_color, alpha=0.75,
+                       fontsize=text_fontsize)
+        ax2.tick_params(colors=ax2_color, labelsize=text_fontsize)
+
+    return ax
+
+
+def _clone_and_score_clusterer(clf, X, n_clusters, second_metric):
+    """Clones and scores clusterer instance.
+    Args:
+        # NOTE: 
+            In this modified implementation, the score method is not needed anymore
+            since the SSE is calculated manually        
+        clf: Clusterer instance that implements ``fit``,``fit_predict``, and
+            ``score`` methods, and an ``n_clusters`` hyperparameter.
+            e.g. :class:`sklearn.cluster.KMeans` instance
+        X (array-like, shape (n_samples, n_features)):
+            Data to cluster, where n_samples is the number of samples and
+            n_features is the number of features.
+        n_clusters (int): Number of clusters
+        second_metric (string): Second metric to return. First is always SSE
+    Returns:
+        score: Score of clusters
+        second_metric: Number of seconds it took to fit cluster
+    """
+    start = time.time()
+    clf = clone(clf)
+    setattr(clf, 'n_clusters', n_clusters)
+    clf.fit(X)
+    
+    labels = clf.labels_
+    
+    
+    # Not every clustering algorithm returns the centers (hence calculating manually)
+    # centers = clf.cluster_centers_ 
+    
+    if (True):
+        num_features = X.shape[1]
+        centers = np.empty((num_features,0))
+
+        for i in range(len(set(labels))):
+            single_cluster_means = X[labels == i].mean().to_numpy().reshape(num_features,1)
+            centers = np.concatenate((centers, single_cluster_means), axis=1, out=None)
+
+        centers = centers.T
+ 
+    # Calculating SSE    
+    # https://|stats.stackexchange.com/questions/81954/ssb-sum-of-squares-between-clusters
+    partition, euc_distance_to_centroids = vq(obs = X, code_book = centers)
+
+    TSS = np.sum((X-X.mean(0))**2)
+    SSW = np.sum(euc_distance_to_centroids**2)
+    SSB = TSS - SSW
+    
+#     # The 'direct' way
+#     B = []
+#     c = scaled_data.mean(0)
+#     for i in range(partition.max()+1):
+#         ci = X[partition == i].mean(0)
+#         B.append(np.bincount(partition)[i]*np.sum((ci - c)**2))
+#     SSB_ = np.sum(B)
+
+    #print(TSS, SSW, SSB, SSB_)
+    #print(n_clusters, clf.score(X), SSW)
+    second_metric_score = math.nan
+    if (second_metric == 'time'):
+        second_metric_score = time.time() - start
+    elif (second_metric == 'silhoutte'): 
+        second_metric_score = mt.silhouette_score(X, labels, random_state=101)
+    
+    return SSW, second_metric_score
+
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: more
-Version: 0.0.1b11
+Version: 0.0.1b12
 Summary: A helper library for Pandas, Visualizations and Scikit-learn
 Home-page: https://github.com/ngupta23/more
 Author: Nikhil Gupta
@@ -13,7 +13,7 @@ Description: # "More" Package
         # Installation
 
         * For standard installation > pip install more 
-        * For installing a particular version >  pip install more==0.0.1b10
+        * For installing a particular version >  pip install more==0.0.1b12
 
         # Overview
 
@@ -27,6 +27,13 @@ Description: # "More" Package
 
         # Version History
 
+        ## 0.0.1b12
+        
+        * Add functions for plotting elbow curves. 
+            - Code modified from: https://github.com/reiinakano/scikit-plot
+            - Modifications made to support running for Hierarchical Clustering as well as support for plotting Silhoutte Score 
+        * typo fixed in function name
+        
         ## 0.0.1b10 & 0.0.1b11
 
         * Updated Visualization Helper to add function to plot Heatmap
 
@@ -17,6 +17,7 @@ more/scikit_helper/cluster/BaseClusterWithN.py
 more/scikit_helper/cluster/GaussianMixture.py
 more/scikit_helper/cluster/KMeansHelper.py
 more/scikit_helper/cluster/__init__.py
+more/scikit_helper/cluster/plot_elbow_curve.py
 more/viz_helper/__init__.py
 more/viz_helper/pca_helper.py
 more/viz_helper/plot_corr.py