|
| 1 | +""" |
| 2 | +Code modified from: https://github.com/reiinakano/scikit-plot |
| 3 | +Modifications made to support running for Hierarchical Clustering |
| 4 | +as well as support for plotting Silhoutte Score |
| 5 | +
|
| 6 | +The :mod:`scikitplot.cluster` module includes plots built specifically for |
| 7 | +scikit-learn clusterer instances e.g. KMeans. You can use your own clusterers, |
| 8 | +but these plots assume specific properties shared by scikit-learn estimators. |
| 9 | +The specific requirements are documented per function. |
| 10 | +""" |
| 11 | +from __future__ import absolute_import, division, print_function, unicode_literals |
| 12 | + |
| 13 | +import time |
| 14 | + |
| 15 | +import matplotlib.pyplot as plt |
| 16 | +import numpy as np |
| 17 | + |
| 18 | +from sklearn.base import clone |
| 19 | +from joblib import Parallel, delayed |
| 20 | + |
| 21 | +import warnings |
| 22 | +import math |
| 23 | +from scipy.cluster.vq import vq |
| 24 | +from sklearn import metrics as mt |
| 25 | + |
| 26 | + |
| 27 | +def plot_elbow_curve(clf, X, title='Elbow Plot', cluster_ranges=None, n_jobs=1, |
| 28 | + show_second_metric=True, second_metric="time", |
| 29 | + ax=None, figsize=None, |
| 30 | + title_fontsize="large", text_fontsize="medium"): |
| 31 | + """Plots elbow curve of different values of K for KMeans clustering. |
| 32 | + Args: |
| 33 | + clf: Clusterer instance that implements ``fit``,``fit_predict``, and |
| 34 | + ``score`` methods, and an ``n_clusters`` hyperparameter. |
| 35 | + e.g. :class:`sklearn.cluster.KMeans` instance |
| 36 | + X (array-like, shape (n_samples, n_features)): |
| 37 | + Data to cluster, where n_samples is the number of samples and |
| 38 | + n_features is the number of features. |
| 39 | + title (string, optional): Title of the generated plot. Defaults to |
| 40 | + "Elbow Plot" |
| 41 | + cluster_ranges (None or :obj:`list` of int, optional): List of |
| 42 | + n_clusters for which to plot the explained variances. Defaults to |
| 43 | + ``range(1, 12, 2)``. |
| 44 | + n_jobs (int, optional): Number of jobs to run in parallel. Defaults to |
| 45 | + 1. |
| 46 | + show_second_metric [Previously: show_cluster_time] (bool, optional): |
| 47 | + Should plot of second metric be included |
| 48 | + second_metric (string, optional)= Metric to ploy on second axis. |
| 49 | + Defaults to "time" for time it took to cluster for a particular K. |
| 50 | + Other options are 'silhoutte' for Silhoutte Score for a particular K |
| 51 | + ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to |
| 52 | + plot the curve. If None, the plot is drawn on a new set of axes. |
| 53 | + figsize (2-tuple, optional): Tuple denoting figure size of the plot |
| 54 | + e.g. (6, 6). Defaults to ``None``. |
| 55 | + title_fontsize (string or int, optional): Matplotlib-style fontsizes. |
| 56 | + Use e.g. "small", "medium", "large" or integer-values. Defaults to |
| 57 | + "large". |
| 58 | + text_fontsize (string or int, optional): Matplotlib-style fontsizes. |
| 59 | + Use e.g. "small", "medium", "large" or integer-values. Defaults to |
| 60 | + "medium". |
| 61 | + Returns: |
| 62 | + ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was |
| 63 | + drawn. |
| 64 | + Example: |
| 65 | + >>> import scikitplot as skplt |
| 66 | + >>> kmeans = KMeans(random_state=1) |
| 67 | + >>> skplt.cluster.plot_elbow_curve(kmeans, cluster_ranges=range(1, 30)) |
| 68 | + <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> |
| 69 | + >>> plt.show() |
| 70 | + .. image:: _static/examples/plot_elbow_curve.png |
| 71 | + :align: center |
| 72 | + :alt: Elbow Curve |
| 73 | + """ |
| 74 | + if (second_metric != 'time' and second_metric != 'silhoutte'): |
| 75 | + warnings.warn("\nSecond Metric is not allowed. Must be one of ['time','silhoutte'].\nYou entered '{}'. This will be reset to 'time'".format(second_metric)) |
| 76 | + second_metric = 'time' |
| 77 | + |
| 78 | + if cluster_ranges is None: |
| 79 | + cluster_ranges = range(1, 12, 2) |
| 80 | + else: |
| 81 | + cluster_ranges = sorted(cluster_ranges) |
| 82 | + |
| 83 | + if not hasattr(clf, 'n_clusters'): |
| 84 | + raise TypeError('"n_clusters" attribute not in classifier. ' |
| 85 | + 'Cannot plot elbow method.') |
| 86 | + |
| 87 | + tuples = Parallel(n_jobs=n_jobs)(delayed(_clone_and_score_clusterer) |
| 88 | + (clf, X, i, second_metric) for i in cluster_ranges) |
| 89 | + clfs, second_metric_score = zip(*tuples) |
| 90 | + |
| 91 | + if ax is None: |
| 92 | + fig, ax = plt.subplots(1, 1, figsize=figsize) |
| 93 | + |
| 94 | + ax.set_title(title, fontsize=title_fontsize) |
| 95 | + ax.plot(cluster_ranges, np.absolute(clfs), 'b*-') |
| 96 | + ax.grid(True) |
| 97 | + ax.set_xlabel('Number of clusters', fontsize=text_fontsize) |
| 98 | + ax.set_ylabel('Sum of Squared Errors', fontsize=text_fontsize) |
| 99 | + ax.tick_params(labelsize=text_fontsize) |
| 100 | + |
| 101 | + if show_second_metric: |
| 102 | + y_label = 'Clustering duration (seconds)' # Default Value |
| 103 | + |
| 104 | + # Overwrite if needed |
| 105 | + # technically checking for time is not required but kept in there for consistency |
| 106 | + # (in case default changes later) |
| 107 | + if (second_metric == 'time'): |
| 108 | + y_label = 'Clustering duration (seconds)' |
| 109 | + elif (second_metric == 'silhoutte'): |
| 110 | + y_label = 'Silhoutte Score' |
| 111 | + |
| 112 | + ax2_color = 'green' |
| 113 | + ax2 = ax.twinx() |
| 114 | + ax2.plot(cluster_ranges, second_metric_score, ':', alpha=0.75, color=ax2_color) |
| 115 | + ax2.set_ylabel(y_label, |
| 116 | + color=ax2_color, alpha=0.75, |
| 117 | + fontsize=text_fontsize) |
| 118 | + ax2.tick_params(colors=ax2_color, labelsize=text_fontsize) |
| 119 | + |
| 120 | + return ax |
| 121 | + |
| 122 | + |
| 123 | +def _clone_and_score_clusterer(clf, X, n_clusters, second_metric): |
| 124 | + """Clones and scores clusterer instance. |
| 125 | + Args: |
| 126 | + # NOTE: |
| 127 | + In this modified implementation, the score method is not needed anymore |
| 128 | + since the SSE is calculated manually |
| 129 | + clf: Clusterer instance that implements ``fit``,``fit_predict``, and |
| 130 | + ``score`` methods, and an ``n_clusters`` hyperparameter. |
| 131 | + e.g. :class:`sklearn.cluster.KMeans` instance |
| 132 | + X (array-like, shape (n_samples, n_features)): |
| 133 | + Data to cluster, where n_samples is the number of samples and |
| 134 | + n_features is the number of features. |
| 135 | + n_clusters (int): Number of clusters |
| 136 | + second_metric (string): Second metric to return. First is always SSE |
| 137 | + Returns: |
| 138 | + score: Score of clusters |
| 139 | + second_metric: Number of seconds it took to fit cluster |
| 140 | + """ |
| 141 | + start = time.time() |
| 142 | + clf = clone(clf) |
| 143 | + setattr(clf, 'n_clusters', n_clusters) |
| 144 | + clf.fit(X) |
| 145 | + |
| 146 | + labels = clf.labels_ |
| 147 | + |
| 148 | + |
| 149 | + # Not every clustering algorithm returns the centers (hence calculating manually) |
| 150 | + # centers = clf.cluster_centers_ |
| 151 | + |
| 152 | + if (True): |
| 153 | + num_features = X.shape[1] |
| 154 | + centers = np.empty((num_features,0)) |
| 155 | + |
| 156 | + for i in range(len(set(labels))): |
| 157 | + single_cluster_means = X[labels == i].mean().to_numpy().reshape(num_features,1) |
| 158 | + centers = np.concatenate((centers, single_cluster_means), axis=1, out=None) |
| 159 | + |
| 160 | + centers = centers.T |
| 161 | + |
| 162 | + # Calculating SSE |
| 163 | + # https://|stats.stackexchange.com/questions/81954/ssb-sum-of-squares-between-clusters |
| 164 | + partition, euc_distance_to_centroids = vq(obs = X, code_book = centers) |
| 165 | + |
| 166 | + TSS = np.sum((X-X.mean(0))**2) |
| 167 | + SSW = np.sum(euc_distance_to_centroids**2) |
| 168 | + SSB = TSS - SSW |
| 169 | + |
| 170 | +# # The 'direct' way |
| 171 | +# B = [] |
| 172 | +# c = scaled_data.mean(0) |
| 173 | +# for i in range(partition.max()+1): |
| 174 | +# ci = X[partition == i].mean(0) |
| 175 | +# B.append(np.bincount(partition)[i]*np.sum((ci - c)**2)) |
| 176 | +# SSB_ = np.sum(B) |
| 177 | + |
| 178 | + #print(TSS, SSW, SSB, SSB_) |
| 179 | + #print(n_clusters, clf.score(X), SSW) |
| 180 | + second_metric_score = math.nan |
| 181 | + if (second_metric == 'time'): |
| 182 | + second_metric_score = time.time() - start |
| 183 | + elif (second_metric == 'silhoutte'): |
| 184 | + second_metric_score = mt.silhouette_score(X, labels, random_state=101) |
| 185 | + |
| 186 | + return SSW, second_metric_score |
| 187 | + |
0 commit comments