Skip to content

Commit 8401562

Browse files
committed
0.0.1b12 released to PIP
1 parent 29e70d2 commit 8401562

18 files changed

+477
-16
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ This is a helper package for a variety of functions as described in the Overview
55
# Installation
66

77
* For standard installation > pip install more
8-
* For installing a particular version > pip install more==0.0.1b10
8+
* For installing a particular version > pip install more==0.0.1b12
99

1010
# Overview
1111

@@ -19,6 +19,13 @@ Check out the [examples](https://github.com/ngupta23/more/tree/master/examples)
1919

2020
# Version History
2121

22+
## 0.0.1b12
23+
24+
* Add functions for plotting elbow curves.
25+
- Code modified from: https://github.com/reiinakano/scikit-plot
26+
- Modifications made to support running for Hierarchical Clustering as well as support for plotting Silhoutte Score
27+
* typo fixed in function name
28+
2229
## 0.0.1b10 & 0.0.1b11
2330

2431
* Updated Visualization Helper to add function to plot Heatmap

build/lib/more/scikit_helper/cluster/BaseClusterWithN.py

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
from sklearn import metrics
66
from more import viz_helper as vh
77
from more import pandas_helper
8+
from .plot_elbow_curve import plot_elbow_curve
9+
810

911
class BaseClusterWithN:
10-
def __init__(self, X, n_clusters, evaluate_by=None, scaled=True, random_state=101):
12+
def __init__(self, X, n_clusters=2, evaluate_by=None, scaled=True, random_state=101):
1113
"""
1214
Class to train and evaluate a Base Cluster Class with Number of Clusters Specified
1315
evaluate_by = column name to use to compare across the clusters eventually
@@ -35,11 +37,43 @@ def __init__(self, X, n_clusters, evaluate_by=None, scaled=True, random_state=10
3537

3638
std_scl = StandardScaler()
3739
self.X_scaled = pd.DataFrame(std_scl.fit_transform(self.X), columns=self.columns)
38-
39-
def train(self, merge = True):
40+
41+
def plot_elbow_curve(self , cluster_ranges, second_metric='time', n_jobs=1, figsize=(6,6)):
4042
"""
41-
Train the clustering method
43+
n_jobs:
44+
Different from the one in the object that is used for training.
45+
This is because when calculating silhoute score can take up a lot of memory
46+
so it may be advisable to run it without parallelism. But training can still
47+
occur in parallel, hence this option to set n_jobs is provided
48+
"""
49+
50+
if (self.scaled):
51+
# This plot_elbow_curve is not the same as self.plot_elbow_curve.
52+
# It is coming from the plot_elbow_curve.py file
53+
plot_elbow_curve(self.cluster_obj,X=self.X_scaled
54+
,cluster_ranges=cluster_ranges, second_metric=second_metric
55+
,n_jobs=n_jobs,figsize=figsize)
56+
else:
57+
plot_elbow_curve(self.cluster_obj,X=self.X
58+
,cluster_ranges=cluster_ranges, second_metric=second_metric
59+
,n_jobs=n_jobs,figsize=figsize)
60+
61+
plt.show()
62+
63+
def train(self, n_clusters=None, merge=True):
4264
"""
65+
Train the clustering method
66+
n_clusters:
67+
If specified, this will override the existing value.
68+
Useful when the value is determined after plotting elbow curve
69+
merge (Default = True)
70+
Should the data be merged with the labels. Recommended not to change
71+
to False right now since that functionality has not been tested.
72+
"""
73+
if (n_clusters != None):
74+
self.n_clusters = n_clusters
75+
setattr(self.cluster_obj, 'n_clusters', self.n_clusters)
76+
4377
if (self.scaled):
4478
self.cluster_obj.fit(self.X_scaled)
4579
else:
@@ -110,7 +144,7 @@ def plot_parallel_coordinates(self, scaled=True, frac=0.05, figsize=(12,6), xrot
110144
vh.plot_parallel_coordinates(data = self.merged_data, by = 'labels', normalize=False, frac=frac, figsize=figsize, xrot=xrot)
111145

112146

113-
def plot_headmap(self, scale_rows=True, cmap='viridis', figsize=(6,6)
147+
def plot_heatmap(self, scale_rows=True, cmap='viridis', figsize=(6,6)
114148
, annot=False, valfmt="{x:.1f}", fontsize=12, fontweight="bold",textcolors=["white", "black"] ):
115149
"""
116150
valfmt example: "{x:.1f}"

build/lib/more/scikit_helper/cluster/KMeansHelper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ def __init__(self, X, n_clusters, evaluate_by = None, init = "k-means++", n_jobs
99
super().__init__(X=X, n_clusters=n_clusters, evaluate_by=evaluate_by, random_state=random_state)
1010
self.init = init
1111
self.n_jobs = n_jobs
12+
1213
self.cluster_obj = KMeans(n_clusters=self.n_clusters, init=self.init, random_state=self.random_state, n_jobs=self.n_jobs)
1314

1415

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .GaussianMixture import GaussianMixtureHelper
22
from .BaseClusterWithN import BaseClusterWithN
33
from .KMeansHelper import KMeansHelper
4-
from .AgglomerativeHelper import AgglomerativeHelper
4+
from .AgglomerativeHelper import AgglomerativeHelper
5+
from .plot_elbow_curve import plot_elbow_curve, _clone_and_score_clusterer
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
"""
2+
Code modified from: https://github.com/reiinakano/scikit-plot
3+
Modifications made to support running for Hierarchical Clustering
4+
as well as support for plotting Silhoutte Score
5+
6+
The :mod:`scikitplot.cluster` module includes plots built specifically for
7+
scikit-learn clusterer instances e.g. KMeans. You can use your own clusterers,
8+
but these plots assume specific properties shared by scikit-learn estimators.
9+
The specific requirements are documented per function.
10+
"""
11+
from __future__ import absolute_import, division, print_function, unicode_literals
12+
13+
import time
14+
15+
import matplotlib.pyplot as plt
16+
import numpy as np
17+
18+
from sklearn.base import clone
19+
from joblib import Parallel, delayed
20+
21+
import warnings
22+
import math
23+
from scipy.cluster.vq import vq
24+
from sklearn import metrics as mt
25+
26+
27+
def plot_elbow_curve(clf, X, title='Elbow Plot', cluster_ranges=None, n_jobs=1,
28+
show_second_metric=True, second_metric="time",
29+
ax=None, figsize=None,
30+
title_fontsize="large", text_fontsize="medium"):
31+
"""Plots elbow curve of different values of K for KMeans clustering.
32+
Args:
33+
clf: Clusterer instance that implements ``fit``,``fit_predict``, and
34+
``score`` methods, and an ``n_clusters`` hyperparameter.
35+
e.g. :class:`sklearn.cluster.KMeans` instance
36+
X (array-like, shape (n_samples, n_features)):
37+
Data to cluster, where n_samples is the number of samples and
38+
n_features is the number of features.
39+
title (string, optional): Title of the generated plot. Defaults to
40+
"Elbow Plot"
41+
cluster_ranges (None or :obj:`list` of int, optional): List of
42+
n_clusters for which to plot the explained variances. Defaults to
43+
``range(1, 12, 2)``.
44+
n_jobs (int, optional): Number of jobs to run in parallel. Defaults to
45+
1.
46+
show_second_metric [Previously: show_cluster_time] (bool, optional):
47+
Should plot of second metric be included
48+
second_metric (string, optional)= Metric to ploy on second axis.
49+
Defaults to "time" for time it took to cluster for a particular K.
50+
Other options are 'silhoutte' for Silhoutte Score for a particular K
51+
ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to
52+
plot the curve. If None, the plot is drawn on a new set of axes.
53+
figsize (2-tuple, optional): Tuple denoting figure size of the plot
54+
e.g. (6, 6). Defaults to ``None``.
55+
title_fontsize (string or int, optional): Matplotlib-style fontsizes.
56+
Use e.g. "small", "medium", "large" or integer-values. Defaults to
57+
"large".
58+
text_fontsize (string or int, optional): Matplotlib-style fontsizes.
59+
Use e.g. "small", "medium", "large" or integer-values. Defaults to
60+
"medium".
61+
Returns:
62+
ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was
63+
drawn.
64+
Example:
65+
>>> import scikitplot as skplt
66+
>>> kmeans = KMeans(random_state=1)
67+
>>> skplt.cluster.plot_elbow_curve(kmeans, cluster_ranges=range(1, 30))
68+
<matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
69+
>>> plt.show()
70+
.. image:: _static/examples/plot_elbow_curve.png
71+
:align: center
72+
:alt: Elbow Curve
73+
"""
74+
if (second_metric != 'time' and second_metric != 'silhoutte'):
75+
warnings.warn("\nSecond Metric is not allowed. Must be one of ['time','silhoutte'].\nYou entered '{}'. This will be reset to 'time'".format(second_metric))
76+
second_metric = 'time'
77+
78+
if cluster_ranges is None:
79+
cluster_ranges = range(1, 12, 2)
80+
else:
81+
cluster_ranges = sorted(cluster_ranges)
82+
83+
if not hasattr(clf, 'n_clusters'):
84+
raise TypeError('"n_clusters" attribute not in classifier. '
85+
'Cannot plot elbow method.')
86+
87+
tuples = Parallel(n_jobs=n_jobs)(delayed(_clone_and_score_clusterer)
88+
(clf, X, i, second_metric) for i in cluster_ranges)
89+
clfs, second_metric_score = zip(*tuples)
90+
91+
if ax is None:
92+
fig, ax = plt.subplots(1, 1, figsize=figsize)
93+
94+
ax.set_title(title, fontsize=title_fontsize)
95+
ax.plot(cluster_ranges, np.absolute(clfs), 'b*-')
96+
ax.grid(True)
97+
ax.set_xlabel('Number of clusters', fontsize=text_fontsize)
98+
ax.set_ylabel('Sum of Squared Errors', fontsize=text_fontsize)
99+
ax.tick_params(labelsize=text_fontsize)
100+
101+
if show_second_metric:
102+
y_label = 'Clustering duration (seconds)' # Default Value
103+
104+
# Overwrite if needed
105+
# technically checking for time is not required but kept in there for consistency
106+
# (in case default changes later)
107+
if (second_metric == 'time'):
108+
y_label = 'Clustering duration (seconds)'
109+
elif (second_metric == 'silhoutte'):
110+
y_label = 'Silhoutte Score'
111+
112+
ax2_color = 'green'
113+
ax2 = ax.twinx()
114+
ax2.plot(cluster_ranges, second_metric_score, ':', alpha=0.75, color=ax2_color)
115+
ax2.set_ylabel(y_label,
116+
color=ax2_color, alpha=0.75,
117+
fontsize=text_fontsize)
118+
ax2.tick_params(colors=ax2_color, labelsize=text_fontsize)
119+
120+
return ax
121+
122+
123+
def _clone_and_score_clusterer(clf, X, n_clusters, second_metric):
124+
"""Clones and scores clusterer instance.
125+
Args:
126+
# NOTE:
127+
In this modified implementation, the score method is not needed anymore
128+
since the SSE is calculated manually
129+
clf: Clusterer instance that implements ``fit``,``fit_predict``, and
130+
``score`` methods, and an ``n_clusters`` hyperparameter.
131+
e.g. :class:`sklearn.cluster.KMeans` instance
132+
X (array-like, shape (n_samples, n_features)):
133+
Data to cluster, where n_samples is the number of samples and
134+
n_features is the number of features.
135+
n_clusters (int): Number of clusters
136+
second_metric (string): Second metric to return. First is always SSE
137+
Returns:
138+
score: Score of clusters
139+
second_metric: Number of seconds it took to fit cluster
140+
"""
141+
start = time.time()
142+
clf = clone(clf)
143+
setattr(clf, 'n_clusters', n_clusters)
144+
clf.fit(X)
145+
146+
labels = clf.labels_
147+
148+
149+
# Not every clustering algorithm returns the centers (hence calculating manually)
150+
# centers = clf.cluster_centers_
151+
152+
if (True):
153+
num_features = X.shape[1]
154+
centers = np.empty((num_features,0))
155+
156+
for i in range(len(set(labels))):
157+
single_cluster_means = X[labels == i].mean().to_numpy().reshape(num_features,1)
158+
centers = np.concatenate((centers, single_cluster_means), axis=1, out=None)
159+
160+
centers = centers.T
161+
162+
# Calculating SSE
163+
# https://|stats.stackexchange.com/questions/81954/ssb-sum-of-squares-between-clusters
164+
partition, euc_distance_to_centroids = vq(obs = X, code_book = centers)
165+
166+
TSS = np.sum((X-X.mean(0))**2)
167+
SSW = np.sum(euc_distance_to_centroids**2)
168+
SSB = TSS - SSW
169+
170+
# # The 'direct' way
171+
# B = []
172+
# c = scaled_data.mean(0)
173+
# for i in range(partition.max()+1):
174+
# ci = X[partition == i].mean(0)
175+
# B.append(np.bincount(partition)[i]*np.sum((ci - c)**2))
176+
# SSB_ = np.sum(B)
177+
178+
#print(TSS, SSW, SSB, SSB_)
179+
#print(n_clusters, clf.score(X), SSW)
180+
second_metric_score = math.nan
181+
if (second_metric == 'time'):
182+
second_metric_score = time.time() - start
183+
elif (second_metric == 'silhoutte'):
184+
second_metric_score = mt.silhouette_score(X, labels, random_state=101)
185+
186+
return SSW, second_metric_score
187+
31.3 KB
Binary file not shown.

dist/more-0.0.1b12.tar.gz

20.9 KB
Binary file not shown.

more.egg-info/PKG-INFO

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Metadata-Version: 2.1
22
Name: more
3-
Version: 0.0.1b11
3+
Version: 0.0.1b12
44
Summary: A helper library for Pandas, Visualizations and Scikit-learn
55
Home-page: https://github.com/ngupta23/more
66
Author: Nikhil Gupta
@@ -13,7 +13,7 @@ Description: # "More" Package
1313
# Installation
1414

1515
* For standard installation > pip install more
16-
* For installing a particular version > pip install more==0.0.1b10
16+
* For installing a particular version > pip install more==0.0.1b12
1717

1818
# Overview
1919

@@ -27,6 +27,13 @@ Description: # "More" Package
2727

2828
# Version History
2929

30+
## 0.0.1b12
31+
32+
* Add functions for plotting elbow curves.
33+
- Code modified from: https://github.com/reiinakano/scikit-plot
34+
- Modifications made to support running for Hierarchical Clustering as well as support for plotting Silhoutte Score
35+
* typo fixed in function name
36+
3037
## 0.0.1b10 & 0.0.1b11
3138

3239
* Updated Visualization Helper to add function to plot Heatmap

more.egg-info/SOURCES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ more/scikit_helper/cluster/BaseClusterWithN.py
1717
more/scikit_helper/cluster/GaussianMixture.py
1818
more/scikit_helper/cluster/KMeansHelper.py
1919
more/scikit_helper/cluster/__init__.py
20+
more/scikit_helper/cluster/plot_elbow_curve.py
2021
more/viz_helper/__init__.py
2122
more/viz_helper/pca_helper.py
2223
more/viz_helper/plot_corr.py

0 commit comments

Comments
 (0)