Skip to content

Commit 29e70d2

Browse files
committed
0.0.1.b10 and 0.0.1b11 released
1 parent df81e19 commit 29e70d2

31 files changed

+1145
-204
lines changed

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ This is a helper package for a variety of functions as described in the Overview
55
# Installation
66

77
* For standard installation > pip install more
8-
* For installing a particular version > pip install more==0.0.1b8
8+
* For installing a particular version > pip install more==0.0.1b10
99

1010
# Overview
1111

@@ -19,6 +19,17 @@ Check out the [examples](https://github.com/ngupta23/more/tree/master/examples)
1919

2020
# Version History
2121

22+
## 0.0.1b10 & 0.0.1b11
23+
24+
* Updated Visualization Helper to add function to plot Heatmap
25+
* Updated BaseClusterWithN to allow plotting of heatmap showing how "cluster feature means" vary between clusters
26+
* 0.0.1b11 included a small bug fix in 0.0.1b10
27+
28+
## 0.0.1b9
29+
30+
* Updated KMeans and Agglomerative Cluster Helpers to include evaluate_by argument
31+
32+
2233
## 0.0.1b8
2334

2435
* Added Cluster Helpers for KMeans and Agglomerative Clustering

build/lib/more/pandas_helper/__init__.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,54 @@ def drop_columns(self, drops, inplace=False):
108108
self.__set_all_feature_types()
109109
else:
110110
return(self._obj.drop(cols_to_del, axis=1, inplace=inplace))
111+
112+
def map_columns(self, mapping):
113+
"""
114+
mapping: Dictionary of column name mapping
115+
"""
116+
self._obj.rename(columns=mapping, inplace=True)
117+
118+
def add_columns(self,names,value = ""):
119+
"""
120+
previously called add_new_col
121+
adds a new column with a single value for all rows.
122+
names is a list of column names to be added
123+
"""
124+
if (type(names) == str):
125+
self._obj[names] = value
126+
else:
127+
for name in names:
128+
self._obj[name] = value
129+
130+
def filter_change(self,filterCol,filterValue,changeCol,changeValue):
131+
"""
132+
filterCol: Column name to filter by
133+
filterValue: value to filter
134+
changeCol: Column to change values
135+
changeValue: Value to change to in the changeCol column
136+
"""
137+
self._obj.loc[self._obj[filterCol] == filterValue, [changeCol]] = changeValue
138+
139+
def filter_delete(self,deleteCol,deleteValue):
140+
self._obj = self._obj[self.data[deleteCol] != deleteValue]
141+
142+
def concat_columns(self,newColName,column1,column2,concatBy = " "):
143+
self._obj[newColName] = self._obj[column1] + concatBy + self._obj[column2]
144+
145+
def strip_columns(self,names):
146+
self._obj[names] = self._obj[names].astype(str).map(lambda x: x.strip()) # convert to string as some are treated as float
147+
148+
def title_case(self,names):
149+
self._obj[names] = self._obj[names].astype(str).map(lambda x: x.title()) # convert to string as some are treated as float
150+
151+
def upper_case(self,names):
152+
self._obj[names] = self._obj[names].astype(str).map(lambda x: x.upper()) # convert to string as some are treated as float
153+
154+
def select(self,names, inplace=False):
155+
if inplace:
156+
self._obj = self._obj[names]
157+
158+
return(self._obj[names])
111159

112160
#########################
113161
#### Private Methods ####

build/lib/more/scikit_helper/cluster/AgglomerativeHelper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
from . import BaseClusterWithN
33

44
class AgglomerativeHelper(BaseClusterWithN):
5-
def __init__(self, X, n_clusters, linkage = "ward", scaled = True, random_state = 101):
5+
def __init__(self, X, n_clusters, evaluate_by = None, linkage = "ward", scaled = True, random_state = 101):
66
"""
77
Class to train and evaluate a Agglomerative (Hierarchical) Cluster Model
88
"""
9-
super().__init__(X=X, n_clusters=n_clusters, random_state = random_state)
9+
super().__init__(X=X, n_clusters=n_clusters, evaluate_by=evaluate_by, random_state = random_state)
1010
self.linkage = linkage
1111
self.cluster_obj = AgglomerativeClustering(n_clusters=self.n_clusters, linkage=self.linkage)
1212

build/lib/more/scikit_helper/cluster/BaseClusterWithN.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,29 @@
11
import warnings
22
import pandas as pd
3+
import matplotlib.pyplot as plt
34
from sklearn.preprocessing import StandardScaler
45
from sklearn import metrics
56
from more import viz_helper as vh
7+
from more import pandas_helper
68

79
class BaseClusterWithN:
8-
def __init__(self, X, n_clusters, scaled = True, random_state = 101):
10+
def __init__(self, X, n_clusters, evaluate_by=None, scaled=True, random_state=101):
911
"""
1012
Class to train and evaluate a Base Cluster Class with Number of Clusters Specified
13+
evaluate_by = column name to use to compare across the clusters eventually
1114
"""
12-
self.X = X.reset_index(drop=True)
15+
self.evaluate_by = evaluate_by
16+
17+
if (self.evaluate_by != None):
18+
self.evaluate_by_values = X[self.evaluate_by]
19+
self.X = X.helper.drop_columns([self.evaluate_by])
20+
else:
21+
self.X = X
22+
23+
#self.X = X.reset_index(drop=True)
1324
self.n_clusters = n_clusters
25+
26+
1427
self.scaled = scaled
1528
self.random_state = random_state
1629
self.cluster_obj = None # Define in child class
@@ -39,7 +52,7 @@ def train(self, merge = True):
3952

4053
return(self) # Allows to cascade methods
4154

42-
def evaluate(self, metric = "silhoutte"):
55+
def evaluate_fit(self, metric = "silhoutte"):
4356
"""
4457
Provides the Goodness of Fit Statistics for the clustering algorithm
4558
"""
@@ -49,13 +62,17 @@ def evaluate(self, metric = "silhoutte"):
4962
else:
5063
self.silhoutte_score = metrics.silhouette_score(self.X, self.labels, random_state= self.random_state)
5164
else:
52-
warnings.warn("Metrix {} is not supported".format(metric))
65+
warnings.warn("Metric {} is not supported".format(metric))
5366

5467
print("Silhouette Coefficient: {}".format(self.silhoutte_score))
5568

5669
def merge_data_labels(self):
57-
self.merged_data = pd.concat([self.X,pd.Series(self.labels,name='labels')], axis = 1)
58-
self.merged_scaled_data = pd.concat([self.X_scaled,pd.Series(self.labels,name='labels')], axis = 1)
70+
if (self.evaluate_by == None):
71+
self.merged_data = pd.concat([self.X,pd.Series(self.labels,name='labels')], axis = 1)
72+
self.merged_scaled_data = pd.concat([self.X_scaled,pd.Series(self.labels,name='labels')], axis = 1)
73+
else:
74+
self.merged_data = pd.concat([self.X,pd.Series(self.labels,name='labels'), self.evaluate_by_values], axis = 1)
75+
self.merged_scaled_data = pd.concat([self.X_scaled,pd.Series(self.labels,name='labels'), self.evaluate_by_values], axis = 1)
5976

6077
def cluster_obs_count(self):
6178
"""
@@ -66,14 +83,22 @@ def cluster_obs_count(self):
6683
def cluster_means(self):
6784
"""
6885
Provides the means of the cluster features for each cluster
86+
If evaluate_by is set, then clusters will be sorted by the mean value of the "evaluate_by" column
6987
"""
70-
return(self.merged_data.groupby('labels').mean().transpose())
71-
88+
if self.evaluate_by is not None:
89+
return(self.merged_data.groupby('labels').mean().sort_values(self.evaluate_by).transpose())
90+
else:
91+
return(self.merged_data.groupby('labels').mean().transpose())
92+
7293
def cluster_means_scaled(self):
7394
"""
7495
Provides the means (scaled) of the cluster features for each cluster
96+
If evaluate_by is set, then clusters will be sorted by the mean value of the "evaluate_by" column
7597
"""
76-
return(self.merged_data.groupby('labels').mean().transpose())
98+
if self.evaluate_by is not None:
99+
return(self.merged_scaled_data.groupby('labels').mean().sort_values(self.evaluate_by).transpose())
100+
else:
101+
return(self.merged_scaled_data.groupby('labels').mean().transpose())
77102

78103
def plot_parallel_coordinates(self, scaled=True, frac=0.05, figsize=(12,6), xrot=0):
79104
"""
@@ -84,6 +109,26 @@ def plot_parallel_coordinates(self, scaled=True, frac=0.05, figsize=(12,6), xrot
84109
else:
85110
vh.plot_parallel_coordinates(data = self.merged_data, by = 'labels', normalize=False, frac=frac, figsize=figsize, xrot=xrot)
86111

112+
113+
def plot_headmap(self, scale_rows=True, cmap='viridis', figsize=(6,6)
114+
, annot=False, valfmt="{x:.1f}", fontsize=12, fontweight="bold",textcolors=["white", "black"] ):
115+
"""
116+
valfmt example: "{x:.1f}"
117+
"""
118+
clmeans_df = self.cluster_means()
119+
clmeans_np = clmeans_df.to_numpy()
120+
if (scale_rows):
121+
cbarlabel = "Normalized Values"
122+
else:
123+
cbarlabel = "Values"
124+
125+
fig, ax = plt.subplots(figsize=figsize)
126+
im, cbar = vh.heatmap(clmeans_np, row_labels=clmeans_df.index, col_labels=clmeans_df.columns
127+
, ax=ax, scale_rows=scale_rows, cmap=cmap, cbarlabel=cbarlabel)
128+
129+
if annot:
130+
vh.annotate_heatmap(im, valfmt=valfmt, size=fontsize, fontweight=fontweight,textcolors=textcolors)
131+
87132

88133

89134

build/lib/more/scikit_helper/cluster/KMeansHelper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
from . import BaseClusterWithN
33

44
class KMeansHelper(BaseClusterWithN):
5-
def __init__(self, X, n_clusters, init = "k-means++", n_jobs = None, scaled = True, random_state = 101):
5+
def __init__(self, X, n_clusters, evaluate_by = None, init = "k-means++", n_jobs = None, scaled = True, random_state = 101):
66
"""
77
Class to train and evaluate a KMeans Cluster Model
88
"""
9-
super().__init__(X=X, n_clusters=n_clusters, random_state = random_state)
9+
super().__init__(X=X, n_clusters=n_clusters, evaluate_by=evaluate_by, random_state=random_state)
1010
self.init = init
1111
self.n_jobs = n_jobs
1212
self.cluster_obj = KMeans(n_clusters=self.n_clusters, init=self.init, random_state=self.random_state, n_jobs=self.n_jobs)

build/lib/more/viz_helper/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@
77
from .plot_data import plot_data
88
from .plot_parallel_coordinates import plot_parallel_coordinates
99
from .plot_similarity import plot_similarity
10+
from .plot_heatmap import heatmap, annotate_heatmap
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import numpy as np
2+
import matplotlib
3+
import matplotlib.pyplot as plt
4+
5+
6+
def heatmap(data, row_labels, col_labels, ax=None, scale_rows=False, xrot = 0,
7+
cbar_kw={}, cbarlabel="", **kwargs):
8+
"""
9+
Create a heatmap from a numpy array and two lists of labels.
10+
Code taken from : https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py
11+
Addition made to
12+
Scale by each row individually -- useful when each row has its own scale
13+
Allow for Xlabel rotation
14+
15+
16+
Parameters
17+
----------
18+
data
19+
A 2D numpy array of shape (N, M).
20+
row_labels
21+
A list or array of length N with the labels for the rows.
22+
col_labels
23+
A list or array of length M with the labels for the columns.
24+
ax
25+
A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If
26+
not provided, use current axes or create a new one. Optional.
27+
scale_rows:
28+
Useful when each row has its own scale
29+
Scaled each row (value - row min) / row range
30+
This ensures that if the rows are representing items having different ranges, then the one with the max range does not overwhelm the plot
31+
xrot:
32+
Rotation of the X-labels
33+
cbar_kw
34+
A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional.
35+
cbarlabel
36+
The label for the colorbar. Optional.
37+
**kwargs
38+
All other arguments are forwarded to `imshow`.
39+
"""
40+
41+
if not ax:
42+
ax = plt.gca()
43+
44+
# If rows need to be scaled, perform the scaling now
45+
if(scale_rows):
46+
data = (data - np.min(data,axis=1)[:,np.newaxis]) / np.ptp(data,axis=1)[:,np.newaxis]
47+
48+
# Plot the heatmap
49+
im = ax.imshow(data, interpolation='none',aspect='auto', **kwargs)
50+
51+
# Create colorbar
52+
cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
53+
cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
54+
55+
# We want to show all ticks...
56+
ax.set_xticks(np.arange(data.shape[1]))
57+
ax.set_yticks(np.arange(data.shape[0]))
58+
# ... and label them with the respective list entries.
59+
ax.set_xticklabels(col_labels)
60+
ax.set_yticklabels(row_labels)
61+
62+
# Let the horizontal axes labeling appear on top.
63+
ax.tick_params(top=True, bottom=True,labeltop=True, labelbottom=True)
64+
65+
# Rotate the tick labels and set their alignment.
66+
plt.setp(ax.get_xticklabels(), rotation=xrot, ha="center",rotation_mode="anchor")
67+
68+
# Turn spines off and create white grid.
69+
for edge, spine in ax.spines.items():
70+
spine.set_visible(False)
71+
72+
ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
73+
ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
74+
#ax.grid(which="minor", color="w", linestyle='-', linewidth=1)
75+
ax.tick_params(which="minor", bottom=False, left=False)
76+
77+
return im, cbar
78+
79+
80+
def annotate_heatmap(im, data=None, valfmt="{x:.2f}", textcolors=["black", "white"], threshold=None, **textkw):
81+
"""
82+
A function to annotate a heatmap.
83+
Code taken from : https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py
84+
85+
Parameters
86+
----------
87+
im
88+
The AxesImage to be labeled.
89+
data
90+
Data used to annotate. If None, the image's data is used. Optional.
91+
valfmt
92+
The format of the annotations inside the heatmap. This should either
93+
use the string format method, e.g. "$ {x:.2f}", or be a
94+
`matplotlib.ticker.Formatter`. Optional.
95+
textcolors
96+
A list or array of two color specifications. The first is used for
97+
values below a threshold, the second for those above. Optional.
98+
threshold
99+
Value in data units according to which the colors from textcolors are
100+
applied. If None (the default) uses the middle of the colormap as
101+
separation. Optional.
102+
**kwargs
103+
All other arguments are forwarded to each call to `text` used to create
104+
the text labels.
105+
"""
106+
107+
if not isinstance(data, (list, np.ndarray)):
108+
data = im.get_array()
109+
110+
# Normalize the threshold to the images color range.
111+
if threshold is not None:
112+
threshold = im.norm(threshold)
113+
else:
114+
threshold = im.norm(data.max())/2.
115+
116+
# Set default alignment to center, but allow it to be
117+
# overwritten by textkw.
118+
kw = dict(horizontalalignment="center",
119+
verticalalignment="center")
120+
kw.update(textkw)
121+
122+
# Get the formatter in case a string is supplied
123+
if isinstance(valfmt, str):
124+
valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
125+
126+
# Loop over the data and create a `Text` for each "pixel".
127+
# Change the text's color depending on the data.
128+
texts = []
129+
for i in range(data.shape[0]):
130+
for j in range(data.shape[1]):
131+
kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
132+
text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
133+
texts.append(text)
134+
135+
return texts
27.7 KB
Binary file not shown.

dist/more-0.0.1b10.tar.gz

18.1 KB
Binary file not shown.
27.7 KB
Binary file not shown.

0 commit comments

Comments
 (0)