ngupta23
diff --git a/‎README.md‎
Lines changed: 10 additions & 1 deletion b/‎README.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎build/lib/more/pandas_helper/__init__.py‎
Lines changed: 30 additions & 12 deletions b/‎build/lib/more/pandas_helper/__init__.py‎
Lines changed: 30 additions & 12 deletions
diff --git a/‎build/lib/more/scikit_helper/cluster/GaussianMixture.py‎
Lines changed: 171 additions & 0 deletions b/‎build/lib/more/scikit_helper/cluster/GaussianMixture.py‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎build/lib/more/scikit_helper/cluster/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎build/lib/more/scikit_helper/cluster/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎build/lib/more/viz_helper/plot_parallel_coordinates.py‎
Lines changed: 2 additions & 1 deletion b/‎build/lib/more/viz_helper/plot_parallel_coordinates.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎dist/more-0.0.1b7-py3-none-any.whl‎
22.2 KB b/‎dist/more-0.0.1b7-py3-none-any.whl‎
22.2 KB
diff --git a/‎dist/more-0.0.1b7.tar.gz‎
14.1 KB b/‎dist/more-0.0.1b7.tar.gz‎
14.1 KB
@@ -4,7 +4,7 @@ This is a helper package for a variety of functions as described in the Overview
 
 # Installation
 
-pip install more==0.0.1b6
+pip install more==0.0.1b7
 
 # Overview
 
@@ -15,3 +15,12 @@ This is a helper package for a variety of functions
 
 # Examples
 Check out the  [examples](https://github.com/ngupta23/more/tree/master/examples) folder for details on usage
+
+# Version History
+
+## 0.0.1b7
+
+* Added Cluster Helper for Gaussian Clusters
+* Fixed Bug for plot_parallel_coordinates where it was not working correctly for a multi-level categorical label
+* Fixed bug for pandas helper for describing categorical and numeric fields - Now it gives a warning if the dataframe does not have any categorical or numeric field when those respective describe functions are called.
+
@@ -1,5 +1,4 @@
 import pandas as pd
-# import numpy as np 
 import warnings
 
 @pd.api.extensions.register_dataframe_accessor("helper")
@@ -42,23 +41,30 @@ def describe_categorical(self, verbose=False):
         the dataframe contains both numeric and categorical variables. 
         This extension provides more flexibility
         """
-        self.__print_dashes(45)
-        print("Summary Statictics for Categorical Variables")
-        self.__print_dashes(45)
-        print(self._obj[self.cat_features].describe())
-        
-        if (verbose):
-            self.level_counts()
+        if (self._cat_exists()):
+            self.__print_dashes(45)
+            print("Summary Statictics for Categorical Variables")
+            self.__print_dashes(45)
+            print(self._obj[self.cat_features].describe())
+            
+            if (verbose):
+                self.level_counts()
+        else:
+            warnings.warn("Data does not have any categorical columns")
+            
 
     def describe_numeric(self):
         """
         Prints numeric variable summaries for a Pandas Dataframe
         Same as default behavior in pd.DataFrame.describe()
         """
-        self.__print_dashes(40)
-        print("Summary Statictics for Numeric Variables")
-        self.__print_dashes(40)
-        print(self._obj[self.num_features].describe())
+        if (self._num_exists()):
+            self.__print_dashes(40)
+            print("Summary Statictics for Numeric Variables")
+            self.__print_dashes(40)
+            print(self._obj[self.num_features].describe())
+        else:
+            warnings.warn("Data does not have any numeric columns")
 
     def describe(self,verbose=False):
         """
@@ -116,6 +122,18 @@ def __set_num_features(self):
     def __set_all_feature_types(self):
         self.__set_cat_features()
         self.__set_num_features()
+        
+    def _cat_exists(self):
+        if (len(self.cat_features) > 0):
+            return True
+        else:
+            return False
+            
+    def _num_exists(self):
+        if (len(self.num_features) > 0):
+            return True
+        else:
+            return False
 
     def __print_dashes(self,num = 20):
         print("-"*num)    
 
@@ -0,0 +1,171 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# this code has been manipulated from the source available on sklearn's website documentation
+import itertools
+from sklearn import metrics as mt
+from scipy import linalg
+import matplotlib as mpl
+from sklearn import mixture
+
+class GaussianMixtureHelper:
+    def __init__(self, X, y=None
+                 ,n_components_range=range(2,3) ,cov_types = ['spherical']
+                 ,metric = 'bic', random_state=101):
+        """
+        Class to train and evaluate a Gaussian Mixture Cluster Model
+        """
+        self.X = X
+        self.y = y
+        self.n_components_range = n_components_range
+        self.cov_types = cov_types 
+        # Add exception here is metric is not of the right type
+        self.metric = metric
+        self.random_state = random_state
+        self.y_pred = None
+        self.best_gmm = None
+        self.best_gmm_bic = None
+        self.best_gmm_aic = None
+        self.bic = []
+        self.aic = []
+        self.lowest_bic = np.infty
+        self.lowest_aic = np.infty
+            
+    def train(self):
+        """
+        Train the Gaissian Mixture Model across a range of cluster values and covariance types
+        """
+        for cov_type in self.cov_types:
+            for n_components in self.n_components_range:
+                # Fit a mixture of Gaussians with EM
+                gmm = mixture.GaussianMixture(n_components=n_components
+                                                  ,covariance_type=cov_type
+                                                  ,random_state=self.random_state)
+                gmm.fit(self.X)
+                self.bic.append(gmm.bic(self.X))
+                self.aic.append(gmm.aic(self.X))
+                
+                if self.bic[-1] < self.lowest_bic:
+                    self.lowest_bic = self.bic[-1]
+                    self.best_gmm_bic = gmm
+                    
+                if self.aic[-1] < self.lowest_aic:
+                    self.lowest_aic = self.aic[-1]
+                    self.best_gmm_aic = gmm
+        
+        self.set_best_model()
+        self.y_pred = self.predict(self.X)
+        return(self)
+    
+    def set_metric(self, metric):
+        self.metric = metric
+        
+    def set_best_model(self):
+        """
+        Use to set the best model to the one based on a specific metric
+        Default Metric = 'bic'; Other Option(s): 'aic' 
+        """
+        if (self.metric == 'bic'):
+            self.best_gmm = self.best_gmm_bic
+        elif(self.metric == 'aic'):
+            self.best_gmm = self.best_gmm_aic  
+    
+    def get_best_model(self):
+        return(self.best_gmm)
+    
+    def plot_metrics(self, figsize = (12,4)):
+        # this code has been manipulated from the source available on sklearn's website documentation
+        # plot the BIC
+        
+        plt.figure(figsize=figsize)
+               
+        # Plot the BIC scores
+        spl = plt.subplot(1,2,1)
+        color_iter = itertools.cycle(['k', 'r', 'b', 'g', 'c', 'm', 'y'])
+        bars = []
+        self.bic = np.array(self.bic)
+        
+        for i, (self.cov_type, color) in enumerate(zip(self.cov_types, color_iter)):
+            xpos = np.array(self.n_components_range) + .2 * (i - 2)
+            bars.append(plt.bar(xpos, self.bic[i * len(self.n_components_range):
+                                          (i + 1) * len(self.n_components_range)],
+                                width=.2, color=color))
+        plt.xticks(self.n_components_range)
+        plt.ylim([self.bic.min() * 1.01 - .01 * self.bic.max(), self.bic.max()])
+        plt.title('BIC score per model')
+
+        xpos = np.min(self.n_components_range)-0.4 + np.mod(self.bic.argmin(), len(self.n_components_range)) +\
+            .2 * np.floor(self.bic.argmin() / len(self.n_components_range))
+        plt.text(xpos, self.bic.min() * 0.97 + .03 * self.bic.max(), '*', fontsize=14)
+        spl.set_xlabel('Number of components')
+        spl.legend([b[0] for b in bars], self.cov_types)
+
+        # Plot the AIC scores
+        spl = plt.subplot(1,2,2)
+        color_iter = itertools.cycle(['k', 'r', 'b', 'g', 'c', 'm', 'y'])
+        bars = []
+        self.aic = np.array(self.aic)
+        
+        for i, (self.cov_type, color) in enumerate(zip(self.cov_types, color_iter)):
+            xpos = np.array(self.n_components_range) + .2 * (i - 2)
+            bars.append(plt.bar(xpos, self.aic[i * len(self.n_components_range):
+                                          (i + 1) * len(self.n_components_range)],
+                                width=.2, color=color))
+        plt.xticks(self.n_components_range)
+        plt.ylim([self.aic.min() * 1.01 - .01 * self.aic.max(), self.aic.max()])
+        plt.title('AIC score per model')
+
+        xpos = np.min(self.n_components_range)-0.4 + np.mod(self.aic.argmin(), len(self.n_components_range)) +\
+            .2 * np.floor(self.aic.argmin() / len(self.n_components_range))
+        plt.text(xpos, self.aic.min() * 0.97 + .03 * self.aic.max(), '*', fontsize=14)
+        spl.set_xlabel('Number of components')
+        spl.legend([b[0] for b in bars], self.cov_types)
+
+        plt.tight_layout()
+        #plt.show()
+        return(plt)
+    
+    def predict(self,X):
+        clf = self.get_best_model()
+        y_pred = clf.predict(X)
+        return(y_pred)
+    
+    def plot_best_model(self,feat_x,feat_y):
+        plt.figure(figsize=(12,6))
+        splot = plt.subplot(1,1,1)
+        
+        color_iter = itertools.cycle(['k', 'r', 'b', 'g', 'c', 'm', 'y'])
+        clf = self.get_best_model()
+        
+        for i, (mean, covar, color) in enumerate(zip(clf.means_, clf.covariances_,color_iter)):
+            if len(covar.shape)<2:
+                tmp = np.zeros((2,2))
+                np.fill_diagonal(tmp,covar)
+                covar = tmp
+            elif covar.shape[0] != covar.shape[1]:
+                covar = np.diag(covar)
+
+            v, w = linalg.eigh(covar)
+            if not np.any(self.y_pred == i):
+                continue
+            
+            plt.scatter(self.X[self.y_pred == i][feat_x], self.X[self.y_pred == i][feat_y], 5, color=color)
+
+            # Plot an ellipse to show the Gaussian component
+            angle = np.arctan2(w[0][1], w[0][0])
+            angle = 180 * angle / np.pi  # convert to degrees
+            v *= 4
+            ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
+            ell.set_clip_box(splot.bbox)
+            ell.set_alpha(.5)
+            splot.add_artist(ell)
+
+        plt.title('Selected GMM')
+        plt.show()
+        
+    def clusters_vs_true_labels(self):
+        self.y_pred = self.predict(self.X)
+        num_true_classes = len(set(self.y))
+        print(mt.confusion_matrix(self.y,self.y_pred)[0:num_true_classes,:])
+        
@@ -0,0 +1 @@
+from .GaussianMixture import GaussianMixtureHelper 
@@ -21,7 +21,8 @@ def plot_parallel_coordinates(data, by, sample=True, frac=1.0, normalize=True
     else:
         df_sub = data.copy(deep=False)
 
-    df_sub[by] = df_sub[by] == 1 # converting categorical variable into number for plotting
+    # Commenting out since the by variable could have more than 2 levels              
+    #df_sub[by] = df_sub[by] == 1 # converting categorical variable into number for plotting
 
 
     # This plot is more meaningful when values are normalized
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .GaussianMixture import GaussianMixtureHelper`