separating all features

tawabshakeel · tawabshakeel · commit 2273cf9097a9 · 2020-12-08T00:16:13.000+05:00
diff --git a/__init__.py b/__init__.py
@@ -1 +1,3 @@
 from explainx.explain import *
+
+from explainx.main import *
diff --git a/demo-explainx-with-sound.gif b/demo-explainx-with-sound.gif
diff --git a/explain.py b/explain.py
@@ -1,8 +1,6 @@
 import os
 import sys
-
 import re
-
 from pathlib import Path
 from sys import platform
 import subprocess
@@ -19,33 +17,25 @@
 from calculate_shap import *
 from analytics import Analytics
 
-"""
-This class calculates feature importance
-
-Input: 
-
-
-"""
-
-
 class explain():
     def __init__(self):
         super(explain, self).__init__()
         self.param = {}
 
     # is classification function?
 
-    def is_classification_given_y_array(self, y_test):
-        is_classification = False
-        total = len(y_test)
-        total_unique = len(set(y_test))
-        if total < 30:
-            if total_unique < 10:
-                is_classification = True
-        else:
-            if total_unique < 20:
-                is_classification = True
-        return is_classification
+    # def is_classification_given_y_array(self, y_test):
+    #     is_classification = False
+    #     total = len(y_test)
+    #     total_unique = len(set(y_test))
+    #     if total < 30:
+    #         if total_unique < 10:
+    #             is_classification = True
+    #     else:
+    #         if total_unique < 20:
+    #             is_classification = True
+    #     return is_classification
+    
 
     def random_string_generator(self):
         random_str = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))
@@ -54,6 +44,8 @@ def random_string_generator(self):
     def ai(self, df, y, model, model_name="xgboost", mode=None):
         y_variable = "y_actual"
         y_variable_predict = "y_prediction"
+        
+        #Code for Analytics
         instance_id = self.random_string_generator()
         analytics = Analytics()
         analytics['ip'] = analytics.finding_ip()
@@ -69,11 +61,6 @@ def ai(self, df, y, model, model_name="xgboost", mode=None):
         analytics['finish_time'] = ''
         analytics.insert_data()
 
-        # If yes, then different shap functuions are required.
-        # get the shap value based on predcton and make a new dataframe.
-
-        # find predictions first as shap values need that.
-
         prediction_col = []
 
         if model_name == "xgboost":
@@ -88,40 +75,48 @@ def ai(self, df, y, model, model_name="xgboost", mode=None):
             prediction_col = model.predict(df.to_numpy())
 
         else:
-            prediction_col = model.predict(df.to_numpy())
+            prediction_col = model.predict(df)
 
         # is classification?
-        is_classification = self.is_classification_given_y_array(prediction_col)
+        #is_classification = self.is_classification_given_y_array(prediction_col)
+        ModelType = lambda model: True if is_classifier(model) else False
+        is_classification = ModelType(model)
 
         # shap
         c = calculate_shap()
         self.df_final, self.explainer = c.find(model, df, prediction_col, is_classification, model_name=model_name)
 
-        # prediction col
+        #Append Model Decision & True Labels Columns into the dataset.
         self.df_final[y_variable_predict] = prediction_col
-
         self.df_final[y_variable] = y
 
         # additional inputs.
         if is_classification == True:
             # find and add probabilities in the dataset.
-            prediction_col_prob = model.predict_proba(df.to_numpy())
-            pd_prediction_col_prob = pd.DataFrame(prediction_col_prob)
+            #prediction_col_prob = model.predict_proba(df)
+            #pd_prediction_col_prob = pd.DataFrame(prediction_col_prob)
 
-            for c in pd_prediction_col_prob.columns:
-                self.df_final["probability_of_predicting_class_" + str(c)] = list(pd_prediction_col_prob[c])
+            probabilities = model.predict_proba(df)
 
-            classes = []
-            for c in pd_prediction_col_prob.columns:
-                classes.append(str(c))
-            self.param["classes"] = classes
+            for i in range(len(np.unique(prediction_col))):
+                self.df_final['Probability: {}'.format(np.unique(prediction_col)[i])] = probabilities[:,i]
+            
+            self.param['classes'] = np.unique(prediction_col)
+
+            #for c in pd_prediction_col_prob.columns:
+             #   self.df_final["probability_of_predicting_class_" + str(c)] = list(pd_prediction_col_prob[c])
+
+            #classes = []
+            #for c in pd_prediction_col_prob.columns:
+             #   classes.append(str(c))
+            #self.param["classes"] = classes
 
             try:
                 expected_values_by_class = self.explainer.expected_value
             except:
                 expected_values_by_class = []
-                for c in range(len(classes)):
-                    expected_values_by_class.append(1 / len(classes))
+                for c in range(len(np.unique(prediction_col))):
+                    expected_values_by_class.append(1 / len(np.unique(prediction_col)))
 
             self.param["expected_values"] = expected_values_by_class
         else:
diff --git a/lib/analytics.py b/lib/analytics.py
@@ -9,27 +9,18 @@ def __init__(self):
 
     @staticmethod
     def finding_address():
-        try:
-            val = get_mac()
-            return val
-
-        except Exception as e :
-            return None
+        val = get_mac()
+        return val
 
     @staticmethod
     def finding_ip():
-        try:
-            val = socket.gethostbyname(socket.gethostname())
-            return val
-        except Exception as e:
-            return None
+        val = socket.gethostbyname(socket.gethostname())
+        return val
 
     @staticmethod
     def finding_system():
-        try:
-            return platform.system()
-        except Exception as e:
-            return None
+        return platform.system()
+
     def __setitem__(self, key, val):
         self.dict[key] = val
 
diff --git a/lib/calculate_shap.py b/lib/calculate_shap.py
@@ -9,7 +9,6 @@
 
 """
 
-
 class calculate_shap():
     def __init__(self):
         super(calculate_shap, self).__init__()
diff --git a/lib/dashboard.py b/lib/dashboard.py
@@ -464,7 +464,6 @@ def toggle_collapse(n, is_open):
             return is_open
 
         #Cohort Analysis - Callbacks
-
         @app.callback(
         Output("modal", "is_open"),
         [Input("open", "n_clicks"), Input("close", "n_clicks")],
@@ -675,6 +674,7 @@ def update_graph(xaxis_column_name, third_axis_name, sql_query):
             g = plotly_graphs()
             graph_type = 'pdp'
             df3 = self.caching_data_manager(df, sql_query, graph_type, g.partial_dependence_plot)
+            print(df3)
             fig = g.pdp_plot(df3, df3[xaxis_column_name], df3[xaxis_column_name+"_impact"], df3[third_axis_name])
             return fig
 
diff --git a/lib/encode_decode_cat_col.py b/lib/encode_decode_cat_col.py
@@ -1,13 +1,7 @@
 from imports import *
 from sklearn.preprocessing import OneHotEncoder
 import numpy as np
-"""
-This class calculates feature importance
 
-Input: 
-
-
-"""
 
 
 class encode_decode_cat_col():
diff --git a/lib/feature_impact.py b/lib/feature_impact.py
@@ -1,22 +1,14 @@
 from imports import *
 
-"""
-This class calculates feature impact
-
-Input: 
-
-
-"""
-
-
 class feature_impact():
     def __init__(self):
         super(feature_impact, self).__init__()
         self.param= None
 
 
     def find(self,  df):
-
+        df  = pd.DataFrame(df)
+        print(df)
         variables = [col for col in df.columns if '_impact' in col]
         y = []
         for i in range(len(variables)):
diff --git a/lib/feature_impact_classification.py b/lib/feature_impact_classification.py
@@ -1,14 +1,5 @@
 from imports import *
 
-"""
-This class calculates feature impact
-
-Input: 
-
-
-"""
-
-
 class feature_impact_classification():
     def __init__(self):
         super(feature_impact_classification, self).__init__()
diff --git a/lib/feature_importance.py b/lib/feature_importance.py
@@ -1,14 +1,5 @@
 from imports import *
 
-"""
-This class calculates feature importance
-
-Input: 
-
-
-"""
-
-
 class feature_importance():
     def __init__(self):
         super(feature_importance, self).__init__()
diff --git a/lib/feature_importance_classification.py b/lib/feature_importance_classification.py
@@ -1,13 +1,5 @@
 from imports import *
 
-"""
-This class calculates feature importance
-
-Input: 
-
-
-"""
-
 
 class feature_importance_classification():
     def __init__(self):
diff --git a/lib/imports.py b/lib/imports.py
@@ -31,6 +31,8 @@
 from config_det import data_det
 from collections import deque
 from sklearn import metrics
+from sklearn.base import is_classifier, is_regressor
+import pytest
 
 firebase_app = pyrebase.initialize_app(data_det)
 ref = firebase_app.database()
diff --git a/lib/plotly_css.py b/lib/plotly_css.py
@@ -114,7 +114,7 @@
            'border-radius': '15px 15px 15px 15px',
            'box-shadow': '0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19)',
            'border-right': '1px solid #2c3e50', 'border-bottom': '1px solid #2c3e50',
-           'marginTop': 50, 'width': '95%'}
+           'marginTop': 50, 'width': '100%'}
 
 style17 = {'backgroundColor': '#fff',
            'color': 'black',
@@ -147,8 +147,8 @@
 
 style20 = {'marginBottom': 50,
            'marginTop': 50,
-           'marginLeft': "1%",
-           'width': '95%', }
+           #'marginLeft': "1%",
+           'width': '100%', }
 
 style21 = {'backgroundColor': '#fff',
            'color': 'black',
diff --git a/lib/plotly_graphs.py b/lib/plotly_graphs.py
@@ -56,7 +56,7 @@ def summary_plot(self, df,classification=False):
         return df2
 
 
-    def summary_plot_graph(self, df,classification=False):
+    def summary_plot_graph(self, df):
         summary_plot = px.scatter(df, x="Feature Impact on Outcome", y="Feature Name", color="Rescaled Feature Value",
                                   hover_data=["Original Feature Value"], color_continuous_scale="Bluered_r", template="plotly_white")
         return summary_plot
@@ -70,7 +70,7 @@ def partial_dependence_plot(self, df, v1=None, v2=None, v3=None):
 
     def pdp_plot(self, df, v1, v2, v3):
         g = px.scatter(df, x=v1, y=v2, color=v3, color_continuous_scale="Bluered_r",
-                       color_discrete_sequence=px.colors.sequential.Plasma_r, template="plotly_white")
+                       color_discrete_sequence= px.colors.sequential.Plasma_r, template="plotly_white")
         return g
 
 
diff --git a/lib/rescale_numeric_feature.py b/lib/rescale_numeric_feature.py
@@ -1,13 +1,5 @@
 from imports import *
 
-"""
-This class calculates feature importance
-
-Input: 
-
-
-"""
-
 
 class get_cols():
     def __init__(self):
diff --git a/lib/shap_pdp.py b/lib/shap_pdp.py
@@ -1,13 +1,5 @@
 from imports import *
 
-"""
-This class calculates feature importance
-
-Input: 
-
-
-"""
-
 
 class shap_pdp():
     def __init__(self):
diff --git a/lib/summary_plot.py b/lib/summary_plot.py
@@ -1,12 +1,5 @@
 from imports import *
 from rescale_numeric_feature import *
-"""
-This class helps to plot summary plot
-
-Input: 
-
-
-"""
 
 
 class summary_plot():
@@ -17,15 +10,11 @@ def __init__(self):
 
 
     def find(self,  df):
-
         column = get_cols()
         self.original_columns = column.get_all_cols(df)
-
         re= rescale_numeric_features()
         df_with_rescaled_features= re.rescale(df)
-
-        final_dataframe= self.rearrange_dataframe(df_with_rescaled_features )
-
+        final_dataframe= self.rearrange_dataframe( df_with_rescaled_features )
         return final_dataframe
 
     def rearrange_dataframe(self, df_re ):

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`from explainx.explain import *`
	`2`	`+`
	`3`	`+from explainx.main import *`