Merge pull request #189 from Thilakraj1998/main

Thilakraj1998 · web-flow · commit b157042b8476 · 2021-11-15T16:15:46.000+05:30
Major changes (Save &amp; load)
diff --git a/blobcity/code_gen/PyMeta.py b/blobcity/code_gen/PyMeta.py
@@ -82,5 +82,5 @@ class PyComments:
         'datasplit':"\n### Train & Test\n# The train-test split is a procedure for evaluating the performance of an algorithm.\n# The procedure involves taking a dataset and dividing it into two subsets.\n# The first subset is utilized to fit/train the model.\n# The second subset is used for prediction.\n# The main motive is to estimate the performance of the model on new data.\n",
         'metrics':"\n### Accuracy Metrics\n# Performance metrics are a part of every machine learning pipeline. \n# They tell you if you're making progress, and put a number on it. All machine learning models,\n# whether it's linear regression, or a SOTA technique like BERT, need a metric to judge performance.\n",
         'x&y':"\n### Feature Selection\n# It is the process of reducing the number of input variables when developing a predictive model.\n# Used to reduce the number of input variables to reduce the computational cost of modelling and,\n# in some cases,to improve the performance of the model.\n",
-        'cor_matrix': "### Correlation Matrix\n# In order to check the correlation between the features, we will plot a correlation matrix.\n# It is effective in summarizing a large amount of data where the goal is to see patterns."
+        'cor_matrix': "### Correlation Matrix\n# In order to check the correlation between the features, we will plot a correlation matrix.\n# It is effective in summarizing a large amount of data where the goal is to see patterns.\n"
     }
diff --git a/blobcity/config/tuner.py b/blobcity/config/tuner.py
@@ -95,8 +95,8 @@ def classification_metrics(y_true,y_pred):
     """
     result=dict()
     result['F1-Score']=f1_score(y_true, y_pred, average="weighted")
-    result['precision']=precision_score(y_true, y_pred,average="weighted")
-    result['recall']=recall_score(y_true, y_pred,average="weighted")
+    result['Precision']=precision_score(y_true, y_pred,average="weighted")
+    result['Recall']=recall_score(y_true, y_pred,average="weighted")
     return result
 
 def metricResults(y_true,y_pred,ptype):
diff --git a/blobcity/main/driver.py b/blobcity/main/driver.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import pickle
+import os
+import dill
 import numpy as np
 import pandas as pd
+import autokeras as ak
+import tensorflow as tf
 from blobcity.store import DictClass
 from blobcity.utils import get_dataframe_type,dataCleaner
 from blobcity.utils import AutoFeatureSelection as AFS
 from blobcity.main.modelSelection import model_search
 from blobcity.code_gen import yml_reader,code_generator
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.feature_selection import SelectKBest,f_regression,f_classif
-def train(file=None, df=None, target=None,features=None,accuracy_criteria=0.99):
+def train(file=None, df=None, target=None,features=None,use_neural=False,accuracy_criteria=0.99):
     """
     param1: string: dataset file path 
 
@@ -56,35 +58,41 @@ def train(file=None, df=None, target=None,features=None,accuracy_criteria=0.99):
         CleanedDF=dataCleaner(dataframe,features,target,dict_class)
     #model search space
     accuracy_criteria= accuracy_criteria if accuracy_criteria<=1.0 else (accuracy_criteria/100)
-    modelClass = model_search(CleanedDF,target,dict_class,use_neural=False,accuracy_criteria=accuracy_criteria)
+    modelClass = model_search(CleanedDF,target,dict_class,use_neural=use_neural,accuracy_criteria=accuracy_criteria)
     modelClass.yamldata=dict_class.getdict()
     modelClass.feature_importance_=dict_class.feature_importance if(features==None) else calculate_feature_importance(CleanedDF.drop(target,axis=1),CleanedDF[target],dict_class)
     dict_class.resetVar()
     return modelClass
 
-def load(modelFile,h5_path=None):
+def load(model_path=None):
         """
         param1: string: (required) the filepath to the stored model. Supports .pkl models.
-        param2: string: the filepath to the stored h5 file, provide only if saved h5 file.
         returns: Model file
 
-        function loads the serialized model from .pkl or .h5 format to usable format.
+        function loads the serialized model from .pkl format to usable format.
         """
-        path_components = modelFile.split('.')
-        extension = path_components[1] if len(path_components)<=2 else path_components[-1]
-         
-        if extension == 'pkl' and h5_path in [None,""]:
-            model = pickle.load(open(modelFile, 'rb'))
-
-        """ elif os.path.splitext(h5_path)[1] == '.h5' and h5_path!=None:
-            print("pkl path: {}, h5 path : {}".format(os.path.splitext(modelFile),os.path.splitext(h5_path)))
-            if os.path.splitext(h5_path)[0] == os.path.splitext(modelFile)[0]:
-                tfmodel = tf.keras.models.load_model(h5_path)
-                model=pickle.load(open(modelFile, 'rb'))
-                model.model=tfmodel
+        if model_path not in [None,""]:
+            path_components = model_path.split('.')
+            extension = path_components[1] if len(path_components)<=2 else path_components[-1]
+            base_path=os.path.splitext(model_path)[0]
+            if extension == 'pkl':
+                model = dill.load(open(model_path, 'rb'))  
+                if model.yamldata['model']['type'] in ['TF','tf','Tensorflow']:
+                    if model.yamldata['model']['save_type']=='h5':
+                        h5_path=base_path+".h5"
+                        if os.path.isfile(h5_path):model.model=tf.keras.models.load_model(h5_path)
+                        else: raise FileNotFoundError(f"{h5_path} file doest exists in the directory")
+                    elif model.yamldata['model']['save_type']=='pb':
+                        if os.path.isdir(base_path):model.model=tf.keras.models.load_model(base_path, custom_objects=ak.CUSTOM_OBJECTS)
+                        else: raise FileNotFoundError(f"{base_path} Folder doest exists")
+                    else:
+                        raise TypeError(f"{model.yamldata['model']['save_type']}, not supported save format")
+                return model
             else:
-                raise ValueError("file name for pickle and h5 file should be same") """
-        return model
+                raise TypeError(f"{extension}, file type must be .pkl")
+        else:
+            raise TypeError(f"{model_path}, path can't be None or Null")
+        
 
 def spill(filepath,yaml_path=None,doc=None):
     """
diff --git a/blobcity/store/Model.py b/blobcity/store/Model.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 import os
-import pickle
-import time
+import dill
 import numpy as np
 import pandas as pd
 import seaborn as sns
@@ -141,30 +140,22 @@ def save(self, model_path=None):
         if model_path not in [None,""]:
             path_components = model_path.split('.')
             extension = path_components[1] if len(path_components)<=2 else path_components[-1]
-
-            if extension == '/':
-                final_path = os.path.join(model_path, 'autoaimodel.pkl')
-                pickle.dump(self, open(final_path, 'wb'))
-                print("The model is stored at {}".format(final_path))
-            elif extension == 'pkl':
+            if extension == 'pkl' and self.yamldata['model']['type'] not in ['TF','tf','Tensorflow']:
                 final_path = model_path
-                pickle.dump(self, open(final_path, 'wb'))
+                dill.dump(self, open(final_path, 'wb'))
                 print("The model is stored at {}".format(final_path))
-             
-                """ 
-            elif extension == 'h5' or self.yamldata['model']['type'] in ['TF','tf']:
-                model_path = model_path if model_path!="./" else os.path.join(model_path, 'autoaimodel.h5')
-                class_path = model_path if model_path!="./" else os.path.join(model_path, 'autoaimodel.pkl')
-                try:
-                    tfmodel_temp=self.model
-                    self.model.save(model_path)
-                    self.model=None
-                    pickle.dump(self, open(class_path, 'wb'))
-                    self.model=tfmodel_temp
-                    print("The model is stored at {}".format(model_path))
-                    return model_path
-                except:
-                    raise TypeError("Your model is not a Keras model of type .h5. Try .pkl extension.") """  
+            elif extension=='pkl' and self.yamldata['model']['type'] in ['TF','tf','Tensorflow']:
+                base_path=os.path.splitext(model_path)[0]
+                tmp=self.model
+                if self.yamldata['problem']['type']=="Classification":
+                    tmp.export_model().save(base_path+".h5")
+                elif self.yamldata['problem']['type']=="Regression":
+                    tmp.export_model().save(base_path, save_format="tf")
+                else:
+                    raise TypeError("Wrong problem type identified")
+                self.model=None
+                dill.dump(self, open(model_path, 'wb'))
+                self.model=tmp
             else:
                 raise TypeError(f"{extension} file type must be .pkl")
         else:
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+dill>=0.3.4
 cliff>=3.6.0
 joblib>=1.0.0
 numpy>=1.21.0
diff --git a/setup.cfg b/setup.cfg
@@ -19,6 +19,7 @@ classifiers =
 packages = find:
 python_requires = >=3.6
 install_requires=
+    dill>=0.3.4
     cliff>=3.6.0
     joblib>=1.0.0
     numpy>=1.21.0

Original file line number	Diff line number	Diff line change
`@@ -82,5 +82,5 @@ class PyComments:`
`82`	`82`	`'datasplit':"\n### Train & Test\n# The train-test split is a procedure for evaluating the performance of an algorithm.\n# The procedure involves taking a dataset and dividing it into two subsets.\n# The first subset is utilized to fit/train the model.\n# The second subset is used for prediction.\n# The main motive is to estimate the performance of the model on new data.\n",`
`83`	`83`	`'metrics':"\n### Accuracy Metrics\n# Performance metrics are a part of every machine learning pipeline. \n# They tell you if you're making progress, and put a number on it. All machine learning models,\n# whether it's linear regression, or a SOTA technique like BERT, need a metric to judge performance.\n",`
`84`	`84`	`'x&y':"\n### Feature Selection\n# It is the process of reducing the number of input variables when developing a predictive model.\n# Used to reduce the number of input variables to reduce the computational cost of modelling and,\n# in some cases,to improve the performance of the model.\n",`
`85`		`- 'cor_matrix': "### Correlation Matrix\n# In order to check the correlation between the features, we will plot a correlation matrix.\n# It is effective in summarizing a large amount of data where the goal is to see patterns."`
	`85`	`+ 'cor_matrix': "### Correlation Matrix\n# In order to check the correlation between the features, we will plot a correlation matrix.\n# It is effective in summarizing a large amount of data where the goal is to see patterns.\n"`
`86`	`86`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+dill>=0.3.4`
`1`	`2`	`cliff>=3.6.0`
`2`	`3`	`joblib>=1.0.0`
`3`	`4`	`numpy>=1.21.0`