From 163e5f7d03b3f415479031d72c4cf6f227463f02 Mon Sep 17 00:00:00 2001 From: Shankar Pandala Date: Sat, 2 Nov 2024 18:36:06 +0000 Subject: [PATCH 01/12] update CI workflow: add push trigger for dev branch and adjust Sphinx requirements path --- .github/workflows/docs.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 300d8a5..df89052 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,7 +1,9 @@ name: "Pull Request Docs Check" on: pull_request: - + push: + branches: + - dev jobs: docs: runs-on: ubuntu-latest @@ -17,7 +19,7 @@ jobs: run: | python -m pip install --upgrade pip pip install sphinx - pip install -r docs/requirements.txt # Ensure you have a requirements file for Sphinx + pip install -r requirements.txt # Ensure you have a requirements file for Sphinx - name: Build documentation run: | From 30a7cf73ec35c1697abc3c3a6a61662d0c6f93f5 Mon Sep 17 00:00:00 2001 From: Shankar Pandala Date: Sat, 2 Nov 2024 18:38:24 +0000 Subject: [PATCH 02/12] update CI workflow: change documentation deployment branch from gh-pages to dev --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index df89052..3cdae32 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,7 +32,7 @@ jobs: - name: Commit documentation changes run: | - git clone https://github.com/shankarpandala/lazypredict.git --branch gh-pages --single-branch gh-pages + git clone https://github.com/shankarpandala/lazypredict.git --branch dev --single-branch dev cp -r docs/_build/html/* gh-pages/ cd gh-pages git config --local user.email "action@github.com" From cbf9ece5aaad3862f900e7c42f29003c8e1e1656 Mon Sep 17 00:00:00 2001 From: Shankar Pandala Date: Sat, 2 Nov 2024 18:43:26 +0000 Subject: [PATCH 03/12] added gh-pages directory --- gh-pages/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 gh-pages/README.md diff --git a/gh-pages/README.md b/gh-pages/README.md new file mode 100644 index 0000000..e69de29 From 1022c6e2601a748cc419498645b4e35af0205528 Mon Sep 17 00:00:00 2001 From: Shankar Pandala Date: Sat, 2 Nov 2024 18:45:48 +0000 Subject: [PATCH 04/12] update CI workflow: change documentation deployment branch from dev to gh-pages --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3cdae32..df89052 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,7 +32,7 @@ jobs: - name: Commit documentation changes run: | - git clone https://github.com/shankarpandala/lazypredict.git --branch dev --single-branch dev + git clone https://github.com/shankarpandala/lazypredict.git --branch gh-pages --single-branch gh-pages cp -r docs/_build/html/* gh-pages/ cd gh-pages git config --local user.email "action@github.com" From 785d6beebf24fc093f77d0aebd2849a233e48ff8 Mon Sep 17 00:00:00 2001 From: Shankar Pandala Date: Sat, 2 Nov 2024 18:47:33 +0000 Subject: [PATCH 05/12] removed gh-pages --- gh-pages/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 gh-pages/README.md diff --git a/gh-pages/README.md b/gh-pages/README.md deleted file mode 100644 index e69de29..0000000 From 7d19304503dce35c1cc8a309d291511f215212ad Mon Sep 17 00:00:00 2001 From: Mo Qo Date: Sat, 4 Jan 2025 12:18:43 -0500 Subject: [PATCH 06/12] Fix ROC-AUC for classifiers --- lazypredict/Supervised.py | 57 +++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/lazypredict/Supervised.py b/lazypredict/Supervised.py index a79c60d..20d3897 100644 --- a/lazypredict/Supervised.py +++ b/lazypredict/Supervised.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from tqdm import tqdm +from tqdm.autonotebook import tqdm import datetime import time from sklearn.pipeline import Pipeline @@ -18,6 +18,7 @@ from sklearn.metrics import ( accuracy_score, balanced_accuracy_score, + euclidean_distances, roc_auc_score, f1_score, r2_score, @@ -289,30 +290,64 @@ def fit(self, X_train, X_test, y_train, y_test): start = time.time() try: if "random_state" in model().get_params().keys(): - pipe = Pipeline( - steps=[ - ("preprocessor", preprocessor), - ("classifier", model(random_state=self.random_state)), - ] - ) + if "probability" not in model().get_params().keys(): + pipe = Pipeline( + steps=[ + ("preprocessor", preprocessor), + ("classifier", model( + random_state=self.random_state)), + ] + ) + else: + pipe = Pipeline( + steps=[ + ("preprocessor", preprocessor), + ("classifier", model( + random_state=self.random_state, probability=True)), + ] + ) else: - pipe = Pipeline( - steps=[("preprocessor", preprocessor), ("classifier", model())] - ) + if "probability" not in model().get_params().keys(): + pipe = Pipeline( + steps=[("preprocessor", preprocessor), + ("classifier", model())] + ) + else: + pipe = Pipeline( + steps=[("preprocessor", preprocessor), + ("classifier", model(probability=True))] + ) pipe.fit(X_train, y_train) self.models[name] = pipe y_pred = pipe.predict(X_test) + + try: + y_score = pipe.predict_proba(X_test)[:, 1] + except: + try: + y_score = pipe.decision_function(X_test) + except: + # Predict centroids and distances + centroids = pipe.named_steps['classifier'].centroids_ + distances = euclidean_distances(X_test, centroids) + + # Use negative distances to the positive class centroid as the score + # (Smaller distance => Higher score for positive class) + # Assuming binary classification with class labels 0 and 1 + y_score = -distances[:, 1] + accuracy = accuracy_score(y_test, y_pred, normalize=True) b_accuracy = balanced_accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average="weighted") try: - roc_auc = roc_auc_score(y_test, y_pred) + roc_auc = roc_auc_score(y_test, y_score) except Exception as exception: roc_auc = None if self.ignore_warnings is False: print("ROC AUC couldn't be calculated for " + name) print(exception) + names.append(name) Accuracy.append(accuracy) B_Accuracy.append(b_accuracy) From 50678c6de2a3777669fa247a9a6e265d0286f933 Mon Sep 17 00:00:00 2001 From: Mo Qo Date: Sun, 2 Feb 2025 19:35:40 -0500 Subject: [PATCH 07/12] Add weighted precision and recall to the table --- .gitignore | 2 ++ lazypredict/Supervised.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/.gitignore b/.gitignore index aebb864..f4bdec0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ __pycache__/ *.py[cod] *$py.class +tests/mq/ + # C extensions *.so diff --git a/lazypredict/Supervised.py b/lazypredict/Supervised.py index 20d3897..9639dd4 100644 --- a/lazypredict/Supervised.py +++ b/lazypredict/Supervised.py @@ -19,6 +19,8 @@ accuracy_score, balanced_accuracy_score, euclidean_distances, + precision_score, + recall_score, roc_auc_score, f1_score, r2_score, @@ -247,6 +249,8 @@ def fit(self, X_train, X_test, y_train, y_test): B_Accuracy = [] ROC_AUC = [] F1 = [] + PRECISION = [] + RECALL = [] names = [] TIME = [] predictions = {} @@ -340,6 +344,8 @@ def fit(self, X_train, X_test, y_train, y_test): accuracy = accuracy_score(y_test, y_pred, normalize=True) b_accuracy = balanced_accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average="weighted") + precision = precision_score(y_test, y_pred, average="weighted") + recall = recall_score(y_test, y_pred, average="weighted") try: roc_auc = roc_auc_score(y_test, y_score) except Exception as exception: @@ -353,6 +359,8 @@ def fit(self, X_train, X_test, y_train, y_test): B_Accuracy.append(b_accuracy) ROC_AUC.append(roc_auc) F1.append(f1) + PRECISION.append(precision) + RECALL.append(recall) TIME.append(time.time() - start) if self.custom_metric is not None: custom_metric = self.custom_metric(y_test, y_pred) @@ -366,6 +374,8 @@ def fit(self, X_train, X_test, y_train, y_test): "Balanced Accuracy": b_accuracy, "ROC AUC": roc_auc, "F1 Score": f1, + "Precision": precision, + "Recall": recall, self.custom_metric.__name__: custom_metric, "Time taken": time.time() - start, } @@ -378,6 +388,8 @@ def fit(self, X_train, X_test, y_train, y_test): "Balanced Accuracy": b_accuracy, "ROC AUC": roc_auc, "F1 Score": f1, + "Precision": precision, + "Recall": recall, "Time taken": time.time() - start, } ) @@ -395,6 +407,8 @@ def fit(self, X_train, X_test, y_train, y_test): "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, "F1 Score": F1, + "Precision": PRECISION, + "Recall": RECALL, "Time Taken": TIME, } ) @@ -406,6 +420,8 @@ def fit(self, X_train, X_test, y_train, y_test): "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, "F1 Score": F1, + "Precision": PRECISION, + "Recall": RECALL, self.custom_metric.__name__: CUSTOM_METRIC, "Time Taken": TIME, } From 067604c34257603ca57772f34324684f2923b3bb Mon Sep 17 00:00:00 2001 From: Mohammad Qodrati <9105400+qomhmd@users.noreply.github.com> Date: Tue, 11 Mar 2025 17:37:52 -0400 Subject: [PATCH 08/12] Sort by ROC-AUC --- lazypredict/Supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lazypredict/Supervised.py b/lazypredict/Supervised.py index 9639dd4..d0999fd 100644 --- a/lazypredict/Supervised.py +++ b/lazypredict/Supervised.py @@ -426,7 +426,7 @@ def fit(self, X_train, X_test, y_train, y_test): "Time Taken": TIME, } ) - scores = scores.sort_values(by="Balanced Accuracy", ascending=False).set_index( + scores = scores.sort_values(by="ROC AUC", ascending=False).set_index( "Model" ) From c1427a06096607aa36da1d6532df4164bd3b77ef Mon Sep 17 00:00:00 2001 From: Mo Qo Date: Mon, 24 Mar 2025 00:29:31 -0400 Subject: [PATCH 09/12] Make ColumnTransformer optional --- lazypredict/Supervised.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/lazypredict/Supervised.py b/lazypredict/Supervised.py index d0999fd..b68d4d1 100644 --- a/lazypredict/Supervised.py +++ b/lazypredict/Supervised.py @@ -213,6 +213,7 @@ def __init__( predictions=False, random_state=42, classifiers="all", + transformers=True, ): self.verbose = verbose self.ignore_warnings = ignore_warnings @@ -221,6 +222,7 @@ def __init__( self.models = {} self.random_state = random_state self.classifiers = classifiers + self.transformers = transformers def fit(self, X_train, X_test, y_train, y_test): """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test. @@ -262,20 +264,29 @@ def fit(self, X_train, X_test, y_train, y_test): X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) - numeric_features = X_train.select_dtypes(include=[np.number]).columns - categorical_features = X_train.select_dtypes(include=["object"]).columns + if self.transformers is True: + numeric_features = X_train.select_dtypes(include=[np.number]).columns + categorical_features = X_train.select_dtypes(include=["object"]).columns - categorical_low, categorical_high = get_card_split( - X_train, categorical_features - ) + categorical_low, categorical_high = get_card_split( + X_train, categorical_features + ) + + preprocessor = ColumnTransformer( + transformers=[ + ("numeric", numeric_transformer, numeric_features), + ("categorical_low", categorical_transformer_low, categorical_low), + ("categorical_high", categorical_transformer_high, categorical_high), + ] + ) + elif self.transformers is False or self.transformers is None: + preprocessor = ColumnTransformer( + transformers=[], + remainder="passthrough" + ) + elif isinstance(self.transformers, ColumnTransformer): + preprocessor = self.transformers - preprocessor = ColumnTransformer( - transformers=[ - ("numeric", numeric_transformer, numeric_features), - ("categorical_low", categorical_transformer_low, categorical_low), - ("categorical_high", categorical_transformer_high, categorical_high), - ] - ) if self.classifiers == "all": self.classifiers = CLASSIFIERS @@ -432,7 +443,7 @@ def fit(self, X_train, X_test, y_train, y_test): if self.predictions: predictions_df = pd.DataFrame.from_dict(predictions) - return scores, predictions_df if self.predictions is True else scores + return scores, predictions_df if self.predictions is True else None def provide_models(self, X_train, X_test, y_train, y_test): """ From 2385fccef9a6df0fdf26d967cef407bb23b6bdb2 Mon Sep 17 00:00:00 2001 From: Mohammad Qodrati <9105400+qomhmd@users.noreply.github.com> Date: Tue, 27 May 2025 17:15:34 +0330 Subject: [PATCH 10/12] Add AUPRC --- lazypredict/Supervised.py | 113 ++++++++++++++------------------------ 1 file changed, 41 insertions(+), 72 deletions(-) diff --git a/lazypredict/Supervised.py b/lazypredict/Supervised.py index b68d4d1..321defe 100644 --- a/lazypredict/Supervised.py +++ b/lazypredict/Supervised.py @@ -25,6 +25,7 @@ f1_score, r2_score, mean_squared_error, + average_precision_score, ) import warnings import xgboost @@ -225,31 +226,10 @@ def __init__( self.transformers = transformers def fit(self, X_train, X_test, y_train, y_test): - """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test. - Parameters - ---------- - X_train : array-like, - Training vectors, where rows is the number of samples - and columns is the number of features. - X_test : array-like, - Testing vectors, where rows is the number of samples - and columns is the number of features. - y_train : array-like, - Training vectors, where rows is the number of samples - and columns is the number of features. - y_test : array-like, - Testing vectors, where rows is the number of samples - and columns is the number of features. - Returns - ------- - scores : Pandas DataFrame - Returns metrics of all the models in a Pandas DataFrame. - predictions : Pandas DataFrame - Returns predictions of all the models in a Pandas DataFrame. - """ Accuracy = [] B_Accuracy = [] ROC_AUC = [] + PR_SCORE = [] F1 = [] PRECISION = [] RECALL = [] @@ -280,14 +260,10 @@ def fit(self, X_train, X_test, y_train, y_test): ] ) elif self.transformers is False or self.transformers is None: - preprocessor = ColumnTransformer( - transformers=[], - remainder="passthrough" - ) + preprocessor = ColumnTransformer(transformers=[], remainder="passthrough") elif isinstance(self.transformers, ColumnTransformer): preprocessor = self.transformers - if self.classifiers == "all": self.classifiers = CLASSIFIERS else: @@ -309,47 +285,38 @@ def fit(self, X_train, X_test, y_train, y_test): pipe = Pipeline( steps=[ ("preprocessor", preprocessor), - ("classifier", model( - random_state=self.random_state)), + ("classifier", model(random_state=self.random_state)), ] ) else: pipe = Pipeline( steps=[ ("preprocessor", preprocessor), - ("classifier", model( - random_state=self.random_state, probability=True)), + ("classifier", model(random_state=self.random_state, probability=True)), ] ) else: if "probability" not in model().get_params().keys(): pipe = Pipeline( - steps=[("preprocessor", preprocessor), - ("classifier", model())] + steps=[("preprocessor", preprocessor), ("classifier", model())] ) else: pipe = Pipeline( - steps=[("preprocessor", preprocessor), - ("classifier", model(probability=True))] + steps=[("preprocessor", preprocessor), ("classifier", model(probability=True))] ) pipe.fit(X_train, y_train) self.models[name] = pipe y_pred = pipe.predict(X_test) - + try: y_score = pipe.predict_proba(X_test)[:, 1] except: try: y_score = pipe.decision_function(X_test) except: - # Predict centroids and distances centroids = pipe.named_steps['classifier'].centroids_ distances = euclidean_distances(X_test, centroids) - - # Use negative distances to the positive class centroid as the score - # (Smaller distance => Higher score for positive class) - # Assuming binary classification with class labels 0 and 1 y_score = -distances[:, 1] accuracy = accuracy_score(y_test, y_pred, normalize=True) @@ -364,52 +331,52 @@ def fit(self, X_train, X_test, y_train, y_test): if self.ignore_warnings is False: print("ROC AUC couldn't be calculated for " + name) print(exception) - + + try: + pr_score = average_precision_score(y_test, y_score) + except Exception as exception: + pr_score = None + if self.ignore_warnings is False: + print("Precision-Recall AUC couldn't be calculated for " + name) + print(exception) + names.append(name) Accuracy.append(accuracy) B_Accuracy.append(b_accuracy) ROC_AUC.append(roc_auc) + PR_SCORE.append(pr_score) F1.append(f1) PRECISION.append(precision) RECALL.append(recall) TIME.append(time.time() - start) + if self.custom_metric is not None: custom_metric = self.custom_metric(y_test, y_pred) CUSTOM_METRIC.append(custom_metric) + if self.verbose > 0: + output = { + "Model": name, + "Accuracy": accuracy, + "Balanced Accuracy": b_accuracy, + "ROC AUC": roc_auc, + "Precision-Recall AUC": pr_score, + "F1 Score": f1, + "Precision": precision, + "Recall": recall, + "Time taken": time.time() - start, + } if self.custom_metric is not None: - print( - { - "Model": name, - "Accuracy": accuracy, - "Balanced Accuracy": b_accuracy, - "ROC AUC": roc_auc, - "F1 Score": f1, - "Precision": precision, - "Recall": recall, - self.custom_metric.__name__: custom_metric, - "Time taken": time.time() - start, - } - ) - else: - print( - { - "Model": name, - "Accuracy": accuracy, - "Balanced Accuracy": b_accuracy, - "ROC AUC": roc_auc, - "F1 Score": f1, - "Precision": precision, - "Recall": recall, - "Time taken": time.time() - start, - } - ) + output[self.custom_metric.__name__] = custom_metric + print(output) + if self.predictions: predictions[name] = y_pred except Exception as exception: if self.ignore_warnings is False: print(name + " model failed to execute") print(exception) + if self.custom_metric is None: scores = pd.DataFrame( { @@ -417,6 +384,7 @@ def fit(self, X_train, X_test, y_train, y_test): "Accuracy": Accuracy, "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, + "Precision-Recall AUC": PR_SCORE, "F1 Score": F1, "Precision": PRECISION, "Recall": RECALL, @@ -430,6 +398,7 @@ def fit(self, X_train, X_test, y_train, y_test): "Accuracy": Accuracy, "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, + "Precision-Recall AUC": PR_SCORE, "F1 Score": F1, "Precision": PRECISION, "Recall": RECALL, @@ -437,14 +406,14 @@ def fit(self, X_train, X_test, y_train, y_test): "Time Taken": TIME, } ) - scores = scores.sort_values(by="ROC AUC", ascending=False).set_index( - "Model" - ) + + scores = scores.sort_values(by="ROC AUC", ascending=False).set_index("Model") if self.predictions: predictions_df = pd.DataFrame.from_dict(predictions) - return scores, predictions_df if self.predictions is True else None + return scores, predictions_df + return scores, None def provide_models(self, X_train, X_test, y_train, y_test): """ This function returns all the model objects trained in fit function. From 3d5368769fe7000190c05af7ec3b4cb786218c05 Mon Sep 17 00:00:00 2001 From: Mohammad Qodrati <9105400+qomhmd@users.noreply.github.com> Date: Tue, 27 May 2025 17:24:24 +0330 Subject: [PATCH 11/12] Update __init__.py version to 0.2.13 --- lazypredict/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lazypredict/__init__.py b/lazypredict/__init__.py index ce94e0a..28f5277 100644 --- a/lazypredict/__init__.py +++ b/lazypredict/__init__.py @@ -4,4 +4,4 @@ __author__ = """Shankar Rao Pandala""" __email__ = "shankar.pandala@live.com" -__version__ = '0.2.12' +__version__ = '0.2.13' From 09558cca2d8c3469b7c6950a0e59e28df13c38bc Mon Sep 17 00:00:00 2001 From: Mohammad Qodrati <9105400+qomhmd@users.noreply.github.com> Date: Tue, 27 May 2025 17:24:52 +0330 Subject: [PATCH 12/12] Update setup.py version 0.2.13 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 77a64f3..ced0c2c 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,6 @@ test_suite="tests", tests_require=test_requirements, url="https://github.com/shankarpandala/lazypredict", - version='0.2.12', + version='0.2.13', zip_safe=False, )