Release1.0.2 (#228)

solegalli · web-flow · commit ebaa2cefa0bf · 2021-01-23T08:51:12.000-03:00
* fix typo

* adds missing param to docstrings

* fix typos in notebook

* fix typos in notebook

* fix bug in select by target proxy

* fix bug target mean selection

* adds kaggle kernels to docs

* update version and changelog

* fixes docs bugs
diff --git a/docs/blogs.rst b/docs/blogs.rst
@@ -9,7 +9,7 @@ Blogs
 - `Feature-engine: A new open-source Python package for feature engineering <https://www.trainindatablog.com/feature-engine-a-new-open-source-python-package-for-feature-engineering/>`_.
 - `Practical Code Implementations of Feature Engineering for Machine Learning with Python <https://www.trainindatablog.com/practical-code-implementations-of-feature-engineering-for-machine-learning-with-python/>`_.
 - `Streamlining Feature Engineering Pipelines with Feature-engine <https://towardsdatascience.com/streamlining-feature-engineering-pipelines-with-feature-engine-e781d551f470?gi=e0fa6e5c0c1a/>`_.
-- `Feature Engineering for Machine Learning: A comprehensive Overvoew <https://www.trainindatablog.com/feature-engineering-for-machine-learning-comprehensive-overview/>`_.
+- `Feature Engineering for Machine Learning: A comprehensive Overview <https://www.trainindatablog.com/feature-engineering-for-machine-learning-comprehensive-overview/>`_.
 - `Feature Selection for Machine Learning: A comprehensive Overview <https://www.trainindatablog.com/feature-selection-for-machine-learning-comprehensive-overview/>`_.
 
 
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
@@ -7,6 +7,14 @@ Code tutorials
 Coming Soon!
 
 
+Kaggle Kernels
+--------------
+
+- `Feature selection for bank customer satisfaction prediction <https://www.kaggle.com/solegalli/feature-selection-with-feature-engine>`_
+- `Feature engineering and selection for house price prediction <https://www.kaggle.com/solegalli/predict-house-price-with-feature-engine>`_
+- `Feature creation for wine quality prediction <https://www.kaggle.com/solegalli/create-new-features-with-feature-engine>`_
+
+
 Video tutorials
 ---------------
 
diff --git a/docs/whats_new/v1.rst b/docs/whats_new/v1.rst
@@ -1,3 +1,32 @@
+Version 1.0.2
+=============
+
+Deployed: 22th January 2021
+
+Contributors
+------------
+    - Nicolas Galli
+    - Pradumna Suryawanshi
+    - Elamraoui Sohayb
+    - Soledad Galli
+
+New transformers
+----------------
+    - **CombineWithReferenceFeatures**: applies mathematical operations between a group of variables and reference variables (**by Nicolas Galli**)
+    - **DropMissingData**: removes missing observations from a dataset (**Pradumna Suryawanshi**)
+
+Bug Fix
+-------
+    - Fix bugs in SelectByTargetMeanPerformance.
+    - Fix documentation and jupyter notebook typos.
+
+Tutorials
+---------
+
+    - **Creation**: updated "how to" examples on how to combine variables into new features (**by Elamraoui Sohayb and Nicolas Galli**)
+    - **Kaggle Kernels**: include links to Kaggle kernels
+
+
 Version 1.0.1
 =============
 
diff --git a/examples/creation/CombineWithReferenceFeature.ipynb b/examples/creation/CombineWithReferenceFeature.ipynb
@@ -824,7 +824,7 @@
     "value_pipe = pipe([\n",
     "    \n",
     "    # Create new features\n",
-    "    ('substraction', CombineWithReferenceFeature(\n",
+    "    ('subtraction', CombineWithReferenceFeature(\n",
     "        variables_to_combine=['total sulfur dioxide'],\n",
     "        reference_variables=['free sulfur dioxide'],\n",
     "        operations=['sub'],\n",
@@ -864,7 +864,7 @@
     {
      "data": {
       "text/plain": [
-       "Pipeline(steps=[('substraction',\n",
+       "Pipeline(steps=[('subtraction',\n",
        "                 CombineWithReferenceFeature(new_variables_names=['non_free_sulfur_dioxide'],\n",
        "                                             reference_variables=['free sulfur '\n",
        "                                                                  'dioxide'],\n",
@@ -920,19 +920,19 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "LogisticRegression Model train accuracy score: 0.7477414871438499\n",
+      "Logistic Regression Model train accuracy score: 0.7477414871438499\n",
       "\n",
-      "LogisticRegression Model train accuracy score: 0.75\n"
+      "Logistic Regression Model test accuracy score: 0.75\n"
      ]
     }
    ],
    "source": [
-    "print('LogisticRegression Model train accuracy score: {}'.format(\n",
+    "print('Logistic Regression Model train accuracy score: {}'.format(\n",
     "    accuracy_score(y_train, pred_train)))\n",
     "\n",
     "print()\n",
     "\n",
-    "print('LogisticRegression Model train accuracy score: {}'.format(\n",
+    "print('Logistic Regression Model test accuracy score: {}'.format(\n",
     "    accuracy_score(y_test, pred_test)))"
    ]
   },
@@ -945,7 +945,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "LogisticRegression Model test classification report: \n",
+      "Logistic Regression Model test classification report: \n",
       "\n",
       "               precision    recall  f1-score   support\n",
       "\n",
@@ -960,7 +960,7 @@
     }
    ],
    "source": [
-    "print('LogisticRegression Model test classification report: \\n\\n {}'.format(\n",
+    "print('Logistic Regression Model test classification report: \\n\\n {}'.format(\n",
     "    classification_report(y_test, pred_test)))"
    ]
   },
diff --git a/examples/creation/MathematicalCombination.ipynb b/examples/creation/MathematicalCombination.ipynb
@@ -578,7 +578,7 @@
     ")\n",
     "\n",
     "\n",
-    "# Fit the Mean Combinator on training data\n",
+    "# Fit the Combinator to the training data\n",
     "multiple_combinator.fit(data)\n",
     "\n",
     "# Transform the data\n",
@@ -793,11 +793,12 @@
    "metadata": {},
    "source": [
     "We can put all these transformations into single pipeline:\n",
+    "\n",
     "1. Create new variables\n",
     "2. Scale features\n",
-    "3. Apply simple LogisticRegression classifier to predict the wine quality range\n",
+    "3. Train a Logistic Regression model to predict wine quality\n",
     "\n",
-    "See more on how to use Piplines in these **[examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines)**"
+    "See more on how to use Feature-engine within Scikit-learn Pipelines in these **[examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines)**"
    ]
   },
   {
@@ -838,12 +839,12 @@
    "outputs": [],
    "source": [
     "value_pipe = pipe([\n",
-    "    # Create two new features using the min and max combinators\n",
+    "\n",
+    "    # Create the new features\n",
     "    ('math_combinator_mean', MathematicalCombination(variables_to_combine=['fixed acidity', 'volatile acidity'],\n",
     "                                                     math_operations=['mean'],\n",
     "                                                     new_variables_names=['avg_acidity'])),\n",
     "\n",
-    "    # Create three new features using the mean and sum combinators\n",
     "    ('math_combinator_sum', MathematicalCombination(variables_to_combine=['total sulfur dioxide', 'sulphates'],\n",
     "                                                    math_operations=['sum'],\n",
     "                                                    new_variables_names=['total_minerals'])),\n",
@@ -910,15 +911,15 @@
      "text": [
       "LogisticRegression Model train accuracy score: 0.744266851980542\n",
       "\n",
-      "LogisticRegression Model train accuracy score: 0.75\n"
+      "LogisticRegression Model test accuracy score: 0.75\n"
      ]
     }
    ],
    "source": [
-    "print('LogisticRegression Model train accuracy score: {}'.format(\n",
+    "print('Logistic Regression Model train accuracy score: {}'.format(\n",
     "    accuracy_score(y_train, pred_train)))\n",
     "print()\n",
-    "print('LogisticRegression Model train accuracy score: {}'.format(\n",
+    "print('Logistic Regression Model test accuracy score: {}'.format(\n",
     "    accuracy_score(y_test, pred_test)))"
    ]
   },
@@ -946,7 +947,7 @@
     }
    ],
    "source": [
-    "print('LogisticRegression Model test classification report: \\n\\n {}'.format(\n",
+    "print('Logistic Regression Model test classification report: \\n\\n {}'.format(\n",
     "    classification_report(y_test, pred_test)))"
    ]
   },
@@ -1042,7 +1043,12 @@
    "title_cell": "Table of Contents",
    "title_sidebar": "Contents",
    "toc_cell": false,
-   "toc_position": {},
+   "toc_position": {
+    "height": "calc(100% - 180px)",
+    "left": "10px",
+    "top": "150px",
+    "width": "197.6px"
+   },
    "toc_section_display": true,
    "toc_window_display": true
   }
diff --git a/feature_engine/VERSION b/feature_engine/VERSION
@@ -1 +1 @@
-1.0.1
+1.0.2
diff --git a/feature_engine/creation/combine_with_reference_feature.py b/feature_engine/creation/combine_with_reference_feature.py
@@ -104,11 +104,11 @@ class CombineWithReferenceFeature(BaseEstimator, TransformerMixin):
     Methods
     -------
 
-    fit:
+    fit :
         This transformer does not learn parameters.
-    transform:
+    transform :
         Combine the variables with the mathematical operations.
-    fit_transform:
+    fit_transform :
         Fit to the data, then transform it.
 
     Notes
@@ -219,8 +219,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
            If any of the reference variables contain null values and the
            mathematical operation is 'div'.
 
-        Returns:
-        --------
+        Returns
+        -------
         self
         """
 
diff --git a/feature_engine/selection/smart_correlation_selection.py b/feature_engine/selection/smart_correlation_selection.py
@@ -58,6 +58,22 @@ class SmartCorrelatedSelection(BaseSelector):
         Takes values 'raise' and 'ignore'. Whether the missing values should be raised
         as error or ignored when determining correlation.
 
+    selection_method : str, default= "missing_values"
+        Takes the values "missing_values", "cardinality", "variance" and
+        "model_performance".
+
+        "missing_values": keeps the feature from the correlated group with least
+        missing observations
+
+        "cardinality": keeps the feature from the correlated group with the highest
+        cardinality.
+
+        "variance": keeps the feature from the correlated group with the highest
+        variance.
+
+        "model_performance": trains a machine learning model using the correlated
+        feature group and retains the feature with the highest importance.
+
     estimator : object, default = None
         A Scikit-learn estimator for regression or classification.
 
diff --git a/feature_engine/selection/target_mean_selection.py b/feature_engine/selection/target_mean_selection.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 from sklearn.metrics import roc_auc_score, r2_score
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import KFold
 from sklearn.pipeline import Pipeline
 
 from feature_engine.dataframe_checks import (
@@ -78,15 +78,15 @@ class SelectByTargetMeanPerformance(BaseSelector):
         This indicates the metrics score to perform the feature selection.
         The current implementation supports 'roc_auc_score' and 'r2_score'.
 
-    threshold : float, default = 0.5
+    threshold : float, default = None
         The performance threshold above which a feature will be selected.
 
     bins : int, default = 5
         If the dataset contains numerical variables, the number of bins into which
         the values will be sorted.
 
     strategy : str, default = equal_width
-        whether to create the bins for discretisation of numerical variables of
+        whether to create the bins for discretization of numerical variables of
         equal width or equal frequency.
 
     cv : int, default=3
@@ -132,21 +132,9 @@ def __init__(
                 "'scoring'"
             )
 
-        if not isinstance(threshold, (int, float)):
+        if threshold and not isinstance(threshold, (int, float)):
             raise ValueError("threshold can only take integer or float")
 
-        if scoring == "roc_auc_score" and (threshold < 0.5 or threshold > 1):
-            raise ValueError(
-                "roc-auc score should vary between 0.5 and 1. Pick a "
-                "threshold within this interval."
-            )
-
-        if scoring == "r2_score" and (threshold < 0 or threshold > 1):
-            raise ValueError(
-                "r2 score should vary between 0 and 1. Pick a "
-                "threshold within this interval."
-            )
-
         if not isinstance(bins, int):
             raise TypeError("'bins' takes only integers")
 
@@ -195,6 +183,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         # check if df contains na
         _check_contains_na(X, self.variables)
 
+        self.input_shape_ = X.shape
+
         # limit df to variables to smooth code below
         X = X[self.variables].copy()
 
@@ -205,7 +195,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         )
 
         # obtain cross-validation indeces
-        skf = StratifiedKFold(
+        skf = KFold(
             n_splits=self.cv, shuffle=True, random_state=self.random_state
         )
         skf.get_n_splits(X, y)
@@ -245,14 +235,18 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
             axis=1
         ).to_dict()
 
+        # select features
+        if not self.threshold:
+            threshold = pd.Series(self.feature_performance_).mean()
+        else:
+            threshold = self.threshold
+
         self.features_to_drop_ = [
             f
             for f in self.variables
-            if self.feature_performance_[f] < self.threshold
+            if self.feature_performance_[f] < threshold
         ]
 
-        self.input_shape_ = X.shape
-
         return self
 
     def _make_numerical_pipeline(self):
diff --git a/tests/test_selection/test_target_mean_selection.py b/tests/test_selection/test_target_mean_selection.py