feature-engine
diff --git a/‎feature_engine/selection/base_selector.py‎
Lines changed: 39 additions & 0 deletions b/‎feature_engine/selection/base_selector.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎feature_engine/selection/drop_constant_features.py‎
Lines changed: 12 additions & 41 deletions b/‎feature_engine/selection/drop_constant_features.py‎
Lines changed: 12 additions & 41 deletions
diff --git a/‎feature_engine/selection/drop_correlated_features.py‎
Lines changed: 16 additions & 48 deletions b/‎feature_engine/selection/drop_correlated_features.py‎
Lines changed: 16 additions & 48 deletions
@@ -1,4 +1,12 @@
 import numpy as np
+import pandas as pd
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.utils.validation import check_is_fitted
+
+from feature_engine.dataframe_checks import (
+    _is_dataframe,
+    _check_input_matches_training_df,
+)
 
 
 def get_feature_importances(estimator):
@@ -17,3 +25,34 @@ def get_feature_importances(estimator):
             importances = np.linalg.norm(coef_, axis=0, ord=len(estimator.coef_))
 
     return list(importances)
+
+
+class BaseSelector(BaseEstimator, TransformerMixin):
+    """Transformation shared by all selectors"""
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Return dataframe with selected features.
+
+        Parameters
+        ----------
+        X : pandas dataframe of shape = [n_samples, n_features].
+            The input dataframe.
+
+        Returns
+        -------
+        X_transformed: pandas dataframe of shape = [n_samples, n_selected_features]
+            Pandas dataframe with the selected features.
+        """
+
+        # check if fit is performed prior to transform
+        check_is_fitted(self)
+
+        # check if input is a dataframe
+        X = _is_dataframe(X)
+
+        # check if number of columns in test dataset matches to train dataset
+        _check_input_matches_training_df(X, self.input_shape_[1])
+
+        # return the dataframe with the selected features
+        return X.drop(columns=self.features_to_drop_)
@@ -2,23 +2,21 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils.validation import check_is_fitted
 
 from feature_engine.dataframe_checks import (
     _is_dataframe,
-    _check_input_matches_training_df,
     _check_contains_na,
 )
 from feature_engine.variable_manipulation import (
     _check_input_parameter_variables,
     _find_all_variables,
 )
+from feature_engine.selection.base_selector import BaseSelector
 
 Variables = Union[None, int, str, List[Union[str, int]]]
 
 
-class DropConstantFeatures(TransformerMixin, BaseEstimator):
+class DropConstantFeatures(BaseSelector):
     """
     Drop constant and quasi-constant variables from a dataframe. Constant variables
     show the same value across all the observations in the dataset. Quasi-constant
@@ -51,7 +49,7 @@ class DropConstantFeatures(TransformerMixin, BaseEstimator):
 
     Attributes
     ----------
-    constant_features_:
+    features_to_drop_:
         List with constant and quasi-constant features.
 
     Methods
@@ -90,7 +88,6 @@ def __init__(
         self.missing_values = missing_values
 
     def fit(self, X: pd.DataFrame, y: pd.Series = None):
-
         """
         Find constant and quasi-constant features.
 
@@ -121,13 +118,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
 
         # find constant features
         if self.tol == 1:
-            self.constant_features_ = [
+            self.features_to_drop_ = [
                 feature for feature in self.variables if X[feature].nunique() == 1
             ]
 
         # find constant and quasi-constant features
         else:
-            self.constant_features_ = []
+            self.features_to_drop_ = []
 
             for feature in self.variables:
                 # find most frequent value / category in the variable
@@ -138,10 +135,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
                 )
 
                 if predominant >= self.tol:
-                    self.constant_features_.append(feature)
+                    self.features_to_drop_.append(feature)
 
         # check we are not dropping all the columns in the df
-        if len(self.constant_features_) == len(X.columns):
+        if len(self.features_to_drop_) == len(X.columns):
             raise ValueError(
                 "The resulting dataframe will have no columns after dropping all "
                 "constant or quasi-constant features. Try changing the tol value."
@@ -151,36 +148,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
 
         return self
 
-    def transform(self, X: pd.DataFrame):
-        """
-        Drop the constant and quasi-constant features from a dataframe.
-
-        Parameters
-        ----------
-        X : pandas dataframe of shape = [n_samples, n_features].
-            The input samples.
-
-        Returns
-        -------
-        X_transformed : pandas dataframe,
-            shape = [n_samples, n_features - (constant + quasi_constant features)]
-            The transformed dataframe with the remaining subset of variables.
-        """
-
-        # check if fit is performed prior to transform
-        check_is_fitted(self)
-
-        # check if input is a dataframe
-        X = _is_dataframe(X)
-
-        # check if number of columns in test dataset matches to train dataset
-        _check_input_matches_training_df(X, self.input_shape_[1])
-
-        if self.missing_values == "raise":
-            # check if dataset contains na
-            _check_contains_na(X, self.variables)
-
-        # returned selected features
-        X = X.drop(columns=self.constant_features_)
+    # Ugly work around to import the docstring for Sphinx, otherwise not necessary
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        X = super().transform(X)
 
         return X
+
+    transform.__doc__ = BaseSelector.transform.__doc__
@@ -1,23 +1,21 @@
 from typing import List, Union
 
 import pandas as pd
-from sklearn.base import TransformerMixin, BaseEstimator
-from sklearn.utils.validation import check_is_fitted
 
 from feature_engine.dataframe_checks import (
     _is_dataframe,
-    _check_input_matches_training_df,
     _check_contains_na,
 )
 from feature_engine.variable_manipulation import (
     _find_or_check_numerical_variables,
     _check_input_parameter_variables,
 )
+from feature_engine.selection.base_selector import BaseSelector
 
 Variables = Union[None, int, str, List[Union[str, int]]]
 
 
-class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
+class DropCorrelatedFeatures(BaseSelector):
     """
     DropCorrelatedFeatures() finds and removes correlated features. Correlation is
     calculated with `pandas.corr()`.
@@ -52,14 +50,11 @@ class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    correlated_features_:
-        Set with the correlated features.
+    features_to_drop_:
+        Set with the correlated features that will be dropped.
 
     correlated_feature_sets_:
-        Groups of correlated features.  Each list is a group of correlated features.
-
-    correlated_matrix_:
-        The correlation matrix.
+        Groups of correlated features. Each list is a group of correlated features.
 
     Methods
     -------
@@ -128,20 +123,20 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
             _check_contains_na(X, self.variables)
 
         # set to collect features that are correlated
-        self.correlated_features_ = set()
+        self.features_to_drop_ = set()
 
         # create tuples of correlated feature groups
         self.correlated_feature_sets_ = []
 
         # the correlation matrix
-        self.correlated_matrix_ = X[self.variables].corr(method=self.method)
+        _correlated_matrix = X[self.variables].corr(method=self.method)
 
         # create set of examined features, helps to determine feature combinations
         # to evaluate below
         _examined_features = set()
 
         # for each feature in the dataset (columns of the correlation matrix)
-        for feature in self.correlated_matrix_.columns:
+        for feature in _correlated_matrix.columns:
 
             if feature not in _examined_features:
 
@@ -155,20 +150,18 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
                 # features that have not been examined, are not currently examined and
                 # were not found correlated
                 _features_to_compare = [
-                    f
-                    for f in self.correlated_matrix_.columns
-                    if f not in _examined_features
+                    f for f in _correlated_matrix.columns if f not in _examined_features
                 ]
 
                 # create combinations:
                 for f2 in _features_to_compare:
 
                     # if the correlation is higher than the threshold
                     # we are interested in absolute correlation coefficient value
-                    if abs(self.correlated_matrix_.loc[f2, feature]) > self.threshold:
+                    if abs(_correlated_matrix.loc[f2, feature]) > self.threshold:
 
                         # add feature (f2) to our correlated set
-                        self.correlated_features_.add(f2)
+                        self.features_to_drop_.add(f2)
                         _temp_set.add(f2)
                         _examined_features.add(f2)
 
@@ -180,35 +173,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
 
         return self
 
-    def transform(self, X):
-        """
-        Drop the correlated features from a dataframe.
-
-        Parameters
-        ----------
-        X : pandas dataframe of shape = [n_samples, n_features].
-            The input samples.
-
-        Returns
-        -------
-        X_transformed : pandas dataframe
-            shape = [n_samples, n_features - (correlated features)]
-            The transformed dataframe with the remaining subset of variables.
-        """
-        # check if fit is performed prior to transform
-        check_is_fitted(self)
-
-        # check if input is a dataframe
-        X = _is_dataframe(X)
-
-        # check if number of columns in test dataset matches to train dataset
-        _check_input_matches_training_df(X, self.input_shape_[1])
-
-        if self.missing_values == "raise":
-            # check if dataset contains na
-            _check_contains_na(X, self.variables)
-
-        # returned non-correlated features
-        X = X.drop(columns=self.correlated_features_)
+    # Ugly work around to import the docstring for Sphinx, otherwise not necessary
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        X = super().transform(X)
 
         return X
+
+    transform.__doc__ = BaseSelector.transform.__doc__