minor changes to paro1234-feature/drop (#217)

ragrawal · paro1234 · ragrawal · web-flow · commit fb01a2332f24 · 2020-09-07T09:29:10.000-07:00
* explicit drop feature + tests

* updated as per rules

* updated as per PR comments

* updated version number and using empty list as default

Co-authored-by: Parul Singh &lt;prlsngh43@gmail.com&gt;
Co-authored-by: ragrawal &lt;ragrawal@varomoney.com&gt;
diff --git a/README.rst b/README.rst
@@ -210,6 +210,32 @@ attribute.
 
 Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper.
 
+Dropping columns explictly
+*******************************
+
+Sometimes it is required to drop a specific column/ list of columns.
+For this purpose, ``drop_cols``  argument for ``DataFrameMapper`` can be used.
+Default value is ``None``
+
+    >>> mapper_df = DataFrameMapper([
+    ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
+    ...     (['children'], sklearn.preprocessing.StandardScaler())
+    ... ], drop_cols=['salary'])
+
+Now running ``fit_transform`` will run transformations on 'pet' and 'children' and drop 'salary' column:
+
+   >>> np.round(mapper_df.fit_transform(data.copy()), 1)
+   array([[ 1. ,  0. ,  0. ,  0.2],
+          [ 0. ,  1. ,  0. ,  1.9],
+          [ 0. ,  1. ,  0. , -0.6],
+          [ 0. ,  0. ,  1. , -0.6],
+          [ 1. ,  0. ,  0. , -1.5],
+          [ 0. ,  1. ,  0. , -0.6],
+          [ 1. ,  0. ,  0. ,  1. ],
+          [ 0. ,  0. ,  1. ,  0.2]])
+
+Transformations may require multiple input columns. In these
+
 Transform Multiple Columns
 **************************
 
@@ -395,7 +421,7 @@ The stacking of the sparse features is done without ever densifying them.
 
 
 Using ``NumericalTransformer``
-****************************
+***********************************
 
 While you can use ``FunctionTransformation`` to generate arbitrary transformers, it can present serialization issues
 when pickling. Use ``NumericalTransformer`` instead, which takes the function name as a string parameter and hence
@@ -419,8 +445,15 @@ can be easily serialized.
 
 Changelog
 ---------
+2.0.1 (2020-09-07)
+******************
+
+* Added an option to explicitly drop columns.
+
+
 2.0.0 (2020-08-01)
 ******************
+
 * Deprecated support for Python < 3.6.
 * Deprecated support for old versions of scikit-learn, pandas and numpy. Please check setup.py for minimum requirement.
 * Removed CategoricalImputer, cross_val_score and GridSearchCV. All these functionality now exists as part of
@@ -430,32 +463,39 @@ Changelog
 * Added ``NumericalTransformer`` for common numerical transformations. Currently it implements log and log1p
   transformation.
 * Added prefix and suffix options. See examples above. These are usually helpful when using gen_features.
+* Added ``drop_cols`` argument to DataframeMapper. This can be used to explicitly drop columns
 
 
 1.8.0 (2018-12-01)
 ******************
+
 * Add ``FunctionTransformer`` class (#117).
 * Fix column names derivation for dataframes with multi-index or non-string
   columns (#166).
 * Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'
   native fit_transform if implemented. (#150)
 
+
 1.7.0 (2018-08-15)
 ******************
+
 * Fix issues with unicode names in ``get_names`` (#160).
 * Update to build using ``numpy==1.14`` and ``python==3.6`` (#154).
 * Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing
   with values other than the mode (#144), (#161).
 * Preserve input data types when no transform is supplied (#138).
 
+
 1.6.0 (2017-10-28)
 ******************
+
 * Add column name to exception during fit/transform (#110).
 * Add ``gen_feature`` helper function to help generating the same transformation for multiple columns (#126).
 
 
 1.5.0 (2017-06-24)
 ******************
+
 * Allow inputting a dataframe/series per group of columns.
 * Get feature names also from ``estimator.get_feature_names()`` if present.
 * Attempt to derive feature names from individual transformers when applying a
@@ -466,6 +506,7 @@ Changelog
 
 1.4.0 (2017-05-13)
 ******************
+
 * Allow specifying a custom name (alias) for transformed columns (#83).
 * Capture output columns generated names in ``transformed_names_`` attribute (#78).
 * Add ``CategoricalImputer`` that replaces null-like values with the mode
@@ -543,3 +584,4 @@ Other contributors:
 * Timothy Sweetser (@hacktuarial)
 * Vitaley Zaretskey (@vzaretsk)
 * Zac Stewart (@zacstewart)
+* Parul Singh (@paro1234)
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2.0.0'
+__version__ = '2.0.1'
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
 from .features_generator import gen_features  # NOQA
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -63,15 +63,15 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self, features, default=False, sparse=False, df_out=False,
-                 input_df=False):
+                 input_df=False, drop_cols=None):
         """
         Params:
 
         features    a list of tuples with features definitions.
                     The first element is the pandas column selector. This can
                     be a string (for one column) or a list of strings.
                     The second element is an object that supports
-                    sklearn's transform interface, or a list of such objects.
+                    sklearn's transform interface, or a list of such objects
                     The third element is optional and, if present, must be
                     a dictionary with the options to apply to the
                     transformation. Example: {'alias': 'day_of_week'}
@@ -96,14 +96,16 @@ def __init__(self, features, default=False, sparse=False, df_out=False,
                     as a pandas DataFrame or Series. Otherwise pass them as a
                     numpy array. Defaults to ``False``.
 
+        drop_cols   List of columns to be dropped. Defaults to None.
+
         """
         self.features = features
-        self.built_features = None
         self.default = default
         self.built_default = None
         self.sparse = sparse
         self.df_out = df_out
         self.input_df = input_df
+        self.drop_columns = drop_cols or []
         self.transformed_names_ = []
 
         if (df_out and (sparse or default)):
@@ -144,7 +146,8 @@ def _unselected_columns(self, X):
         """
         X_columns = list(X.columns)
         return [column for column in X_columns if
-                column not in self._selected_columns]
+                column not in self._selected_columns
+                and column not in self.drop_columns]
 
     def __setstate__(self, state):
         # compatibility for older versions of sklearn-pandas
@@ -153,6 +156,7 @@ def __setstate__(self, state):
         self.default = state.get('default', False)
         self.df_out = state.get('df_out', False)
         self.input_df = state.get('input_df', False)
+        self.drop_columns = state.get('drop_cols', None)
         self.built_features = state.get('built_features', self.features)
         self.built_default = state.get('built_default', self.default)
         self.transformed_names_ = state.get('transformed_names_', [])
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -649,7 +649,7 @@ def test_selected_columns():
 
 def test_unselected_columns():
     """
-    selected_columns returns a list of the columns not appearing in the
+    unselected_columns returns a list of the columns not appearing in the
     features of the mapper but present in the given dataframe.
     """
     df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]})
@@ -660,6 +660,49 @@ def test_unselected_columns():
     assert 'c' in mapper._unselected_columns(df)
 
 
+def test_drop_and_default_false():
+    """
+    If default=False, non explicitly selected columns and drop columns
+    are discarded.
+    """
+    df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]})
+    mapper = DataFrameMapper([
+            ('a', None)
+        ], drop_cols=['c'], default=False)
+    transformed = mapper.fit_transform(df)
+    assert transformed.shape == (1, 1)
+    assert mapper.transformed_names_ == ['a']
+
+
+def test_drop_and_default_none():
+    """
+    If default=None, drop columns are discarded and
+    remaining non explicitly selected columns are passed through untransformed
+    """
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
+    mapper = DataFrameMapper([
+        ('a', None)
+    ], drop_cols=['c'], default=None)
+
+    transformed = mapper.fit_transform(df)
+    assert transformed.shape == (3, 2)
+    assert mapper.transformed_names_ == ['a', 'b']
+
+
+def test_conflicting_drop():
+    """
+    Drop column name shouldn't get confused with transformed columns.
+    """
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
+    mapper = DataFrameMapper([
+        ('a', None)
+    ], drop_cols=['a'], default=False)
+
+    transformed = mapper.fit_transform(df)
+    assert transformed.shape == (3, 1)
+    assert mapper.transformed_names_ == ['a']
+
+
 def test_default_false():
     """
     If default=False, non explicitly selected columns are discarded.

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = '2.0.0'`
	`1`	`+__version__ = '2.0.1'`
`2`	`2`
`3`	`3`	`from .dataframe_mapper import DataFrameMapper # NOQA`
`4`	`4`	`from .features_generator import gen_features # NOQA`