pull master

RJ Agrawal · RJ Agrawal · commit 22dc685756f5 · 2020-07-29T06:48:29.000-07:00
diff --git a/README.rst b/README.rst
@@ -6,8 +6,6 @@ Sklearn-pandas
     :target: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas
 
 This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/stable>`__'s machine learning methods and `pandas <https://pandas.pydata.org>`__-style Data Frames.
-
-
 In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features.
 
 Installation
@@ -395,6 +393,29 @@ A ``DataFrameMapper`` will return a dense feature array by default. Setting ``sp
 The stacking of the sparse features is done without ever densifying them.
 
 
+Using Numerical Transformer
+****************************
+
+While you can use FunctionTransformation to generate artibtrary transformer but they cannot not serialized (pickled).
+NumericalTransformer takes function name as a string parameter and hence can be easily serialized.
+
+    >>> from sklearn_pandas import NumericalTransformer
+    >>> mapper5 = DataFrameMapper([
+    ...     ('children', NumericalTransformer('log')),
+    ... ])
+    >>> mapper5.fit_transform(data)
+    array([[1.38629436],
+           [1.79175947],
+           [1.09861229],
+           [1.09861229],
+           [0.69314718],
+           [1.09861229],
+           [1.60943791],
+           [1.38629436]])
+
+
+
+
 Changelog
 ---------
 
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -3,3 +3,4 @@
 from .dataframe_mapper import DataFrameMapper  # NOQA
 from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
 from .features_generator import gen_features  # NOQA
+from .transformers import NumericalTransformer # NOQA
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -166,6 +166,7 @@ def __setstate__(self, state):
         self.built_features = state.get('built_features', self.features)
         self.built_default = state.get('built_default', self.default)
         self.transformed_names_ = state.get('transformed_names_', [])
+        self.show_progressbar = state.get('show_progressbar', False)
 
     def _get_col_subset(self, X, cols, input_df=False):
         """
diff --git a/sklearn_pandas/transformers.py b/sklearn_pandas/transformers.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+from sklearn.base import TransformerMixin
 
 
 def _get_mask(X, value):
@@ -12,3 +13,35 @@ def _get_mask(X, value):
         return pd.isnull(X)
     else:
         return X == value
+
+
+class NumericalTransformer(TransformerMixin):
+    """
+    Provides commonly used numerical transformers.
+    """
+    SUPPORTED_FUNCTIONS = ['log', 'log1p']
+
+    def __init__(self, func):
+        """
+        Params
+
+        func    function to apply to input columns. The function will be
+                applied to each value. Supported functions are defined
+                in SUPPORTED_FUNCTIONS variable. Throws assertion error if the
+                not supported.
+        """
+        assert func in self.SUPPORTED_FUNCTIONS, \
+            f"Only following func are supported: {self.SUPPORTED_FUNCTIONS}"
+        super(NumericalTransformer, self).__init__()
+        self.__func = func
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        if self.__func == 'log1p':
+            return np.vectorize(np.log1p)(X)
+        elif self.__func == 'log':
+            return np.vectorize(np.log)(X)
+
+        raise ValueError(f"Invalid function name: {self.__func}")
diff --git a/tests/test_transformers.py b/tests/test_transformers.py
@@ -0,0 +1,47 @@
+import tempfile
+import pytest
+import numpy as np
+from pandas import DataFrame
+import joblib
+
+from sklearn_pandas import DataFrameMapper
+from sklearn_pandas.transformers import NumericalTransformer
+
+
+@pytest.fixture
+def simple_dataset():
+    return DataFrame({
+        'feat1': [1, 2, 1, 3, 1],
+        'feat2': [1, 2, 2, 2, 3],
+        'feat3': [1, 2, 3, 4, 5],
+    })
+
+
+def test_common_numerical_transformer(simple_dataset):
+    """
+    Test log transformation
+    """
+    transfomer = DataFrameMapper([
+        ('feat1', NumericalTransformer('log'))
+    ], df_out=True)
+    df = simple_dataset
+    outDF = transfomer.fit_transform(df)
+    assert list(outDF.columns) == ['feat1']
+    assert np.array_equal(df['feat1'].apply(np.log).values, outDF.feat1.values)
+
+
+def test_numerical_transformer_serialization(simple_dataset):
+    """
+    Test if you can serialize transformer
+    """
+    transfomer = DataFrameMapper([
+        ('feat1', NumericalTransformer('log'))
+    ])
+
+    df = simple_dataset
+    transfomer.fit(df)
+    f = tempfile.NamedTemporaryFile(delete=True)
+    joblib.dump(transfomer, f.name)
+    transfomer2 = joblib.load(f.name)
+    np.array_equal(transfomer.transform(df), transfomer2.transform(df))
+    f.close()