Skip to content

Commit 22dc685

Browse files
author
RJ Agrawal
committed
pull master
2 parents 6e51661 + 661bd94 commit 22dc685

File tree

5 files changed

+105
-2
lines changed

5 files changed

+105
-2
lines changed

README.rst

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ Sklearn-pandas
66
:target: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas
77

88
This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/stable>`__'s machine learning methods and `pandas <https://pandas.pydata.org>`__-style Data Frames.
9-
10-
119
In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features.
1210

1311
Installation
@@ -395,6 +393,29 @@ A ``DataFrameMapper`` will return a dense feature array by default. Setting ``sp
395393
The stacking of the sparse features is done without ever densifying them.
396394

397395

396+
Using Numerical Transformer
397+
****************************
398+
399+
While you can use FunctionTransformation to generate artibtrary transformer but they cannot not serialized (pickled).
400+
NumericalTransformer takes function name as a string parameter and hence can be easily serialized.
401+
402+
>>> from sklearn_pandas import NumericalTransformer
403+
>>> mapper5 = DataFrameMapper([
404+
... ('children', NumericalTransformer('log')),
405+
... ])
406+
>>> mapper5.fit_transform(data)
407+
array([[1.38629436],
408+
[1.79175947],
409+
[1.09861229],
410+
[1.09861229],
411+
[0.69314718],
412+
[1.09861229],
413+
[1.60943791],
414+
[1.38629436]])
415+
416+
417+
418+
398419
Changelog
399420
---------
400421

sklearn_pandas/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
from .dataframe_mapper import DataFrameMapper # NOQA
44
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
55
from .features_generator import gen_features # NOQA
6+
from .transformers import NumericalTransformer # NOQA

sklearn_pandas/dataframe_mapper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ def __setstate__(self, state):
166166
self.built_features = state.get('built_features', self.features)
167167
self.built_default = state.get('built_default', self.default)
168168
self.transformed_names_ = state.get('transformed_names_', [])
169+
self.show_progressbar = state.get('show_progressbar', False)
169170

170171
def _get_col_subset(self, X, cols, input_df=False):
171172
"""

sklearn_pandas/transformers.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
import pandas as pd
3+
from sklearn.base import TransformerMixin
34

45

56
def _get_mask(X, value):
@@ -12,3 +13,35 @@ def _get_mask(X, value):
1213
return pd.isnull(X)
1314
else:
1415
return X == value
16+
17+
18+
class NumericalTransformer(TransformerMixin):
19+
"""
20+
Provides commonly used numerical transformers.
21+
"""
22+
SUPPORTED_FUNCTIONS = ['log', 'log1p']
23+
24+
def __init__(self, func):
25+
"""
26+
Params
27+
28+
func function to apply to input columns. The function will be
29+
applied to each value. Supported functions are defined
30+
in SUPPORTED_FUNCTIONS variable. Throws assertion error if the
31+
not supported.
32+
"""
33+
assert func in self.SUPPORTED_FUNCTIONS, \
34+
f"Only following func are supported: {self.SUPPORTED_FUNCTIONS}"
35+
super(NumericalTransformer, self).__init__()
36+
self.__func = func
37+
38+
def fit(self, X, y=None):
39+
return self
40+
41+
def transform(self, X, y=None):
42+
if self.__func == 'log1p':
43+
return np.vectorize(np.log1p)(X)
44+
elif self.__func == 'log':
45+
return np.vectorize(np.log)(X)
46+
47+
raise ValueError(f"Invalid function name: {self.__func}")

tests/test_transformers.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import tempfile
2+
import pytest
3+
import numpy as np
4+
from pandas import DataFrame
5+
import joblib
6+
7+
from sklearn_pandas import DataFrameMapper
8+
from sklearn_pandas.transformers import NumericalTransformer
9+
10+
11+
@pytest.fixture
12+
def simple_dataset():
13+
return DataFrame({
14+
'feat1': [1, 2, 1, 3, 1],
15+
'feat2': [1, 2, 2, 2, 3],
16+
'feat3': [1, 2, 3, 4, 5],
17+
})
18+
19+
20+
def test_common_numerical_transformer(simple_dataset):
21+
"""
22+
Test log transformation
23+
"""
24+
transfomer = DataFrameMapper([
25+
('feat1', NumericalTransformer('log'))
26+
], df_out=True)
27+
df = simple_dataset
28+
outDF = transfomer.fit_transform(df)
29+
assert list(outDF.columns) == ['feat1']
30+
assert np.array_equal(df['feat1'].apply(np.log).values, outDF.feat1.values)
31+
32+
33+
def test_numerical_transformer_serialization(simple_dataset):
34+
"""
35+
Test if you can serialize transformer
36+
"""
37+
transfomer = DataFrameMapper([
38+
('feat1', NumericalTransformer('log'))
39+
])
40+
41+
df = simple_dataset
42+
transfomer.fit(df)
43+
f = tempfile.NamedTemporaryFile(delete=True)
44+
joblib.dump(transfomer, f.name)
45+
transfomer2 = joblib.load(f.name)
46+
np.array_equal(transfomer.transform(df), transfomer2.transform(df))
47+
f.close()

0 commit comments

Comments
 (0)