Skip to content

Commit 7879292

Browse files
committed
fixed unit tests
1 parent fedae38 commit 7879292

File tree

4 files changed

+24
-124
lines changed

4 files changed

+24
-124
lines changed

README.rst

Lines changed: 16 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Sklearn-pandas
33
==============
44

5-
.. image:: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas.svg?style=svg
6-
:target: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas
5+
.. image:: https://circleci.com/gh/ragrawal/sklearn-pandas.svg?style=svg
6+
:target: https://circleci.com/gh/ragrawal/sklearn-pandas
77

88
This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/stable>`__'s machine learning methods and `pandas <https://pandas.pydata.org>`__-style Data Frames.
99

@@ -231,8 +231,9 @@ Multiple transformers for the same column
231231
Multiple transformers can be applied to the same column specifying them
232232
in a list::
233233

234+
>>> from sklearn.impute import SimpleImputer
234235
>>> mapper3 = DataFrameMapper([
235-
... (['age'], [sklearn.preprocessing.Imputer(),
236+
... (['age'], [SimpleImputer(),
236237
... sklearn.preprocessing.StandardScaler()])])
237238
>>> data_3 = pd.DataFrame({'age': [1, np.nan, 3]})
238239
>>> mapper3.fit_transform(data_3)
@@ -302,7 +303,7 @@ into generator, and then use returned definition as ``features`` argument for ``
302303
... classes=[sklearn.preprocessing.LabelEncoder]
303304
... )
304305
>>> feature_def
305-
[('col1', [LabelEncoder()]), ('col2', [LabelEncoder()]), ('col3', [LabelEncoder()])]
306+
[('col1', [LabelEncoder()], {}), ('col2', [LabelEncoder()], {}), ('col3', [LabelEncoder()], {})]
306307
>>> mapper5 = DataFrameMapper(feature_def)
307308
>>> data5 = pd.DataFrame({
308309
... 'col1': ['yes', 'no', 'yes'],
@@ -318,22 +319,24 @@ If it is required to override some of transformer parameters, then a dict with '
318319
transformer parameters should be provided. For example, consider a dataset with missing values.
319320
Then the following code could be used to override default imputing strategy:
320321

322+
>>> from sklearn.impute import SimpleImputer
323+
>>> import numpy as np
321324
>>> feature_def = gen_features(
322325
... columns=[['col1'], ['col2'], ['col3']],
323-
... classes=[{'class': sklearn.preprocessing.Imputer, 'strategy': 'most_frequent'}]
326+
... classes=[{'class': SimpleImputer, 'strategy':'most_frequent'}]
324327
... )
325328
>>> mapper6 = DataFrameMapper(feature_def)
326329
>>> data6 = pd.DataFrame({
327-
... 'col1': [None, 1, 1, 2, 3],
328-
... 'col2': [True, False, None, None, True],
329-
... 'col3': [0, 0, 0, None, None]
330+
... 'col1': [np.nan, 1, 1, 2, 3],
331+
... 'col2': [True, False, np.nan, np.nan, True],
332+
... 'col3': [0, 0, 0, np.nan, np.nan]
330333
... })
331334
>>> mapper6.fit_transform(data6)
332-
array([[1., 1., 0.],
333-
[1., 0., 0.],
334-
[1., 1., 0.],
335-
[2., 1., 0.],
336-
[3., 1., 0.]])
335+
array([[1.0, True, 0.0],
336+
[1.0, False, 0.0],
337+
[1.0, True, 0.0],
338+
[2.0, True, 0.0],
339+
[3.0, True, 0.0]], dtype=object)
337340

338341

339342
Feature selection and other supervised transformations
@@ -366,59 +369,6 @@ A ``DataFrameMapper`` will return a dense feature array by default. Setting ``sp
366369

367370
The stacking of the sparse features is done without ever densifying them.
368371

369-
Cross-Validation
370-
****************
371-
372-
Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
373-
374-
To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_score`` function which passes a pandas DataFrame to the estimator rather than a numpy array::
375-
376-
>>> pipe = sklearn.pipeline.Pipeline([
377-
... ('featurize', mapper),
378-
... ('lm', sklearn.linear_model.LinearRegression())])
379-
>>> np.round(cross_val_score(pipe, X=data.copy(), y=data.salary, scoring='r2'), 2)
380-
array([ -1.09, -5.3 , -15.38])
381-
382-
Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface as sklearn's function of the same name.
383-
384-
``CategoricalImputer``
385-
**********************
386-
387-
Since the ``scikit-learn`` ``Imputer`` transformer currently only works with
388-
numbers, ``sklearn-pandas`` provides an equivalent helper transformer that
389-
works with strings, substituting null values with the most frequent value in
390-
that column. Alternatively, you can specify a fixed value to use.
391-
392-
Example: imputing with the mode:
393-
394-
>>> from sklearn_pandas import CategoricalImputer
395-
>>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
396-
>>> imputer = CategoricalImputer()
397-
>>> imputer.fit_transform(data)
398-
array(['a', 'b', 'b', 'b'], dtype=object)
399-
400-
Example: imputing with a fixed value:
401-
402-
>>> from sklearn_pandas import CategoricalImputer
403-
>>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
404-
>>> imputer = CategoricalImputer(strategy='constant', fill_value='a')
405-
>>> imputer.fit_transform(data)
406-
array(['a', 'b', 'b', 'a'], dtype=object)
407-
408-
409-
``FunctionTransformer``
410-
***********************
411-
412-
Often one wants to apply simple transformations to data such as ``np.log``. ``FunctionTransformer`` is a simple wrapper that takes any function and applies vectorization so that it can be used as a transformer.
413-
414-
Example:
415-
416-
>>> from sklearn_pandas import FunctionTransformer
417-
>>> array = np.array([10, 100])
418-
>>> transformer = FunctionTransformer(np.log10)
419-
420-
>>> transformer.fit_transform(array)
421-
array([1., 2.])
422372

423373
Changelog
424374
---------

sklearn_pandas/dataframe_mapper.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -267,9 +267,10 @@ def get_names(self, columns, transformer, x, alias=None, prefix='',
267267
# Otherwise use the only estimator present
268268
else:
269269
names = _get_feature_names(transformer)
270+
270271
if names is not None and len(names) == num_cols:
271-
output = ['%s_%s' % (name, o) for o in names]
272-
# otherwise, return name concatenated with '_1', '_2', etc.
272+
output = [f"{name}_{o}" for o in names]
273+
# otherwise, return name concatenated with '_1', '_2', etc.
273274
else:
274275
output = [name + '_' + str(o) for o in range(num_cols)]
275276
else:
@@ -282,7 +283,6 @@ def get_names(self, columns, transformer, x, alias=None, prefix='',
282283
suffix = suffix or ''
283284
return ['{}{}{}'.format(prefix, x, suffix) for x in output]
284285

285-
286286
def get_dtypes(self, extracted):
287287
dtypes_features = [self.get_dtype(ex) for ex in extracted]
288288
return [dtype for dtype_feature in dtypes_features
@@ -329,6 +329,7 @@ def _transform(self, X, y=None, do_fit=False):
329329
alias = options.get('alias')
330330
prefix = options.get('prefix')
331331
suffix = options.get('suffix')
332+
332333
self.transformed_names_ += self.get_names(
333334
columns, transformers, Xt, alias, prefix, suffix)
334335

tests/test_dataframe_mapper.py

Lines changed: 2 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,8 @@ def test_onehot_df():
334334
transformed = mapper.fit_transform(df)
335335
cols = transformed.columns
336336
assert len(cols) == 4
337-
assert cols[0] == 'target_0'
338-
assert cols[3] == 'target_3'
337+
assert cols[0] == 'target_x0_0'
338+
assert cols[3] == 'target_x0_3'
339339

340340

341341
def test_customtransform_df():
@@ -757,35 +757,6 @@ def test_list_transformers_old_unpickle(simple_dataframe):
757757
assert isinstance(transformer.steps[0][1], MockXTransformer)
758758

759759

760-
def test_default_old_unpickle(simple_dataframe):
761-
mapper = DataFrameMapper([('a', None)])
762-
# simulate the mapper was pickled before the ``default`` init argument
763-
# existed
764-
del mapper.default
765-
mapper_pickled = pickle.dumps(mapper)
766-
767-
loaded_mapper = pickle.loads(mapper_pickled)
768-
loaded_mapper.fit_transform(simple_dataframe) # doesn't fail
769-
770-
771-
def test_build_features_old_unpickle(simple_dataframe):
772-
"""
773-
Fitted mappers pickled before the built_features and built_default
774-
attributes can correctly transform
775-
"""
776-
df = simple_dataframe
777-
mapper = DataFrameMapper([('a', None)])
778-
mapper.fit(df)
779-
780-
# simulate the mapper was pickled before the attributes existed
781-
del mapper.built_features
782-
del mapper.built_default
783-
784-
mapper_pickled = pickle.dumps(mapper)
785-
loaded_mapper = pickle.loads(mapper_pickled)
786-
loaded_mapper.transform(simple_dataframe) # doesn't fail
787-
788-
789760
def test_sparse_features(simple_dataframe):
790761
"""
791762
If any of the extracted features is sparse and "sparse" argument
@@ -860,7 +831,6 @@ def iris_dataframe():
860831
}
861832
)
862833

863-
864834
@pytest.fixture
865835
def cars_dataframe():
866836
return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
@@ -914,27 +884,6 @@ def test_with_car_dataframe(cars_dataframe):
914884
assert scores.mean() > 0.30
915885

916886

917-
@pytest.mark.skipIf(parse_version(sklearn_version) < parse_version('0.16'))
918-
def test_direct_cross_validation(iris_dataframe):
919-
"""
920-
Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes.
921-
See https://github.com/paulgb/sklearn-pandas/issues/11
922-
"""
923-
pipeline = Pipeline([
924-
("preprocess", DataFrameMapper([
925-
("petal length (cm)", None),
926-
("petal width (cm)", None),
927-
("sepal length (cm)", None),
928-
("sepal width (cm)", None),
929-
])),
930-
("classify", SVC(kernel='linear'))
931-
])
932-
data = iris_dataframe.drop("species", axis=1)
933-
labels = iris_dataframe["species"]
934-
scores = sklearn_cv_score(pipeline, data, labels)
935-
assert scores.mean() > 0.96
936-
assert (scores.std() * 2) < 0.04
937-
938887

939888
def test_heterogeneous_output_types_input_df():
940889
"""

tests/test_features_generator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_generate_features_with_default_parameters():
4747
feature_defs = gen_features(columns=columns, classes=[MockClass])
4848
assert len(feature_defs) == len(columns)
4949

50-
feature_dict = dict(feature_defs)
50+
feature_dict = dict([_[0:2] for _ in feature_defs])
5151
assert columns == sorted(feature_dict.keys())
5252

5353
# default init arguments for MockClass for clarification.
@@ -70,7 +70,7 @@ def test_generate_features_with_several_classes():
7070
]
7171
)
7272

73-
for transformers in dict(feature_defs).values():
73+
for col, transformers, params in feature_defs:
7474
assert_attributes(transformers[0], name='class', value=1)
7575
assert_attributes(transformers[1], name='mockA', value=1)
7676
assert_attributes(transformers[2], name='mockB', value=None)

0 commit comments

Comments
 (0)