Skip to content

Commit e0ef0c0

Browse files
author
RJ Agrawal
committed
fixed suggested changes
1 parent ec71b72 commit e0ef0c0

File tree

7 files changed

+61
-22
lines changed

7 files changed

+61
-22
lines changed

README.rst

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,8 @@ Sklearn-pandas
77

88
This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/stable>`__'s machine learning methods and `pandas <https://pandas.pydata.org>`__-style Data Frames.
99

10-
In particular, it provides:
1110

12-
1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
13-
2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
14-
3. A couple of special transformers that work well with pandas inputs: ``CategoricalImputer`` and ``FunctionTransformer``.
11+
In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features.
1512

1613
Installation
1714
------------
@@ -20,6 +17,7 @@ You can install ``sklearn-pandas`` with ``pip``::
2017

2118
# pip install sklearn-pandas
2219

20+
2321
Tests
2422
-----
2523

@@ -136,8 +134,18 @@ of the feature definition::
136134
>>> mapper_alias.transformed_names_
137135
['children_scaled']
138136

137+
Alternatively, you can also specify prefix and/or suffix to add to the column name. For example::
138+
139139

140-
Passing Series/DataFrames to the transformers
140+
>>> mapper_alias = DataFrameMapper([
141+
... (['children'], sklearn.preprocessing.StandardScaler(), {'prefix': 'standard_scaled_'}),
142+
... (['children'], sklearn.preprocessing.StandardScaler(), {'suffix': '_raw'})
143+
... ])
144+
>>> _ = mapper_alias.fit_transform(data.copy())
145+
>>> mapper_alias.transformed_names_
146+
['standard_scaled_children', 'children_raw']
147+
148+
Passing Series/DataFrames to the transformerså
141149
*********************************************
142150

143151
By default the transformers are passed a numpy array of the selected columns
@@ -338,6 +346,23 @@ Then the following code could be used to override default imputing strategy:
338346
[2.0, True, 0.0],
339347
[3.0, True, 0.0]], dtype=object)
340348

349+
You can also specify global prefix or suffix for the generated transformed column names using the prefix and suffix
350+
parameters::
351+
352+
>>> feature_def = gen_features(
353+
... columns=['col1', 'col2', 'col3'],
354+
... classes=[sklearn.preprocessing.LabelEncoder],
355+
... prefix="lblencoder_"
356+
... )
357+
>>> mapper5 = DataFrameMapper(feature_def)
358+
>>> data5 = pd.DataFrame({
359+
... 'col1': ['yes', 'no', 'yes'],
360+
... 'col2': [True, False, False],
361+
... 'col3': ['one', 'two', 'three']
362+
... })
363+
>>> _ = mapper5.fit_transform(data5)
364+
>>> mapper5.transformed_names_
365+
['lblencoder_col1', 'lblencoder_col2', 'lblencoder_col3']
341366

342367
Feature selection and other supervised transformations
343368
******************************************************

setup.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,19 @@ def run(self):
3434
description='Pandas integration with sklearn',
3535
maintainer='Ritesh Agrawal',
3636
maintainer_email='[email protected]',
37-
url='https://github.com/ragrawal/sklearn-pandas',
37+
url='https://github.com/scikit-learn-contrib/sklearn-pandas',
3838
packages=['sklearn_pandas'],
3939
keywords=['scikit', 'sklearn', 'pandas'],
4040
install_requires=[
4141
'scikit-learn>=0.23.0',
4242
'scipy>=1.4.1',
4343
'pandas>=1.0.5',
4444
'numpy>=1.18.1',
45-
'tqdm>=4.46.0'],
45+
'tqdm>=4.46.0'
46+
],
47+
extras_require={
48+
"progress-bar": ['tqdm>=4.46.0']
49+
},
4650
tests_require=['pytest', 'mock'],
4751
cmdclass={'test': PyTest},
4852
)

sklearn_pandas/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '1.8.2'
1+
__version__ = '2.0.0'
22

33
from .dataframe_mapper import DataFrameMapper # NOQA
44
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA

sklearn_pandas/dataframe_mapper.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -276,11 +276,9 @@ def get_names(self, columns, transformer, x, alias=None, prefix='',
276276
else:
277277
output = [name]
278278

279-
if not prefix and not suffix:
279+
if prefix == suffix == "":
280280
return output
281281

282-
prefix = prefix or ''
283-
suffix = suffix or ''
284282
return ['{}{}{}'.format(prefix, x, suffix) for x in output]
285283

286284
def get_dtypes(self, extracted):
@@ -327,8 +325,8 @@ def _transform(self, X, y=None, do_fit=False):
327325
extracted.append(_handle_feature(Xt))
328326

329327
alias = options.get('alias')
330-
prefix = options.get('prefix')
331-
suffix = options.get('suffix')
328+
prefix = options.get('prefix', '')
329+
suffix = options.get('suffix', '')
332330

333331
self.transformed_names_ += self.get_names(
334332
columns, transformers, Xt, alias, prefix, suffix)

sklearn_pandas/features_generator.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
def gen_features(columns, classes=None, arguments={}):
1+
def gen_features(columns, classes=None, prefix='', suffix=''):
22
"""Generates a feature definition list which can be passed
33
into DataFrameMapper
44
@@ -25,8 +25,9 @@ def gen_features(columns, classes=None, arguments={}):
2525
2626
If None value selected, then each feature left as is.
2727
28-
arguments a dictionary of additional values such as {'prefix': 'x',
29-
'suffix': 'na'}
28+
prefix add prefix to transformed column names
29+
30+
suffix add suffix to transformed column names.
3031
3132
"""
3233
if classes is None:
@@ -37,9 +38,15 @@ def gen_features(columns, classes=None, arguments={}):
3738
for column in columns:
3839
feature_transformers = []
3940

41+
arguments = {}
42+
if prefix and prefix != "":
43+
arguments['prefix'] = prefix
44+
if suffix and suffix != "":
45+
arguments['suffix'] = suffix
46+
4047
classes = [cls for cls in classes if cls is not None]
4148
if not classes:
42-
feature_defs.append((column, None))
49+
feature_defs.append((column, None, arguments))
4350

4451
else:
4552
for definition in classes:

tests/test_features_generator.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ def test_generate_features_with_default_parameters():
4747
feature_defs = gen_features(columns=columns, classes=[MockClass])
4848
assert len(feature_defs) == len(columns)
4949

50+
for feature in feature_defs:
51+
assert feature[2] == {}
52+
5053
feature_dict = dict([_[0:2] for _ in feature_defs])
5154
assert columns == sorted(feature_dict.keys())
5255

@@ -84,9 +87,9 @@ def test_generate_features_with_none_only_transformers():
8487
feature_defs = gen_features(
8588
columns=['colA', 'colB', 'colC'], classes=[None])
8689

87-
expected = [('colA', None),
88-
('colB', None),
89-
('colC', None)]
90+
expected = [('colA', None, {}),
91+
('colB', None, {}),
92+
('colC', None, {})]
9093

9194
assert feature_defs == expected
9295

tox.ini

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ exclude =
1010
*bin/
1111

1212
[tox]
13-
envlist = {py36}-sklearn{23}-pandas{1}
13+
envlist = {py36,py37,py38}-sklearn{22,23}-pandas{105,110}
1414

1515
[testenv]
1616
deps =
@@ -20,7 +20,9 @@ deps =
2020
flake8==3.7.9
2121
numpy==1.18.1
2222
scipy==1.4.1
23-
pandas1: pandas==1.0.5
23+
pandas105: pandas==1.0.5
24+
pandas110: pandas==1.1.0
25+
sklearn22: scikit-learn==0.22.2
2426
sklearn23: scikit-learn==0.23.1
2527

2628
commands =

0 commit comments

Comments
 (0)