fixed unit tests

ragrawal · ragrawal · commit 787929277625 · 2020-07-26T23:40:02.000-07:00
diff --git a/README.rst b/README.rst
@@ -2,8 +2,8 @@
 Sklearn-pandas
 ==============
 
-.. image:: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas.svg?style=svg
-    :target: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas
+.. image:: https://circleci.com/gh/ragrawal/sklearn-pandas.svg?style=svg
+    :target: https://circleci.com/gh/ragrawal/sklearn-pandas
 
 This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/stable>`__'s machine learning methods and `pandas <https://pandas.pydata.org>`__-style Data Frames.
 
@@ -231,8 +231,9 @@ Multiple transformers for the same column
 Multiple transformers can be applied to the same column specifying them
 in a list::
 
+    >>> from sklearn.impute import SimpleImputer
     >>> mapper3 = DataFrameMapper([
-    ...     (['age'], [sklearn.preprocessing.Imputer(),
+    ...     (['age'], [SimpleImputer(),
     ...                sklearn.preprocessing.StandardScaler()])])
     >>> data_3 = pd.DataFrame({'age': [1, np.nan, 3]})
     >>> mapper3.fit_transform(data_3)
@@ -302,7 +303,7 @@ into generator, and then use returned definition as ``features`` argument for ``
     ...     classes=[sklearn.preprocessing.LabelEncoder]
     ... )
     >>> feature_def
-    [('col1', [LabelEncoder()]), ('col2', [LabelEncoder()]), ('col3', [LabelEncoder()])]
+    [('col1', [LabelEncoder()], {}), ('col2', [LabelEncoder()], {}), ('col3', [LabelEncoder()], {})]
     >>> mapper5 = DataFrameMapper(feature_def)
     >>> data5 = pd.DataFrame({
     ...     'col1': ['yes', 'no', 'yes'],
@@ -318,22 +319,24 @@ If it is required to override some of transformer parameters, then a dict with '
 transformer parameters should be provided. For example, consider a dataset with missing values.
 Then the following code could be used to override default imputing strategy:
 
+    >>> from sklearn.impute import SimpleImputer
+    >>> import numpy as np
     >>> feature_def = gen_features(
     ...     columns=[['col1'], ['col2'], ['col3']],
-    ...     classes=[{'class': sklearn.preprocessing.Imputer, 'strategy': 'most_frequent'}]
+    ...     classes=[{'class': SimpleImputer, 'strategy':'most_frequent'}]
     ... )
     >>> mapper6 = DataFrameMapper(feature_def)
     >>> data6 = pd.DataFrame({
-    ...     'col1': [None, 1, 1, 2, 3],
-    ...     'col2': [True, False, None, None, True],
-    ...     'col3': [0, 0, 0, None, None]
+    ...     'col1': [np.nan, 1, 1, 2, 3],
+    ...     'col2': [True, False, np.nan, np.nan, True],
+    ...     'col3': [0, 0, 0, np.nan, np.nan]
     ... })
     >>> mapper6.fit_transform(data6)
-    array([[1., 1., 0.],
-           [1., 0., 0.],
-           [1., 1., 0.],
-           [2., 1., 0.],
-           [3., 1., 0.]])
+    array([[1.0, True, 0.0],
+           [1.0, False, 0.0],
+           [1.0, True, 0.0],
+           [2.0, True, 0.0],
+           [3.0, True, 0.0]], dtype=object)
 
 
 Feature selection and other supervised transformations
@@ -366,59 +369,6 @@ A ``DataFrameMapper`` will return a dense feature array by default. Setting ``sp
 
 The stacking of the sparse features is done without ever densifying them.
 
-Cross-Validation
-****************
-
-Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
-
-To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_score`` function which passes a pandas DataFrame to the estimator rather than a numpy array::
-
-    >>> pipe = sklearn.pipeline.Pipeline([
-    ...     ('featurize', mapper),
-    ...     ('lm', sklearn.linear_model.LinearRegression())])
-    >>> np.round(cross_val_score(pipe, X=data.copy(), y=data.salary, scoring='r2'), 2)
-    array([ -1.09,  -5.3 , -15.38])
-
-Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface as sklearn's function of the same name.
-
-``CategoricalImputer``
-**********************
-
-Since the ``scikit-learn``  ``Imputer`` transformer currently only works with
-numbers, ``sklearn-pandas`` provides an equivalent helper transformer that
-works with strings, substituting null values with the most frequent value in
-that column. Alternatively, you can specify a fixed value to use.
-
-Example: imputing with the mode:
-
-    >>> from sklearn_pandas import CategoricalImputer
-    >>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
-    >>> imputer = CategoricalImputer()
-    >>> imputer.fit_transform(data)
-    array(['a', 'b', 'b', 'b'], dtype=object)
-
-Example: imputing with a fixed value:
-
-    >>> from sklearn_pandas import CategoricalImputer
-    >>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
-    >>> imputer = CategoricalImputer(strategy='constant', fill_value='a')
-    >>> imputer.fit_transform(data)
-    array(['a', 'b', 'b', 'a'], dtype=object)
-
-
-``FunctionTransformer``
-***********************
-
-Often one wants to apply simple transformations to data such as ``np.log``. ``FunctionTransformer`` is a simple wrapper that takes any function and applies vectorization so that it can be used as a transformer.
-
-Example:
-
-    >>> from sklearn_pandas import FunctionTransformer
-    >>> array = np.array([10, 100])
-    >>> transformer = FunctionTransformer(np.log10)
-
-    >>> transformer.fit_transform(array)
-    array([1., 2.])
 
 Changelog
 ---------
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -267,9 +267,10 @@ def get_names(self, columns, transformer, x, alias=None, prefix='',
             # Otherwise use the only estimator present
             else:
                 names = _get_feature_names(transformer)
+
             if names is not None and len(names) == num_cols:
-                output = ['%s_%s' % (name, o) for o in names]
-            # otherwise, return name concatenated with '_1', '_2', etc.
+                output = [f"{name}_{o}" for o in names]
+                # otherwise, return name concatenated with '_1', '_2', etc.
             else:
                 output = [name + '_' + str(o) for o in range(num_cols)]
         else:
@@ -282,7 +283,6 @@ def get_names(self, columns, transformer, x, alias=None, prefix='',
         suffix = suffix or ''
         return ['{}{}{}'.format(prefix, x, suffix) for x in output]
 
-
     def get_dtypes(self, extracted):
         dtypes_features = [self.get_dtype(ex) for ex in extracted]
         return [dtype for dtype_feature in dtypes_features
@@ -329,6 +329,7 @@ def _transform(self, X, y=None, do_fit=False):
             alias = options.get('alias')
             prefix = options.get('prefix')
             suffix = options.get('suffix')
+
             self.transformed_names_ += self.get_names(
                 columns, transformers, Xt, alias, prefix, suffix)
 
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -334,8 +334,8 @@ def test_onehot_df():
     transformed = mapper.fit_transform(df)
     cols = transformed.columns
     assert len(cols) == 4
-    assert cols[0] == 'target_0'
-    assert cols[3] == 'target_3'
+    assert cols[0] == 'target_x0_0'
+    assert cols[3] == 'target_x0_3'
 
 
 def test_customtransform_df():
@@ -757,35 +757,6 @@ def test_list_transformers_old_unpickle(simple_dataframe):
     assert isinstance(transformer.steps[0][1], MockXTransformer)
 
 
-def test_default_old_unpickle(simple_dataframe):
-    mapper = DataFrameMapper([('a', None)])
-    # simulate the mapper was pickled before the ``default`` init argument
-    # existed
-    del mapper.default
-    mapper_pickled = pickle.dumps(mapper)
-
-    loaded_mapper = pickle.loads(mapper_pickled)
-    loaded_mapper.fit_transform(simple_dataframe)  # doesn't fail
-
-
-def test_build_features_old_unpickle(simple_dataframe):
-    """
-    Fitted mappers pickled before the built_features and built_default
-    attributes can correctly transform
-    """
-    df = simple_dataframe
-    mapper = DataFrameMapper([('a', None)])
-    mapper.fit(df)
-
-    # simulate the mapper was pickled before the attributes existed
-    del mapper.built_features
-    del mapper.built_default
-
-    mapper_pickled = pickle.dumps(mapper)
-    loaded_mapper = pickle.loads(mapper_pickled)
-    loaded_mapper.transform(simple_dataframe)  # doesn't fail
-
-
 def test_sparse_features(simple_dataframe):
     """
     If any of the extracted features is sparse and "sparse" argument
@@ -860,7 +831,6 @@ def iris_dataframe():
         }
     )
 
-
 @pytest.fixture
 def cars_dataframe():
     return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
@@ -914,27 +884,6 @@ def test_with_car_dataframe(cars_dataframe):
     assert scores.mean() > 0.30
 
 
-@pytest.mark.skipIf(parse_version(sklearn_version) < parse_version('0.16'))
-def test_direct_cross_validation(iris_dataframe):
-    """
-    Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes.
-    See https://github.com/paulgb/sklearn-pandas/issues/11
-    """
-    pipeline = Pipeline([
-        ("preprocess", DataFrameMapper([
-            ("petal length (cm)", None),
-            ("petal width (cm)", None),
-            ("sepal length (cm)", None),
-            ("sepal width (cm)", None),
-        ])),
-        ("classify", SVC(kernel='linear'))
-    ])
-    data = iris_dataframe.drop("species", axis=1)
-    labels = iris_dataframe["species"]
-    scores = sklearn_cv_score(pipeline, data, labels)
-    assert scores.mean() > 0.96
-    assert (scores.std() * 2) < 0.04
-
 
 def test_heterogeneous_output_types_input_df():
     """
diff --git a/tests/test_features_generator.py b/tests/test_features_generator.py
@@ -47,7 +47,7 @@ def test_generate_features_with_default_parameters():
     feature_defs = gen_features(columns=columns, classes=[MockClass])
     assert len(feature_defs) == len(columns)
 
-    feature_dict = dict(feature_defs)
+    feature_dict = dict([_[0:2] for _ in feature_defs])
     assert columns == sorted(feature_dict.keys())
 
     # default init arguments for MockClass for clarification.
@@ -70,7 +70,7 @@ def test_generate_features_with_several_classes():
         ]
     )
 
-    for transformers in dict(feature_defs).values():
+    for col, transformers, params in feature_defs:
         assert_attributes(transformers[0], name='class', value=1)
         assert_attributes(transformers[1], name='mockA', value=1)
         assert_attributes(transformers[2], name='mockB', value=None)

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ def test_generate_features_with_default_parameters():`
`47`	`47`	`feature_defs = gen_features(columns=columns, classes=[MockClass])`
`48`	`48`	`assert len(feature_defs) == len(columns)`
`49`	`49`
`50`		`- feature_dict = dict(feature_defs)`
	`50`	`+ feature_dict = dict([_[0:2] for _ in feature_defs])`
`51`	`51`	`assert columns == sorted(feature_dict.keys())`
`52`	`52`
`53`	`53`	`# default init arguments for MockClass for clarification.`
`@@ -70,7 +70,7 @@ def test_generate_features_with_several_classes():`
`70`	`70`	`]`
`71`	`71`	`)`
`72`	`72`
`73`		`- for transformers in dict(feature_defs).values():`
	`73`	`+ for col, transformers, params in feature_defs:`
`74`	`74`	`assert_attributes(transformers[0], name='class', value=1)`
`75`	`75`	`assert_attributes(transformers[1], name='mockA', value=1)`
`76`	`76`	`assert_attributes(transformers[2], name='mockB', value=None)`