ENH: Support user categories in OneHotEncoder (#727)

TomAugspurger · web-flow · commit 68c9bab4a5c6 · 2020-08-17T13:08:56.000-05:00
* ENH: Support user categories in OneHotEncoder Allows for ```python ohe = OneHotEncoder(categories=[['a', 'b'], ['c', 'd']]) ``` Previously, we required inputs to be CategoricalDtype for dataframes. Closes #726 * bump minimum
diff --git a/ci/environment-3.6.yaml b/ci/environment-3.6.yaml
@@ -17,7 +17,7 @@ dependencies:
   - numpy ==1.17.3
   - numpydoc
   - packaging
-  - pandas =0.23.4
+  - pandas =0.24.2
   - psutil
   - pytest
   - pytest-cov
diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
@@ -191,19 +191,21 @@ def _fit(self, X: Union[ArrayLike, DataFrameType], handle_unknown: str = "error"
                 self.categories_.append(cats)
                 self.dtypes_.append(None)
         else:
-            if not (X.dtypes == "category").all():
-                raise ValueError("All columns must be Categorical dtype.")
-            if self.categories == "auto":
-                for col in X.columns:
-                    Xi = X[col]
-                    cats = _encode(Xi, uniques=Xi.cat.categories)
-                    self.categories_.append(cats)
-                    self.dtypes_.append(Xi.dtype)
-            else:
-                raise ValueError(
-                    "Cannot specify 'categories' with DataFrame input. "
-                    "Use a categorical dtype instead."
-                )
+            for i in range(len(X.columns)):
+                Xi = X.iloc[:, i]
+                if self.categories != "auto":
+                    categories = self.categories[i]
+                    Xi = Xi.astype(pd.CategoricalDtype(categories))
+                else:
+                    if not pd.api.types.is_categorical_dtype(Xi.dtype):
+                        raise ValueError(
+                            "All columns must be Categorical dtype when "
+                            "'categories=\"auto\"'."
+                        )
+
+                cats = _encode(Xi, uniques=Xi.cat.categories)
+                self.categories_.append(cats)
+                self.dtypes_.append(Xi.dtype)
 
         self.categories_ = dask.compute(self.categories_)[0]
 
@@ -250,23 +252,25 @@ def _transform(
         else:
             import dask.dataframe as dd
 
-            # Validate that all are categorical.
-            if not (X.dtypes == "category").all():
-                raise ValueError("Must be all categorical.")
+            X = X.copy()
 
             if not len(X.columns) == len(self.categories_):
                 raise ValueError(
                     "Number of columns ({}) does not match number "
                     "of categories_ ({})".format(len(X.columns), len(self.categories_))
                 )
 
-            for col, dtype in zip(X.columns, self.dtypes_):
-                if not (X[col].dtype == dtype):
+            for i, (col, dtype) in enumerate(zip(X.columns, self.dtypes_)):
+                Xi = X.iloc[:, i]
+                if not pd.api.types.is_categorical_dtype(Xi.dtype):
+                    Xi = Xi.astype(dtype)
+                    X[col] = Xi
+
+                if Xi.dtype != dtype:
                     raise ValueError(
-                        "Different CategoricalDtype for fit and "
-                        "transform. '{}' != {}'".format(dtype, X[col].dtype)
+                        "Different CategoricalDtype for fit and transform. "
+                        "{!r} != {!r}".format(Xi.dtype, dtype)
                     )
-
             return dd.get_dummies(X, sparse=self.sparse, dtype=self.dtype)
 
         return X
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
     "distributed>=2.4.0",
     "numba",
     "numpy>=1.17.3",
-    "pandas>=0.23.4",
+    "pandas>=0.24.2",
     "scikit-learn>=0.23",
     "scipy",
     "dask-glm>=0.2.0",
diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py
@@ -117,6 +117,31 @@ def test_onehotencoder_drop_raises():
         dask_ml.preprocessing.OneHotEncoder(drop="first")
 
 
+def test_onehotencoder_dataframe_with_categories():
+    # https://github.com/dask/dask-ml/issues/726
+    enc = dask_ml.preprocessing.OneHotEncoder(
+        categories=[["a", "b", "c"], ["a", "b"]], sparse=False
+    )
+    ddf = dd.from_pandas(
+        pd.DataFrame({"A": ["a", "b", "b", "a"], "B": ["a", "b", "b", "b"]}),
+        npartitions=1,
+    )
+    result = enc.fit_transform(ddf)
+    expected = dd.from_pandas(
+        pd.DataFrame(
+            {
+                "A_a": [1, 0, 0, 1],
+                "A_b": [0, 1, 1, 0],
+                "A_c": [0, 0, 0, 0],
+                "B_a": [1, 0, 0, 0],
+                "B_b": [0, 0, 0, 0],
+            }
+        ),
+        npartitions=1,
+    )
+    assert_estimator_equal(result, expected)
+
+
 def test_handles_numpy():
     enc = dask_ml.preprocessing.OneHotEncoder()
     enc.fit(X)
@@ -132,26 +157,26 @@ def test_dataframe_requires_all_categorical(data):
     assert e.match("All columns must be Categorical dtype")
 
 
-@pytest.mark.parametrize("data", [df, ddf])
-def test_dataframe_prohibits_categories(data):
-    enc = dask_ml.preprocessing.OneHotEncoder(categories=[["a", "b"]])
-    with pytest.raises(ValueError) as e:
-        enc.fit(data)
-
-    assert e.match("Cannot specify 'categories'")
-
-
 def test_unknown_category_transform():
     df2 = ddf.copy()
     df2["A"] = ddf.A.cat.add_categories("new!")
 
     enc = dask_ml.preprocessing.OneHotEncoder()
     enc.fit(ddf)
 
-    with pytest.raises(ValueError) as e:
+    with pytest.raises(ValueError, match="Different CategoricalDtype"):
         enc.transform(df2)
 
-    assert e.match("Different CategoricalDtype for fit and transform")
+
+def test_different_shape_raises():
+    df2 = ddf.copy()
+    df2["B"] = ddf.A.cat.add_categories("new!")
+
+    enc = dask_ml.preprocessing.OneHotEncoder()
+    enc.fit(ddf)
+
+    with pytest.raises(ValueError, match="Number of columns"):
+        enc.transform(df2)
 
 
 @pytest.mark.skipif(not DASK_2_20_0, reason="Fixed in Dask 2.20.0")
diff --git a/tests/test_incremental_pca.py b/tests/test_incremental_pca.py
@@ -22,6 +22,7 @@
 
 @pytest.mark.parametrize("svd_solver", ["full", "auto", "randomized"])
 @pytest.mark.parametrize("batch_number", [3, 10])
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_compare_with_sklearn(svd_solver, batch_number):
     X = iris.data
     X_da = da.from_array(X, chunks=(3, -1))
@@ -52,14 +53,15 @@ def test_compare_with_sklearn(svd_solver, batch_number):
 
 
 @pytest.mark.parametrize("svd_solver", ["full", "auto", "randomized"])
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca(svd_solver):
     # Incremental PCA on dense arrays.
     X = iris.data
     X = da.from_array(X, chunks=(3, -1))
     batch_size = X.shape[0] // 3
     ipca = IncrementalPCA(n_components=2, batch_size=batch_size, svd_solver=svd_solver)
     pca = PCA(n_components=2, svd_solver=svd_solver)
-    pca.fit_transform(X)
+    pca.fit_transform(X.compute())
 
     X_transformed = ipca.fit_transform(X)
 
@@ -87,6 +89,7 @@ def test_incremental_pca(svd_solver):
         )
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_check_projection():
     # Test that the projection of data is correct.
     rng = np.random.RandomState(1999)
@@ -111,6 +114,7 @@ def test_incremental_pca_check_projection():
     assert_almost_equal(np.abs(Yt[0][0]), 1.0, 1)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_inverse():
     # Test that the projection of data can be inverted.
     rng = np.random.RandomState(1999)
@@ -154,6 +158,7 @@ def test_incremental_pca_validation():
         IncrementalPCA(n_components=n_components).partial_fit(X)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_n_components_none():
     # Ensures that n_components == None is handled correctly
     rng = np.random.RandomState(1999)
@@ -173,6 +178,7 @@ def test_n_components_none():
         assert ipca.n_components_ == ipca.components_.shape[0]
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_set_params():
     # Test that components_ sign is stable over batch sizes.
     rng = np.random.RandomState(1999)
@@ -200,6 +206,7 @@ def test_incremental_pca_set_params():
     ipca.partial_fit(X)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_num_features_change():
     # Test that changing n_components will raise an error.
     rng = np.random.RandomState(1999)
@@ -215,6 +222,7 @@ def test_incremental_pca_num_features_change():
         ipca.partial_fit(X2)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_batch_signs():
     # Test that components_ sign is stable over batch sizes.
     rng = np.random.RandomState(1999)
@@ -232,6 +240,7 @@ def test_incremental_pca_batch_signs():
         assert_almost_equal(np.sign(i), np.sign(j), decimal=6)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_batch_values():
     # Test that components_ values are stable over batch sizes.
     rng = np.random.RandomState(1999)
@@ -249,6 +258,7 @@ def test_incremental_pca_batch_values():
         assert_almost_equal(i, j, decimal=1)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_batch_rank():
     # Test sample size in each batch is always larger or equal to n_components
     rng = np.random.RandomState(1999)
@@ -266,6 +276,7 @@ def test_incremental_pca_batch_rank():
         assert_allclose_dense_sparse(components_i, components_j)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_partial_fit():
     # Test that fit and partial_fit get equivalent results.
     rng = np.random.RandomState(1999)
@@ -288,12 +299,13 @@ def test_incremental_pca_partial_fit():
 
 
 @pytest.mark.parametrize("svd_solver", ["full", "auto", "randomized"])
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_against_pca_iris(svd_solver):
     # Test that IncrementalPCA and PCA are approximate (to a sign flip).
     X = iris.data
     X = da.from_array(X, chunks=[50, -1])
 
-    Y_pca = PCA(n_components=2, svd_solver=svd_solver).fit_transform(X)
+    Y_pca = PCA(n_components=2, svd_solver=svd_solver).fit_transform(X.compute())
     Y_ipca = IncrementalPCA(
         n_components=2, batch_size=25, svd_solver=svd_solver
     ).fit_transform(X)
@@ -302,6 +314,7 @@ def test_incremental_pca_against_pca_iris(svd_solver):
 
 
 @pytest.mark.parametrize("svd_solver", ["full", "auto", "randomized"])
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_against_pca_random_data(svd_solver):
     # Test that IncrementalPCA and PCA are approximate (to a sign flip).
     rng = np.random.RandomState(1999)
@@ -310,7 +323,7 @@ def test_incremental_pca_against_pca_random_data(svd_solver):
     X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features)
     X = da.from_array(X, chunks=[40, -1])
 
-    Y_pca = PCA(n_components=3, svd_solver=svd_solver).fit_transform(X)
+    Y_pca = PCA(n_components=3, svd_solver=svd_solver).fit_transform(X.compute())
     Y_ipca = IncrementalPCA(
         n_components=3, batch_size=25, svd_solver=svd_solver
     ).fit_transform(X)
@@ -319,6 +332,7 @@ def test_incremental_pca_against_pca_random_data(svd_solver):
 
 
 @pytest.mark.parametrize("svd_solver", ["full", "auto", "randomized"])
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_explained_variances(svd_solver):
     # Test that PCA and IncrementalPCA calculations match
     X = datasets.make_low_rank_matrix(
@@ -328,7 +342,7 @@ def test_explained_variances(svd_solver):
     prec = 3
     n_samples, n_features = X.shape
     for nc in [None, 99]:
-        pca = PCA(n_components=nc, svd_solver=svd_solver).fit(X)
+        pca = PCA(n_components=nc, svd_solver=svd_solver).fit(X.compute())
         ipca = IncrementalPCA(
             n_components=nc, batch_size=100, svd_solver=svd_solver
         ).fit(X)
@@ -342,6 +356,7 @@ def test_explained_variances(svd_solver):
 
 
 @pytest.mark.parametrize("svd_solver", ["full", "auto", "randomized"])
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_singular_values(svd_solver):
     # Check that the IncrementalPCA output has the correct singular values
 
@@ -354,7 +369,7 @@ def test_singular_values(svd_solver):
     )
     X = da.from_array(X, chunks=[200, -1])
 
-    pca = PCA(n_components=10, svd_solver=svd_solver, random_state=rng).fit(X)
+    pca = PCA(n_components=10, svd_solver=svd_solver, random_state=rng).fit(X.compute())
     ipca = IncrementalPCA(n_components=10, batch_size=100, svd_solver=svd_solver).fit(X)
     assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)
 
@@ -389,7 +404,7 @@ def test_singular_values(svd_solver):
     pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng)
     ipca = IncrementalPCA(n_components=3, batch_size=100, svd_solver=svd_solver)
 
-    X_pca = pca.fit_transform(X)
+    X_pca = pca.fit_transform(X.compute())
     X_pca /= np.sqrt(np.sum(X_pca ** 2.0, axis=0))
     X_pca[:, 0] *= 3.142
     X_pca[:, 1] *= 2.718
@@ -403,6 +418,7 @@ def test_singular_values(svd_solver):
 
 
 @pytest.mark.parametrize("svd_solver", ["full", "auto", "randomized"])
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_whitening(svd_solver):
     # Test that PCA and IncrementalPCA transforms match to sign flip.
     X = datasets.make_low_rank_matrix(
@@ -412,7 +428,7 @@ def test_whitening(svd_solver):
     prec = 3
     n_samples, n_features = X.shape
     for nc in [None, 9]:
-        pca = PCA(whiten=True, n_components=nc, svd_solver=svd_solver).fit(X)
+        pca = PCA(whiten=True, n_components=nc, svd_solver=svd_solver).fit(X.compute())
         ipca = IncrementalPCA(
             whiten=True, n_components=nc, batch_size=250, svd_solver=svd_solver
         ).fit(X)
@@ -427,6 +443,7 @@ def test_whitening(svd_solver):
         assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
 def test_incremental_pca_partial_fit_float_division():
     # Test to ensure float division is used in all versions of Python
     # (non-regression test for issue #9489)
diff --git a/tests/test_parallel_post_fit.py b/tests/test_parallel_post_fit.py
@@ -93,12 +93,12 @@ def test_transform(kind):
     base = PCA(random_state=0)
     wrap = ParallelPostFit(PCA(random_state=0))
 
-    base.fit(X, y)
-    wrap.fit(X, y)
+    base.fit(*dask.compute(X, y))
+    wrap.fit(*dask.compute(X, y))
 
     assert_estimator_equal(wrap.estimator, base)
 
-    result = base.transform(X)
+    result = base.transform(*dask.compute(X))
     expected = wrap.transform(X)
     assert_eq_ar(result, expected)