Count vectorizer (with Actors) (#705)

TomAugspurger · web-flow · commit 236e13aa0285 · 2020-07-24T15:21:25.000-05:00
diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py
@@ -1,14 +1,20 @@
 """
 Utilities to build feature vectors from text documents.
 """
+import itertools
+
 import dask
 import dask.array as da
 import dask.bag as db
 import dask.dataframe as dd
+import distributed
 import numpy as np
 import scipy.sparse
 import sklearn.base
 import sklearn.feature_extraction.text
+from dask.delayed import Delayed
+from distributed import get_client, wait
+from sklearn.utils.validation import check_is_fitted
 
 
 class _BaseHasher(sklearn.base.BaseEstimator):
@@ -108,3 +114,168 @@ class FeatureHasher(_BaseHasher, sklearn.feature_extraction.text.FeatureHasher):
     @property
     def _hasher(self):
         return sklearn.feature_extraction.text.FeatureHasher
+
+
+class CountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
+    """Convert a collection of text documents to a matrix of token counts
+
+    Notes
+    -----
+    When a vocabulary isn't provided, ``fit_transform`` requires two
+    passes over the dataset: one to learn the vocabulary and a second
+    to transform the data. Consider persisting the data if it fits
+    in (distributed) memory prior to calling ``fit`` or ``transform``
+    when not providing a ``vocabulary``.
+
+    Additionally, this implementation benefits from having
+    an active ``dask.distributed.Client``, even on a single machine.
+    When a client is present, the learned ``vocabulary`` is persisted
+    in distributed memory, which saves some recompuation and redundant
+    communication.
+
+    See Also
+    --------
+    sklearn.feature_extraction.text.CountVectorizer
+
+    Examples
+    --------
+    The Dask-ML implementation currently requires that ``raw_documents``
+    is a :class:`dask.bag.Bag` of documents (lists of strings).
+
+    >>> from dask_ml.feature_extraction.text import CountVectorizer
+    >>> import dask.bag as db
+    >>> from distributed import Client
+    >>> client = Client()
+    >>> corpus = [
+    ...     'This is the first document.',
+    ...     'This document is the second document.',
+    ...     'And this is the third one.',
+    ...     'Is this the first document?',
+    ... ]
+    >>> corpus = db.from_sequence(corpus, npartitions=2)
+    >>> vectorizer = CountVectorizer()
+    >>> X = vectorizer.fit_transform(corpus)
+    dask.array<concatenate, shape=(nan, 9), dtype=int64, chunksize=(nan, 9), ...
+               chunktype=scipy.csr_matrix>
+    >>> X.compute().toarray()
+    array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
+           [0, 2, 0, 1, 0, 1, 1, 0, 1],
+           [1, 0, 0, 1, 1, 0, 1, 1, 1],
+           [0, 1, 1, 1, 0, 0, 1, 0, 1]])
+    >>> vectorizer.get_feature_names()
+    ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
+    """
+
+    def fit_transform(self, raw_documents, y=None):
+        params = self.get_params()
+        vocabulary = params.pop("vocabulary")
+
+        vocabulary_for_transform = vocabulary
+
+        if self.vocabulary is not None:
+            # Case 1: Just map transform.
+            fixed_vocabulary = True
+            n_features = vocabulary_length(vocabulary)
+            vocabulary_ = vocabulary
+        else:
+            fixed_vocabulary = False
+            # Case 2: learn vocabulary from the data.
+            vocabularies = raw_documents.map_partitions(_build_vocabulary, params)
+            vocabulary = vocabulary_for_transform = _merge_vocabulary(
+                *vocabularies.to_delayed()
+            )
+            vocabulary_for_transform = vocabulary_for_transform.persist()
+            vocabulary_ = vocabulary.compute()
+
+        n_features = len(vocabulary_)
+        result = raw_documents.map_partitions(
+            _count_vectorizer_transform, vocabulary_for_transform, params
+        )
+
+        meta = scipy.sparse.eye(0, format="csr", dtype=self.dtype)
+        result = build_array(result, n_features, meta)
+
+        self.vocabulary_ = vocabulary_
+        self.fixed_vocabulary_ = fixed_vocabulary
+
+        return result
+
+    def transform(self, raw_documents):
+        params = self.get_params()
+        vocabulary = params.pop("vocabulary")
+
+        if vocabulary is None:
+            check_is_fitted(self, "vocabulary_")
+            vocabulary_for_transform = self.vocabulary_
+        else:
+            if isinstance(vocabulary, dict):
+                # scatter for the user
+                try:
+                    client = get_client()
+                except ValueError:
+                    vocabulary_for_transform = dask.delayed(vocabulary)
+                else:
+                    (vocabulary_for_transform,) = client.scatter(
+                        (vocabulary,), broadcast=True
+                    )
+            else:
+                vocabulary_for_transform = vocabulary
+
+        n_features = vocabulary_length(vocabulary_for_transform)
+        transformed = raw_documents.map_partitions(
+            _count_vectorizer_transform, vocabulary_for_transform, params
+        )
+        meta = scipy.sparse.eye(0, format="csr", dtype=self.dtype)
+        return build_array(transformed, n_features, meta)
+
+
+def build_array(bag, n_features, meta):
+    name = "from-bag-" + bag.name
+    layer = {(name, i, 0): (k, i) for k, i in bag.__dask_keys__()}
+    dsk = dask.highlevelgraph.HighLevelGraph.from_collections(
+        name, layer, dependencies=[bag]
+    )
+    chunks = ((np.nan,) * bag.npartitions, (n_features,))
+    return da.Array(dsk, name, chunks, meta=meta)
+
+
+def vocabulary_length(vocabulary):
+    if isinstance(vocabulary, dict):
+        return len(vocabulary)
+    elif isinstance(vocabulary, Delayed):
+        try:
+            return len(vocabulary)
+        except TypeError:
+            return len(vocabulary.compute())
+    elif isinstance(vocabulary, distributed.Future):
+        client = get_client()
+        future = client.submit(len, vocabulary)
+        wait(future)
+        result = future.result()
+        return result
+    else:
+        raise ValueError(f"Unknown vocabulary type {type(vocabulary)}.")
+
+
+def _count_vectorizer_transform(partition, vocabulary, params):
+    model = sklearn.feature_extraction.text.CountVectorizer(
+        vocabulary=vocabulary, **params
+    )
+    return model.transform(partition)
+
+
+def _build_vocabulary(partition, params):
+    model = sklearn.feature_extraction.text.CountVectorizer(**params)
+    model.fit(partition)
+    return set(model.vocabulary_)
+
+
+@dask.delayed
+def _merge_vocabulary(*vocabularies):
+    vocabulary = {
+        key: i
+        for i, key in enumerate(
+            sorted(set(itertools.chain.from_iterable(vocabularies)))
+        )
+    }
+    return vocabulary
diff --git a/dask_ml/utils.py b/dask_ml/utils.py
@@ -96,7 +96,7 @@ def assert_estimator_equal(left, right, exclude=None, **kwargs):
     for attr in left_attrs2:
         l = getattr(left, attr)
         r = getattr(right, attr)
-        _assert_eq(l, r, **kwargs)
+        _assert_eq(l, r, name=attr, **kwargs)
 
 
 def check_array(
@@ -193,7 +193,7 @@ def check_array(
         return sk_validation.check_array(array, *args, **kwargs)
 
 
-def _assert_eq(l, r, **kwargs):
+def _assert_eq(l, r, name=None, **kwargs):
     array_types = (np.ndarray, da.Array)
     frame_types = (pd.core.generic.NDFrame, dd._Frame)
     if isinstance(l, array_types):
@@ -206,7 +206,7 @@ def _assert_eq(l, r, **kwargs):
         for a, b in zip(l, r):
             _assert_eq(a, b, **kwargs)
     else:
-        assert l == r
+        assert l == r, (name, l, r)
 
 
 def check_random_state(random_state):
diff --git a/docs/source/modules/api.rst b/docs/source/modules/api.rst
@@ -176,10 +176,11 @@ with Dask Arrays or DataFrames.
    :toctree: generated/
    :template: class.rst
 
+   feature_extraction.text.CountVectorizer
    feature_extraction.text.HashingVectorizer
    feature_extraction.text.FeatureHasher
 
-   
+
 :mod:`dask_ml.compose`: Composite Estimators
 ============================================
 
diff --git a/tests/feature_extraction/test_text.py b/tests/feature_extraction/test_text.py
@@ -6,8 +6,10 @@
 import pytest
 import scipy.sparse
 import sklearn.feature_extraction.text
+from distributed import Client
 
 import dask_ml.feature_extraction.text
+from dask_ml._compat import dummy_context
 from dask_ml.utils import assert_estimator_equal
 
 JUNK_FOOD_DOCS = (
@@ -107,3 +109,71 @@ def test_correct_meta():
     assert scipy.sparse.issparse(result._meta)
     assert result._meta.dtype == "float64"
     assert result._meta.shape == (0, 0)
+
+
+@pytest.mark.parametrize("give_vocabulary", [True, False])
+@pytest.mark.parametrize("distributed", [True, False])
+def test_count_vectorizer(give_vocabulary, distributed):
+    m1 = sklearn.feature_extraction.text.CountVectorizer()
+    b = db.from_sequence(JUNK_FOOD_DOCS, npartitions=2)
+    r1 = m1.fit_transform(JUNK_FOOD_DOCS)
+
+    if give_vocabulary:
+        vocabulary = m1.vocabulary_
+        m1 = sklearn.feature_extraction.text.CountVectorizer(vocabulary=vocabulary)
+        r1 = m1.transform(JUNK_FOOD_DOCS)
+    else:
+        vocabulary = None
+
+    m2 = dask_ml.feature_extraction.text.CountVectorizer(vocabulary=vocabulary)
+
+    if distributed:
+        client = Client()  # noqa
+    else:
+        client = dummy_context()
+
+    if give_vocabulary:
+        r2 = m2.transform(b)
+    else:
+        r2 = m2.fit_transform(b)
+
+    with client:
+        exclude = {"vocabulary_actor_", "stop_words_"}
+        if give_vocabulary:
+            # In scikit-learn, `.transform()` sets these.
+            # This looks buggy.
+            exclude |= {"vocabulary_", "fixed_vocabulary_"}
+
+        assert_estimator_equal(m1, m2, exclude=exclude)
+        assert isinstance(r2, da.Array)
+        assert isinstance(r2._meta, scipy.sparse.csr_matrix)
+        np.testing.assert_array_equal(r1.toarray(), r2.compute().toarray())
+
+        r3 = m2.transform(b)
+        assert isinstance(r3, da.Array)
+        assert isinstance(r3._meta, scipy.sparse.csr_matrix)
+        np.testing.assert_array_equal(r1.toarray(), r3.compute().toarray())
+
+        if give_vocabulary:
+            r4 = m2.fit_transform(b)
+            assert isinstance(r4, da.Array)
+            assert isinstance(r4._meta, scipy.sparse.csr_matrix)
+            np.testing.assert_array_equal(r1.toarray(), r4.compute().toarray())
+
+
+def test_count_vectorizer_remote_vocabulary():
+    m1 = sklearn.feature_extraction.text.CountVectorizer().fit(JUNK_FOOD_DOCS)
+    vocabulary = m1.vocabulary_
+    r1 = m1.transform(JUNK_FOOD_DOCS)
+    b = db.from_sequence(JUNK_FOOD_DOCS, npartitions=2)
+
+    with Client() as client:
+        (remote_vocabulary,) = client.scatter((vocabulary,), broadcast=True)
+        m = dask_ml.feature_extraction.text.CountVectorizer(
+            vocabulary=remote_vocabulary
+        )
+        r2 = m.transform(b)
+
+        assert isinstance(r2, da.Array)
+        assert isinstance(r2._meta, scipy.sparse.csr_matrix)
+        np.testing.assert_array_equal(r1.toarray(), r2.compute().toarray())