Fixed vocabulary length for remote vocabulary (#719)

TomAugspurger · web-flow · commit f97ba247dc7d · 2020-08-05T13:20:50.000-05:00
* Fixed vocabulary length for remote vocabulary Closes #712 * optimize fit, transform
diff --git a/dask_ml/feature_extraction/text.py b/dask_ml/feature_extraction/text.py
@@ -186,8 +186,8 @@ def fit_transform(self, raw_documents, y=None):
             )
             vocabulary_for_transform = vocabulary_for_transform.persist()
             vocabulary_ = vocabulary.compute()
+            n_features = len(vocabulary_)
 
-        n_features = len(vocabulary_)
         result = raw_documents.map_partitions(
             _count_vectorizer_transform, vocabulary_for_transform, params
         )
@@ -206,20 +206,20 @@ def transform(self, raw_documents):
 
         if vocabulary is None:
             check_is_fitted(self, "vocabulary_")
-            vocabulary_for_transform = self.vocabulary_
-        else:
-            if isinstance(vocabulary, dict):
-                # scatter for the user
-                try:
-                    client = get_client()
-                except ValueError:
-                    vocabulary_for_transform = dask.delayed(vocabulary)
-                else:
-                    (vocabulary_for_transform,) = client.scatter(
-                        (vocabulary,), broadcast=True
-                    )
+            vocabulary = self.vocabulary_
+
+        if isinstance(vocabulary, dict):
+            # scatter for the user
+            try:
+                client = get_client()
+            except ValueError:
+                vocabulary_for_transform = dask.delayed(vocabulary)
             else:
-                vocabulary_for_transform = vocabulary
+                (vocabulary_for_transform,) = client.scatter(
+                    (vocabulary,), broadcast=True
+                )
+        else:
+            vocabulary_for_transform = vocabulary
 
         n_features = vocabulary_length(vocabulary_for_transform)
         transformed = raw_documents.map_partitions(
diff --git a/tests/feature_extraction/test_text.py b/tests/feature_extraction/test_text.py
@@ -177,3 +177,9 @@ def test_count_vectorizer_remote_vocabulary():
         assert isinstance(r2, da.Array)
         assert isinstance(r2._meta, scipy.sparse.csr_matrix)
         np.testing.assert_array_equal(r1.toarray(), r2.compute().toarray())
+
+        m = dask_ml.feature_extraction.text.CountVectorizer(
+            vocabulary=remote_vocabulary
+        )
+        m.fit_transform(b)
+        assert m.vocabulary_ is remote_vocabulary