skrub-data · rcap107 · Jan 14, 2026 · Dec 19, 2025 · Jan 12, 2026 · Jan 12, 2026
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -87,6 +87,9 @@ Changes
 - Computing the associations in :class:`TableReport` is now deterministic and can
   be controlled by the new parameter ``subsampling_seed`` of the global configuration.
   :pr:`1775` by :user:`Thomas S. <thomass-dev>`.
+- The :class: `StringEncoder` now exposes the ``vocabulary`` parameter from the parent
+  :class: `TfidfVectorizer`.
+  :pr: `1819` by :user: `Eloi Massoulié <emassoulie>`
 - Added ``cast_to_str`` parameter to :class:`Cleaner` to prevent unintended
   conversion of list/object-like columns to strings unless explicitly enabled.
   :pr:`1789` by :user:`PilliSiddharth`.

diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py
@@ -61,6 +61,11 @@ class StringEncoder(TransformerMixin, SingleColumnTransformer):
         Used during randomized svd. Pass an int for reproducible results across
         multiple function calls.
 
+    vocabulary : Mapping or iterable, default=None
-    vocabulary : Mapping or iterable, default=None
+    vocabulary_ : Mapping or iterable, default=None
-    vocabulary : Mapping or iterable, default=None
+    vocabulary_ : Mapping or iterable, default=None
+        In case of "tfidf" vectorizer, the vocabulary mapping passed to the vectorizer.
+        Either a Mapping (e.g., a dict) where keys are terms and values are
+        indices in the feature matrix, or an iterable over terms.
+
     Attributes
     ----------
     input_name_ : str
@@ -131,13 +136,15 @@ def __init__(
         analyzer="char_wb",
         stop_words=None,
         random_state=None,
+        vocabulary=None,
     ):
         self.n_components = n_components
         self.vectorizer = vectorizer
         self.ngram_range = ngram_range
         self.analyzer = analyzer
         self.stop_words = stop_words
         self.random_state = random_state
+        self.vocabulary = vocabulary
 
     def fit_transform(self, X, y=None):
         """Fit the encoder and transform a column.
@@ -165,21 +172,28 @@ def fit_transform(self, X, y=None):
                 ngram_range=self.ngram_range,
                 analyzer=self.analyzer,
                 stop_words=self.stop_words,
+                vocabulary=self.vocabulary,
             )
         elif self.vectorizer == "hashing":
-            self.vectorizer_ = Pipeline(
-                [
-                    (
-                        "hashing",
-                        HashingVectorizer(
-                            ngram_range=self.ngram_range,
-                            analyzer=self.analyzer,
-                            stop_words=self.stop_words,
+            if self.vocabulary is None:
+                self.vectorizer_ = Pipeline(
+                    [
+                        (
+                            "hashing",
+                            HashingVectorizer(
+                                ngram_range=self.ngram_range,
+                                analyzer=self.analyzer,
+                                stop_words=self.stop_words,
+                            ),
                         ),
-                    ),
-                    ("tfidf", TfidfTransformer()),
-                ]
-            )
+                        ("tfidf", TfidfTransformer()),
+                    ]
+                )
+            else:
+                raise ValueError(
+                    "Custom vocabulary passed to StringEncoder, unsupported by"
+                    "HashingVectorizer. Rerun without a 'vocabulary' parameter."
+                )
         else:
             raise ValueError(
                 f"Unknown vectorizer {self.vectorizer}. Options are 'tfidf' or"

diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py
@@ -310,3 +310,30 @@ def test_zero_padding_in_feature_names_out(df_module, n_components, expected_col
     feature_names = encoder.get_feature_names_out()
 
     assert feature_names[: len(expected_columns)] == expected_columns
+
+
+def test_vocabulary_parameter(df_module):
+    voc = {
+        "this": 5,
+        "is": 1,
+        "simple": 3,
+        "example": 0,
+        "this is": 6,
+        "is simple": 2,
+        "simple example": 4,
+    }
+    encoder = StringEncoder(vocabulary=voc)
+    X = df_module.make_column("col", [f"v{idx}" for idx in range(12)])
+
+    encoder.fit_transform(X)
+    assert encoder.vectorizer_.vocabulary_ == voc
+
+
+def test_vocabulary_on_hashing_vectorizer(df_module):
+    voc = {
+        "this": 5,
+    }
+    encoder = StringEncoder(vocabulary=voc, vectorizer="hashing")
+    with pytest.raises(ValueError):
+        X = df_module.make_column("col", [f"v{idx}" for idx in range(12)])
+        encoder.fit_transform(X)