INRIA · SebastienMelo · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -62,66 +62,81 @@
 # In the previous sections, we saw that we need to treat data differently
 # depending on their nature (i.e. numerical or categorical).
 #
-# Scikit-learn provides a `ColumnTransformer` class which sends specific
-# columns to a specific transformer, making it easy to fit a single predictive
-# model on a dataset that combines both kinds of variables together
-# (heterogeneously typed tabular data).
+# Skrub is a data preprocessing library built to work seamlessly with
+# scikit-learn. It provides a convenient transformer called `TableVectorizer`
+# that can handle both numerical and categorical variables in a single
+# transformer. It makes the column selection automatically by using a column's
+# `dtype`.
 #
-# We first define the columns depending on their data type:
+# It separates the columns into four groups:
+# * **low cardinality categorical columns** (categorical columns with a limited
+#   number of unique values, one hot encoded by default);
+# * **high cardinality categorical columns** (categorical columns with a large
+#   number of unique values, string encoded by default);
+# * **numerical columns** (untouched by default).
+# * **time columns** (columns that encode time information, as present in time
+#   series for instance, converted to numerical features that can be used by
+#   learners; for more information, see the
+#   [documentation](https://skrub-data.org/stable/reference/generated/skrub.DatetimeEncoder.html)).
 #
-# * **one-hot encoding** is applied to categorical columns. Besides, we use
-#   `handle_unknown="ignore"` to solve the potential issues due to rare
-#   categories.
+# The threshold to determine whether a categorical column is of low or high
+# cardinality can be set using the `cardinality_threshold` parameter. We will see
+# its impact later on.
+#
+# We apply the following transformations:
+#
+# * **one-hot encoding** is applied to the low cardinality categorical columns.
+#   Besides, we use `handle_unknown="ignore"` to solve the potential issues due
+#   to rare categories.
 # * **numerical scaling** numerical features which will be standardized.
 #
-# Now, we create our `ColumnTransfomer` using the helper function
-# `make_column_transformer`. We specify two values: the transformer, and the
-# columns. First, let's create the preprocessors for the numerical and
-# categorical parts.
+# Now, we create our transformer using the helper function `TableVectorizer`. We
+# specify the transformers. First, let's create the preprocessors for the
+# numerical and low cardinality categorical parts.
 
 # %%
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
-categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
+categorical_preprocessor = OneHotEncoder(
+    handle_unknown="ignore", sparse_output=False
+)
 numerical_preprocessor = StandardScaler()
 
 # %% [markdown]
 # Now, we create the transformer and associate each of these preprocessors with
 # their respective columns.
 
 # %%
-from sklearn.compose import make_column_transformer
+from skrub import TableVectorizer
 
-preprocessor = make_column_transformer(
-    (categorical_preprocessor, categorical_columns),
-    (numerical_preprocessor, numerical_columns),
+vectorizer = TableVectorizer(
+    low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor
 )
 
 # %% [markdown]
 # We can take a minute to represent graphically the structure of a
-# `ColumnTransformer`:
+# `TableVectorizer`:
 #
 # ![columntransformer diagram](../figures/api_diagram-columntransformer.svg)
 #
-# A `ColumnTransformer` does the following:
+# `TableVectorizer` does the following:
 #
-# * It **splits the columns** of the original dataset based on the column names
-#   or indices provided. We obtain as many subsets as the number of transformers
-#   passed into the `ColumnTransformer`.
+# * It **splits the columns** of the original dataset based on the data type and
+#   cardinality of unique values.
 # * It **transforms each subsets**. A specific transformer is applied to each
 #   subset: it internally calls `fit_transform` or `transform`. The output of
 #   this step is a set of transformed datasets.
 # * It then **concatenates the transformed datasets** into a single dataset.
-
-# The important thing is that `ColumnTransformer` is like any other scikit-learn
+#
+# The important thing is that `TableVectorizer` is like any other scikit-learn
 # transformer. In particular it can be combined with a classifier in a
 # `Pipeline`:
 
 # %%
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import make_pipeline
 
-model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
+model = make_pipeline(vectorizer, LogisticRegression(max_iter=500))
 model
 
 # %% [markdown]
@@ -227,23 +242,17 @@
 
 # %%
 from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.preprocessing import OrdinalEncoder
+from skrub import ToCategorical
 
-categorical_preprocessor = OrdinalEncoder(
-    handle_unknown="use_encoded_value", unknown_value=-1
-)
+categorical_preprocessor = ToCategorical()
 
-preprocessor = make_column_transformer(
-    (categorical_preprocessor, categorical_columns),
-    remainder="passthrough",
-)
+preprocessor = TableVectorizer(low_cardinality=categorical_preprocessor)
 
 model = make_pipeline(preprocessor, HistGradientBoostingClassifier())
 
 # %% [markdown]
 # Now that we created our model, we can check its generalization performance.
 
-# %%
 # %%time
 _ = model.fit(data_train, target_train)
 
@@ -262,8 +271,8 @@
 # %% [markdown]
 # In this notebook we:
 #
-# * used a `ColumnTransformer` to apply different preprocessing for categorical
+# * used a `TableVectorizer` to apply different preprocessing for categorical
 #   and numerical variables;
-# * used a pipeline to chain the `ColumnTransformer` preprocessing and logistic
+# * used a pipeline to chain the `TableVectorizer` preprocessing and logistic
 #   regression fitting;
 # * saw that **gradient boosting methods** can outperform **linear models**.
diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py
@@ -49,20 +49,15 @@
 # Let's have a quick look at the target before to focus on the data.
 
 # %%
-target.head()
+from skrub import TableReport
 
-# %% [markdown]
-# We see that the target contains continuous value. It corresponds to the price
-# of a house in $. We can have a look at the target distribution.
-
-# %%
-import matplotlib.pyplot as plt
-
-target.plot.hist(bins=20, edgecolor="black")
-plt.xlabel("House price in $")
-_ = plt.title("Distribution of the house price \nin Ames")
+TableReport(target)
 
 # %% [markdown]
+# We see that the target contains continuous value. It corresponds to the price
+# of a house in $. We can have a look at the target distribution in the
+# "Distributions" tab.
+#
 # We see that the distribution has a long tail. It means that most of the house
 # are normally distributed but a couple of houses have a higher than normal
 # value. It could be critical to take this peculiarity into account when
@@ -72,7 +67,7 @@
 # house prices.
 
 # %%
-data.info()
+TableReport(data)
 
 # %% [markdown]
 # Looking at the dataframe general information, we can see that 79 features are
@@ -84,24 +79,17 @@
 
 # %%
 numerical_data = data.select_dtypes("number")
-numerical_data.info()
+TableReport(numerical_data, max_plot_columns=40)
 
 # %% [markdown]
 # We see that the data are mainly represented with integer number. Let's have a
-# look at the histogram for all these features.
-
-# %%
-numerical_data.hist(
-    bins=20, figsize=(12, 22), edgecolor="black", layout=(9, 4)
-)
-plt.subplots_adjust(hspace=0.8, wspace=0.8)
-
-# %% [markdown]
+# look at the histogram for all these features in the "Distributions" tab.
+#
 # We see that some features have high picks for 0. It could be linked that this
 # value was assigned when the criterion did not apply, for instance the area of
 # the swimming pool when no swimming pools are available.
 #
-# We also have some feature encoding some date (for instance year).
+# We also have some features encoding a date (for instance year).
 #
 # These information are useful and should also be considered when designing a
 # predictive model.
@@ -110,34 +98,13 @@
 
 # %%
 string_data = data.select_dtypes(object)
-string_data.info()
+TableReport(string_data, max_plot_columns=45)
 
 # %% [markdown]
-# These features are categorical. We can make some bar plot to see categories
-# count for each feature.
-
-# %%
-from math import ceil
-from itertools import zip_longest
-
-n_string_features = string_data.shape[1]
-nrows, ncols = ceil(n_string_features / 4), 4
-
-fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 80))
-
-for feature_name, ax in zip_longest(string_data, axs.ravel()):
-    if feature_name is None:
-        # do not show the axis
-        ax.axis("off")
-        continue
-
-    string_data[feature_name].value_counts().plot.barh(ax=ax)
-    ax.set_title(feature_name)
-
-plt.subplots_adjust(hspace=0.2, wspace=0.8)
-
-# %% [markdown]
-# Plotting this information allows us to answer to two questions:
+# These features are categorical. We can make analyze the bar plots in the
+# "Distribution" tab to see categories count for each feature.
+#
+# This allows us to answer to two questions:
 #
 # * Is there few or many categories for a given features?
 # * Is there rare categories for some features?

diff --git a/python_scripts/datasets_bike_rides.py b/python_scripts/datasets_bike_rides.py
@@ -94,10 +94,9 @@
 # We can have a first look at the target distribution.
 
 # %%
-import matplotlib.pyplot as plt
+from skrub import TableReport
 
-target.plot.hist(bins=50, edgecolor="black")
-plt.xlabel("Power (W)")
+TableReport(target)
 
 # %% [markdown]
 # We see a pick at 0 Watts, it corresponds to whenever our cyclist does not
@@ -144,6 +143,8 @@
 data_ride, target_ride = data.loc[date_first_ride], target.loc[date_first_ride]
 
 # %%
+import matplotlib.pyplot as plt
+
 data_ride.plot()
 plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
 _ = plt.title("Sensor values for different cyclist measurements")
@@ -163,18 +164,7 @@
 # We can check the range of the different features:
 
 # %%
-axs = data_ride.hist(figsize=(10, 12), bins=50, edgecolor="black", grid=False)
-# add the units to the plots
-units = [
-    "beats per minute",
-    "rotations per minute",
-    "meters per second",
-    "meters per second squared",
-    "%",
-]
-for unit, ax in zip(units, axs.ravel()):
-    ax.set_xlabel(unit)
-plt.subplots_adjust(hspace=0.6)
+TableReport(data_ride)
 
 # %% [markdown]
 # From these plots, we can see some interesting information: a cyclist is

diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py
@@ -54,17 +54,15 @@
 # columns and if any missing values are present in our dataset.
 
 # %%
-data.info()
+from skrub import TableReport
+
+TableReport(data)
 
 # %% [markdown]
 # Our dataset is made of 748 samples. All features are represented with integer
 # numbers and there is no missing values. We can have a look at each feature
-# distributions.
-
-# %%
-_ = data.hist(figsize=(12, 10), bins=30, edgecolor="black")
-
-# %% [markdown]
+# distributions in the "Distributions" tab.
+#
 # There is nothing shocking regarding the distributions. We only observe a high
 # value range for the features `"Recency"`, `"Frequency"`, and `"Monetary"`. It
 # means that we have a few extreme high values for these features.
@@ -76,11 +74,7 @@
 target.head()
 
 # %%
-import matplotlib.pyplot as plt
-
-target.value_counts(normalize=True).plot.barh()
-plt.xlabel("Number of samples")
-_ = plt.title("Class distribution")
+TableReport(target)
 
 # %% [markdown]
 # We see that the target is discrete and contains two categories: whether a

diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py
@@ -18,7 +18,7 @@
 california_housing = fetch_california_housing(as_frame=True)
 
 # %% [markdown]
-# We can have a first look at the available description
+# We can have a first look at the available description.
 
 # %%
 print(california_housing.DESCR)
@@ -67,15 +67,13 @@
 # * all features are numerical features encoded as floating number;
 # * there is no missing values.
 #
-# Let's have a quick look at the distribution of these features by plotting
-# their histograms.
+# Let's have a quick look at the distribution of these features with the
+# TableReport from the skrub package.
 
 # %%
-import matplotlib.pyplot as plt
-
-california_housing.frame.hist(figsize=(12, 10), bins=30, edgecolor="black")
-plt.subplots_adjust(hspace=0.7, wspace=0.4)
+from skrub import TableReport
 
+TableReport(california_housing.frame)
 
 # %% [markdown]
 # We can first focus on features for which their distributions would be more or
@@ -95,16 +93,12 @@
 # population, the range of the data is large with unnoticeable bin for the
 # largest values. It means that there are very high and few values (maybe they
 # could be considered as outliers?). We can see this specificity looking at the
-# statistics for these features:
-
-# %%
-features_of_interest = ["AveRooms", "AveBedrms", "AveOccup", "Population"]
-california_housing.frame[features_of_interest].describe()
-
-# %% [markdown]
-# For each of these features, comparing the `max` and `75%` values, we can see a
-# huge difference. It confirms the intuitions that there are a couple of extreme
-# values.
+# statistics for these features by clicking on the corresponding columns in the
+# table.
+#
+# For each of these features, comparing the `max` and `Median ± IQR` values, we
+# can see a huge difference. It confirms the intuitions that there are a couple
+# of extreme values.
 #
 # Up to now, we discarded the longitude and latitude that carry geographical
 # information. In short, the combination of these features could help us decide
@@ -115,6 +109,7 @@
 
 # %%
 import seaborn as sns
+import matplotlib.pyplot as plt
 
 sns.scatterplot(
     data=california_housing.frame,