diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 921950cf6..7f72e3790 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -62,27 +62,44 @@ # In the previous sections, we saw that we need to treat data differently # depending on their nature (i.e. numerical or categorical). # -# Scikit-learn provides a `ColumnTransformer` class which sends specific -# columns to a specific transformer, making it easy to fit a single predictive -# model on a dataset that combines both kinds of variables together -# (heterogeneously typed tabular data). +# Skrub is a data preprocessing library built to work seamlessly with +# scikit-learn. It provides a convenient transformer called `TableVectorizer` +# that can handle both numerical and categorical variables in a single +# transformer. It makes the column selection automatically by using a column's +# `dtype`. # -# We first define the columns depending on their data type: +# It separates the columns into four groups: +# * **low cardinality categorical columns** (categorical columns with a limited +# number of unique values, one hot encoded by default); +# * **high cardinality categorical columns** (categorical columns with a large +# number of unique values, string encoded by default); +# * **numerical columns** (untouched by default). +# * **time columns** (columns that encode time information, as present in time +# series for instance, converted to numerical features that can be used by +# learners; for more information, see the +# [documentation](https://skrub-data.org/stable/reference/generated/skrub.DatetimeEncoder.html)). # -# * **one-hot encoding** is applied to categorical columns. Besides, we use -# `handle_unknown="ignore"` to solve the potential issues due to rare -# categories. +# The threshold to determine whether a categorical column is of low or high +# cardinality can be set using the `cardinality_threshold` parameter. We will see +# its impact later on. +# +# We apply the following transformations: +# +# * **one-hot encoding** is applied to the low cardinality categorical columns. +# Besides, we use `handle_unknown="ignore"` to solve the potential issues due +# to rare categories. # * **numerical scaling** numerical features which will be standardized. # -# Now, we create our `ColumnTransfomer` using the helper function -# `make_column_transformer`. We specify two values: the transformer, and the -# columns. First, let's create the preprocessors for the numerical and -# categorical parts. +# Now, we create our transformer using the helper function `TableVectorizer`. We +# specify the transformers. First, let's create the preprocessors for the +# numerical and low cardinality categorical parts. # %% from sklearn.preprocessing import OneHotEncoder, StandardScaler -categorical_preprocessor = OneHotEncoder(handle_unknown="ignore") +categorical_preprocessor = OneHotEncoder( + handle_unknown="ignore", sparse_output=False +) numerical_preprocessor = StandardScaler() # %% [markdown] @@ -90,30 +107,28 @@ # their respective columns. # %% -from sklearn.compose import make_column_transformer +from skrub import TableVectorizer -preprocessor = make_column_transformer( - (categorical_preprocessor, categorical_columns), - (numerical_preprocessor, numerical_columns), +vectorizer = TableVectorizer( + low_cardinality=categorical_preprocessor, numeric=numerical_preprocessor ) # %% [markdown] # We can take a minute to represent graphically the structure of a -# `ColumnTransformer`: +# `TableVectorizer`: # # ![columntransformer diagram](../figures/api_diagram-columntransformer.svg) # -# A `ColumnTransformer` does the following: +# `TableVectorizer` does the following: # -# * It **splits the columns** of the original dataset based on the column names -# or indices provided. We obtain as many subsets as the number of transformers -# passed into the `ColumnTransformer`. +# * It **splits the columns** of the original dataset based on the data type and +# cardinality of unique values. # * It **transforms each subsets**. A specific transformer is applied to each # subset: it internally calls `fit_transform` or `transform`. The output of # this step is a set of transformed datasets. # * It then **concatenates the transformed datasets** into a single dataset. - -# The important thing is that `ColumnTransformer` is like any other scikit-learn +# +# The important thing is that `TableVectorizer` is like any other scikit-learn # transformer. In particular it can be combined with a classifier in a # `Pipeline`: @@ -121,7 +136,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline -model = make_pipeline(preprocessor, LogisticRegression(max_iter=500)) +model = make_pipeline(vectorizer, LogisticRegression(max_iter=500)) model # %% [markdown] @@ -227,23 +242,17 @@ # %% from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.preprocessing import OrdinalEncoder +from skrub import ToCategorical -categorical_preprocessor = OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=-1 -) +categorical_preprocessor = ToCategorical() -preprocessor = make_column_transformer( - (categorical_preprocessor, categorical_columns), - remainder="passthrough", -) +preprocessor = TableVectorizer(low_cardinality=categorical_preprocessor) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) # %% [markdown] # Now that we created our model, we can check its generalization performance. -# %% # %%time _ = model.fit(data_train, target_train) @@ -262,8 +271,8 @@ # %% [markdown] # In this notebook we: # -# * used a `ColumnTransformer` to apply different preprocessing for categorical +# * used a `TableVectorizer` to apply different preprocessing for categorical # and numerical variables; -# * used a pipeline to chain the `ColumnTransformer` preprocessing and logistic +# * used a pipeline to chain the `TableVectorizer` preprocessing and logistic # regression fitting; # * saw that **gradient boosting methods** can outperform **linear models**. diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py index c69c236b1..86c4b51d2 100644 --- a/python_scripts/datasets_ames_housing.py +++ b/python_scripts/datasets_ames_housing.py @@ -49,20 +49,15 @@ # Let's have a quick look at the target before to focus on the data. # %% -target.head() +from skrub import TableReport -# %% [markdown] -# We see that the target contains continuous value. It corresponds to the price -# of a house in $. We can have a look at the target distribution. - -# %% -import matplotlib.pyplot as plt - -target.plot.hist(bins=20, edgecolor="black") -plt.xlabel("House price in $") -_ = plt.title("Distribution of the house price \nin Ames") +TableReport(target) # %% [markdown] +# We see that the target contains continuous value. It corresponds to the price +# of a house in $. We can have a look at the target distribution in the +# "Distributions" tab. +# # We see that the distribution has a long tail. It means that most of the house # are normally distributed but a couple of houses have a higher than normal # value. It could be critical to take this peculiarity into account when @@ -72,7 +67,7 @@ # house prices. # %% -data.info() +TableReport(data) # %% [markdown] # Looking at the dataframe general information, we can see that 79 features are @@ -84,24 +79,17 @@ # %% numerical_data = data.select_dtypes("number") -numerical_data.info() +TableReport(numerical_data, max_plot_columns=40) # %% [markdown] # We see that the data are mainly represented with integer number. Let's have a -# look at the histogram for all these features. - -# %% -numerical_data.hist( - bins=20, figsize=(12, 22), edgecolor="black", layout=(9, 4) -) -plt.subplots_adjust(hspace=0.8, wspace=0.8) - -# %% [markdown] +# look at the histogram for all these features in the "Distributions" tab. +# # We see that some features have high picks for 0. It could be linked that this # value was assigned when the criterion did not apply, for instance the area of # the swimming pool when no swimming pools are available. # -# We also have some feature encoding some date (for instance year). +# We also have some features encoding a date (for instance year). # # These information are useful and should also be considered when designing a # predictive model. @@ -110,34 +98,13 @@ # %% string_data = data.select_dtypes(object) -string_data.info() +TableReport(string_data, max_plot_columns=45) # %% [markdown] -# These features are categorical. We can make some bar plot to see categories -# count for each feature. - -# %% -from math import ceil -from itertools import zip_longest - -n_string_features = string_data.shape[1] -nrows, ncols = ceil(n_string_features / 4), 4 - -fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 80)) - -for feature_name, ax in zip_longest(string_data, axs.ravel()): - if feature_name is None: - # do not show the axis - ax.axis("off") - continue - - string_data[feature_name].value_counts().plot.barh(ax=ax) - ax.set_title(feature_name) - -plt.subplots_adjust(hspace=0.2, wspace=0.8) - -# %% [markdown] -# Plotting this information allows us to answer to two questions: +# These features are categorical. We can make analyze the bar plots in the +# "Distribution" tab to see categories count for each feature. +# +# This allows us to answer to two questions: # # * Is there few or many categories for a given features? # * Is there rare categories for some features? diff --git a/python_scripts/datasets_bike_rides.py b/python_scripts/datasets_bike_rides.py index 9cb2ec77a..fa1ce8019 100644 --- a/python_scripts/datasets_bike_rides.py +++ b/python_scripts/datasets_bike_rides.py @@ -94,10 +94,9 @@ # We can have a first look at the target distribution. # %% -import matplotlib.pyplot as plt +from skrub import TableReport -target.plot.hist(bins=50, edgecolor="black") -plt.xlabel("Power (W)") +TableReport(target) # %% [markdown] # We see a pick at 0 Watts, it corresponds to whenever our cyclist does not @@ -144,6 +143,8 @@ data_ride, target_ride = data.loc[date_first_ride], target.loc[date_first_ride] # %% +import matplotlib.pyplot as plt + data_ride.plot() plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Sensor values for different cyclist measurements") @@ -163,18 +164,7 @@ # We can check the range of the different features: # %% -axs = data_ride.hist(figsize=(10, 12), bins=50, edgecolor="black", grid=False) -# add the units to the plots -units = [ - "beats per minute", - "rotations per minute", - "meters per second", - "meters per second squared", - "%", -] -for unit, ax in zip(units, axs.ravel()): - ax.set_xlabel(unit) -plt.subplots_adjust(hspace=0.6) +TableReport(data_ride) # %% [markdown] # From these plots, we can see some interesting information: a cyclist is diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py index 1042f16f1..2f32ed005 100644 --- a/python_scripts/datasets_blood_transfusion.py +++ b/python_scripts/datasets_blood_transfusion.py @@ -54,17 +54,15 @@ # columns and if any missing values are present in our dataset. # %% -data.info() +from skrub import TableReport + +TableReport(data) # %% [markdown] # Our dataset is made of 748 samples. All features are represented with integer # numbers and there is no missing values. We can have a look at each feature -# distributions. - -# %% -_ = data.hist(figsize=(12, 10), bins=30, edgecolor="black") - -# %% [markdown] +# distributions in the "Distributions" tab. +# # There is nothing shocking regarding the distributions. We only observe a high # value range for the features `"Recency"`, `"Frequency"`, and `"Monetary"`. It # means that we have a few extreme high values for these features. @@ -76,11 +74,7 @@ target.head() # %% -import matplotlib.pyplot as plt - -target.value_counts(normalize=True).plot.barh() -plt.xlabel("Number of samples") -_ = plt.title("Class distribution") +TableReport(target) # %% [markdown] # We see that the target is discrete and contains two categories: whether a diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py index 16869021a..b53bc70be 100644 --- a/python_scripts/datasets_california_housing.py +++ b/python_scripts/datasets_california_housing.py @@ -18,7 +18,7 @@ california_housing = fetch_california_housing(as_frame=True) # %% [markdown] -# We can have a first look at the available description +# We can have a first look at the available description. # %% print(california_housing.DESCR) @@ -67,15 +67,13 @@ # * all features are numerical features encoded as floating number; # * there is no missing values. # -# Let's have a quick look at the distribution of these features by plotting -# their histograms. +# Let's have a quick look at the distribution of these features with the +# TableReport from the skrub package. # %% -import matplotlib.pyplot as plt - -california_housing.frame.hist(figsize=(12, 10), bins=30, edgecolor="black") -plt.subplots_adjust(hspace=0.7, wspace=0.4) +from skrub import TableReport +TableReport(california_housing.frame) # %% [markdown] # We can first focus on features for which their distributions would be more or @@ -95,16 +93,12 @@ # population, the range of the data is large with unnoticeable bin for the # largest values. It means that there are very high and few values (maybe they # could be considered as outliers?). We can see this specificity looking at the -# statistics for these features: - -# %% -features_of_interest = ["AveRooms", "AveBedrms", "AveOccup", "Population"] -california_housing.frame[features_of_interest].describe() - -# %% [markdown] -# For each of these features, comparing the `max` and `75%` values, we can see a -# huge difference. It confirms the intuitions that there are a couple of extreme -# values. +# statistics for these features by clicking on the corresponding columns in the +# table. +# +# For each of these features, comparing the `max` and `Median ± IQR` values, we +# can see a huge difference. It confirms the intuitions that there are a couple +# of extreme values. # # Up to now, we discarded the longitude and latitude that carry geographical # information. In short, the combination of these features could help us decide @@ -115,6 +109,7 @@ # %% import seaborn as sns +import matplotlib.pyplot as plt sns.scatterplot( data=california_housing.frame,