1212
1313
1414class Preprocessor (TransformerMixin ):
15+ """
16+ Preprocessor class for automated tabular feature preprocessing using scikit-learn-compatible pipelines.
17+
18+ This class provides a flexible interface for preprocessing tabular datasets containing numerical and
19+ categorical features. It automatically detects feature types, applies user-defined or default preprocessing
20+ strategies, and supports both dictionary and array-style outputs. It also supports integration with external
21+ embedding vectors.
22+
23+ Features
24+ --------
25+ - Supports a wide range of preprocessing methods for numerical and categorical features.
26+ - Automatically detects feature types (numerical vs. categorical).
27+ - Compatible with both pandas DataFrames and NumPy arrays.
28+ - Handles external embedding arrays for models that require learned representations.
29+ - Returns either a dictionary of transformed feature blocks or a single NumPy array.
30+ - Fully compatible with scikit-learn transformers and pipelines.
31+
32+ Parameters
33+ ----------
34+ feature_preprocessing : dict, optional
35+ Dictionary mapping feature names to specific preprocessing methods. Overrides global defaults.
36+ n_bins : int, default=64
37+ Number of bins used for binning-based preprocessing (e.g., for discretizers or PLE).
38+ numerical_preprocessing : str, default="ple"
39+ Preprocessing method for numerical features (e.g., "standardization", "minmax", "ple", "rbf", etc.).
40+ categorical_preprocessing : str, default="int"
41+ Preprocessing method for categorical features (e.g., "int", "ordinal", "onehot").
42+ use_decision_tree_bins : bool, default=False
43+ Whether to use decision tree binning for numerical discretization.
44+ binning_strategy : str, default="uniform"
45+ Strategy for bin placement when not using tree-based methods. Options: "uniform", "quantile".
46+ task : str, default="regression"
47+ Problem type used to guide preprocessing (e.g., "regression" or "classification").
48+ cat_cutoff : float or int, default=0.03
49+ Threshold to determine whether integer-valued features are treated as categorical.
50+ treat_all_integers_as_numerical : bool, default=False
51+ If True, treat all integer-typed columns as numerical regardless of cardinality.
52+ degree : int, default=3
53+ Degree of polynomial or spline basis functions where applicable.
54+ scaling_strategy : str, default="minmax"
55+ Strategy for feature scaling (e.g., "standardization", "minmax", etc.).
56+ n_knots : int, default=64
57+ Number of knots used in spline-based feature expansions.
58+ use_decision_tree_knots : bool, default=True
59+ Whether to use decision tree-based knot placement for spline transformations.
60+ knots_strategy : str, default="uniform"
61+ Strategy for placing knots for splines ("uniform" or "quantile").
62+ spline_implementation : str, default="sklearn"
63+ Which spline backend implementation to use (e.g., "sklearn", "custom").
64+ min_unique_vals : int, default=5
65+ Minimum number of unique values required for a feature to be treated as numerical.
66+
67+ Attributes
68+ ----------
69+ column_transformer : ColumnTransformer
70+ The internal scikit-learn column transformer that handles feature-wise preprocessing.
71+ fitted : bool
72+ Whether the preprocessor has been fitted.
73+ embeddings : bool
74+ Whether embedding vectors are expected and used in transformation.
75+ embedding_dimensions : dict
76+ Dictionary of embedding feature names to their expected dimensionality.
77+
78+ Examples
79+ --------
80+ >>> from prefab import Preprocessor
81+ >>> import pandas as pd
82+ >>> df = pd.DataFrame({
83+ ... "age": [25, 32, 47],
84+ ... "gender": ["M", "F", "F"]
85+ ... })
86+ >>> pre = Preprocessor()
87+ >>> pre.fit(df)
88+ >>> out = pre.transform(df)
89+ >>> out.keys()
90+ dict_keys(['num_age', 'cat_gender'])
91+ """
92+
1593 def __init__ (
1694 self ,
1795 feature_preprocessing = None ,
@@ -31,6 +109,45 @@ def __init__(
31109 spline_implementation = "sklearn" ,
32110 min_unique_vals = 5 ,
33111 ):
112+ """
113+ Initialize the Preprocessor with various transformation options for tabular data.
114+
115+ Parameters
116+ ----------
117+ feature_preprocessing : dict, optional
118+ Dictionary specifying preprocessing methods per feature. If None, global settings are used.
119+ n_bins : int, default=64
120+ Number of bins to use for binning-based transformations.
121+ numerical_preprocessing : str, default="ple"
122+ Preprocessing strategy for numerical features.
123+ categorical_preprocessing : str, default="int"
124+ Preprocessing strategy for categorical features.
125+ use_decision_tree_bins : bool, default=False
126+ Whether to use decision tree-based binning for numerical features.
127+ binning_strategy : str, default="uniform"
128+ Strategy for determining bin edges ("uniform", "quantile").
129+ task : str, default="regression"
130+ Task type for decision tree splitting ("regression", "classification").
131+ cat_cutoff : float or int, default=0.03
132+ Threshold to determine whether integer-valued columns are treated as categorical.
133+ treat_all_integers_as_numerical : bool, default=False
134+ If True, treat all integer columns as numerical.
135+ degree : int, default=3
136+ Degree of polynomial or spline basis expansion.
137+ scaling_strategy : str, default="minmax"
138+ Scaling method for numerical data ("standardization", "minmax", etc.).
139+ n_knots : int, default=64
140+ Number of knots for spline transformations.
141+ use_decision_tree_knots : bool, default=True
142+ Use decision tree-based knot placement for splines.
143+ knots_strategy : str, default="uniform"
144+ Strategy for placing spline knots.
145+ spline_implementation : str, default="sklearn"
146+ Backend implementation to use for splines.
147+ min_unique_vals : int, default=5
148+ Minimum number of unique values required for numerical processing.
149+ """
150+
34151 self .n_bins = n_bins
35152 self .numerical_preprocessing = (
36153 numerical_preprocessing .lower ()
@@ -62,6 +179,22 @@ def __init__(
62179 self .embedding_dimensions = {}
63180
64181 def _detect_column_types (self , X ):
182+ """
183+ Detects categorical and numerical features in the input data.
184+
185+ Parameters
186+ ----------
187+ X : pandas.DataFrame, numpy.ndarray, or dict
188+ The input data to analyze.
189+
190+ Returns
191+ -------
192+ numerical_features : list of str
193+ Column names detected as numerical features.
194+ categorical_features : list of str
195+ Column names detected as categorical features.
196+ """
197+
65198 categorical_features = []
66199 numerical_features = []
67200
@@ -98,6 +231,24 @@ def _detect_column_types(self, X):
98231 return numerical_features , categorical_features
99232
100233 def fit (self , X , y = None , embeddings = None ):
234+ """
235+ Fit the preprocessor to the input data and target labels.
236+
237+ Parameters
238+ ----------
239+ X : pandas.DataFrame, numpy.ndarray, or dict
240+ The input features.
241+ y : array-like, default=None
242+ Target values (used for decision tree-based methods).
243+ embeddings : np.ndarray or list of np.ndarray, optional
244+ External embedding arrays to be passed and validated.
245+
246+ Returns
247+ -------
248+ self : Preprocessor
249+ Fitted instance of the preprocessor.
250+ """
251+
101252 if isinstance (X , dict ):
102253 X = pd .DataFrame (X )
103254 elif isinstance (X , np .ndarray ):
@@ -148,6 +299,24 @@ def fit(self, X, y=None, embeddings=None):
148299 return self
149300
150301 def transform (self , X , embeddings = None , return_array = False ):
302+ """
303+ Transform the input data using the fitted column transformer.
304+
305+ Parameters
306+ ----------
307+ X : pandas.DataFrame, numpy.ndarray, or dict
308+ Input features to transform.
309+ embeddings : np.ndarray or list of np.ndarray, optional
310+ Optional external embeddings to attach to the transformation.
311+ return_array : bool, default=False
312+ If True, return a single stacked NumPy array. If False, return a dict of transformed arrays.
313+
314+ Returns
315+ -------
316+ dict or np.ndarray
317+ Transformed data. A dictionary if return_array=False, else a NumPy array.
318+ """
319+
151320 if not self .fitted :
152321 raise NotFittedError (
153322 "Preprocessor must be fitted before calling transform."
@@ -189,11 +358,56 @@ def transform(self, X, embeddings=None, return_array=False):
189358 return transformed_dict
190359
191360 def fit_transform (self , X , y = None , embeddings = None , return_array = False ):
361+ """
362+ Convenience method that fits the preprocessor and transforms the data.
363+
364+ Parameters
365+ ----------
366+ X : pandas.DataFrame, numpy.ndarray, or dict
367+ Input features.
368+ y : array-like, optional
369+ Target values.
370+ embeddings : np.ndarray or list of np.ndarray, optional
371+ Optional embedding arrays.
372+ return_array : bool, default=False
373+ Whether to return a stacked NumPy array or a dictionary of arrays.
374+
375+ Returns
376+ -------
377+ dict or np.ndarray
378+ Transformed dataset in the specified output format.
379+ """
380+
192381 return self .fit (X , y , embeddings = embeddings ).transform (
193382 X , embeddings , return_array
194383 )
195384
196385 def get_feature_info (self , verbose = True ):
386+ """
387+ Retrieves metadata about the transformed features.
388+
389+ Provides detailed information for each input feature, including:
390+ - preprocessing applied
391+ - output dimensionality
392+ - number of categories (for categorical features)
393+ - embedding dimensions (if any)
394+
395+ Parameters
396+ ----------
397+ verbose : bool, default=True
398+ If True, prints detailed information for each feature.
399+
400+ Returns
401+ -------
402+ tuple of dicts
403+ numerical_feature_info : dict
404+ Metadata for numerical features.
405+ categorical_feature_info : dict
406+ Metadata for categorical features.
407+ embedding_feature_info : dict
408+ Metadata for embedding features, if used.
409+ """
410+
197411 if not self .fitted :
198412 raise NotFittedError (
199413 "Preprocessor must be fitted before calling get_feature_info."
0 commit comments