Skip to content

Commit bd2ed62

Browse files
committed
docstrings
1 parent 3e2b37f commit bd2ed62

File tree

14 files changed

+716
-1
lines changed

14 files changed

+716
-1
lines changed

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,37 @@ pretab includes both sklearn-native and custom-built transformers:
116116

117117
> Plus: **any `sklearn` transformer** can be passed directly with full support for hyperparameters.
118118
119+
### Using Transformers
120+
Using the transformers follows the standard sklearn.preprocessing steps. I.e. using PLE
121+
```python
122+
import numpy as np
123+
from pretab.transformers import PLETransformer
124+
125+
x = np.random.randn(100, 1)
126+
y = np.random.randn(100, 1)
127+
128+
x_ple = PLETransformer(n_bins=15, task="regression").fit_transform(x, y)
129+
130+
assert x_ple.shape[1] == 15
131+
```
132+
133+
For splines, the penalty matrices can be extracted via `.get_penalty_matrix()`
134+
135+
```python
136+
import numpy as np
137+
from pretab.transformers import ThinPlateSplineTransformer
138+
139+
x = np.random.randn(100, 1)
140+
141+
tp = ThinPlateSplineTransformer(n_basis=15)
142+
143+
x_tp = tp.fit_transform(x)
144+
145+
assert x_tp.shape[1] == 15
146+
147+
penalty = tp.get_penalty_matrix()
148+
```
149+
119150
---
120151

121152
## 🧪 Running Tests

pretab/preprocessor.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,84 @@
1212

1313

1414
class Preprocessor(TransformerMixin):
15+
"""
16+
Preprocessor class for automated tabular feature preprocessing using scikit-learn-compatible pipelines.
17+
18+
This class provides a flexible interface for preprocessing tabular datasets containing numerical and
19+
categorical features. It automatically detects feature types, applies user-defined or default preprocessing
20+
strategies, and supports both dictionary and array-style outputs. It also supports integration with external
21+
embedding vectors.
22+
23+
Features
24+
--------
25+
- Supports a wide range of preprocessing methods for numerical and categorical features.
26+
- Automatically detects feature types (numerical vs. categorical).
27+
- Compatible with both pandas DataFrames and NumPy arrays.
28+
- Handles external embedding arrays for models that require learned representations.
29+
- Returns either a dictionary of transformed feature blocks or a single NumPy array.
30+
- Fully compatible with scikit-learn transformers and pipelines.
31+
32+
Parameters
33+
----------
34+
feature_preprocessing : dict, optional
35+
Dictionary mapping feature names to specific preprocessing methods. Overrides global defaults.
36+
n_bins : int, default=64
37+
Number of bins used for binning-based preprocessing (e.g., for discretizers or PLE).
38+
numerical_preprocessing : str, default="ple"
39+
Preprocessing method for numerical features (e.g., "standardization", "minmax", "ple", "rbf", etc.).
40+
categorical_preprocessing : str, default="int"
41+
Preprocessing method for categorical features (e.g., "int", "ordinal", "onehot").
42+
use_decision_tree_bins : bool, default=False
43+
Whether to use decision tree binning for numerical discretization.
44+
binning_strategy : str, default="uniform"
45+
Strategy for bin placement when not using tree-based methods. Options: "uniform", "quantile".
46+
task : str, default="regression"
47+
Problem type used to guide preprocessing (e.g., "regression" or "classification").
48+
cat_cutoff : float or int, default=0.03
49+
Threshold to determine whether integer-valued features are treated as categorical.
50+
treat_all_integers_as_numerical : bool, default=False
51+
If True, treat all integer-typed columns as numerical regardless of cardinality.
52+
degree : int, default=3
53+
Degree of polynomial or spline basis functions where applicable.
54+
scaling_strategy : str, default="minmax"
55+
Strategy for feature scaling (e.g., "standardization", "minmax", etc.).
56+
n_knots : int, default=64
57+
Number of knots used in spline-based feature expansions.
58+
use_decision_tree_knots : bool, default=True
59+
Whether to use decision tree-based knot placement for spline transformations.
60+
knots_strategy : str, default="uniform"
61+
Strategy for placing knots for splines ("uniform" or "quantile").
62+
spline_implementation : str, default="sklearn"
63+
Which spline backend implementation to use (e.g., "sklearn", "custom").
64+
min_unique_vals : int, default=5
65+
Minimum number of unique values required for a feature to be treated as numerical.
66+
67+
Attributes
68+
----------
69+
column_transformer : ColumnTransformer
70+
The internal scikit-learn column transformer that handles feature-wise preprocessing.
71+
fitted : bool
72+
Whether the preprocessor has been fitted.
73+
embeddings : bool
74+
Whether embedding vectors are expected and used in transformation.
75+
embedding_dimensions : dict
76+
Dictionary of embedding feature names to their expected dimensionality.
77+
78+
Examples
79+
--------
80+
>>> from prefab import Preprocessor
81+
>>> import pandas as pd
82+
>>> df = pd.DataFrame({
83+
... "age": [25, 32, 47],
84+
... "gender": ["M", "F", "F"]
85+
... })
86+
>>> pre = Preprocessor()
87+
>>> pre.fit(df)
88+
>>> out = pre.transform(df)
89+
>>> out.keys()
90+
dict_keys(['num_age', 'cat_gender'])
91+
"""
92+
1593
def __init__(
1694
self,
1795
feature_preprocessing=None,
@@ -31,6 +109,45 @@ def __init__(
31109
spline_implementation="sklearn",
32110
min_unique_vals=5,
33111
):
112+
"""
113+
Initialize the Preprocessor with various transformation options for tabular data.
114+
115+
Parameters
116+
----------
117+
feature_preprocessing : dict, optional
118+
Dictionary specifying preprocessing methods per feature. If None, global settings are used.
119+
n_bins : int, default=64
120+
Number of bins to use for binning-based transformations.
121+
numerical_preprocessing : str, default="ple"
122+
Preprocessing strategy for numerical features.
123+
categorical_preprocessing : str, default="int"
124+
Preprocessing strategy for categorical features.
125+
use_decision_tree_bins : bool, default=False
126+
Whether to use decision tree-based binning for numerical features.
127+
binning_strategy : str, default="uniform"
128+
Strategy for determining bin edges ("uniform", "quantile").
129+
task : str, default="regression"
130+
Task type for decision tree splitting ("regression", "classification").
131+
cat_cutoff : float or int, default=0.03
132+
Threshold to determine whether integer-valued columns are treated as categorical.
133+
treat_all_integers_as_numerical : bool, default=False
134+
If True, treat all integer columns as numerical.
135+
degree : int, default=3
136+
Degree of polynomial or spline basis expansion.
137+
scaling_strategy : str, default="minmax"
138+
Scaling method for numerical data ("standardization", "minmax", etc.).
139+
n_knots : int, default=64
140+
Number of knots for spline transformations.
141+
use_decision_tree_knots : bool, default=True
142+
Use decision tree-based knot placement for splines.
143+
knots_strategy : str, default="uniform"
144+
Strategy for placing spline knots.
145+
spline_implementation : str, default="sklearn"
146+
Backend implementation to use for splines.
147+
min_unique_vals : int, default=5
148+
Minimum number of unique values required for numerical processing.
149+
"""
150+
34151
self.n_bins = n_bins
35152
self.numerical_preprocessing = (
36153
numerical_preprocessing.lower()
@@ -62,6 +179,22 @@ def __init__(
62179
self.embedding_dimensions = {}
63180

64181
def _detect_column_types(self, X):
182+
"""
183+
Detects categorical and numerical features in the input data.
184+
185+
Parameters
186+
----------
187+
X : pandas.DataFrame, numpy.ndarray, or dict
188+
The input data to analyze.
189+
190+
Returns
191+
-------
192+
numerical_features : list of str
193+
Column names detected as numerical features.
194+
categorical_features : list of str
195+
Column names detected as categorical features.
196+
"""
197+
65198
categorical_features = []
66199
numerical_features = []
67200

@@ -98,6 +231,24 @@ def _detect_column_types(self, X):
98231
return numerical_features, categorical_features
99232

100233
def fit(self, X, y=None, embeddings=None):
234+
"""
235+
Fit the preprocessor to the input data and target labels.
236+
237+
Parameters
238+
----------
239+
X : pandas.DataFrame, numpy.ndarray, or dict
240+
The input features.
241+
y : array-like, default=None
242+
Target values (used for decision tree-based methods).
243+
embeddings : np.ndarray or list of np.ndarray, optional
244+
External embedding arrays to be passed and validated.
245+
246+
Returns
247+
-------
248+
self : Preprocessor
249+
Fitted instance of the preprocessor.
250+
"""
251+
101252
if isinstance(X, dict):
102253
X = pd.DataFrame(X)
103254
elif isinstance(X, np.ndarray):
@@ -148,6 +299,24 @@ def fit(self, X, y=None, embeddings=None):
148299
return self
149300

150301
def transform(self, X, embeddings=None, return_array=False):
302+
"""
303+
Transform the input data using the fitted column transformer.
304+
305+
Parameters
306+
----------
307+
X : pandas.DataFrame, numpy.ndarray, or dict
308+
Input features to transform.
309+
embeddings : np.ndarray or list of np.ndarray, optional
310+
Optional external embeddings to attach to the transformation.
311+
return_array : bool, default=False
312+
If True, return a single stacked NumPy array. If False, return a dict of transformed arrays.
313+
314+
Returns
315+
-------
316+
dict or np.ndarray
317+
Transformed data. A dictionary if return_array=False, else a NumPy array.
318+
"""
319+
151320
if not self.fitted:
152321
raise NotFittedError(
153322
"Preprocessor must be fitted before calling transform."
@@ -189,11 +358,56 @@ def transform(self, X, embeddings=None, return_array=False):
189358
return transformed_dict
190359

191360
def fit_transform(self, X, y=None, embeddings=None, return_array=False):
361+
"""
362+
Convenience method that fits the preprocessor and transforms the data.
363+
364+
Parameters
365+
----------
366+
X : pandas.DataFrame, numpy.ndarray, or dict
367+
Input features.
368+
y : array-like, optional
369+
Target values.
370+
embeddings : np.ndarray or list of np.ndarray, optional
371+
Optional embedding arrays.
372+
return_array : bool, default=False
373+
Whether to return a stacked NumPy array or a dictionary of arrays.
374+
375+
Returns
376+
-------
377+
dict or np.ndarray
378+
Transformed dataset in the specified output format.
379+
"""
380+
192381
return self.fit(X, y, embeddings=embeddings).transform(
193382
X, embeddings, return_array
194383
)
195384

196385
def get_feature_info(self, verbose=True):
386+
"""
387+
Retrieves metadata about the transformed features.
388+
389+
Provides detailed information for each input feature, including:
390+
- preprocessing applied
391+
- output dimensionality
392+
- number of categories (for categorical features)
393+
- embedding dimensions (if any)
394+
395+
Parameters
396+
----------
397+
verbose : bool, default=True
398+
If True, prints detailed information for each feature.
399+
400+
Returns
401+
-------
402+
tuple of dicts
403+
numerical_feature_info : dict
404+
Metadata for numerical features.
405+
categorical_feature_info : dict
406+
Metadata for categorical features.
407+
embedding_feature_info : dict
408+
Metadata for embedding features, if used.
409+
"""
410+
197411
if not self.fitted:
198412
raise NotFittedError(
199413
"Preprocessor must be fitted before calling get_feature_info."

pretab/transformers/binning/binning.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,63 @@
44

55

66
class CustomBinTransformer(TransformerMixin, BaseEstimator):
7+
"""
8+
Custom binning transformer for one-dimensional numerical features.
9+
10+
This transformer bins continuous values into discrete intervals, using either a fixed number of equal-width bins
11+
or a user-provided array of bin edges. It is compatible with scikit-learn pipelines.
12+
13+
Parameters
14+
----------
15+
bins : int or array-like
16+
If int, defines the number of equal-width bins. If array-like, defines the bin edges to use directly.
17+
18+
Attributes
19+
----------
20+
n_features_in_ : int
21+
The number of input features. Always set to 1 for this transformer.
22+
"""
23+
724
def __init__(self, bins):
825
# bins can be a scalar (number of bins) or array-like (bin edges)
926
self.bins = bins
1027

1128
def fit(self, X, y=None):
29+
"""
30+
Fit the transformer on the data.
31+
32+
Parameters
33+
----------
34+
X : array-like of shape (n_samples, 1)
35+
Input data.
36+
37+
y : Ignored
38+
Not used, present here for API consistency by convention.
39+
40+
Returns
41+
-------
42+
self : object
43+
Fitted transformer.
44+
"""
1245
# Fit doesn't need to do anything as we are directly using provided bins
1346
self.n_features_in_ = 1
1447
return self
1548

1649
def transform(self, X):
50+
"""
51+
Transform the data using the specified binning strategy.
52+
53+
Parameters
54+
----------
55+
X : array-like of shape (n_samples, 1)
56+
Input data to transform.
57+
58+
Returns
59+
-------
60+
X_binned : ndarray of shape (n_samples, 1)
61+
Binned data with integer bin indices.
62+
"""
63+
1764
X = np.asarray(X) # Ensures squeeze works and consistent input
1865
if X.ndim != 2 or X.shape[1] != 1:
1966
raise ValueError("Input must be a 2D array with shape (n_samples, 1).")

0 commit comments

Comments
 (0)