diff --git a/nanotabpfn/__init__.py b/nanotabpfn/__init__.py index f2fb3b5..15b4c13 100644 --- a/nanotabpfn/__init__.py +++ b/nanotabpfn/__init__.py @@ -1 +1,2 @@ from nanotabpfn.interface import NanoTabPFNClassifier, NanoTabPFNRegressor +from nanotabpfn.ensembles import EnsembleClassifer diff --git a/nanotabpfn/ensembles.py b/nanotabpfn/ensembles.py new file mode 100644 index 0000000..d39d01e --- /dev/null +++ b/nanotabpfn/ensembles.py @@ -0,0 +1,40 @@ +from typing import Any +import numpy as np +import torch.nn.functional as F + +from nanotabpfn.interface import get_feature_preprocessor +from nanotabpfn.preprocessors import IdentityPreprocessor, Preprocessor, sample_preprocessors + +class EnsembleClassifer: + def __init__(self, classifier: Any, num_preprocessors: int = 4, preprocess_features: bool = True): + self.classifier = classifier + self.model = self.classifier.model + self.num_preprocessors = num_preprocessors + self.preprocess_features = preprocess_features + + def fit(self, X_train: np.ndarray, y_train: np.ndarray): + """ stores X_train and y_train for later use, also computes the highest class number occuring in num_classes """ + self.X_train = X_train + if self.preprocess_features: + self.feature_preprocessor = get_feature_preprocessor(X_train) + self.X_train: np.ndarray = self.feature_preprocessor.fit_transform(self.X_train) # type:ignore + self.y_train = y_train + self.num_classes = max(set(y_train))+1 + self.preprocessors: list[Preprocessor] = [IdentityPreprocessor()] + sample_preprocessors(self.num_preprocessors, self.X_train) + + def predict(self, X_test: np.ndarray) -> np.ndarray: + """ calls predit_proba and picks the class with the highest probability for each datapoint """ + predicted_probabilities = self.predict_proba(X_test) + return predicted_probabilities.argmax(axis=1) + + def predict_proba(self, X_test: np.ndarray) -> np.ndarray: + if self.preprocess_features: + X_test = self.feature_preprocessor.transform(X_test) # type:ignore + all_probabilities = [] + for preprocessor in self.preprocessors: + preprocessor.fit(self.X_train) + X_train_preprocessed = preprocessor.transform(self.X_train) + X_test_preprocessed = preprocessor.transform(X_test) + self.classifier.fit(X_train_preprocessed, self.y_train) + all_probabilities.append(self.classifier.predict_proba(X_test_preprocessed)) + return np.average(np.stack(all_probabilities, axis=0), axis=0) \ No newline at end of file diff --git a/nanotabpfn/evaluation.py b/nanotabpfn/evaluation.py index b13d9e3..48f2ea6 100644 --- a/nanotabpfn/evaluation.py +++ b/nanotabpfn/evaluation.py @@ -8,6 +8,7 @@ from sklearn.metrics import balanced_accuracy_score, roc_auc_score, r2_score from sklearn.preprocessing import LabelEncoder +from nanotabpfn.ensembles import EnsembleClassifer from nanotabpfn.interface import NanoTabPFNRegressor, NanoTabPFNClassifier TOY_TASKS_REGRESSION = [ @@ -35,7 +36,7 @@ @torch.no_grad() def get_openml_predictions( *, - model: NanoTabPFNRegressor | NanoTabPFNClassifier, + model: NanoTabPFNRegressor | NanoTabPFNClassifier | EnsembleClassifer, tasks: list[int] | str = "tabarena-v0.1", max_n_features: int = 500, max_n_samples: int = 10_000, @@ -133,6 +134,8 @@ def get_openml_predictions( parser = argparse.ArgumentParser() parser.add_argument("-model_type", type=str, choices=["regression", "classification"], required=True, help="Whether to use the regressor or classifier model") + parser.add_argument("-ensemble_size", type=int, default=None, + help="Set the number of preprocessors to ensemble with. If None, then no ensembling is applied.") parser.add_argument("-checkpoint", type=str, default=None, help="Path to load the model weights from. If None, default weights are used.") parser.add_argument("-dist_path", type=str, default=None, @@ -150,7 +153,10 @@ def get_openml_predictions( args = parser.parse_args() if args.model_type == "classification": - model = NanoTabPFNClassifier(model=args.checkpoint, num_mem_chunks=args.num_mem_chunks) + if args.ensemble_size is None: + model = NanoTabPFNClassifier(model=args.checkpoint, num_mem_chunks=args.num_mem_chunks) + else: + model = EnsembleClassifer(NanoTabPFNClassifier(model=args.checkpoint, num_mem_chunks=args.num_mem_chunks, preprocess_features=False), num_preprocessors=args.ensemble_size) else: model = NanoTabPFNRegressor(model=args.checkpoint, dist=args.dist_path, num_mem_chunks=args.num_mem_chunks) model.model.eval() diff --git a/nanotabpfn/interface.py b/nanotabpfn/interface.py index fa1c3d7..9bb1a92 100644 --- a/nanotabpfn/interface.py +++ b/nanotabpfn/interface.py @@ -80,7 +80,7 @@ def get_feature_preprocessor(X: np.ndarray | pd.DataFrame) -> ColumnTransformer: class NanoTabPFNClassifier(): """ scikit-learn like interface """ - def __init__(self, model: NanoTabPFNModel|str|None = None, device: None|str|torch.device = None, num_mem_chunks: int = 8): + def __init__(self, model: NanoTabPFNModel|str|None = None, device: None|str|torch.device = None, num_mem_chunks: int = 8, preprocess_features: bool = True): if device is None: device = get_default_device() if model is None: @@ -95,11 +95,14 @@ def __init__(self, model: NanoTabPFNModel|str|None = None, device: None|str|torc self.model = model.to(device) self.device = device self.num_mem_chunks = num_mem_chunks + self.preprocess_features = preprocess_features def fit(self, X_train: np.ndarray, y_train: np.ndarray): """ stores X_train and y_train for later use, also computes the highest class number occuring in num_classes """ - self.feature_preprocessor = get_feature_preprocessor(X_train) - self.X_train = self.feature_preprocessor.fit_transform(X_train) + self.X_train = X_train + if self.preprocess_features: + self.feature_preprocessor = get_feature_preprocessor(self.X_train) + self.X_train: np.ndarray = self.feature_preprocessor.fit_transform(self.X_train) # type:ignore self.y_train = y_train self.num_classes = max(set(y_train))+1 @@ -113,7 +116,9 @@ def predict_proba(self, X_test: np.ndarray) -> np.ndarray: creates (x,y), runs it through our PyTorch Model, cuts off the classes that didn't appear in the training data and applies softmax to get the probabilities """ - x = np.concatenate((self.X_train, self.feature_preprocessor.transform(X_test))) + if self.preprocess_features: + X_test = self.feature_preprocessor.transform(X_test) # type:ignore + x = np.concatenate((self.X_train, X_test)) y = self.y_train with torch.no_grad(): x = torch.from_numpy(x).unsqueeze(0).to(torch.float).to(self.device) # introduce batch size 1 diff --git a/nanotabpfn/preprocessors.py b/nanotabpfn/preprocessors.py new file mode 100644 index 0000000..8892fa3 --- /dev/null +++ b/nanotabpfn/preprocessors.py @@ -0,0 +1,66 @@ +from abc import ABC, abstractmethod +from typing import Any, override +import numpy as np +from sklearn.pipeline import FunctionTransformer + + +class Preprocessor(ABC): + @abstractmethod + def fit(self, X: np.ndarray) -> None: + pass + + @abstractmethod + def transform(self, X: np.ndarray) -> np.ndarray: + pass + + +class IdentityPreprocessor(Preprocessor): + @override + def fit(self, X: np.ndarray) -> None: + pass + + @override + def transform(self, X: np.ndarray) -> np.ndarray: + return X + + +class SklearnPreprocessor(Preprocessor): + def __init__(self, transformer: Any): + self.transformer = transformer + + @override + def fit(self, X: np.ndarray) -> None: + self.transformer.fit(X) + + @override + def transform(self, X: np.ndarray) -> np.ndarray: + return np.nan_to_num(self.transformer.transform(X)) + + +class LogPreprocessor(SklearnPreprocessor): + """log1p for right-skewed, non-negative features.""" + @override + def __init__(self): + super().__init__(FunctionTransformer(func=np.log1p, feature_names_out="one-to-one")) + + +class AsinhPreprocessor(SklearnPreprocessor): + """Signed log-like transform: linear near 0, log for large |x|; works with negatives.""" + @override + def __init__(self): + super().__init__(FunctionTransformer(func=np.arcsinh, feature_names_out="one-to-one")) + + +def sample_preprocessors(num_preprocessors: int, X_train: np.ndarray) -> list[Preprocessor]: + """ + Return a *good* set of preprocessors for tabular transformers. + """ + picks: list[Preprocessor] = [] + + picks.append(AsinhPreprocessor()) + + # For strictly non-negative, log1p is fine + if np.nanmin(X_train) >= 0: + picks.append(LogPreprocessor()) + + return picks[:num_preprocessors] \ No newline at end of file