Skip to content

Commit 91a2eef

Browse files
Merge pull request #4 from PriorLabs/evaluation
Evaluation
2 parents cc71764 + 9b8c0c6 commit 91a2eef

File tree

5 files changed

+237
-49
lines changed

5 files changed

+237
-49
lines changed

nanotabpfn/evaluation.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import argparse
2+
3+
import numpy as np
4+
import openml
5+
import torch
6+
from openml.config import set_root_cache_directory
7+
from openml.tasks import TaskType
8+
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, r2_score
9+
from sklearn.preprocessing import LabelEncoder
10+
11+
from nanotabpfn.interface import NanoTabPFNRegressor, NanoTabPFNClassifier
12+
13+
TOY_TASKS_REGRESSION = [
14+
362443, # diabetes
15+
]
16+
17+
TOY_TASKS_CLASSIFICATION = [
18+
59, # iris
19+
2382, # wine
20+
9946, # breast_cancer
21+
]
22+
23+
@torch.no_grad()
24+
def get_openml_predictions(
25+
*,
26+
model: NanoTabPFNRegressor | NanoTabPFNClassifier,
27+
tasks: list[int] | str = "tabarena-v0.1",
28+
max_n_features=500,
29+
max_n_instances=10_000,
30+
classification: bool | None = None,
31+
cache_directory: str | None = None,
32+
):
33+
"""
34+
Evaluates a model on a set of OpenML tasks and returns predictions.
35+
36+
Retrieves datasets from OpenML, applies preprocessing, and evaluates the given model on each task.
37+
Returns true targets, predicted labels, and predicted probabilities for each dataset.
38+
39+
Args:
40+
model (NanoTabPFNRegressor | NanoTabPFNClassifier): A scikit-learn compatible model or classifier to be evaluated.
41+
tasks (list[int] | str, optional): A list of OpenML task IDs or the name of a benchmark suite.
42+
max_n_features (int, optional): Maximum number of features allowed for a task. Tasks exceeding this limit are skipped.
43+
max_n_instances (int, optional): Maximum number of instances allowed for a task. Tasks exceeding this limit are skipped.
44+
classification (bool | None, optional): Whether the model is a classifier (True) or regressor (False). If None, it is inferred from the model type.
45+
cache_directory (str | None, optional): Directory to save OpenML data. If None, default cache path is used.
46+
Returns:
47+
dict: A dictionary where keys are dataset names and values are tuples of (true targets, predicted labels, predicted probabilities).
48+
"""
49+
if classification is None:
50+
classification = isinstance(model, NanoTabPFNClassifier)
51+
52+
if cache_directory is not None:
53+
set_root_cache_directory(cache_directory)
54+
55+
if isinstance(tasks, str):
56+
benchmark_suite = openml.study.get_suite(tasks)
57+
task_ids = benchmark_suite.tasks
58+
else:
59+
task_ids = tasks
60+
61+
dataset_predictions = {}
62+
63+
for task_id in task_ids:
64+
task = openml.tasks.get_task(task_id, download_splits=False)
65+
66+
if classification and task.task_type_id != TaskType.SUPERVISED_CLASSIFICATION:
67+
continue # skip task, only classification
68+
if not classification and task.task_type_id != TaskType.SUPERVISED_REGRESSION:
69+
continue # skip task, only regression
70+
71+
dataset = task.get_dataset(download_data=False)
72+
73+
n_features = dataset.qualities["NumberOfFeatures"]
74+
n_instances = dataset.qualities["NumberOfInstances"]
75+
if n_features > max_n_features or n_instances > max_n_instances:
76+
continue # skip task, too big
77+
78+
_, folds, _ = task.get_split_dimensions()
79+
tabarena_light = True
80+
if tabarena_light:
81+
folds = 1 # code supports multiple folds but tabarena_light only has one
82+
repeat = 0 # code only supports one repeat
83+
targets = []
84+
predictions = []
85+
probabilities = []
86+
for fold in range(folds):
87+
X, y, categorical_indicator, attribute_names = dataset.get_data(
88+
target=task.target_name, dataset_format="dataframe"
89+
)
90+
train_indices, test_indices = task.get_train_test_split_indices(
91+
fold=fold, repeat=repeat
92+
)
93+
X_train = X.iloc[train_indices].to_numpy()
94+
y_train = y.iloc[train_indices].to_numpy()
95+
X_test = X.iloc[test_indices].to_numpy()
96+
y_test = y.iloc[test_indices].to_numpy()
97+
98+
if classification:
99+
label_encoder = LabelEncoder()
100+
y_train = label_encoder.fit_transform(y_train)
101+
y_test = label_encoder.transform(y_test)
102+
targets.append(y_test)
103+
104+
model.fit(X_train, y_train)
105+
y_pred = model.predict(X_test)
106+
predictions.append(y_pred)
107+
if classification:
108+
y_proba = model.predict_proba(X_test)
109+
if y_proba.shape[1] == 2: # binary classification
110+
y_proba = y_proba[:, 1]
111+
probabilities.append(y_proba)
112+
113+
y_pred = np.concatenate(predictions, axis=0)
114+
targets = np.concatenate(targets, axis=0)
115+
probabilities = np.concatenate(probabilities, axis=0) if len(probabilities) > 0 else None
116+
dataset_predictions[str(dataset.name)] = (targets, y_pred, probabilities)
117+
return dataset_predictions
118+
119+
120+
if __name__ == "__main__":
121+
parser = argparse.ArgumentParser()
122+
parser.add_argument("-model_type", type=str, choices=["regression", "classification"], required=True,
123+
help="Whether to use the regressor or classifier model")
124+
parser.add_argument("-checkpoint", type=str, default=None,
125+
help="Path to load the model weights from. If None, default weights are used.")
126+
parser.add_argument("-dist_path", type=str, default=None,
127+
help="Path to load the bucket edges for the support bar distribution from. Only needed for regression.")
128+
parser.add_argument("-tasks", type=str, default="tabarena-v0.1",
129+
choices=["tabarena-v0.1", "toy_tasks"], help="Which OpenML tasks to evaluate on.")
130+
parser.add_argument("-cache_directory", type=str, default=None,
131+
help="Directory to save OpenML data. If None, default cache path is used.")
132+
parser.add_argument("-max_n_features", type=int, default=500,
133+
help="Maximum number of features allowed for a task. Tasks exceeding this limit are skipped.")
134+
parser.add_argument("-max_n_instances", type=int, default=10_000,
135+
help="Maximum number of instances allowed for a task. Tasks exceeding this limit are skipped.")
136+
args = parser.parse_args()
137+
138+
if args.model_type == "classification":
139+
model = NanoTabPFNClassifier(model=args.checkpoint)
140+
else:
141+
model = NanoTabPFNRegressor(model=args.checkpoint, dist=args.dist_path)
142+
model.model.eval()
143+
144+
if args.tasks == "toy_tasks" and args.model_type == "regression":
145+
tasks = TOY_TASKS_REGRESSION
146+
elif args.tasks == "toy_tasks" and args.model_type == "classification":
147+
tasks = TOY_TASKS_CLASSIFICATION
148+
else:
149+
tasks = args.tasks
150+
151+
predictions = get_openml_predictions(
152+
model=model, tasks=tasks, max_n_features=args.max_n_features, max_n_instances=args.max_n_instances,
153+
classification=(args.model_type=="classification"), cache_directory=args.cache_directory
154+
)
155+
156+
for dataset_name, (y_true, y_pred, y_proba) in predictions.items():
157+
if args.model_type == "classification":
158+
acc = balanced_accuracy_score(y_true, y_pred)
159+
auc = roc_auc_score(y_true, y_proba, multi_class='ovr')
160+
print(f"Dataset: {dataset_name} | ROC AUC: {auc:.4f} | Balanced Accuracy: {acc:.4f}")
161+
else:
162+
r2 = r2_score(y_true, y_pred)
163+
print(f"Dataset: {dataset_name} | R2: {r2:.4f}")

nanotabpfn/interface.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
import os
2-
import requests
2+
33
import numpy as np
4+
import pandas as pd
5+
import requests
46
import torch
57
import torch.nn.functional as F
6-
8+
from numpy import ndarray
79
from pfns.bar_distribution import FullSupportBarDistribution
10+
from sklearn.compose import ColumnTransformer
11+
from sklearn.impute import SimpleImputer
12+
from sklearn.pipeline import Pipeline
13+
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
814

9-
from nanotabpfn.utils import get_default_device
1015
from nanotabpfn.model import NanoTabPFNModel
16+
from nanotabpfn.utils import get_default_device
17+
1118

1219
def init_model_from_state_dict_file(file_path):
1320
"""
@@ -29,6 +36,39 @@ def init_model_from_state_dict_file(file_path):
2936
model.load_state_dict(torch.load(file_path, map_location='cpu'))
3037
return model
3138

39+
def get_feature_preprocessor(X: ndarray | pd.DataFrame) -> ColumnTransformer:
40+
"""
41+
fits a preprocessor that imputes NaNs
42+
"""
43+
X = pd.DataFrame(X)
44+
num_mask = []
45+
for col in X:
46+
non_nan_entries = X[col].notna().sum()
47+
numeric_entries = pd.to_numeric(X[col], errors='coerce').notna().sum() # in case numeric columns are stored as strings
48+
num_mask.append(non_nan_entries == numeric_entries)
49+
# num_mask.append(is_numeric_dtype(X[col])) # Assumes pandas dtype is correct
50+
51+
num_mask = np.array(num_mask)
52+
53+
num_transformer = Pipeline([
54+
("to_pandas", FunctionTransformer(lambda x: pd.DataFrame(x) if not isinstance(x, pd.DataFrame) else x)), # to apply pd.to_numeric of pandas
55+
("to_numeric", FunctionTransformer(lambda x: x.apply(pd.to_numeric, errors='coerce').to_numpy())), # in case numeric columns are stored as strings
56+
('imputer', SimpleImputer(strategy='mean')) # median might be better because of outliers
57+
])
58+
cat_transformer = Pipeline([
59+
('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
60+
('imputer', SimpleImputer(strategy='most_frequent')),
61+
])
62+
63+
preprocessor = ColumnTransformer(
64+
transformers=[
65+
('num', num_transformer, num_mask),
66+
('cat', cat_transformer, ~num_mask)
67+
]
68+
)
69+
return preprocessor
70+
71+
3272
class NanoTabPFNClassifier():
3373
""" scikit-learn like interface """
3474
def __init__(self, model: NanoTabPFNModel|str|None = None, device=get_default_device()):
@@ -46,7 +86,8 @@ def __init__(self, model: NanoTabPFNModel|str|None = None, device=get_default_de
4686

4787
def fit(self, X_train: np.array, y_train: np.array):
4888
""" stores X_train and y_train for later use, also computes the highest class number occuring in num_classes """
49-
self.X_train = X_train
89+
self.feature_preprocessor = get_feature_preprocessor(X_train)
90+
self.X_train = self.feature_preprocessor.fit_transform(X_train)
5091
self.y_train = y_train
5192
self.num_classes = max(set(y_train))+1
5293

@@ -60,7 +101,7 @@ def predict_proba(self, X_test: np.array) -> np.array:
60101
creates (x,y), runs it through our PyTorch Model, cuts off the classes that didn't appear in the training data
61102
and applies softmax to get the probabilities
62103
"""
63-
x = np.concatenate((self.X_train, X_test))
104+
x = np.concatenate((self.X_train, self.feature_preprocessor.transform(X_test)))
64105
y = self.y_train
65106
with torch.no_grad():
66107
x = torch.from_numpy(x).unsqueeze(0).to(torch.float).to(self.device) # introduce batch size 1
@@ -76,7 +117,7 @@ def predict_proba(self, X_test: np.array) -> np.array:
76117
class NanoTabPFNRegressor():
77118
""" scikit-learn like interface """
78119
def __init__(self, model: NanoTabPFNModel|str|None = None, dist: FullSupportBarDistribution|str|None = None, device=get_default_device()):
79-
if model == None:
120+
if model is None:
80121
model = 'nanotabpfn_regressor.pth'
81122
dist = 'nanotabpfn_regressor_buckets.pth'
82123
if not os.path.isfile(model):
@@ -105,7 +146,8 @@ def fit(self, X_train: np.array, y_train: np.array):
105146
"""
106147
Stores X_train and y_train for later use. Computes target normalization. Builds normalized bar distribution from existing self.dist.
107148
"""
108-
self.X_train = X_train
149+
self.feature_preprocessor = get_feature_preprocessor(X_train)
150+
self.X_train = self.feature_preprocessor.fit_transform(X_train)
109151
self.y_train = y_train
110152

111153
self.y_train_mean = np.mean(self.y_train)
@@ -121,7 +163,7 @@ def predict(self, X_test: np.array) -> np.array:
121163
"""
122164
Performs in-context learning using X_train and y_train. Predicts the means of the output distributions for X_test.
123165
"""
124-
X = np.concatenate((self.X_train, X_test))
166+
X = np.concatenate((self.X_train, self.feature_preprocessor.transform(X_test)))
125167
y = self.y_train_n
126168

127169
with torch.no_grad():

pretrain_classification.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,16 @@
11
import argparse
2-
import torch
3-
import numpy as np
42

3+
import torch
4+
from sklearn.metrics import accuracy_score
55
from torch import nn
6-
from functools import partial
76

87
from nanotabpfn.callbacks import ConsoleLoggerCallback
9-
from nanotabpfn.priors import PriorDumpDataLoader
8+
from nanotabpfn.evaluation import get_openml_predictions, TOY_TASKS_CLASSIFICATION
9+
from nanotabpfn.interface import NanoTabPFNClassifier
1010
from nanotabpfn.model import NanoTabPFNModel
11+
from nanotabpfn.priors import PriorDumpDataLoader
1112
from nanotabpfn.train import train
1213
from nanotabpfn.utils import get_default_device, set_randomness_seed
13-
from nanotabpfn.interface import NanoTabPFNClassifier
14-
15-
from sklearn.datasets import *
16-
from sklearn.model_selection import train_test_split
17-
from sklearn.metrics import accuracy_score, roc_auc_score
1814

1915
parser = argparse.ArgumentParser()
2016
parser.add_argument("-priordump", type=str, default="/50x3_3_100k_classification.h5", help="path to the prior dump")
@@ -55,29 +51,22 @@
5551
if ckpt:
5652
model.load_state_dict(ckpt['model'])
5753

58-
datasets = []
59-
datasets.append(train_test_split(*load_iris(return_X_y=True), test_size=0.5, random_state=42))
60-
datasets.append(train_test_split(*load_wine(return_X_y=True), test_size=0.5, random_state=42))
61-
datasets.append(train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.5, random_state=42))
62-
63-
6454
class EvaluationLoggerCallback(ConsoleLoggerCallback):
65-
def __init__(self, datasets):
66-
self.datasets = datasets
55+
def __init__(self, tasks):
56+
self.tasks = tasks
6757

6858
def on_epoch_end(self, epoch: int, epoch_time: float, loss: float, model, **kwargs):
6959
classifier = NanoTabPFNClassifier(model, device)
60+
predictions = get_openml_predictions(model=classifier, tasks=self.tasks)
7061
scores = []
71-
for X_train, X_test, y_train, y_test in self.datasets:
72-
classifier.fit(X_train, y_train)
73-
pred = classifier.predict(X_test)
74-
scores.append(accuracy_score(y_test, pred))
62+
for dataset_name, (y_true, y_pred, y_proba) in predictions.items():
63+
scores.append(accuracy_score(y_true, y_pred))
7564
avg_score = sum(scores) / len(scores)
7665
print(f'epoch {epoch:5d} | time {epoch_time:5.2f}s | mean loss {loss:5.2f} | avg accuracy {avg_score:.3f}',
7766
flush=True)
7867

7968

80-
callbacks = [EvaluationLoggerCallback(datasets)]
69+
callbacks = [EvaluationLoggerCallback(TOY_TASKS_CLASSIFICATION)]
8170

8271
trained_model, loss = train(
8372
model=model,

pretrain_regression.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
import argparse
2+
23
import torch
4+
from pfns.bar_distribution import FullSupportBarDistribution
5+
from sklearn.metrics import r2_score
36

47
from nanotabpfn.callbacks import ConsoleLoggerCallback
5-
from nanotabpfn.priors import PriorDumpDataLoader
8+
from nanotabpfn.evaluation import get_openml_predictions, TOY_TASKS_REGRESSION
9+
from nanotabpfn.interface import NanoTabPFNRegressor
610
from nanotabpfn.model import NanoTabPFNModel
11+
from nanotabpfn.priors import PriorDumpDataLoader
712
from nanotabpfn.train import train
813
from nanotabpfn.utils import get_default_device, set_randomness_seed, make_global_bucket_edges
9-
from nanotabpfn.interface import NanoTabPFNRegressor
10-
11-
from pfns.bar_distribution import FullSupportBarDistribution
12-
13-
from sklearn.datasets import load_diabetes
14-
from sklearn.model_selection import train_test_split
15-
from sklearn.metrics import r2_score
1614

1715
parser = argparse.ArgumentParser()
1816

@@ -66,27 +64,22 @@
6664

6765
dist = FullSupportBarDistribution(bucket_edges)
6866

69-
datasets = []
70-
datasets.append(train_test_split(*load_diabetes(return_X_y=True), test_size=0.5, random_state=42))
71-
72-
7367
class EvaluationLoggerCallback(ConsoleLoggerCallback):
74-
def __init__(self, datasets):
75-
self.datasets = datasets
68+
def __init__(self, tasks):
69+
self.tasks = tasks
7670

7771
def on_epoch_end(self, epoch: int, epoch_time: float, loss: float, model, **kwargs):
7872
regressor = NanoTabPFNRegressor(model, dist, device)
73+
predictions = get_openml_predictions(model=regressor, tasks=self.tasks)
7974
scores = []
80-
for X_train, X_test, y_train, y_test in datasets:
81-
regressor.fit(X_train, y_train)
82-
pred = regressor.predict(X_test)
83-
scores.append(r2_score(y_test, pred))
75+
for dataset_name, (y_true, y_pred, _) in predictions.items():
76+
scores.append(r2_score(y_true, y_pred))
8477
avg_score = sum(scores) / len(scores)
8578
print(f'epoch {epoch:5d} | time {epoch_time:5.2f}s | mean loss {loss:5.2f} | avg r2 score {avg_score:.3f}',
8679
flush=True)
8780

8881

89-
callbacks = [EvaluationLoggerCallback(datasets)]
82+
callbacks = [EvaluationLoggerCallback(TOY_TASKS_REGRESSION)]
9083

9184
trained_model, loss = train(
9285
model=model,

0 commit comments

Comments
 (0)