Skip to content

Commit af6d3bf

Browse files
committed
Add evaluation code
1 parent cc71764 commit af6d3bf

File tree

4 files changed

+238
-49
lines changed

4 files changed

+238
-49
lines changed

nanotabpfn/evaluation.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import argparse
2+
3+
import numpy as np
4+
import openml
5+
import torch
6+
from openml.config import set_root_cache_directory
7+
from openml.tasks import TaskType
8+
from sklearn.preprocessing import LabelEncoder
9+
10+
from nanotabpfn.interface import NanoTabPFNRegressor, NanoTabPFNClassifier
11+
12+
TOY_TASKS_REGRESSION = [
13+
362443, # diabetes
14+
]
15+
16+
TOY_TASKS_CLASSIFICATION = [
17+
59, # iris
18+
2382, # wine
19+
9946, # breast_cancer
20+
]
21+
22+
@torch.no_grad()
23+
def get_openml_predictions(
24+
*,
25+
model: NanoTabPFNRegressor | NanoTabPFNClassifier,
26+
tasks: list[int] | str = "tabarena-v0.1",
27+
max_n_features=500,
28+
max_n_instances=10_000,
29+
classification: bool | None = None,
30+
cache_directory: str | None = None,
31+
):
32+
"""
33+
Evaluates a model on a set of OpenML tasks and returns predictions.
34+
35+
Retrieves datasets from OpenML, applies preprocessing, and evaluates the given model on each task.
36+
Returns true targets, predicted labels, and predicted probabilities for each dataset.
37+
38+
Args:
39+
model (NanoTabPFNRegressor | NanoTabPFNClassifier): A scikit-learn compatible model or classifier to be evaluated.
40+
tasks (list[int] | str, optional): A list of OpenML task IDs or the name of a benchmark suite.
41+
max_n_features (int, optional): Maximum number of features allowed for a task. Tasks exceeding this limit are skipped.
42+
max_n_instances (int, optional): Maximum number of instances allowed for a task. Tasks exceeding this limit are skipped.
43+
classification (bool | None, optional): Whether the model is a classifier (True) or regressor (False). If None, it is inferred from the model type.
44+
cache_directory (str | None, optional): Directory to save OpenML data. If None, default cache path is used.
45+
Returns:
46+
dict: A dictionary where keys are dataset names and values are tuples of (true targets, predicted labels, predicted probabilities).
47+
"""
48+
if classification is None:
49+
classification = isinstance(model, NanoTabPFNClassifier)
50+
51+
if cache_directory is not None:
52+
set_root_cache_directory(cache_directory)
53+
54+
if isinstance(tasks, str):
55+
benchmark_suite = openml.study.get_suite(tasks)
56+
task_ids = benchmark_suite.tasks
57+
else:
58+
task_ids = tasks
59+
60+
dataset_predictions = {}
61+
62+
for task_id in task_ids:
63+
task = openml.tasks.get_task(task_id, download_splits=False)
64+
65+
if classification and task.task_type_id != TaskType.SUPERVISED_CLASSIFICATION:
66+
continue # skip task, only classification
67+
if not classification and task.task_type_id != TaskType.SUPERVISED_REGRESSION:
68+
continue # skip task, only regression
69+
70+
dataset = task.get_dataset(download_data=False)
71+
72+
n_features = dataset.qualities["NumberOfFeatures"]
73+
n_instances = dataset.qualities["NumberOfInstances"]
74+
if n_features > max_n_features or n_instances > max_n_instances:
75+
continue # skip task, too big
76+
77+
_, folds, _ = task.get_split_dimensions()
78+
tabarena_light = True
79+
if tabarena_light:
80+
folds = 1 # code supports multiple folds but tabarena_light only has one
81+
repeat = 0 # code only supports one repeat
82+
targets = []
83+
predictions = []
84+
probabilities = []
85+
for fold in range(folds):
86+
X, y, categorical_indicator, attribute_names = dataset.get_data(
87+
target=task.target_name, dataset_format="dataframe"
88+
)
89+
train_indices, test_indices = task.get_train_test_split_indices(
90+
fold=fold, repeat=repeat
91+
)
92+
X_train = X.iloc[train_indices].to_numpy()
93+
y_train = y.iloc[train_indices].to_numpy()
94+
X_test = X.iloc[test_indices].to_numpy()
95+
y_test = y.iloc[test_indices].to_numpy()
96+
97+
if classification:
98+
label_encoder = LabelEncoder()
99+
y_train = label_encoder.fit_transform(y_train)
100+
y_test = label_encoder.transform(y_test)
101+
targets.append(y_test)
102+
103+
model.fit(X_train, y_train)
104+
y_pred = model.predict(X_test)
105+
predictions.append(y_pred)
106+
if classification:
107+
y_proba = model.predict_proba(X_test)
108+
if y_proba.shape[1] == 2: # binary classification
109+
y_proba = y_proba[:, 1]
110+
probabilities.append(y_proba)
111+
112+
y_pred = np.concatenate(predictions, axis=0)
113+
targets = np.concatenate(targets, axis=0)
114+
probabilities = np.concatenate(probabilities, axis=0) if len(probabilities) > 0 else None
115+
dataset_predictions[str(dataset.name)] = (targets, y_pred, probabilities)
116+
return dataset_predictions
117+
118+
119+
if __name__ == "__main__":
120+
parser = argparse.ArgumentParser()
121+
parser.add_argument("-model_type", type=str, choices=["regression", "classification"], required=True,
122+
help="Whether to use the regressor or classifier model")
123+
parser.add_argument("-checkpoint", type=str, default=None,
124+
help="Path to load the model weights from. If None, default weights are used.")
125+
parser.add_argument("-dist_path", type=str, default=None,
126+
help="Path to load the bucket edges for the support bar distribution from. Only needed for regression.")
127+
parser.add_argument("-tasks", type=str, default="tabarena-v0.1",
128+
choices=["tabarena-v0.1", "toy_tasks"], help="Which OpenML tasks to evaluate on.")
129+
parser.add_argument("-cache_directory", type=str, default=None,
130+
help="Directory to save OpenML data. If None, default cache path is used.")
131+
parser.add_argument("-max_n_features", type=int, default=500,
132+
help="Maximum number of features allowed for a task. Tasks exceeding this limit are skipped.")
133+
parser.add_argument("-max_n_instances", type=int, default=10_000,
134+
help="Maximum number of instances allowed for a task. Tasks exceeding this limit are skipped.")
135+
args = parser.parse_args()
136+
137+
if args.model_type == "classification":
138+
model = NanoTabPFNClassifier(model=args.checkpoint)
139+
else:
140+
model = NanoTabPFNRegressor(model=args.checkpoint, dist=args.dist_path)
141+
model.model.eval()
142+
143+
if args.tasks == "toy_tasks" and args.model_type == "regression":
144+
tasks = TOY_TASKS_REGRESSION
145+
elif args.tasks == "toy_tasks" and args.model_type == "classification":
146+
tasks = TOY_TASKS_CLASSIFICATION
147+
else:
148+
tasks = args.tasks
149+
150+
predictions = get_openml_predictions(
151+
model=model, tasks=tasks, max_n_features=args.max_n_features, max_n_instances=args.max_n_instances,
152+
classification=(args.model_type=="classification"), cache_directory=args.cache_directory
153+
)
154+
155+
for dataset_name, (y_true, y_pred, y_proba) in predictions.items():
156+
if args.model_type == "classification":
157+
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
158+
acc = balanced_accuracy_score(y_true, y_pred)
159+
auc = roc_auc_score(y_true, y_proba, multi_class='ovr')
160+
print(f"Dataset: {dataset_name} | ROC AUC: {auc:.4f} | Balanced Accuracy: {acc:.4f}")
161+
else:
162+
from sklearn.metrics import r2_score
163+
r2 = r2_score(y_true, y_pred)
164+
print(f"Dataset: {dataset_name} | R2: {r2:.4f}")

nanotabpfn/interface.py

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
import os
2-
import requests
2+
33
import numpy as np
4+
import pandas as pd
5+
import requests
46
import torch
57
import torch.nn.functional as F
6-
8+
from numpy import ndarray
79
from pfns.bar_distribution import FullSupportBarDistribution
10+
from sklearn.compose import ColumnTransformer
11+
from sklearn.impute import SimpleImputer
12+
from sklearn.pipeline import Pipeline
13+
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
814

9-
from nanotabpfn.utils import get_default_device
1015
from nanotabpfn.model import NanoTabPFNModel
16+
from nanotabpfn.utils import get_default_device
17+
1118

1219
def init_model_from_state_dict_file(file_path):
1320
"""
@@ -29,6 +36,40 @@ def init_model_from_state_dict_file(file_path):
2936
model.load_state_dict(torch.load(file_path, map_location='cpu'))
3037
return model
3138

39+
def get_feature_preprocessor(X: ndarray | pd.DataFrame) -> ColumnTransformer:
40+
"""
41+
fits a preprocessor that replaces NaNs with the mean of the respective column
42+
and scales each column to mean 0 and variance 1
43+
"""
44+
X = pd.DataFrame(X)
45+
num_mask = []
46+
for col in X:
47+
non_nan_entries = X[col].notna().sum()
48+
numeric_entries = pd.to_numeric(X[col], errors='coerce').notna().sum() # in case numeric columns are stored as strings
49+
num_mask.append(non_nan_entries == numeric_entries)
50+
# num_mask.append(is_numeric_dtype(X[col])) # Assumes pandas dtype is correct
51+
52+
num_mask = np.array(num_mask)
53+
54+
num_transformer = Pipeline([
55+
("to_pandas", FunctionTransformer(lambda x: pd.DataFrame(x) if not isinstance(x, pd.DataFrame) else x)), # to apply pd.to_numeric of pandas
56+
("to_numeric", FunctionTransformer(lambda x: x.apply(pd.to_numeric, errors='coerce').to_numpy())), # in case numeric columns are stored as strings
57+
('imputer', SimpleImputer(strategy='mean')) # median might be better because of outliers
58+
])
59+
cat_transformer = Pipeline([
60+
('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
61+
('imputer', SimpleImputer(strategy='most_frequent')),
62+
])
63+
64+
preprocessor = ColumnTransformer(
65+
transformers=[
66+
('num', num_transformer, num_mask),
67+
('cat', cat_transformer, ~num_mask)
68+
]
69+
)
70+
return preprocessor
71+
72+
3273
class NanoTabPFNClassifier():
3374
""" scikit-learn like interface """
3475
def __init__(self, model: NanoTabPFNModel|str|None = None, device=get_default_device()):
@@ -46,7 +87,8 @@ def __init__(self, model: NanoTabPFNModel|str|None = None, device=get_default_de
4687

4788
def fit(self, X_train: np.array, y_train: np.array):
4889
""" stores X_train and y_train for later use, also computes the highest class number occuring in num_classes """
49-
self.X_train = X_train
90+
self.feature_preprocessor = get_feature_preprocessor(X_train)
91+
self.X_train = self.feature_preprocessor.fit_transform(X_train)
5092
self.y_train = y_train
5193
self.num_classes = max(set(y_train))+1
5294

@@ -60,7 +102,7 @@ def predict_proba(self, X_test: np.array) -> np.array:
60102
creates (x,y), runs it through our PyTorch Model, cuts off the classes that didn't appear in the training data
61103
and applies softmax to get the probabilities
62104
"""
63-
x = np.concatenate((self.X_train, X_test))
105+
x = np.concatenate((self.X_train, self.feature_preprocessor.transform(X_test)))
64106
y = self.y_train
65107
with torch.no_grad():
66108
x = torch.from_numpy(x).unsqueeze(0).to(torch.float).to(self.device) # introduce batch size 1
@@ -76,7 +118,7 @@ def predict_proba(self, X_test: np.array) -> np.array:
76118
class NanoTabPFNRegressor():
77119
""" scikit-learn like interface """
78120
def __init__(self, model: NanoTabPFNModel|str|None = None, dist: FullSupportBarDistribution|str|None = None, device=get_default_device()):
79-
if model == None:
121+
if model is None:
80122
model = 'nanotabpfn_regressor.pth'
81123
dist = 'nanotabpfn_regressor_buckets.pth'
82124
if not os.path.isfile(model):
@@ -105,7 +147,8 @@ def fit(self, X_train: np.array, y_train: np.array):
105147
"""
106148
Stores X_train and y_train for later use. Computes target normalization. Builds normalized bar distribution from existing self.dist.
107149
"""
108-
self.X_train = X_train
150+
self.feature_preprocessor = get_feature_preprocessor(X_train)
151+
self.X_train = self.feature_preprocessor.fit_transform(X_train)
109152
self.y_train = y_train
110153

111154
self.y_train_mean = np.mean(self.y_train)
@@ -121,7 +164,7 @@ def predict(self, X_test: np.array) -> np.array:
121164
"""
122165
Performs in-context learning using X_train and y_train. Predicts the means of the output distributions for X_test.
123166
"""
124-
X = np.concatenate((self.X_train, X_test))
167+
X = np.concatenate((self.X_train, self.feature_preprocessor.transform(X_test)))
125168
y = self.y_train_n
126169

127170
with torch.no_grad():

pretrain_classification.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,16 @@
11
import argparse
2-
import torch
3-
import numpy as np
42

3+
import torch
4+
from sklearn.metrics import accuracy_score
55
from torch import nn
6-
from functools import partial
76

87
from nanotabpfn.callbacks import ConsoleLoggerCallback
9-
from nanotabpfn.priors import PriorDumpDataLoader
8+
from nanotabpfn.evaluation import get_openml_predictions, TOY_TASKS_CLASSIFICATION
9+
from nanotabpfn.interface import NanoTabPFNClassifier
1010
from nanotabpfn.model import NanoTabPFNModel
11+
from nanotabpfn.priors import PriorDumpDataLoader
1112
from nanotabpfn.train import train
1213
from nanotabpfn.utils import get_default_device, set_randomness_seed
13-
from nanotabpfn.interface import NanoTabPFNClassifier
14-
15-
from sklearn.datasets import *
16-
from sklearn.model_selection import train_test_split
17-
from sklearn.metrics import accuracy_score, roc_auc_score
1814

1915
parser = argparse.ArgumentParser()
2016
parser.add_argument("-priordump", type=str, default="/50x3_3_100k_classification.h5", help="path to the prior dump")
@@ -55,29 +51,22 @@
5551
if ckpt:
5652
model.load_state_dict(ckpt['model'])
5753

58-
datasets = []
59-
datasets.append(train_test_split(*load_iris(return_X_y=True), test_size=0.5, random_state=42))
60-
datasets.append(train_test_split(*load_wine(return_X_y=True), test_size=0.5, random_state=42))
61-
datasets.append(train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.5, random_state=42))
62-
63-
6454
class EvaluationLoggerCallback(ConsoleLoggerCallback):
65-
def __init__(self, datasets):
66-
self.datasets = datasets
55+
def __init__(self, tasks):
56+
self.tasks = tasks
6757

6858
def on_epoch_end(self, epoch: int, epoch_time: float, loss: float, model, **kwargs):
6959
classifier = NanoTabPFNClassifier(model, device)
60+
predictions = get_openml_predictions(model=classifier, tasks=self.tasks)
7061
scores = []
71-
for X_train, X_test, y_train, y_test in self.datasets:
72-
classifier.fit(X_train, y_train)
73-
pred = classifier.predict(X_test)
74-
scores.append(accuracy_score(y_test, pred))
62+
for dataset_name, (y_true, y_pred, y_proba) in predictions.items():
63+
scores.append(accuracy_score(y_true, y_pred))
7564
avg_score = sum(scores) / len(scores)
7665
print(f'epoch {epoch:5d} | time {epoch_time:5.2f}s | mean loss {loss:5.2f} | avg accuracy {avg_score:.3f}',
7766
flush=True)
7867

7968

80-
callbacks = [EvaluationLoggerCallback(datasets)]
69+
callbacks = [EvaluationLoggerCallback(TOY_TASKS_CLASSIFICATION)]
8170

8271
trained_model, loss = train(
8372
model=model,

pretrain_regression.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
import argparse
2+
23
import torch
4+
from pfns.bar_distribution import FullSupportBarDistribution
5+
from sklearn.metrics import r2_score
36

47
from nanotabpfn.callbacks import ConsoleLoggerCallback
5-
from nanotabpfn.priors import PriorDumpDataLoader
8+
from nanotabpfn.evaluation import get_openml_predictions, TOY_TASKS_REGRESSION
9+
from nanotabpfn.interface import NanoTabPFNRegressor
610
from nanotabpfn.model import NanoTabPFNModel
11+
from nanotabpfn.priors import PriorDumpDataLoader
712
from nanotabpfn.train import train
813
from nanotabpfn.utils import get_default_device, set_randomness_seed, make_global_bucket_edges
9-
from nanotabpfn.interface import NanoTabPFNRegressor
10-
11-
from pfns.bar_distribution import FullSupportBarDistribution
12-
13-
from sklearn.datasets import load_diabetes
14-
from sklearn.model_selection import train_test_split
15-
from sklearn.metrics import r2_score
1614

1715
parser = argparse.ArgumentParser()
1816

@@ -66,27 +64,22 @@
6664

6765
dist = FullSupportBarDistribution(bucket_edges)
6866

69-
datasets = []
70-
datasets.append(train_test_split(*load_diabetes(return_X_y=True), test_size=0.5, random_state=42))
71-
72-
7367
class EvaluationLoggerCallback(ConsoleLoggerCallback):
74-
def __init__(self, datasets):
75-
self.datasets = datasets
68+
def __init__(self, tasks):
69+
self.tasks = tasks
7670

7771
def on_epoch_end(self, epoch: int, epoch_time: float, loss: float, model, **kwargs):
7872
regressor = NanoTabPFNRegressor(model, dist, device)
73+
predictions = get_openml_predictions(model=regressor, tasks=self.tasks)
7974
scores = []
80-
for X_train, X_test, y_train, y_test in datasets:
81-
regressor.fit(X_train, y_train)
82-
pred = regressor.predict(X_test)
83-
scores.append(r2_score(y_test, pred))
75+
for dataset_name, (y_true, y_pred, _) in predictions.items():
76+
scores.append(r2_score(y_true, y_pred))
8477
avg_score = sum(scores) / len(scores)
8578
print(f'epoch {epoch:5d} | time {epoch_time:5.2f}s | mean loss {loss:5.2f} | avg r2 score {avg_score:.3f}',
8679
flush=True)
8780

8881

89-
callbacks = [EvaluationLoggerCallback(datasets)]
82+
callbacks = [EvaluationLoggerCallback(TOY_TASKS_REGRESSION)]
9083

9184
trained_model, loss = train(
9285
model=model,

0 commit comments

Comments
 (0)