1+ import argparse
2+
3+ import numpy as np
4+ import openml
5+ import torch
6+ from openml .config import set_root_cache_directory
7+ from openml .tasks import TaskType
8+ from sklearn .preprocessing import LabelEncoder
9+
10+ from nanotabpfn .interface import NanoTabPFNRegressor , NanoTabPFNClassifier
11+
12+ TOY_TASKS_REGRESSION = [
13+ 362443 , # diabetes
14+ ]
15+
16+ TOY_TASKS_CLASSIFICATION = [
17+ 59 , # iris
18+ 2382 , # wine
19+ 9946 , # breast_cancer
20+ ]
21+
22+ @torch .no_grad ()
23+ def get_openml_predictions (
24+ * ,
25+ model : NanoTabPFNRegressor | NanoTabPFNClassifier ,
26+ tasks : list [int ] | str = "tabarena-v0.1" ,
27+ max_n_features = 500 ,
28+ max_n_instances = 10_000 ,
29+ classification : bool | None = None ,
30+ cache_directory : str | None = None ,
31+ ):
32+ """
33+ Evaluates a model on a set of OpenML tasks and returns predictions.
34+
35+ Retrieves datasets from OpenML, applies preprocessing, and evaluates the given model on each task.
36+ Returns true targets, predicted labels, and predicted probabilities for each dataset.
37+
38+ Args:
39+ model (NanoTabPFNRegressor | NanoTabPFNClassifier): A scikit-learn compatible model or classifier to be evaluated.
40+ tasks (list[int] | str, optional): A list of OpenML task IDs or the name of a benchmark suite.
41+ max_n_features (int, optional): Maximum number of features allowed for a task. Tasks exceeding this limit are skipped.
42+ max_n_instances (int, optional): Maximum number of instances allowed for a task. Tasks exceeding this limit are skipped.
43+ classification (bool | None, optional): Whether the model is a classifier (True) or regressor (False). If None, it is inferred from the model type.
44+ cache_directory (str | None, optional): Directory to save OpenML data. If None, default cache path is used.
45+ Returns:
46+ dict: A dictionary where keys are dataset names and values are tuples of (true targets, predicted labels, predicted probabilities).
47+ """
48+ if classification is None :
49+ classification = isinstance (model , NanoTabPFNClassifier )
50+
51+ if cache_directory is not None :
52+ set_root_cache_directory (cache_directory )
53+
54+ if isinstance (tasks , str ):
55+ benchmark_suite = openml .study .get_suite (tasks )
56+ task_ids = benchmark_suite .tasks
57+ else :
58+ task_ids = tasks
59+
60+ dataset_predictions = {}
61+
62+ for task_id in task_ids :
63+ task = openml .tasks .get_task (task_id , download_splits = False )
64+
65+ if classification and task .task_type_id != TaskType .SUPERVISED_CLASSIFICATION :
66+ continue # skip task, only classification
67+ if not classification and task .task_type_id != TaskType .SUPERVISED_REGRESSION :
68+ continue # skip task, only regression
69+
70+ dataset = task .get_dataset (download_data = False )
71+
72+ n_features = dataset .qualities ["NumberOfFeatures" ]
73+ n_instances = dataset .qualities ["NumberOfInstances" ]
74+ if n_features > max_n_features or n_instances > max_n_instances :
75+ continue # skip task, too big
76+
77+ _ , folds , _ = task .get_split_dimensions ()
78+ tabarena_light = True
79+ if tabarena_light :
80+ folds = 1 # code supports multiple folds but tabarena_light only has one
81+ repeat = 0 # code only supports one repeat
82+ targets = []
83+ predictions = []
84+ probabilities = []
85+ for fold in range (folds ):
86+ X , y , categorical_indicator , attribute_names = dataset .get_data (
87+ target = task .target_name , dataset_format = "dataframe"
88+ )
89+ train_indices , test_indices = task .get_train_test_split_indices (
90+ fold = fold , repeat = repeat
91+ )
92+ X_train = X .iloc [train_indices ].to_numpy ()
93+ y_train = y .iloc [train_indices ].to_numpy ()
94+ X_test = X .iloc [test_indices ].to_numpy ()
95+ y_test = y .iloc [test_indices ].to_numpy ()
96+
97+ if classification :
98+ label_encoder = LabelEncoder ()
99+ y_train = label_encoder .fit_transform (y_train )
100+ y_test = label_encoder .transform (y_test )
101+ targets .append (y_test )
102+
103+ model .fit (X_train , y_train )
104+ y_pred = model .predict (X_test )
105+ predictions .append (y_pred )
106+ if classification :
107+ y_proba = model .predict_proba (X_test )
108+ if y_proba .shape [1 ] == 2 : # binary classification
109+ y_proba = y_proba [:, 1 ]
110+ probabilities .append (y_proba )
111+
112+ y_pred = np .concatenate (predictions , axis = 0 )
113+ targets = np .concatenate (targets , axis = 0 )
114+ probabilities = np .concatenate (probabilities , axis = 0 ) if len (probabilities ) > 0 else None
115+ dataset_predictions [str (dataset .name )] = (targets , y_pred , probabilities )
116+ return dataset_predictions
117+
118+
119+ if __name__ == "__main__" :
120+ parser = argparse .ArgumentParser ()
121+ parser .add_argument ("-model_type" , type = str , choices = ["regression" , "classification" ], required = True ,
122+ help = "Whether to use the regressor or classifier model" )
123+ parser .add_argument ("-checkpoint" , type = str , default = None ,
124+ help = "Path to load the model weights from. If None, default weights are used." )
125+ parser .add_argument ("-dist_path" , type = str , default = None ,
126+ help = "Path to load the bucket edges for the support bar distribution from. Only needed for regression." )
127+ parser .add_argument ("-tasks" , type = str , default = "tabarena-v0.1" ,
128+ choices = ["tabarena-v0.1" , "toy_tasks" ], help = "Which OpenML tasks to evaluate on." )
129+ parser .add_argument ("-cache_directory" , type = str , default = None ,
130+ help = "Directory to save OpenML data. If None, default cache path is used." )
131+ parser .add_argument ("-max_n_features" , type = int , default = 500 ,
132+ help = "Maximum number of features allowed for a task. Tasks exceeding this limit are skipped." )
133+ parser .add_argument ("-max_n_instances" , type = int , default = 10_000 ,
134+ help = "Maximum number of instances allowed for a task. Tasks exceeding this limit are skipped." )
135+ args = parser .parse_args ()
136+
137+ if args .model_type == "classification" :
138+ model = NanoTabPFNClassifier (model = args .checkpoint )
139+ else :
140+ model = NanoTabPFNRegressor (model = args .checkpoint , dist = args .dist_path )
141+ model .model .eval ()
142+
143+ if args .tasks == "toy_tasks" and args .model_type == "regression" :
144+ tasks = TOY_TASKS_REGRESSION
145+ elif args .tasks == "toy_tasks" and args .model_type == "classification" :
146+ tasks = TOY_TASKS_CLASSIFICATION
147+ else :
148+ tasks = args .tasks
149+
150+ predictions = get_openml_predictions (
151+ model = model , tasks = tasks , max_n_features = args .max_n_features , max_n_instances = args .max_n_instances ,
152+ classification = (args .model_type == "classification" ), cache_directory = args .cache_directory
153+ )
154+
155+ for dataset_name , (y_true , y_pred , y_proba ) in predictions .items ():
156+ if args .model_type == "classification" :
157+ from sklearn .metrics import roc_auc_score , balanced_accuracy_score
158+ acc = balanced_accuracy_score (y_true , y_pred )
159+ auc = roc_auc_score (y_true , y_proba , multi_class = 'ovr' )
160+ print (f"Dataset: { dataset_name } | ROC AUC: { auc :.4f} | Balanced Accuracy: { acc :.4f} " )
161+ else :
162+ from sklearn .metrics import r2_score
163+ r2 = r2_score (y_true , y_pred )
164+ print (f"Dataset: { dataset_name } | R2: { r2 :.4f} " )
0 commit comments