Skip to content

Commit eb18ce0

Browse files
authored
Post ICML Refactor (part 1) (#257)
* Various improvements * Refactor website_format.py * Add patience_callback * Add logic to TabArena class * Add generic metric impact evaluation to TabArena class
1 parent cc964d9 commit eb18ce0

File tree

18 files changed

+617
-157
lines changed

18 files changed

+617
-157
lines changed

bencheval/bencheval/tabarena.py

Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
import copy
44
import os
5+
from dataclasses import dataclass
6+
from typing import Callable, Literal
7+
from collections.abc import Iterable
58
from pathlib import Path
69

710
import numpy as np
@@ -18,6 +21,32 @@
1821
IMPROVABILITY = "improvability"
1922
LOSS_RESCALED = "loss_rescaled"
2023

24+
MetricDirection = Literal["min", "max"]
25+
MetricAlignment = Literal["row", "method"]
26+
InvalidSubsetPolicy = Literal["raise", "nan", "skip"]
27+
28+
29+
@dataclass(frozen=True, slots=True)
30+
class MetricSpec:
31+
"""
32+
Defines how to (re)compute a metric from a subset of results_per_task and how to
33+
reduce it to a single scalar score for a given method.
34+
35+
- compute(): returns either
36+
* row-aligned Series (index == results_per_task.index) [alignment="row"]
37+
* method-aligned Series (index == method names) [alignment="method"]
38+
- score(): returns a float score for method_1 given the computed metric result
39+
"""
40+
name: str
41+
direction: MetricDirection
42+
alignment: MetricAlignment
43+
compute: Callable[["TabArena", pd.DataFrame], pd.Series]
44+
score: Callable[["TabArena", pd.DataFrame, pd.Series, str], float]
45+
# Methods that must be present in any subset (e.g., Elo calibration framework)
46+
required_methods: frozenset[str] = frozenset()
47+
# What to do if required methods are missing from a subset
48+
invalid_subset_policy: InvalidSubsetPolicy = "raise"
49+
2150

2251
# TODO: Should "data" be an init arg? Probably not.
2352
class TabArena:
@@ -420,6 +449,12 @@ def fillna_data(
420449

421450
return data
422451

452+
def get_task_groupby_cols(self, include_seed_col: bool = False):
453+
task_groupby_cols = self.task_groupby_columns
454+
if include_seed_col and self.seed_column is not None:
455+
task_groupby_cols = task_groupby_cols + [self.seed_column]
456+
return task_groupby_cols
457+
423458
def compute_results_per_task(self, data: pd.DataFrame, include_seed_col: bool = False) -> pd.DataFrame:
424459
groupby_cols = self.groupby_columns
425460
task_groupby_cols = self.task_groupby_columns
@@ -996,6 +1031,316 @@ def _weighted_groupby_mean(self, tasks: list[str], data: pd.DataFrame, agg_colum
9961031
column_mean.index.name = agg_column
9971032
return column_mean
9981033

1034+
def _seed_col_if_present(self, df: pd.DataFrame) -> str | None:
1035+
if self.seed_column is not None and self.seed_column in df.columns:
1036+
return self.seed_column
1037+
return None
1038+
1039+
def _score_weighted_mean_by_task(
1040+
self,
1041+
df: pd.DataFrame,
1042+
*,
1043+
value_col: str,
1044+
sort_asc: bool,
1045+
) -> pd.Series:
1046+
"""
1047+
Returns a per-method Series of weighted means using the same equal-task weighting
1048+
logic as other parts of TabArena.
1049+
"""
1050+
seed_col = self._seed_col_if_present(df)
1051+
return compute_weighted_mean_by_task(
1052+
df=df,
1053+
value_col=value_col,
1054+
task_col=self.task_groupby_columns,
1055+
seed_col=seed_col,
1056+
method_col=self.method_col,
1057+
sort_asc=sort_asc,
1058+
)
1059+
1060+
def score_if_remove_method(
1061+
self,
1062+
metric: MetricSpec,
1063+
results_per_task: pd.DataFrame,
1064+
*,
1065+
method_1: str,
1066+
method_2: str,
1067+
) -> float:
1068+
"""
1069+
Compute the scalar score for method_1 after removing method_2 and recomputing metric.
1070+
Returns the resulting score (NOT delta).
1071+
"""
1072+
# Keep your prior convention: if we remove method_1 itself, return baseline score on provided df.
1073+
if method_1 == method_2:
1074+
if not self._metric_subset_ok(metric, results_per_task):
1075+
return float("nan")
1076+
metric_values = metric.compute(self, results_per_task)
1077+
return float(metric.score(self, results_per_task, metric_values, method_1))
1078+
1079+
subset = results_per_task.loc[results_per_task[self.method_col] != method_2].copy()
1080+
if not self._metric_subset_ok(metric, subset):
1081+
return float("nan")
1082+
metric_values = metric.compute(self, subset)
1083+
return float(metric.score(self, subset, metric_values, method_1))
1084+
1085+
def score_series_if_remove_each_method(
1086+
self,
1087+
metric: MetricSpec,
1088+
results_per_task: pd.DataFrame,
1089+
*,
1090+
method_1: str,
1091+
) -> pd.Series:
1092+
"""
1093+
For a fixed method_1, return a Series indexed by method_2 with values = resulting score
1094+
for method_1 if method_2 were removed.
1095+
"""
1096+
methods = (
1097+
results_per_task[self.method_col]
1098+
.dropna()
1099+
.astype(str)
1100+
.unique()
1101+
.tolist()
1102+
)
1103+
1104+
scores: dict[str, float] = {}
1105+
for method_2 in methods:
1106+
# Never propose removing required methods (e.g., Elo calibration framework)
1107+
if method_2 in metric.required_methods:
1108+
continue
1109+
scores[method_2] = self.score_if_remove_method(
1110+
metric,
1111+
results_per_task,
1112+
method_1=method_1,
1113+
method_2=method_2,
1114+
)
1115+
1116+
s = pd.Series(scores, name=f"{metric.name}_score_for_{method_1}_if_remove_method")
1117+
# Sorting: for min-metrics ascending is "better"; for max-metrics descending is "better"
1118+
return s.sort_values(ascending=(metric.direction == "min"))
1119+
1120+
def greedy_remove_methods_optimize_score(
1121+
self,
1122+
metric: MetricSpec,
1123+
results_per_task: pd.DataFrame,
1124+
*,
1125+
method_1: str,
1126+
stop_at_score: float | None = None,
1127+
) -> pd.Series:
1128+
"""
1129+
Iteratively remove method_2 that yields the best improvement for method_1
1130+
according to metric.direction, recomputing the metric each iteration.
1131+
1132+
Returns:
1133+
pd.Series indexed by removed method_2 in removal order
1134+
values = resulting score for method_1 at that iteration (NOT delta).
1135+
"""
1136+
current = results_per_task
1137+
removed_in_order: dict[str, float] = {}
1138+
1139+
while True:
1140+
# Compute current score for method_1 (and stop checks)
1141+
if not self._metric_subset_ok(metric, current):
1142+
break
1143+
current_metric = metric.compute(self, current)
1144+
cur_score = float(metric.score(self, current, current_metric, method_1))
1145+
1146+
# Stop criteria
1147+
if pd.isna(cur_score):
1148+
break
1149+
if stop_at_score is not None:
1150+
if metric.direction == "min" and cur_score <= stop_at_score:
1151+
break
1152+
if metric.direction == "max" and cur_score >= stop_at_score:
1153+
break
1154+
1155+
remaining_methods = current[self.method_col].dropna().astype(str).unique().tolist()
1156+
# Exclude method_1 and any required methods (e.g., calibration framework)
1157+
candidates = [
1158+
m for m in remaining_methods
1159+
if m != method_1 and m not in metric.required_methods
1160+
]
1161+
if not candidates:
1162+
break
1163+
1164+
candidate_scores: dict[str, float] = {}
1165+
for method_2 in candidates:
1166+
subset = current.loc[current[self.method_col] != method_2].copy()
1167+
if not self._metric_subset_ok(metric, subset):
1168+
if metric.invalid_subset_policy == "skip":
1169+
continue
1170+
candidate_scores[method_2] = float("nan")
1171+
continue
1172+
subset_metric = metric.compute(self, subset)
1173+
candidate_scores[method_2] = float(metric.score(self, subset, subset_metric, method_1))
1174+
1175+
scores_s = pd.Series(candidate_scores).dropna()
1176+
if scores_s.empty:
1177+
break
1178+
1179+
# Choose best candidate depending on direction
1180+
if metric.direction == "min":
1181+
best_method_2 = scores_s.idxmin()
1182+
else:
1183+
best_method_2 = scores_s.idxmax()
1184+
1185+
best_score = float(scores_s.loc[best_method_2])
1186+
removed_in_order[best_method_2] = best_score
1187+
1188+
# Remove best_method_2 and continue
1189+
current = current.loc[current[self.method_col] != best_method_2]
1190+
1191+
return pd.Series(removed_in_order, name=f"{metric.name}_score_iter_for_{method_1}")
1192+
1193+
def greedy_score_matrix(
1194+
self,
1195+
metric: MetricSpec,
1196+
results_per_task: pd.DataFrame,
1197+
*,
1198+
methods_1: Iterable[str] | None = None,
1199+
stop_at_score: float | None = None,
1200+
) -> pd.DataFrame:
1201+
"""
1202+
Build a DataFrame:
1203+
rows = method_2 (removed)
1204+
cols = method_1
1205+
cell = resulting score for method_1 at the iteration when method_2 was removed
1206+
"""
1207+
if methods_1 is None:
1208+
methods_1 = (
1209+
results_per_task[self.method_col].dropna().astype(str).unique().tolist()
1210+
)
1211+
1212+
col_series: dict[str, pd.Series] = {}
1213+
for method_1 in methods_1:
1214+
col_series[method_1] = self.greedy_remove_methods_optimize_score(
1215+
metric,
1216+
results_per_task,
1217+
method_1=method_1,
1218+
stop_at_score=stop_at_score,
1219+
)
1220+
1221+
return pd.DataFrame(col_series)
1222+
1223+
# ----------------------------
1224+
# MetricSpec factories
1225+
# ----------------------------
1226+
1227+
def metric_spec_error(self) -> MetricSpec:
1228+
"""
1229+
Lower is better. Score = weighted mean error (equal task weighting).
1230+
"""
1231+
def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series:
1232+
# row-aligned; no recomputation needed
1233+
return df[self.error_col]
1234+
1235+
def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float:
1236+
groupby_columns = self._get_groupby_cols(df)
1237+
tmp = df[groupby_columns].copy()
1238+
tmp[self.error_col] = values.to_numpy()
1239+
per_method = self._score_weighted_mean_by_task(tmp, value_col=self.error_col, sort_asc=True)
1240+
return float(per_method.get(method_1, float("nan")))
1241+
1242+
return MetricSpec(
1243+
name=self.error_col,
1244+
direction="min",
1245+
alignment="row",
1246+
compute=compute,
1247+
score=score,
1248+
)
1249+
1250+
def metric_spec_rank(self) -> MetricSpec:
1251+
"""
1252+
Lower is better. Score = weighted mean rank.
1253+
"""
1254+
def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series:
1255+
task_groupby_cols = self._get_task_groupby_cols(results=df)
1256+
return self.compare_rank_per(df=df, task_groupby_cols=task_groupby_cols)
1257+
1258+
def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float:
1259+
groupby_columns = self._get_groupby_cols(df)
1260+
tmp = df[groupby_columns].copy()
1261+
tmp[RANK] = values.to_numpy()
1262+
per_method = self._score_weighted_mean_by_task(tmp, value_col=RANK, sort_asc=True)
1263+
return float(per_method.get(method_1, float("nan")))
1264+
1265+
return MetricSpec(
1266+
name=RANK,
1267+
direction="min",
1268+
alignment="row",
1269+
compute=compute,
1270+
score=score,
1271+
)
1272+
1273+
def metric_spec_improvability(self) -> MetricSpec:
1274+
"""
1275+
Lower is better (0 is ideal). Score = weighted mean improvability.
1276+
"""
1277+
def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series:
1278+
task_groupby_cols = self._get_task_groupby_cols(results=df)
1279+
return self.compute_improvability_per(results_per_task=df, task_groupby_cols=task_groupby_cols)
1280+
1281+
def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float:
1282+
groupby_columns = self._get_groupby_cols(df)
1283+
tmp = df[groupby_columns].copy()
1284+
tmp[IMPROVABILITY] = values.to_numpy()
1285+
per_method = self._score_weighted_mean_by_task(tmp, value_col=IMPROVABILITY, sort_asc=True)
1286+
return float(per_method.get(method_1, float("nan")))
1287+
1288+
return MetricSpec(
1289+
name=IMPROVABILITY,
1290+
direction="min",
1291+
alignment="row",
1292+
compute=compute,
1293+
score=score,
1294+
)
1295+
1296+
def metric_spec_elo(self, **elo_kwargs) -> MetricSpec:
1297+
"""
1298+
Higher is better. Score = Elo value for method_1 computed on the subset.
1299+
"""
1300+
calibration_framework = elo_kwargs.get("calibration_framework", None)
1301+
required = frozenset([calibration_framework]) if calibration_framework else frozenset()
1302+
1303+
def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series:
1304+
bars = self.compute_elo(
1305+
results_per_task=df,
1306+
include_quantiles=False,
1307+
round_decimals=None,
1308+
**elo_kwargs,
1309+
)
1310+
# method-aligned Series
1311+
return bars["elo"]
1312+
1313+
def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float:
1314+
return float(values.get(method_1, float("nan")))
1315+
1316+
return MetricSpec(
1317+
name="elo",
1318+
direction="max",
1319+
alignment="method",
1320+
compute=compute,
1321+
score=score,
1322+
required_methods=required,
1323+
invalid_subset_policy="raise",
1324+
)
1325+
1326+
def _metric_subset_ok(self, metric: MetricSpec, df: pd.DataFrame) -> bool:
1327+
"""Return True if df satisfies metric.required_methods; otherwise obey policy."""
1328+
if not metric.required_methods:
1329+
return True
1330+
present = set(df[self.method_col].dropna().astype(str).unique())
1331+
missing = set(metric.required_methods) - present
1332+
if not missing:
1333+
return True
1334+
if metric.invalid_subset_policy == "raise":
1335+
raise ValueError(
1336+
f"Metric {metric.name!r} requires methods {sorted(metric.required_methods)}, "
1337+
f"but subset is missing {sorted(missing)}."
1338+
)
1339+
if metric.invalid_subset_policy == "nan":
1340+
return False
1341+
# "skip"
1342+
return False
1343+
9991344

10001345
def get_bootstrap_result_lst(data: list, func_, rng=None, num_round: int = None, func_kwargs=None, seed: int = 0):
10011346
rows = []

examples/benchmarking/custom_tabarena_model/run_evaluate_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99

1010
from tabarena.nips2025_utils.end_to_end_single import EndToEndResultsSingle, EndToEndSingle
11-
from bencheval.website_format import format_leaderboard
11+
from tabarena.website.website_format import format_leaderboard
1212

1313
if __name__ == "__main__":
1414
path_raw = Path(__file__).parent / "tabarena_out" / "custom_model"

examples/benchmarking/run_quickstart_tabarena.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from tabarena.benchmark.experiment import AGModelBagExperiment, ExperimentBatchRunner
99
from tabarena.nips2025_utils.end_to_end import EndToEnd
1010
from tabarena.nips2025_utils.tabarena_context import TabArenaContext
11-
from bencheval.website_format import format_leaderboard
11+
from tabarena.website.website_format import format_leaderboard
1212

1313

1414
if __name__ == '__main__':

examples/benchmarking/run_quickstart_tabarena_custom_datasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from tabarena.models.utils import get_configs_generator_from_name
1313
from tabarena.nips2025_utils.compare import compare
1414
from tabarena.nips2025_utils.end_to_end import EndToEnd
15-
from bencheval.website_format import format_leaderboard
15+
from tabarena.website.website_format import format_leaderboard
1616

1717

1818
def get_custom_classification_task(task_cache_dir: str) -> UserTask:

0 commit comments

Comments
 (0)