|
2 | 2 |
|
3 | 3 | import copy |
4 | 4 | import os |
| 5 | +from dataclasses import dataclass |
| 6 | +from typing import Callable, Literal |
| 7 | +from collections.abc import Iterable |
5 | 8 | from pathlib import Path |
6 | 9 |
|
7 | 10 | import numpy as np |
|
18 | 21 | IMPROVABILITY = "improvability" |
19 | 22 | LOSS_RESCALED = "loss_rescaled" |
20 | 23 |
|
| 24 | +MetricDirection = Literal["min", "max"] |
| 25 | +MetricAlignment = Literal["row", "method"] |
| 26 | +InvalidSubsetPolicy = Literal["raise", "nan", "skip"] |
| 27 | + |
| 28 | + |
| 29 | +@dataclass(frozen=True, slots=True) |
| 30 | +class MetricSpec: |
| 31 | + """ |
| 32 | + Defines how to (re)compute a metric from a subset of results_per_task and how to |
| 33 | + reduce it to a single scalar score for a given method. |
| 34 | +
|
| 35 | + - compute(): returns either |
| 36 | + * row-aligned Series (index == results_per_task.index) [alignment="row"] |
| 37 | + * method-aligned Series (index == method names) [alignment="method"] |
| 38 | + - score(): returns a float score for method_1 given the computed metric result |
| 39 | + """ |
| 40 | + name: str |
| 41 | + direction: MetricDirection |
| 42 | + alignment: MetricAlignment |
| 43 | + compute: Callable[["TabArena", pd.DataFrame], pd.Series] |
| 44 | + score: Callable[["TabArena", pd.DataFrame, pd.Series, str], float] |
| 45 | + # Methods that must be present in any subset (e.g., Elo calibration framework) |
| 46 | + required_methods: frozenset[str] = frozenset() |
| 47 | + # What to do if required methods are missing from a subset |
| 48 | + invalid_subset_policy: InvalidSubsetPolicy = "raise" |
| 49 | + |
21 | 50 |
|
22 | 51 | # TODO: Should "data" be an init arg? Probably not. |
23 | 52 | class TabArena: |
@@ -420,6 +449,12 @@ def fillna_data( |
420 | 449 |
|
421 | 450 | return data |
422 | 451 |
|
| 452 | + def get_task_groupby_cols(self, include_seed_col: bool = False): |
| 453 | + task_groupby_cols = self.task_groupby_columns |
| 454 | + if include_seed_col and self.seed_column is not None: |
| 455 | + task_groupby_cols = task_groupby_cols + [self.seed_column] |
| 456 | + return task_groupby_cols |
| 457 | + |
423 | 458 | def compute_results_per_task(self, data: pd.DataFrame, include_seed_col: bool = False) -> pd.DataFrame: |
424 | 459 | groupby_cols = self.groupby_columns |
425 | 460 | task_groupby_cols = self.task_groupby_columns |
@@ -996,6 +1031,316 @@ def _weighted_groupby_mean(self, tasks: list[str], data: pd.DataFrame, agg_colum |
996 | 1031 | column_mean.index.name = agg_column |
997 | 1032 | return column_mean |
998 | 1033 |
|
| 1034 | + def _seed_col_if_present(self, df: pd.DataFrame) -> str | None: |
| 1035 | + if self.seed_column is not None and self.seed_column in df.columns: |
| 1036 | + return self.seed_column |
| 1037 | + return None |
| 1038 | + |
| 1039 | + def _score_weighted_mean_by_task( |
| 1040 | + self, |
| 1041 | + df: pd.DataFrame, |
| 1042 | + *, |
| 1043 | + value_col: str, |
| 1044 | + sort_asc: bool, |
| 1045 | + ) -> pd.Series: |
| 1046 | + """ |
| 1047 | + Returns a per-method Series of weighted means using the same equal-task weighting |
| 1048 | + logic as other parts of TabArena. |
| 1049 | + """ |
| 1050 | + seed_col = self._seed_col_if_present(df) |
| 1051 | + return compute_weighted_mean_by_task( |
| 1052 | + df=df, |
| 1053 | + value_col=value_col, |
| 1054 | + task_col=self.task_groupby_columns, |
| 1055 | + seed_col=seed_col, |
| 1056 | + method_col=self.method_col, |
| 1057 | + sort_asc=sort_asc, |
| 1058 | + ) |
| 1059 | + |
| 1060 | + def score_if_remove_method( |
| 1061 | + self, |
| 1062 | + metric: MetricSpec, |
| 1063 | + results_per_task: pd.DataFrame, |
| 1064 | + *, |
| 1065 | + method_1: str, |
| 1066 | + method_2: str, |
| 1067 | + ) -> float: |
| 1068 | + """ |
| 1069 | + Compute the scalar score for method_1 after removing method_2 and recomputing metric. |
| 1070 | + Returns the resulting score (NOT delta). |
| 1071 | + """ |
| 1072 | + # Keep your prior convention: if we remove method_1 itself, return baseline score on provided df. |
| 1073 | + if method_1 == method_2: |
| 1074 | + if not self._metric_subset_ok(metric, results_per_task): |
| 1075 | + return float("nan") |
| 1076 | + metric_values = metric.compute(self, results_per_task) |
| 1077 | + return float(metric.score(self, results_per_task, metric_values, method_1)) |
| 1078 | + |
| 1079 | + subset = results_per_task.loc[results_per_task[self.method_col] != method_2].copy() |
| 1080 | + if not self._metric_subset_ok(metric, subset): |
| 1081 | + return float("nan") |
| 1082 | + metric_values = metric.compute(self, subset) |
| 1083 | + return float(metric.score(self, subset, metric_values, method_1)) |
| 1084 | + |
| 1085 | + def score_series_if_remove_each_method( |
| 1086 | + self, |
| 1087 | + metric: MetricSpec, |
| 1088 | + results_per_task: pd.DataFrame, |
| 1089 | + *, |
| 1090 | + method_1: str, |
| 1091 | + ) -> pd.Series: |
| 1092 | + """ |
| 1093 | + For a fixed method_1, return a Series indexed by method_2 with values = resulting score |
| 1094 | + for method_1 if method_2 were removed. |
| 1095 | + """ |
| 1096 | + methods = ( |
| 1097 | + results_per_task[self.method_col] |
| 1098 | + .dropna() |
| 1099 | + .astype(str) |
| 1100 | + .unique() |
| 1101 | + .tolist() |
| 1102 | + ) |
| 1103 | + |
| 1104 | + scores: dict[str, float] = {} |
| 1105 | + for method_2 in methods: |
| 1106 | + # Never propose removing required methods (e.g., Elo calibration framework) |
| 1107 | + if method_2 in metric.required_methods: |
| 1108 | + continue |
| 1109 | + scores[method_2] = self.score_if_remove_method( |
| 1110 | + metric, |
| 1111 | + results_per_task, |
| 1112 | + method_1=method_1, |
| 1113 | + method_2=method_2, |
| 1114 | + ) |
| 1115 | + |
| 1116 | + s = pd.Series(scores, name=f"{metric.name}_score_for_{method_1}_if_remove_method") |
| 1117 | + # Sorting: for min-metrics ascending is "better"; for max-metrics descending is "better" |
| 1118 | + return s.sort_values(ascending=(metric.direction == "min")) |
| 1119 | + |
| 1120 | + def greedy_remove_methods_optimize_score( |
| 1121 | + self, |
| 1122 | + metric: MetricSpec, |
| 1123 | + results_per_task: pd.DataFrame, |
| 1124 | + *, |
| 1125 | + method_1: str, |
| 1126 | + stop_at_score: float | None = None, |
| 1127 | + ) -> pd.Series: |
| 1128 | + """ |
| 1129 | + Iteratively remove method_2 that yields the best improvement for method_1 |
| 1130 | + according to metric.direction, recomputing the metric each iteration. |
| 1131 | +
|
| 1132 | + Returns: |
| 1133 | + pd.Series indexed by removed method_2 in removal order |
| 1134 | + values = resulting score for method_1 at that iteration (NOT delta). |
| 1135 | + """ |
| 1136 | + current = results_per_task |
| 1137 | + removed_in_order: dict[str, float] = {} |
| 1138 | + |
| 1139 | + while True: |
| 1140 | + # Compute current score for method_1 (and stop checks) |
| 1141 | + if not self._metric_subset_ok(metric, current): |
| 1142 | + break |
| 1143 | + current_metric = metric.compute(self, current) |
| 1144 | + cur_score = float(metric.score(self, current, current_metric, method_1)) |
| 1145 | + |
| 1146 | + # Stop criteria |
| 1147 | + if pd.isna(cur_score): |
| 1148 | + break |
| 1149 | + if stop_at_score is not None: |
| 1150 | + if metric.direction == "min" and cur_score <= stop_at_score: |
| 1151 | + break |
| 1152 | + if metric.direction == "max" and cur_score >= stop_at_score: |
| 1153 | + break |
| 1154 | + |
| 1155 | + remaining_methods = current[self.method_col].dropna().astype(str).unique().tolist() |
| 1156 | + # Exclude method_1 and any required methods (e.g., calibration framework) |
| 1157 | + candidates = [ |
| 1158 | + m for m in remaining_methods |
| 1159 | + if m != method_1 and m not in metric.required_methods |
| 1160 | + ] |
| 1161 | + if not candidates: |
| 1162 | + break |
| 1163 | + |
| 1164 | + candidate_scores: dict[str, float] = {} |
| 1165 | + for method_2 in candidates: |
| 1166 | + subset = current.loc[current[self.method_col] != method_2].copy() |
| 1167 | + if not self._metric_subset_ok(metric, subset): |
| 1168 | + if metric.invalid_subset_policy == "skip": |
| 1169 | + continue |
| 1170 | + candidate_scores[method_2] = float("nan") |
| 1171 | + continue |
| 1172 | + subset_metric = metric.compute(self, subset) |
| 1173 | + candidate_scores[method_2] = float(metric.score(self, subset, subset_metric, method_1)) |
| 1174 | + |
| 1175 | + scores_s = pd.Series(candidate_scores).dropna() |
| 1176 | + if scores_s.empty: |
| 1177 | + break |
| 1178 | + |
| 1179 | + # Choose best candidate depending on direction |
| 1180 | + if metric.direction == "min": |
| 1181 | + best_method_2 = scores_s.idxmin() |
| 1182 | + else: |
| 1183 | + best_method_2 = scores_s.idxmax() |
| 1184 | + |
| 1185 | + best_score = float(scores_s.loc[best_method_2]) |
| 1186 | + removed_in_order[best_method_2] = best_score |
| 1187 | + |
| 1188 | + # Remove best_method_2 and continue |
| 1189 | + current = current.loc[current[self.method_col] != best_method_2] |
| 1190 | + |
| 1191 | + return pd.Series(removed_in_order, name=f"{metric.name}_score_iter_for_{method_1}") |
| 1192 | + |
| 1193 | + def greedy_score_matrix( |
| 1194 | + self, |
| 1195 | + metric: MetricSpec, |
| 1196 | + results_per_task: pd.DataFrame, |
| 1197 | + *, |
| 1198 | + methods_1: Iterable[str] | None = None, |
| 1199 | + stop_at_score: float | None = None, |
| 1200 | + ) -> pd.DataFrame: |
| 1201 | + """ |
| 1202 | + Build a DataFrame: |
| 1203 | + rows = method_2 (removed) |
| 1204 | + cols = method_1 |
| 1205 | + cell = resulting score for method_1 at the iteration when method_2 was removed |
| 1206 | + """ |
| 1207 | + if methods_1 is None: |
| 1208 | + methods_1 = ( |
| 1209 | + results_per_task[self.method_col].dropna().astype(str).unique().tolist() |
| 1210 | + ) |
| 1211 | + |
| 1212 | + col_series: dict[str, pd.Series] = {} |
| 1213 | + for method_1 in methods_1: |
| 1214 | + col_series[method_1] = self.greedy_remove_methods_optimize_score( |
| 1215 | + metric, |
| 1216 | + results_per_task, |
| 1217 | + method_1=method_1, |
| 1218 | + stop_at_score=stop_at_score, |
| 1219 | + ) |
| 1220 | + |
| 1221 | + return pd.DataFrame(col_series) |
| 1222 | + |
| 1223 | + # ---------------------------- |
| 1224 | + # MetricSpec factories |
| 1225 | + # ---------------------------- |
| 1226 | + |
| 1227 | + def metric_spec_error(self) -> MetricSpec: |
| 1228 | + """ |
| 1229 | + Lower is better. Score = weighted mean error (equal task weighting). |
| 1230 | + """ |
| 1231 | + def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series: |
| 1232 | + # row-aligned; no recomputation needed |
| 1233 | + return df[self.error_col] |
| 1234 | + |
| 1235 | + def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float: |
| 1236 | + groupby_columns = self._get_groupby_cols(df) |
| 1237 | + tmp = df[groupby_columns].copy() |
| 1238 | + tmp[self.error_col] = values.to_numpy() |
| 1239 | + per_method = self._score_weighted_mean_by_task(tmp, value_col=self.error_col, sort_asc=True) |
| 1240 | + return float(per_method.get(method_1, float("nan"))) |
| 1241 | + |
| 1242 | + return MetricSpec( |
| 1243 | + name=self.error_col, |
| 1244 | + direction="min", |
| 1245 | + alignment="row", |
| 1246 | + compute=compute, |
| 1247 | + score=score, |
| 1248 | + ) |
| 1249 | + |
| 1250 | + def metric_spec_rank(self) -> MetricSpec: |
| 1251 | + """ |
| 1252 | + Lower is better. Score = weighted mean rank. |
| 1253 | + """ |
| 1254 | + def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series: |
| 1255 | + task_groupby_cols = self._get_task_groupby_cols(results=df) |
| 1256 | + return self.compare_rank_per(df=df, task_groupby_cols=task_groupby_cols) |
| 1257 | + |
| 1258 | + def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float: |
| 1259 | + groupby_columns = self._get_groupby_cols(df) |
| 1260 | + tmp = df[groupby_columns].copy() |
| 1261 | + tmp[RANK] = values.to_numpy() |
| 1262 | + per_method = self._score_weighted_mean_by_task(tmp, value_col=RANK, sort_asc=True) |
| 1263 | + return float(per_method.get(method_1, float("nan"))) |
| 1264 | + |
| 1265 | + return MetricSpec( |
| 1266 | + name=RANK, |
| 1267 | + direction="min", |
| 1268 | + alignment="row", |
| 1269 | + compute=compute, |
| 1270 | + score=score, |
| 1271 | + ) |
| 1272 | + |
| 1273 | + def metric_spec_improvability(self) -> MetricSpec: |
| 1274 | + """ |
| 1275 | + Lower is better (0 is ideal). Score = weighted mean improvability. |
| 1276 | + """ |
| 1277 | + def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series: |
| 1278 | + task_groupby_cols = self._get_task_groupby_cols(results=df) |
| 1279 | + return self.compute_improvability_per(results_per_task=df, task_groupby_cols=task_groupby_cols) |
| 1280 | + |
| 1281 | + def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float: |
| 1282 | + groupby_columns = self._get_groupby_cols(df) |
| 1283 | + tmp = df[groupby_columns].copy() |
| 1284 | + tmp[IMPROVABILITY] = values.to_numpy() |
| 1285 | + per_method = self._score_weighted_mean_by_task(tmp, value_col=IMPROVABILITY, sort_asc=True) |
| 1286 | + return float(per_method.get(method_1, float("nan"))) |
| 1287 | + |
| 1288 | + return MetricSpec( |
| 1289 | + name=IMPROVABILITY, |
| 1290 | + direction="min", |
| 1291 | + alignment="row", |
| 1292 | + compute=compute, |
| 1293 | + score=score, |
| 1294 | + ) |
| 1295 | + |
| 1296 | + def metric_spec_elo(self, **elo_kwargs) -> MetricSpec: |
| 1297 | + """ |
| 1298 | + Higher is better. Score = Elo value for method_1 computed on the subset. |
| 1299 | + """ |
| 1300 | + calibration_framework = elo_kwargs.get("calibration_framework", None) |
| 1301 | + required = frozenset([calibration_framework]) if calibration_framework else frozenset() |
| 1302 | + |
| 1303 | + def compute(self: "TabArena", df: pd.DataFrame) -> pd.Series: |
| 1304 | + bars = self.compute_elo( |
| 1305 | + results_per_task=df, |
| 1306 | + include_quantiles=False, |
| 1307 | + round_decimals=None, |
| 1308 | + **elo_kwargs, |
| 1309 | + ) |
| 1310 | + # method-aligned Series |
| 1311 | + return bars["elo"] |
| 1312 | + |
| 1313 | + def score(self: "TabArena", df: pd.DataFrame, values: pd.Series, method_1: str) -> float: |
| 1314 | + return float(values.get(method_1, float("nan"))) |
| 1315 | + |
| 1316 | + return MetricSpec( |
| 1317 | + name="elo", |
| 1318 | + direction="max", |
| 1319 | + alignment="method", |
| 1320 | + compute=compute, |
| 1321 | + score=score, |
| 1322 | + required_methods=required, |
| 1323 | + invalid_subset_policy="raise", |
| 1324 | + ) |
| 1325 | + |
| 1326 | + def _metric_subset_ok(self, metric: MetricSpec, df: pd.DataFrame) -> bool: |
| 1327 | + """Return True if df satisfies metric.required_methods; otherwise obey policy.""" |
| 1328 | + if not metric.required_methods: |
| 1329 | + return True |
| 1330 | + present = set(df[self.method_col].dropna().astype(str).unique()) |
| 1331 | + missing = set(metric.required_methods) - present |
| 1332 | + if not missing: |
| 1333 | + return True |
| 1334 | + if metric.invalid_subset_policy == "raise": |
| 1335 | + raise ValueError( |
| 1336 | + f"Metric {metric.name!r} requires methods {sorted(metric.required_methods)}, " |
| 1337 | + f"but subset is missing {sorted(missing)}." |
| 1338 | + ) |
| 1339 | + if metric.invalid_subset_policy == "nan": |
| 1340 | + return False |
| 1341 | + # "skip" |
| 1342 | + return False |
| 1343 | + |
999 | 1344 |
|
1000 | 1345 | def get_bootstrap_result_lst(data: list, func_, rng=None, num_round: int = None, func_kwargs=None, seed: int = 0): |
1001 | 1346 | rows = [] |
|
0 commit comments