Clean up plot tuning impact code

Innixma · Innixma · commit d94134b68c86 · 2026-01-27T23:42:10.000Z
diff --git a/tabarena/tabarena/paper/tabarena_evaluator.py b/tabarena/tabarena/paper/tabarena_evaluator.py
@@ -233,6 +233,7 @@ def eval(
         tmp_treat_tasks_independently: bool = False,  # FIXME: Need to make a weighted elo logic
         leaderboard_kwargs: dict | None = None,
         plot_with_baselines: bool = False,
+        verbose: bool = True,
     ) -> pd.DataFrame:
         if leaderboard_kwargs is None:
             leaderboard_kwargs = {}
@@ -407,10 +408,11 @@ def eval(
 
         n_tasks = len(df_results_rank_compare[[tabarena.task_col, tabarena.seed_column]].drop_duplicates())
 
-        print(
-            f"Evaluating with {len(df_results_rank_compare[tabarena.task_col].unique())} datasets... ({n_tasks} tasks)| problem_types={self.problem_types}, folds={self.folds}")
-        with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 1000):
-            print(leaderboard)
+        if verbose:
+            print(
+                f"Evaluating with {len(df_results_rank_compare[tabarena.task_col].unique())} datasets... ({n_tasks} tasks)| problem_types={self.problem_types}, folds={self.folds}")
+            with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 1000):
+                print(leaderboard)
 
         # horizontal elo barplot
         self.plot_tuning_impact(
@@ -1053,12 +1055,10 @@ def plot_tuning_impact(
 
         if imputed_names is None:
             imputed_names = []
-        # imputed_names = imputed_names or ['TabPFNv2', 'TabICL']
 
         df = df.copy(deep=True)
 
         framework_col = "framework_type"
-        # framework_col = "framework_name"
 
         groupby_columns_extra = ["dataset"]
 
@@ -1073,7 +1073,6 @@ def plot_tuning_impact(
         elif use_score:
             lower_is_better = False
             df["normalized-score"] = 1 - df[metric]
-            # df_plot_w_mean_per_dataset["normalized-score"] = 1 - df_plot_w_mean_per_dataset["normalized-error"]
             metric = "normalized-score"
         else:
             metric = metric
@@ -1102,12 +1101,6 @@ def plot_tuning_impact(
             df = df[df["tune_method"].isin(plot_tune_types) | df[self.method_col].isin(baselines)]
 
         df_plot = df[df["framework_type"].isin(framework_types)]
-        # df_plot = df_plot[~df_plot["framework_type"].isin(imputed_names)]
-
-        # pd.set_option('display.max_columns', None)  # todo
-        # print(f'{df_plot.head()=}')
-
-        # df_plot_w_mean_2 = df_plot.groupby(["framework_type", "tune_method"])[metric].mean().reset_index()
 
         df_plot_w_mean_per_dataset = df_plot.groupby(["framework_type", "tune_method", *groupby_columns_extra])[
             metric].mean().reset_index()
@@ -1136,27 +1129,8 @@ def plot_tuning_impact(
         framework_type_order = list(df_plot_mean_dedupe["framework_type"].to_list())
         framework_type_order.reverse()
 
-        # change to names
-        # df_plot_w_mean_per_dataset["framework_type"] = df_plot_w_mean_per_dataset["framework_type"].map(f_map_type_name)
-
-        # sns.set_color_codes("pastel")
-        # with sns.plotting_context("notebook", font_scale=0.8, rc={
-        #     "pgf.texsystem": "pdflatex",
-        #     'font.family': 'serif',
-        #     'font.size': 10.95,
-        #     'text.usetex': True,
-        #     'pgf.rcfonts': False,
-        #     # 'legend.framealpha': 0.5,
-        #     'text.latex.preamble': r'\usepackage{times} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{amssymb} \usepackage{xcolor}'
-        # }):
-
         with sns.axes_style("whitegrid"):
-            # with plt.rc_context({'font.family': 'serif', "text.usetex": True, 'font.size': 12, 'axes.labelsize': 12, 'xtick.labelsize': 12}):
-            with plt.rc_context(self.rc_context_params
-                                # | figsizes.neurips2024(height_to_width_ratio=0.8)
-                                ):
-            # with plt.rc_context(fontsizes.neurips2024() | fonts.neurips2024()):
-                # with plt.rc_context(figsizes.neurips2024(height_to_width_ratio=0.8)):
+            with plt.rc_context(self.rc_context_params):
                 colors = sns.color_palette("pastel").as_hex()
                 errcolors = sns.color_palette("deep").as_hex()
 
@@ -1178,7 +1152,6 @@ def plot_tuning_impact(
                     # figsize = None
 
                 fig, ax = plt.subplots(1, 1, figsize=figsize)
-                # fig, ax = plt.subplots(1, 1)
 
                 if use_y:
                     baseline_func = ax.axvline
@@ -1198,7 +1171,6 @@ def plot_tuning_impact(
                 to_plot = [
                     dict(
                         x=pos, y=y,
-                        # hue="tune_method",  # palette=["m", "g", "r],
                         label="Tuned + Ensembled",
                         data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "tuned_ensembled"],
                         ax=ax,
@@ -1208,7 +1180,6 @@ def plot_tuning_impact(
                     ),
                     # dict(
                     #     x=x, y=y,
-                    #     # hue="tune_method",  # palette=["m", "g", "r],
                     #     label="Default (Holdout)",
                     #     data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "holdout"], ax=ax,
                     #     order=framework_type_order,
@@ -1218,7 +1189,6 @@ def plot_tuning_impact(
                     # ),
                     # dict(
                     #     x=x, y=y,
-                    #     # hue="tune_method",  # palette=["m", "g", "r],
                     #     label="Tuned (Holdout)",
                     #     data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "holdout_tuned"], ax=ax,
                     #     order=framework_type_order,
@@ -1228,7 +1198,6 @@ def plot_tuning_impact(
                     # ),
                     dict(
                         x=pos, y=y,
-                        # hue="tune_method",  # palette=["m", "g", "r],
                         label="Tuned",
                         data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "tuned"], ax=ax,
                         order=framework_type_order,
@@ -1238,7 +1207,6 @@ def plot_tuning_impact(
                     ),
                     dict(
                         x=pos, y=y,
-                        # hue="tune_method",  # palette=["m", "g", "r],
                         label="Default",
                         data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "default"], ax=ax,
                         order=framework_type_order, color=colors[0],
@@ -1248,7 +1216,6 @@ def plot_tuning_impact(
                     ),
                     dict(
                         x=pos, y=y,
-                        # hue="tune_method",  # palette=["m", "g", "r],
                         label="Tuned + Ensembled (Holdout)",
                         data=df_plot_w_mean_per_dataset[
                             df_plot_w_mean_per_dataset["tune_method"] == "holdout_tuned_ensembled"], ax=ax,
@@ -1260,34 +1227,13 @@ def plot_tuning_impact(
                     ),
                     # dict(
                     #     x=x, y=y,
-                    #     # hue="tune_method",  # palette=["m", "g", "r],
                     #     label="Best",
                     #     data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "best"], ax=ax,
                     #     order=framework_type_order, color=colors[3],
                     #     width=0.55, linewidth=linewidth,
                     #     err_kws={"color": errcolors[3]},
                     #     alpha=1.0,
                     # ),
-                    # dict(
-                    #     x=x, y=y,
-                    #     # hue="tune_method",  # palette=["m", "g", "r],
-                    #     label="Tuned (4h)",
-                    #     data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "tuned_4h"], ax=ax,
-                    #     order=framework_type_order,
-                    #     color=colors[4],
-                    #     width=0.5, linewidth=linewidth,
-                    #     err_kws={"color": errcolors[4]},
-                    # ),
-                    # dict(
-                    #     x=x, y=y,
-                    #     # hue="tune_method",  # palette=["m", "g", "r],
-                    #     label="Tuned + Ensembled (4h)",
-                    #     data=df_plot_w_mean_per_dataset[df_plot_w_mean_per_dataset["tune_method"] == "tuned_ensembled_4h"], ax=ax,
-                    #     order=framework_type_order, color=colors[5],
-                    #     width=0.4,
-                    #     err_kws={"color": errcolors[5]},
-                    # ),
-
                 ]
 
                 if use_score:
@@ -1301,8 +1247,6 @@ def plot_tuning_impact(
                             plot_line["width"] = 0.6 * 1.3
                         else:
                             plot_line["width"] = width * 1.3
-                        # plot_line["color"] = color
-                        # plot_line["err_kws"] = err_kws
 
                 for plot_line in to_plot:
                     boxplot = sns.barplot(**plot_line)
@@ -1313,19 +1257,9 @@ def plot_tuning_impact(
                     boxplot.set(xlabel=None, ylabel='Elo' if metric=='elo' else 'Normalized score')  # remove method in the x-axis
                 # boxplot.set_title("Effect of tuning and ensembling")
 
-                # # FIXME: (Nick) HACK, otherwise it isn't in the plot, don't know why
-                # if use_elo:
-                #     if baseline_means and "Portfolio-N200 (ensemble) (4h)" in baselines:
-                #         max_baseline_mean = max([v for k, v in baseline_means.items()])
-                #         if ylim is not None:
-                #             ylim[1] = max_baseline_mean + 50
-                #         if xlim is not None:
-                #             xlim[1] = max_baseline_mean + 50
-
                 # do this before setting x/y limits
                 for baseline_idx, (baseline, color) in enumerate(zip(baselines, baseline_colors)):
                     baseline_mean = baseline_means[baseline]
-                    # baseline_func(baseline_mean, label=baseline, color=color, linewidth=2.0, ls="--")
                     baseline_func(baseline_mean, color=color, linewidth=2.0, ls="--", zorder=-10)
 
                     if baseline == 'Portfolio-N200 (ensemble) (4h)':
@@ -1411,26 +1345,11 @@ def plot_tuning_impact(
                 else:
                     plt.xlim(-0.5, len(boxplot.get_xticklabels()) - 0.5)
 
-
-                # ax.legend(loc="upper center", ncol=5)
-                # these are not the final legend parameters, see below
                 ax.legend(loc="upper center", bbox_to_anchor=[0.5, 1.02])
 
                 # reordering the labels
                 handles, labels = ax.get_legend_handles_labels()
 
-                # this doesn't work, it also removes the hatch from the actual bars in the plot
-                # for handle in handles:
-                #     patches = []
-                #     if isinstance(handle, Patch):
-                #         patches = [handle]
-                #     elif isinstance(handle, BarContainer):
-                #         patches = handle.patches
-                #     for patch in patches:
-                #         # remove hatch from existing handles
-                #         # It can be present if one of the imputed methods is the best method, e.g., for multiclass
-                #         patch.set(hatch=None)
-
                 if has_imputed:
                     # Create a custom legend patch for "imputed"
                     imputed_patch = Patch(facecolor='gray', edgecolor='white', hatch='xx', label='Partially imputed')
@@ -1446,28 +1365,18 @@ def plot_tuning_impact(
                     labels = [labels[i] for i in valid_idxs]
                     handles = [handles[i] for i in valid_idxs]
 
-                # specify order
-                # len_baselines = len(baselines)
-                # len_baselines = 0  # we don't put them in the legend anymore
-                # num_other = len(labels) - len_baselines
-                # order = [n + len_baselines for n in range(num_other)] + [n for n in range(len_baselines)]
-                # order = [3, 4, 5, 0, 1, 2]
                 order = list(range(len(labels)))
                 order = list(reversed(order))
-                # if len(order) == 3:
-                #     order = [2, 1, 0]
 
                 # pass handle & labels lists along with order as below
-                ax.legend([handles[i] for i in order], [labels[i] for i in order], loc="lower center",
-                          ncol=(len(labels)+1)//2 if has_imputed and use_y else len(labels),
-                          bbox_to_anchor=[0.35 if use_y else 0.5, 1.05])
-
-                # if use_y:
-                #     boxplot.margins(y=0.05)
-                # else:
-                #     boxplot.margins(x=0.05)
+                ax.legend(
+                    [handles[i] for i in order],
+                    [labels[i] for i in order],
+                    loc="lower center",
+                    ncol=(len(labels)+1)//2 if has_imputed and use_y else len(labels),
+                    bbox_to_anchor=[0.35 if use_y else 0.5, 1.05],
+                )
 
-                # ax.legend(bbox_to_anchor=[0.1, 0.5], loc='center left', ncol=5)
                 plt.tight_layout()
 
                 if save_prefix: