automl · benjamc · May 19, 2025 · May 19, 2025 · May 19, 2025 · May 22, 2025
diff --git a/carps/analysis/calc_hypervolume.py b/carps/analysis/calc_hypervolume.py
@@ -11,7 +11,7 @@
 import pandas as pd
 from pymoo.indicators.hv import HV
 
-from carps.analysis.gather_data import convert_mixed_types_to_str
+# from carps.analysis.gather_data import convert_mixed_types_to_str
 
 run_id = ["task_type", "benchmark_id", "task_id", "optimizer_id", "seed"]
 
@@ -57,9 +57,17 @@ def add_reference_point(x: pd.DataFrame) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Dataframe with the reference point.
     """
-    costs = x["trial_value__cost_inc"].apply(lambda x: np.array([np.array(c) for c in x])).to_list()
-    costs = np.concatenate(costs)
+    # Flatten and stack all cost vectors
+    costs = np.vstack([np.array(c) for c in x["trial_value__cost_raw"]])
+
+    # Sanity check for consistent dimensionality
+    if len(set(cost.shape[0] for cost in costs)) != 1:
+        raise ValueError("Inconsistent number of objectives in cost vectors.")
+
+    # Reference point is max across all objectives
     reference_point = np.max(costs, axis=0)
+
+    # Set reference point per row
     x["reference_point"] = [reference_point] * len(x)
     return x
 
@@ -73,7 +81,7 @@ def calc_hv(x: pd.DataFrame) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Dataframe with the hypervolume.
     """
-    F = np.concatenate(np.array([np.array(p) for p in x["trial_value__cost_inc"].to_numpy()]))
+    F = np.vstack([np.array(p) for p in x["trial_value__cost_raw"]])
 
     ind = HV(ref_point=x["reference_point"].iloc[0], pf=None, nds=False)
     x["hypervolume"] = ind(F)

diff --git a/carps/analysis/gather_data.py b/carps/analysis/gather_data.py
@@ -22,6 +22,7 @@
 from carps.utils.loggingutils import get_logger, setup_logging
 from carps.utils.task import Task
 from carps.utils.trials import TrialInfo
+from carps.analysis.calc_hypervolume import calc_hv, add_reference_point, run_id
 
 if TYPE_CHECKING:
     from carps.objective_functions.objective_function import ObjectiveFunction
@@ -389,11 +390,11 @@ def maybe_postadd_task(logs: pd.DataFrame, overwrite: bool = False) -> pd.DataFr
         task_cfg = load_task_cfg(task_id=gid, task_index=task_index)
 
         task_cfg_yaml = OmegaConf.to_yaml(task_cfg)
-        if "${seed}" in task_cfg_yaml:
-            # Add seed to config to make it resolvable
-            assert gdf["seed"].nunique() == 1  # noqa: PD101
-            seed = gdf["seed"].iloc[0]
-            task_cfg.seed = int(seed)
+        # if "${seed}" in task_cfg_yaml:
+        #     # Add seed to config to make it resolvable
+        #     assert gdf["seed"].nunique() == 1  # noqa: PD101
+        #     seed = gdf["seed"].iloc[0]
+        #     task_cfg.seed = int(seed)
         task_cfg = OmegaConf.to_container(task_cfg, resolve=False)
         task_columns = [c for c in gdf.columns if c.startswith("task.")]
         if overwrite:
@@ -440,10 +441,13 @@ def maybe_convert_cost_dtype(x: int | float | str | list) -> float | list[float]
     Returns:
         float | list[float]: Cost(s).
     """
+
     if isinstance(x, int | float):
         return float(x)
     if isinstance(x, str):
-        return eval(x)  # noqa: S307
+        x = eval(x)  # noqa: S307
+    if isinstance(x, dict):
+        x = eval(x["cost"])
     assert isinstance(x, list)
     return x
 
@@ -463,7 +467,7 @@ def maybe_convert_cost_to_so(x: float | list | np.ndarray) -> float:
         float: Single-objective cost or aggregated cost.
     """
     if isinstance(x, list | np.ndarray):
-        return np.sum(x)
+        return np.sum(x)  # TODO change to HV here
     if isinstance(x, dict):
         assert len(x.values()) == 1
         # Most likely comes from database
@@ -472,7 +476,7 @@ def maybe_convert_cost_to_so(x: float | list | np.ndarray) -> float:
         if isinstance(value, str):
             value = ast.literal_eval(value)
             if isinstance(value, list):
-                return np.sum(value)
+                return np.sum(value)  # TODO Change to HV here
         if isinstance(value, float | int):
             return value
     if isinstance(x, float):
@@ -566,7 +570,11 @@ def process_logs(logs: pd.DataFrame, keep_task_columns: list[str] | None = None)
 
     logger.debug("Handle MO costs...")
     logs["trial_value__cost_raw"] = logs["trial_value__cost"].apply(maybe_convert_cost_dtype)
-    logs["trial_value__cost"] = logs["trial_value__cost_raw"].apply(maybe_convert_cost_to_so)
+    # trial_value__cost_raw for add_reference_point and to calc_hv
+    logs = logs.groupby(by=["task_type", "task_id"]).apply(add_reference_point).reset_index(drop=True)
+    logs = logs.groupby(by=[*run_id, "n_trials"]).apply(calc_hv).reset_index(drop=True)
+    logs["trial_value__cost"] = logs["hypervolume"] #logs["trial_value__cost_raw"].apply(maybe_convert_cost_to_so)
+    print(logs.head())
     logger.debug("Determine incumbent cost...")
     logs["trial_value__cost_inc"] = logs.groupby(by=grouper_keys)["trial_value__cost"].transform("cummin")
 
@@ -613,6 +621,7 @@ def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Normalized logs
     """
+    grouper_keys = ["task_id", "optimizer_id", "seed"]
     logger.info("Start normalization...")
     logger.info("Normalize n_trials...")
     logs["n_trials_norm"] = logs.groupby("task_id")["n_trials"].transform(normalize)
@@ -623,7 +632,7 @@ def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame:
         hv = logs.loc[ids_mo, "hypervolume"]
         logs.loc[ids_mo, "trial_value__cost"] = -hv  # higher is better
         logs["trial_value__cost"] = logs["trial_value__cost"].astype("float64")
-        logs["trial_value__cost_inc"] = logs["trial_value__cost"].transform("cummin")
+        logs["trial_value__cost_inc"] = logs.groupby(by=grouper_keys)["trial_value__cost"].transform("cummin")
     logs["trial_value__cost_norm"] = logs.groupby("task_id")["trial_value__cost"].transform(normalize)
     logger.info("Calc normalized incumbent cost...")
 

diff --git a/carps/analysis/run_autorank.py b/carps/analysis/run_autorank.py
@@ -383,6 +383,7 @@ def cd_evaluation(
         alpha=alpha,
         alpha_normality=alpha_normality,
         num_samples=len(rank_data),
+        sample_matrix=None,
         posterior_matrix=None,
         decision_matrix=None,
         rope=None,

diff --git a/carps/experimenter/create_cluster_configs.py b/carps/experimenter/create_cluster_configs.py
@@ -13,6 +13,7 @@
 from py_experimenter.experimenter import PyExperimenter
 
 from carps.utils.loggingutils import CustomEncoder
+import pickle as pckl
 
 logger = logging.getLogger("create experiments")
 
@@ -72,7 +73,7 @@ def get_experiment_definition(cfg: OmegaConf) -> dict:
     cfg_dict = OmegaConf.to_container(cfg=cfg, resolve=True)
 
     cfg_str = json.dumps(cfg_dict, cls=CustomEncoder)
-    cfg_hash = create_config_hash(cfg)
+    cfg_hash = create_config_hash_from_full_cfg(cfg)
 
     return {
         "config": cfg_str,
@@ -100,6 +101,7 @@ def fill_database(cfg: DictConfig, experimenter: PyExperimenter) -> None:
         DatabaseConnectionError: If there is an error with the database connection.
     """
     experiment_definition = get_experiment_definition(cfg)
+
 
     column_names = list(experimenter.db_connector.database_configuration.keyfields.keys())
     exists = False
@@ -124,7 +126,7 @@ def fill_database(cfg: DictConfig, experimenter: PyExperimenter) -> None:
     # experimenter.close_ssh()
 
 
-@hydra.main(config_path="../configs", config_name="base.yaml", version_base=None)  # type: ignore[misc]
+@hydra.main(config_path="../configs", config_name="base.yaml", version_base=None, save_as_pckl=True, folder_path="configs_pckl")  # type: ignore[misc]
 def main(cfg: DictConfig) -> None:
     """Store experiment config in database.
 
@@ -134,23 +136,31 @@ def main(cfg: DictConfig) -> None:
         Global configuration.
 
     """
-    fill_database(cfg, experimenter)
+    if save_as_pckl:
+        experiment_definition = get_experiment_definition(cfg)
+        files = list(Path(folder_path).glob("*.pkl"))
+
+        if experiment_definition['config_hash'] not in files:
+            with open(f"{folder_path}{experiment_definition['config_hash']}.pkl", "wb") as f:
+                pckl.dump(experiment_definition, f)
+    else: 
+        experiment_configuration_file_path = Path(__file__).parent / "py_experimenter.yaml"
+
+        database_credential_file_path = Path(__file__).parent / "credentials.yaml"
+        if database_credential_file_path is not None and not database_credential_file_path.exists():
+            database_credential_file_path = None  # type: ignore[assignment]
+
+        experimenter = PyExperimenter(
+            experiment_configuration_file_path=experiment_configuration_file_path,
+            name="carps",
+            database_credential_file_path=database_credential_file_path,
+            log_level=logging.INFO,
+            use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel,
+            use_codecarbon=False
+        )
+        fill_database(cfg, experimenter)
 
 
 if __name__ == "__main__":
     # TODO make experiment_configuration_file_path and database_credential_file_path a commandline arg
-    experiment_configuration_file_path = Path(__file__).parent / "py_experimenter.yaml"
-
-    database_credential_file_path = Path(__file__).parent / "credentials.yaml"
-    if database_credential_file_path is not None and not database_credential_file_path.exists():
-        database_credential_file_path = None  # type: ignore[assignment]
-
-    experimenter = PyExperimenter(
-        experiment_configuration_file_path=experiment_configuration_file_path,
-        name="carps",
-        database_credential_file_path=database_credential_file_path,
-        log_level=logging.INFO,
-        use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel,
-    )
-
     main()
diff --git a/carps/experimenter/database/download_results.py b/carps/experimenter/database/download_results.py
@@ -18,6 +18,7 @@ def main(
     pyexperimenter_configuration_file_path: str | None = None,
     database_credential_file_path: str | Path | None = None,
     outdir: str | Path | None = None,
+    codecarbon: bool = False
 ) -> None:
     """Download results from the database and save them to outdir.
 
@@ -49,6 +50,7 @@ def main(
         database_credential_file_path=database_credential_file_path,
         log_file="logs/reset_experiments.log",
         use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel,
+        use_codecarbon=codecarbon
     )
 
     experiment_config_table = experimenter.get_table()
@@ -64,12 +66,14 @@ def main(
             logger.info(f"\tFrom them, found {n_errored} errored runs of type {task_type}. ❌")
     trajectory_table = experimenter.get_logtable("trajectory")
     trials_table = experimenter.get_logtable("trials")
-    codecarbon_table = experimenter.get_codecarbon_table()
+    if codecarbon:
+        codecarbon_table = experimenter.get_codecarbon_table()
 
     experiment_config_table.to_parquet(outdir / "experiment_config.parquet", index=False)
     trajectory_table.to_parquet(outdir / "trajectory.parquet", index=False)
     trials_table.to_parquet(outdir / "trials.parquet", index=False)
-    codecarbon_table.to_parquet(outdir / "codecarbon.parquet", index=False)
+    if codecarbon:
+        codecarbon_table.to_parquet(outdir / "codecarbon.parquet", index=False)
     logger.info(
         "Downloaded results from the database. "
         f"Saved to '{outdir}'. "

diff --git a/carps/experimenter/database/process_logs.py b/carps/experimenter/database/process_logs.py
@@ -63,6 +63,7 @@ def add_metadata(
         "config",
         "config_hash",
         "name",
+        "n_trials"
     ]
     metadata_columns = [c for c in experiment_config_table.columns if c not in ignore_columns]
 
@@ -100,9 +101,9 @@ def process_single_run_from_database(
     if logs_from_one_run["experiment_id"].nunique() != 1:  # noqa: PD101
         raise ValueError("Multiple values for `experiment_id` found in the logs. Something is suspicious.")
     experiment_id = logs_from_one_run["experiment_id"].iloc[0]
-    logs_from_one_run = process_logs(logs_from_one_run)
-    if only_incumbents:
-        logs_from_one_run = filter_non_incumbent_entries(logs=logs_from_one_run)
+    # logs_from_one_run = process_logs(logs_from_one_run)
+    # if only_incumbents:
+    #     logs_from_one_run = filter_non_incumbent_entries(logs=logs_from_one_run)
     return add_metadata(
         logs_from_one_run=logs_from_one_run,
         experiment_id=experiment_id,
@@ -144,7 +145,7 @@ def process_logs_from_database(
     experiment_config_table_filename: str = "experiment_config.parquet",
     output_filename: str = "processed_logs.parquet",
     results_dir: str = "experimenter/results",
-    only_incumbents: bool = True,  # noqa: FBT001, FBT002
+    only_incumbents: bool = False,  # noqa: FBT001, FBT002
 ) -> pd.DataFrame:
     """Process logs from the database with multiprocessing for speed-up.
 
@@ -178,12 +179,6 @@ def process_logs_from_database(
         only_incumbents=only_incumbents,
     )
 
-    # Set up multiprocessing pool to process the logs
-    # with Pool() as pool:
-    #     # Wrap pool.imap_unordered with tqdm to show the progress bar
-    #     result = list(tqdm(
-    #           pool.imap_unordered(
-    #           process_experiment_partial, experiment_ids), total=len(experiment_ids), desc="Processing experiments"))
     logger.info(f"Start processing {len(experiment_ids)} experiments... This might take a while...")
     result = [
         process_experiment_partial(experiment_id)
@@ -192,9 +187,9 @@ def process_logs_from_database(
 
     # Combine the results into a single DataFrame
     processed_logs = pd.concat(result, ignore_index=True).reset_index(drop=True)
-    processed_logs.to_parquet(output_filename, index=False)
+    processed_logs = process_logs(processed_logs)
+    processed_logs.to_parquet(output_filename, index=False, engine="fastparquet")
     logger.info(f"Processed logs saved to {output_filename} 💌.")
-    return processed_logs
 
 
 if __name__ == "__main__":

diff --git a/carps/experimenter/py_experimenter.yaml b/carps/experimenter/py_experimenter.yaml
@@ -2,7 +2,7 @@ PY_EXPERIMENTER:
   n_jobs: 1
 
   Database:
-    use_ssh_tunnel: true
+    use_ssh_tunnel: false
     provider: mysql
     database: smacbenchmarking
     table:
@@ -21,7 +21,7 @@ PY_EXPERIMENTER:
         task_type:
           type: VARCHAR(50)
         optimizer_id:
-          type: VARCHAR(50)
+          type: VARCHAR(200)
         optimizer_container_id:
           type: VARCHAR(50)
         seed:
@@ -54,6 +54,7 @@ PY_EXPERIMENTER:
         trial_value__additional_info: JSON
       trajectory:
         n_trials: INT
+        n_function_calls: INT
         trial_info__config: JSON
         trial_info__instance: INT
         trial_info__seed: INT

diff --git a/carps/experimenter/scrape_results_to_db.py b/carps/experimenter/scrape_results_to_db.py
@@ -35,6 +35,7 @@
     database_credential_file_path=database_credential_file_path,
     log_level=logging.INFO,
     use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel,
+    use_codecarbon=False
 )