From f6696638b6b209f94275432a44930992f01421ac Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Mon, 19 May 2025 12:36:53 +0200 Subject: [PATCH 01/12] Fallback, in case the database conenction is not good --- carps/loggers/database_logger.py | 34 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/carps/loggers/database_logger.py b/carps/loggers/database_logger.py index b6cddd748..54b031123 100644 --- a/carps/loggers/database_logger.py +++ b/carps/loggers/database_logger.py @@ -8,6 +8,7 @@ from carps.loggers.abstract_logger import AbstractLogger from carps.utils.loggingutils import CustomEncoder, get_logger, setup_logging +import time if TYPE_CHECKING: from py_experimenter.result_processor import ResultProcessor @@ -98,12 +99,20 @@ def log_trial( table_name: str, default "trials" The name of the table to log the trial to. """ - info = convert_trial_info(trial_info, trial_value) - info["n_trials"] = n_trials - info["n_function_calls"] = n_function_calls if n_function_calls else n_trials - - if self.result_processor: - self.result_processor.process_logs({table_name: info}) + for i in range(5): + try: + info = convert_trial_info(trial_info, trial_value) + info["n_trials"] = n_trials + info["n_function_calls"] = n_function_calls if n_function_calls else n_trials + + if self.result_processor: + logger.info(f"Logging trial to {table_name}: {info}") + self.result_processor.process_logs({table_name: info}) + break + except Exception as e: + if i == 4: + raise e + time.sleep(10) def log_incumbent(self, n_trials: int | float, incumbent: Incumbent, n_function_calls: int | None = None) -> None: """Log the incumbent. @@ -142,5 +151,14 @@ def log_arbitrary(self, data: dict, entity: str) -> None: entity : str The entity to log the data to. This is the table name in the database. """ - if self.result_processor: - self.result_processor.process_logs({entity: data}) + + for i in range(5): + try: + if self.result_processor: + self.result_processor.process_logs({entity: data}) + break + except Exception as e: + if i == 4: + raise e + time.sleep(10) + From 91a60745e06ffe49a50aeb3aab2eb366f3ced12b Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Mon, 19 May 2025 15:03:11 +0200 Subject: [PATCH 02/12] calculate hypervolume --- carps/analysis/calc_hypervolume.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/carps/analysis/calc_hypervolume.py b/carps/analysis/calc_hypervolume.py index 22ad04126..f5784b6c1 100644 --- a/carps/analysis/calc_hypervolume.py +++ b/carps/analysis/calc_hypervolume.py @@ -11,7 +11,7 @@ import pandas as pd from pymoo.indicators.hv import HV -from carps.analysis.gather_data import convert_mixed_types_to_str +# from carps.analysis.gather_data import convert_mixed_types_to_str run_id = ["task_type", "benchmark_id", "task_id", "optimizer_id", "seed"] @@ -57,9 +57,17 @@ def add_reference_point(x: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe with the reference point. """ - costs = x["trial_value__cost_inc"].apply(lambda x: np.array([np.array(c) for c in x])).to_list() - costs = np.concatenate(costs) + # Flatten and stack all cost vectors + costs = np.vstack([np.array(c) for c in x["trial_value__cost_raw"]]) + + # Sanity check for consistent dimensionality + if len(set(cost.shape[0] for cost in costs)) != 1: + raise ValueError("Inconsistent number of objectives in cost vectors.") + + # Reference point is max across all objectives reference_point = np.max(costs, axis=0) + + # Set reference point per row x["reference_point"] = [reference_point] * len(x) return x @@ -73,7 +81,7 @@ def calc_hv(x: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe with the hypervolume. """ - F = np.concatenate(np.array([np.array(p) for p in x["trial_value__cost_inc"].to_numpy()])) + F = np.vstack([np.array(p) for p in x["trial_value__cost_raw"]]) ind = HV(ref_point=x["reference_point"].iloc[0], pf=None, nds=False) x["hypervolume"] = ind(F) From 8bc78991646ad5aabc18e707a288e17073810fe4 Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Mon, 19 May 2025 15:06:50 +0200 Subject: [PATCH 03/12] update gather data hypervolume --- carps/analysis/gather_data.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/carps/analysis/gather_data.py b/carps/analysis/gather_data.py index 193a2ba71..838dc1679 100644 --- a/carps/analysis/gather_data.py +++ b/carps/analysis/gather_data.py @@ -22,6 +22,7 @@ from carps.utils.loggingutils import get_logger, setup_logging from carps.utils.task import Task from carps.utils.trials import TrialInfo +from carps.analysis.calc_hypervolume import calc_hv, add_reference_point, run_id if TYPE_CHECKING: from carps.objective_functions.objective_function import ObjectiveFunction @@ -389,11 +390,11 @@ def maybe_postadd_task(logs: pd.DataFrame, overwrite: bool = False) -> pd.DataFr task_cfg = load_task_cfg(task_id=gid, task_index=task_index) task_cfg_yaml = OmegaConf.to_yaml(task_cfg) - if "${seed}" in task_cfg_yaml: - # Add seed to config to make it resolvable - assert gdf["seed"].nunique() == 1 # noqa: PD101 - seed = gdf["seed"].iloc[0] - task_cfg.seed = int(seed) + # if "${seed}" in task_cfg_yaml: + # # Add seed to config to make it resolvable + # assert gdf["seed"].nunique() == 1 # noqa: PD101 + # seed = gdf["seed"].iloc[0] + # task_cfg.seed = int(seed) task_cfg = OmegaConf.to_container(task_cfg, resolve=False) task_columns = [c for c in gdf.columns if c.startswith("task.")] if overwrite: @@ -440,10 +441,13 @@ def maybe_convert_cost_dtype(x: int | float | str | list) -> float | list[float] Returns: float | list[float]: Cost(s). """ + if isinstance(x, int | float): return float(x) if isinstance(x, str): - return eval(x) # noqa: S307 + x = eval(x) # noqa: S307 + if isinstance(x, dict): + x = eval(x["cost"]) assert isinstance(x, list) return x @@ -463,7 +467,7 @@ def maybe_convert_cost_to_so(x: float | list | np.ndarray) -> float: float: Single-objective cost or aggregated cost. """ if isinstance(x, list | np.ndarray): - return np.sum(x) + return np.sum(x) # TODO change to HV here if isinstance(x, dict): assert len(x.values()) == 1 # Most likely comes from database @@ -472,7 +476,7 @@ def maybe_convert_cost_to_so(x: float | list | np.ndarray) -> float: if isinstance(value, str): value = ast.literal_eval(value) if isinstance(value, list): - return np.sum(value) + return np.sum(value) # TODO Change to HV here if isinstance(value, float | int): return value if isinstance(x, float): @@ -566,7 +570,11 @@ def process_logs(logs: pd.DataFrame, keep_task_columns: list[str] | None = None) logger.debug("Handle MO costs...") logs["trial_value__cost_raw"] = logs["trial_value__cost"].apply(maybe_convert_cost_dtype) - logs["trial_value__cost"] = logs["trial_value__cost_raw"].apply(maybe_convert_cost_to_so) + # trial_value__cost_raw for add_reference_point and to calc_hv + logs = logs.groupby(by=["task_type", "task_id"]).apply(add_reference_point).reset_index(drop=True) + logs = logs.groupby(by=[*run_id, "n_trials"]).apply(calc_hv).reset_index(drop=True) + logs["trial_value__cost"] = logs["hypervolume"] #logs["trial_value__cost_raw"].apply(maybe_convert_cost_to_so) + print(logs.head()) logger.debug("Determine incumbent cost...") logs["trial_value__cost_inc"] = logs.groupby(by=grouper_keys)["trial_value__cost"].transform("cummin") @@ -613,6 +621,7 @@ def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Normalized logs """ + grouper_keys = ["task_id", "optimizer_id", "seed"] logger.info("Start normalization...") logger.info("Normalize n_trials...") logs["n_trials_norm"] = logs.groupby("task_id")["n_trials"].transform(normalize) @@ -623,7 +632,7 @@ def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame: hv = logs.loc[ids_mo, "hypervolume"] logs.loc[ids_mo, "trial_value__cost"] = -hv # higher is better logs["trial_value__cost"] = logs["trial_value__cost"].astype("float64") - logs["trial_value__cost_inc"] = logs["trial_value__cost"].transform("cummin") + logs["trial_value__cost_inc"] = logs.groupby(by=grouper_keys)["trial_value__cost"].transform("cummin") logs["trial_value__cost_norm"] = logs.groupby("task_id")["trial_value__cost"].transform(normalize) logger.info("Calc normalized incumbent cost...") From eb0d40ce3d6139488a11bee3eedd880bf6ddd3bb Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Thu, 22 May 2025 16:12:26 +0200 Subject: [PATCH 04/12] smaple matrix in autorank --- carps/analysis/run_autorank.py | 1 + 1 file changed, 1 insertion(+) diff --git a/carps/analysis/run_autorank.py b/carps/analysis/run_autorank.py index 45c3f401a..0471c348b 100644 --- a/carps/analysis/run_autorank.py +++ b/carps/analysis/run_autorank.py @@ -383,6 +383,7 @@ def cd_evaluation( alpha=alpha, alpha_normality=alpha_normality, num_samples=len(rank_data), + sample_matrix=None, posterior_matrix=None, decision_matrix=None, rope=None, From f77b6b88f8cb65364351e2302bd462820ccb9891 Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Thu, 22 May 2025 16:15:28 +0200 Subject: [PATCH 05/12] make codecarbon optional --- carps/experimenter/database/download_results.py | 8 ++++++-- carps/experimenter/scrape_results_to_db.py | 1 + carps/run_from_db.py | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/carps/experimenter/database/download_results.py b/carps/experimenter/database/download_results.py index 60a285182..54c25f2fe 100644 --- a/carps/experimenter/database/download_results.py +++ b/carps/experimenter/database/download_results.py @@ -18,6 +18,7 @@ def main( pyexperimenter_configuration_file_path: str | None = None, database_credential_file_path: str | Path | None = None, outdir: str | Path | None = None, + codecarbon: bool = False ) -> None: """Download results from the database and save them to outdir. @@ -49,6 +50,7 @@ def main( database_credential_file_path=database_credential_file_path, log_file="logs/reset_experiments.log", use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel, + use_codecarbon=codecarbon ) experiment_config_table = experimenter.get_table() @@ -64,12 +66,14 @@ def main( logger.info(f"\tFrom them, found {n_errored} errored runs of type {task_type}. ❌") trajectory_table = experimenter.get_logtable("trajectory") trials_table = experimenter.get_logtable("trials") - codecarbon_table = experimenter.get_codecarbon_table() + if codecarbon: + codecarbon_table = experimenter.get_codecarbon_table() experiment_config_table.to_parquet(outdir / "experiment_config.parquet", index=False) trajectory_table.to_parquet(outdir / "trajectory.parquet", index=False) trials_table.to_parquet(outdir / "trials.parquet", index=False) - codecarbon_table.to_parquet(outdir / "codecarbon.parquet", index=False) + if codecarbon: + codecarbon_table.to_parquet(outdir / "codecarbon.parquet", index=False) logger.info( "Downloaded results from the database. " f"Saved to '{outdir}'. " diff --git a/carps/experimenter/scrape_results_to_db.py b/carps/experimenter/scrape_results_to_db.py index 902963e63..0cce9ef28 100644 --- a/carps/experimenter/scrape_results_to_db.py +++ b/carps/experimenter/scrape_results_to_db.py @@ -35,6 +35,7 @@ database_credential_file_path=database_credential_file_path, log_level=logging.INFO, use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel, + use_codecarbon=False ) diff --git a/carps/run_from_db.py b/carps/run_from_db.py index ddca3774c..d70daf285 100644 --- a/carps/run_from_db.py +++ b/carps/run_from_db.py @@ -95,6 +95,7 @@ def main( database_credential_file_path=database_credential_file_path, log_file=f"logs/{slurm_job_id}.log", use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel, + use_codecarbon=False ) experimenter.execute(py_experimenter_evaluate, max_experiments=1) From 222184b777c79c107e48c68065ce4cf9407c6ff5 Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Thu, 22 May 2025 16:19:54 +0200 Subject: [PATCH 06/12] Possibility to save configs to pickle and upload them together to the DB --- carps/experimenter/create_cluster_configs.py | 44 ++++++---- carps/experimenter/write_to_db.py | 85 ++++++++++++++++++++ 2 files changed, 112 insertions(+), 17 deletions(-) create mode 100644 carps/experimenter/write_to_db.py diff --git a/carps/experimenter/create_cluster_configs.py b/carps/experimenter/create_cluster_configs.py index 07f064ac6..77d6f3b15 100644 --- a/carps/experimenter/create_cluster_configs.py +++ b/carps/experimenter/create_cluster_configs.py @@ -13,6 +13,7 @@ from py_experimenter.experimenter import PyExperimenter from carps.utils.loggingutils import CustomEncoder +import pickle as pckl logger = logging.getLogger("create experiments") @@ -72,7 +73,7 @@ def get_experiment_definition(cfg: OmegaConf) -> dict: cfg_dict = OmegaConf.to_container(cfg=cfg, resolve=True) cfg_str = json.dumps(cfg_dict, cls=CustomEncoder) - cfg_hash = create_config_hash(cfg) + cfg_hash = create_config_hash_from_full_cfg(cfg) return { "config": cfg_str, @@ -100,6 +101,7 @@ def fill_database(cfg: DictConfig, experimenter: PyExperimenter) -> None: DatabaseConnectionError: If there is an error with the database connection. """ experiment_definition = get_experiment_definition(cfg) + column_names = list(experimenter.db_connector.database_configuration.keyfields.keys()) exists = False @@ -124,7 +126,7 @@ def fill_database(cfg: DictConfig, experimenter: PyExperimenter) -> None: # experimenter.close_ssh() -@hydra.main(config_path="../configs", config_name="base.yaml", version_base=None) # type: ignore[misc] +@hydra.main(config_path="../configs", config_name="base.yaml", version_base=None, save_as_pckl=True, folder_path="configs_pckl") # type: ignore[misc] def main(cfg: DictConfig) -> None: """Store experiment config in database. @@ -134,23 +136,31 @@ def main(cfg: DictConfig) -> None: Global configuration. """ - fill_database(cfg, experimenter) + if save_as_pckl: + experiment_definition = get_experiment_definition(cfg) + files = list(Path(folder_path).glob("*.pkl")) + + if experiment_definition['config_hash'] not in files: + with open(f"{folder_path}{experiment_definition['config_hash']}.pkl", "wb") as f: + pckl.dump(experiment_definition, f) + else: + experiment_configuration_file_path = Path(__file__).parent / "py_experimenter.yaml" + + database_credential_file_path = Path(__file__).parent / "credentials.yaml" + if database_credential_file_path is not None and not database_credential_file_path.exists(): + database_credential_file_path = None # type: ignore[assignment] + + experimenter = PyExperimenter( + experiment_configuration_file_path=experiment_configuration_file_path, + name="carps", + database_credential_file_path=database_credential_file_path, + log_level=logging.INFO, + use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel, + use_codecarbon=False + ) + fill_database(cfg, experimenter) if __name__ == "__main__": # TODO make experiment_configuration_file_path and database_credential_file_path a commandline arg - experiment_configuration_file_path = Path(__file__).parent / "py_experimenter.yaml" - - database_credential_file_path = Path(__file__).parent / "credentials.yaml" - if database_credential_file_path is not None and not database_credential_file_path.exists(): - database_credential_file_path = None # type: ignore[assignment] - - experimenter = PyExperimenter( - experiment_configuration_file_path=experiment_configuration_file_path, - name="carps", - database_credential_file_path=database_credential_file_path, - log_level=logging.INFO, - use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel, - ) - main() diff --git a/carps/experimenter/write_to_db.py b/carps/experimenter/write_to_db.py new file mode 100644 index 000000000..c4896c8c6 --- /dev/null +++ b/carps/experimenter/write_to_db.py @@ -0,0 +1,85 @@ +from __future__ import annotations +from omegaconf import OmegaConf +from py_experimenter.experimenter import PyExperimenter +from pathlib import Path +import logging +from multiprocessing import Pool +from tqdm import tqdm +import numpy as np +from concurrent.futures import ThreadPoolExecutor +import pickle as pckl +from hydra.core.utils import setup_globals + + +setup_globals() + + +experiment_identifiers = ["optimizer_id", "task_id", "seed", "benchmark_id", "n_trials", "time_budget"] + + +def check_existance_by_keys(experiment_definition: dict, existing_rows: list, identifier_keys: list[str]) -> bool: + """Check existance of experiment in database by the identifier keys. + + Args: + experiment_definition (dict): Experiment definition. + existing_rows (list): List of existing rows in the database. + identifier_keys (list[str]): List of keys to check for existance. + + Returns: + bool: True if the experiment exists, False otherwise. + """ + return any(all(experiment_definition[k] == e[k] for k in identifier_keys) for e in existing_rows) + + + +folder_path = Path("configs_pckl") +pkl_files = list(folder_path.glob("*.pkl")) +print('length of pkl_files', len(pkl_files)) + +def load_pickle(file_path): + with open(file_path, 'rb') as f: + return pckl.load(f) + +with ThreadPoolExecutor() as executor: + exp_defs = list(executor.map(load_pickle, pkl_files)) + + +# CONNECT TO DATABASE and get existing experiments +experiment_configuration_file_path = "carps/experimenter/py_experimenter copy.yaml" +database_credential_file_path = "carps/experimenter/credentials.yaml" + +experimenter = PyExperimenter( + experiment_configuration_file_path=experiment_configuration_file_path, + name="carps", + database_credential_file_path=database_credential_file_path, + log_level=logging.INFO, + use_ssh_tunnel=OmegaConf.load(experiment_configuration_file_path).PY_EXPERIMENTER.Database.use_ssh_tunnel, + use_codecarbon=False +) + + +column_names = list(experimenter.db_connector.database_configuration.keyfields.keys()) +existing_rows = experimenter.db_connector._get_existing_rows(column_names) + +# Check if experiments exists +print("Checking if experiments already exist...") +rows_exist = [ + check_existance_by_keys(exp_def, existing_rows, experiment_identifiers) + for exp_def in tqdm(exp_defs, total=len(exp_defs)) +] + + +print(f"This number of experiments already exists: {np.sum(rows_exist)}") + +experiments_to_add = [exp_def for exp_def, exists in zip(exp_defs, rows_exist, strict=True) if not exists] +print( + f"number of existing rows {len(existing_rows)}, previous length: " + f"{len(exp_defs)}, length now {len(experiments_to_add)}" +) + + +BATCH_SIZE = 5000 +for i in range(0, len(experiments_to_add), BATCH_SIZE): + batch = experiments_to_add[i:i+BATCH_SIZE] + experimenter.fill_table_with_rows(batch) + From f770a9ebea35648bbad7abbe0946d1efc371bb6a Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Thu, 22 May 2025 16:20:24 +0200 Subject: [PATCH 07/12] larger optimizer_id --- carps/experimenter/py_experimenter.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/carps/experimenter/py_experimenter.yaml b/carps/experimenter/py_experimenter.yaml index 8509494c0..774bba8b2 100644 --- a/carps/experimenter/py_experimenter.yaml +++ b/carps/experimenter/py_experimenter.yaml @@ -2,7 +2,7 @@ PY_EXPERIMENTER: n_jobs: 1 Database: - use_ssh_tunnel: true + use_ssh_tunnel: false provider: mysql database: smacbenchmarking table: @@ -21,7 +21,7 @@ PY_EXPERIMENTER: task_type: type: VARCHAR(50) optimizer_id: - type: VARCHAR(50) + type: VARCHAR(200) optimizer_container_id: type: VARCHAR(50) seed: @@ -54,6 +54,7 @@ PY_EXPERIMENTER: trial_value__additional_info: JSON trajectory: n_trials: INT + n_function_calls: INT trial_info__config: JSON trial_info__instance: INT trial_info__seed: INT From 86b3a421d026d116d859e2c40179faa57dfd0c6c Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Thu, 22 May 2025 16:22:16 +0200 Subject: [PATCH 08/12] MO process_logs --- carps/experimenter/database/process_logs.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/carps/experimenter/database/process_logs.py b/carps/experimenter/database/process_logs.py index 651361aed..b66be3fba 100644 --- a/carps/experimenter/database/process_logs.py +++ b/carps/experimenter/database/process_logs.py @@ -63,6 +63,7 @@ def add_metadata( "config", "config_hash", "name", + "n_trials" ] metadata_columns = [c for c in experiment_config_table.columns if c not in ignore_columns] @@ -100,9 +101,9 @@ def process_single_run_from_database( if logs_from_one_run["experiment_id"].nunique() != 1: # noqa: PD101 raise ValueError("Multiple values for `experiment_id` found in the logs. Something is suspicious.") experiment_id = logs_from_one_run["experiment_id"].iloc[0] - logs_from_one_run = process_logs(logs_from_one_run) - if only_incumbents: - logs_from_one_run = filter_non_incumbent_entries(logs=logs_from_one_run) + # logs_from_one_run = process_logs(logs_from_one_run) + # if only_incumbents: + # logs_from_one_run = filter_non_incumbent_entries(logs=logs_from_one_run) return add_metadata( logs_from_one_run=logs_from_one_run, experiment_id=experiment_id, @@ -144,7 +145,7 @@ def process_logs_from_database( experiment_config_table_filename: str = "experiment_config.parquet", output_filename: str = "processed_logs.parquet", results_dir: str = "experimenter/results", - only_incumbents: bool = True, # noqa: FBT001, FBT002 + only_incumbents: bool = False, # noqa: FBT001, FBT002 ) -> pd.DataFrame: """Process logs from the database with multiprocessing for speed-up. @@ -192,9 +193,9 @@ def process_logs_from_database( # Combine the results into a single DataFrame processed_logs = pd.concat(result, ignore_index=True).reset_index(drop=True) - processed_logs.to_parquet(output_filename, index=False) + processed_logs = process_logs(processed_logs) + processed_logs.to_parquet(output_filename, index=False, engine="fastparquet") logger.info(f"Processed logs saved to {output_filename} 💌.") - return processed_logs if __name__ == "__main__": From 400f7b786a668cafce76f2beec16dc355a0cac19 Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Thu, 22 May 2025 16:22:26 +0200 Subject: [PATCH 09/12] MO process logs --- carps/experimenter/database/process_logs.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/carps/experimenter/database/process_logs.py b/carps/experimenter/database/process_logs.py index b66be3fba..3136ec0d2 100644 --- a/carps/experimenter/database/process_logs.py +++ b/carps/experimenter/database/process_logs.py @@ -179,12 +179,6 @@ def process_logs_from_database( only_incumbents=only_incumbents, ) - # Set up multiprocessing pool to process the logs - # with Pool() as pool: - # # Wrap pool.imap_unordered with tqdm to show the progress bar - # result = list(tqdm( - # pool.imap_unordered( - # process_experiment_partial, experiment_ids), total=len(experiment_ids), desc="Processing experiments")) logger.info(f"Start processing {len(experiment_ids)} experiments... This might take a while...") result = [ process_experiment_partial(experiment_id) From bbfd62a646c37e63507f0fab2b8db45ccad5cdf6 Mon Sep 17 00:00:00 2001 From: Daphne Theodorakopoulos <56087728+daphne12345@users.noreply.github.com> Date: Thu, 22 May 2025 16:33:33 +0200 Subject: [PATCH 10/12] Update write_to_db.py --- carps/experimenter/write_to_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/carps/experimenter/write_to_db.py b/carps/experimenter/write_to_db.py index c4896c8c6..964b841cc 100644 --- a/carps/experimenter/write_to_db.py +++ b/carps/experimenter/write_to_db.py @@ -45,7 +45,7 @@ def load_pickle(file_path): # CONNECT TO DATABASE and get existing experiments -experiment_configuration_file_path = "carps/experimenter/py_experimenter copy.yaml" +experiment_configuration_file_path = "carps/experimenter/py_experimenter.yaml" database_credential_file_path = "carps/experimenter/credentials.yaml" experimenter = PyExperimenter( From a390ab7ed3db0450a34028b7b56fa89826c51951 Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Thu, 5 Jun 2025 16:43:25 +0200 Subject: [PATCH 11/12] calculates hypervolume based on current pareto front, normalize objectives before --- carps/analysis/calc_hypervolume.py | 47 ++++++++++++++++++++++++++++-- carps/analysis/gather_data.py | 7 +++-- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/carps/analysis/calc_hypervolume.py b/carps/analysis/calc_hypervolume.py index f5784b6c1..ad9fc956f 100644 --- a/carps/analysis/calc_hypervolume.py +++ b/carps/analysis/calc_hypervolume.py @@ -45,6 +45,38 @@ def gather_trajectory(x: pd.DataFrame) -> pd.DataFrame: data.append(D) return pd.DataFrame(data) +def get_pareto_front(costs): + """Return all Pareto-optimal rows from the given array. Assumes minimization.""" + is_efficient = np.ones(len(costs), dtype=bool) + for i, c in enumerate(costs): + if is_efficient[i]: + is_efficient[is_efficient] = np.any(costs[is_efficient] < c, axis=1) | np.all(costs[is_efficient] == c, axis=1) + is_efficient[i] = True + return costs[is_efficient] + + +def add_running_pareto_front(group): + """Adds the pareto front of all costs up until the current trial to the group. + + Args: + group (_type_): _description_ + + Returns: + _type_: _description_ + """ + group = group.sort_values("n_trials").reset_index(drop=True) + costs = np.stack(group["trial_value__cost_normalized"].to_numpy()) + pareto_fronts = [] + + for i in range(len(group)): + current_costs = costs[:i+1] + front = get_pareto_front(current_costs) + pareto_fronts.append(tuple(map(tuple, front))) + + group["pareto_front"] = pareto_fronts + return group + + def add_reference_point(x: pd.DataFrame) -> pd.DataFrame: """Add reference point to the dataframe. @@ -65,12 +97,21 @@ def add_reference_point(x: pd.DataFrame) -> pd.DataFrame: raise ValueError("Inconsistent number of objectives in cost vectors.") # Reference point is max across all objectives - reference_point = np.max(costs, axis=0) + reference_point = np.max(costs, axis=0) + 1e-4 # Set reference point per row x["reference_point"] = [reference_point] * len(x) return x +def normalize_objectives(x: pd.DataFrame) -> pd.DataFrame: + costs = np.vstack(x["trial_value__cost_raw"]) + min_vals, max_vals = costs.min(0), costs.max(0) + denom = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals) + normalized = (costs - min_vals) / denom + x["trial_value__cost_normalized"] = list(normalized) + return x + + def calc_hv(x: pd.DataFrame) -> pd.DataFrame: """Calculate hypervolume per trajectory step. @@ -81,9 +122,9 @@ def calc_hv(x: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe with the hypervolume. """ - F = np.vstack([np.array(p) for p in x["trial_value__cost_raw"]]) + F = np.vstack([np.array(p) for p in x["pareto_front"]]) - ind = HV(ref_point=x["reference_point"].iloc[0], pf=None, nds=False) + ind = HV(ref_point=[1.000001]*F.shape[1], pf=None, nds=False) x["hypervolume"] = ind(F) return x diff --git a/carps/analysis/gather_data.py b/carps/analysis/gather_data.py index 838dc1679..01f779cff 100644 --- a/carps/analysis/gather_data.py +++ b/carps/analysis/gather_data.py @@ -22,7 +22,7 @@ from carps.utils.loggingutils import get_logger, setup_logging from carps.utils.task import Task from carps.utils.trials import TrialInfo -from carps.analysis.calc_hypervolume import calc_hv, add_reference_point, run_id +from carps.analysis.calc_hypervolume import calc_hv, add_reference_point, run_id, add_running_pareto_front, normalize_objectives if TYPE_CHECKING: from carps.objective_functions.objective_function import ObjectiveFunction @@ -571,7 +571,8 @@ def process_logs(logs: pd.DataFrame, keep_task_columns: list[str] | None = None) logger.debug("Handle MO costs...") logs["trial_value__cost_raw"] = logs["trial_value__cost"].apply(maybe_convert_cost_dtype) # trial_value__cost_raw for add_reference_point and to calc_hv - logs = logs.groupby(by=["task_type", "task_id"]).apply(add_reference_point).reset_index(drop=True) + logs = logs.groupby(by=["task_type", "task_id"]).apply(normalize_objectives).reset_index(drop=True) + logs = logs.groupby(by=[*run_id]).apply(add_running_pareto_front).reset_index(drop=True) logs = logs.groupby(by=[*run_id, "n_trials"]).apply(calc_hv).reset_index(drop=True) logs["trial_value__cost"] = logs["hypervolume"] #logs["trial_value__cost_raw"].apply(maybe_convert_cost_to_so) print(logs.head()) @@ -805,7 +806,7 @@ def rename_legacy(logs: pd.DataFrame) -> pd.DataFrame: # NOTE(eddiebergman): Use `n_processes=None` as default, which uses `os.cpu_count()` in `Pool` def filelogs_to_df( - rundir: str | list[str], log_fn: str = "trial_logs.jsonl", n_processes: int | None = None + rundir: str | list[str] = "results/", log_fn: str = "trial_logs.jsonl", n_processes: int | None = None ) -> tuple[pd.DataFrame, pd.DataFrame]: """Load logs from file and preprocess. From fb2b7831e732b643cde030b9e6a9c765be032205 Mon Sep 17 00:00:00 2001 From: Daphne12345 Date: Tue, 1 Jul 2025 10:35:48 +0200 Subject: [PATCH 12/12] mo with pareto front --- carps/analysis/calc_hypervolume.py | 2 +- carps/analysis/gather_data.py | 4 ++-- carps/analysis/run_autorank.py | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/carps/analysis/calc_hypervolume.py b/carps/analysis/calc_hypervolume.py index 4082fea2f..a0912d308 100644 --- a/carps/analysis/calc_hypervolume.py +++ b/carps/analysis/calc_hypervolume.py @@ -215,7 +215,7 @@ def add_hypervolume_to_df(logs: pd.DataFrame, on_key: str = "trial_value__cost") """ tqdm.pandas(desc="Calc hypervolume...") ids_mo = get_ids_mo(logs) - add_reference_point_partial = partial(add_reference_point, on_key=on_key) + add_reference_point_partial = partial(add_reference_point) mo_cols = ["hypervolume", "reference_point"] for mo_col in mo_cols: if mo_col not in logs.columns: diff --git a/carps/analysis/gather_data.py b/carps/analysis/gather_data.py index 382045193..f5510fe67 100644 --- a/carps/analysis/gather_data.py +++ b/carps/analysis/gather_data.py @@ -612,9 +612,9 @@ def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame: logs["trial_value__cost_raw"] = logs["trial_value__cost"].apply(maybe_convert_cost_dtype) else: logs["trial_value__cost_raw"] = logs["trial_value__cost_raw"].apply(maybe_convert_cost_dtype) - logs = add_hypervolume_to_df(logs, on_key="trial_value__cost_raw") + # logs = add_hypervolume_to_df(logs, on_key="trial_value__cost_raw") # IDs have changed, so we need to recalculate - ids_mo = get_ids_mo(logs) + # ids_mo = get_ids_mo(logs) hv = logs.loc[ids_mo, "hypervolume"] logs.loc[ids_mo, "trial_value__cost"] = -hv # higher is better logs["trial_value__cost"] = logs["trial_value__cost"].astype("float64") diff --git a/carps/analysis/run_autorank.py b/carps/analysis/run_autorank.py index 13ace7d9e..5dcc84b57 100644 --- a/carps/analysis/run_autorank.py +++ b/carps/analysis/run_autorank.py @@ -444,8 +444,7 @@ def cd_evaluation( rope_mode=None, effect_size=res.effect_size, force_mode=None, - sample_matrix=None, - plot_order=None, + # plot_order=None, ) is_significant = True if result.pvalue >= result.alpha: