Brings tuning changes on LaMBO2 (#55)

miguelgondu · samuelstanton · web-flow · commit dcbe4ed35c86 · 2024-10-28T16:51:02.000+01:00
* add lambo ehrlich example, tune params

* lambo tweaks

* add geometric recency

* Adds beignet as a dependency

* Removes the hardcoded GPU acceleration

* Runs the lambo test for less time

* Makes it even smaller for testing

* Updates the example's readme

---------

Co-authored-by: Samuel Stanton &lt;ss13641@nyu.edu&gt;
diff --git a/.gitignore b/.gitignore
@@ -11,8 +11,10 @@ __pycache__/
 
 .tox/
 build/
+oracle/
 
 examples/05_optimizing_qed_using_latent_space_bo/VAESelfies_TINY-CID-SELFIES-20_latent_dim_2.pt
+examples/06_running_lambo2_on_rasp/directed_evolution_rasp*.npz
 examples/07_optimizing_logp_on_chembl/VAE_CHEMBL.pt
 examples/08_pymoo_nsga_ii_on_foldx/history.json
 examples/09_replicating_nsga_ii_of_lambo_by_hand/pdbs
diff --git a/examples/06_running_lambo2_on_rasp/readme.md b/examples/06_running_lambo2_on_rasp/readme.md
@@ -1,4 +1,6 @@
-This folder includes an example in which we optimize the thermal stability of red fluorescent proteins (RFPs) using `LaMBO2`.
+This folder includes an example in which we optimize the thermal stability of red fluorescent proteins (RFPs), measured using an additive version of RaSP, using `LaMBO2`.
+
+As a pre-requisite, [we encourage you to set-up `poli` for RaSP](https://machinelearninglifescience.github.io/poli-docs/using_poli/objective_repository/RaSP.html).
 
 It includes the following assets:
 - Several PDB files for these RFPs, based on the Pareto front found by [LaMBO](https://arxiv.org/abs/2203.12742).
@@ -10,7 +12,6 @@ We recommend running it inside the environment of `LaMBO2`, which you can find i
 
 ```bash
 # From the root of the poli-baselines directory
-conda env create --file src/poli_baselines/solvers/bayesian_optimization/lambo2/environment.lambo2.yml
-conda activate poli__lambo2
+pip install -e .[lambo2]
 python examples/06_running_lambo2_on_rasp/run.py
 ```
diff --git a/examples/06_running_lambo2_on_rasp/run.py b/examples/06_running_lambo2_on_rasp/run.py
@@ -3,6 +3,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+import torch
 from poli.objective_repository import RaspProblemFactory
 
 from poli_baselines.solvers.bayesian_optimization.lambo2 import LaMBO2
@@ -33,20 +34,25 @@ def run_with_default_hyperparameters():
     observer.x_s.append(x0.reshape(-1, 1))
     observer.y_s.append(y0)
 
+    torch.set_float32_matmul_precision("medium")
     lambo2 = LaMBO2(
         black_box=black_box,
         x0=x0,
         y0=y0,
+        overrides=["max_epochs=2"],
+        max_epochs_for_retraining=8,
     )
-
-    lambo2.solve(max_iter=10)
+    lambo2.solve(max_iter=32)
 
     fig, (ax1, ax2) = plt.subplots(1, 2)
     plot_best_y(observer, ax1)
     plot_best_y(observer, ax2, start_from=x0.shape[0])
     ax1.axvline(x0.shape[0], color="red")
     plt.show()
 
+    print("Best starting obj value: ", np.max(y0))
+    print("Best final obj value: ", np.max(lambo2.history_for_training["y"]))
+
     black_box.terminate()
 
 
@@ -107,5 +113,5 @@ def run_with_modified_hyperparameters():
 
 
 if __name__ == "__main__":
-    # run_with_default_hyperparameters()
-    run_with_modified_hyperparameters()
+    run_with_default_hyperparameters()
+    # run_with_modified_hyperparameters()
diff --git a/examples/07_running_lambo2_on_ehrlich/run.py b/examples/07_running_lambo2_on_ehrlich/run.py
@@ -0,0 +1,72 @@
+import sys
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from poli.objective_repository import EhrlichProblemFactory
+
+from poli_baselines.solvers.bayesian_optimization.lambo2 import LaMBO2
+from poli_baselines.solvers.simple.genetic_algorithm import FixedLengthGeneticAlgorithm
+
+THIS_DIR = Path(__file__).resolve().parent
+sys.path.append(str(THIS_DIR))
+
+from simple_observer import SimpleObserver, plot_best_y  # noqa: E402
+
+
+def run_with_default_hyperparameters():
+    problem = EhrlichProblemFactory().create(
+        sequence_length=32,
+        motif_length=4,
+        n_motifs=2,
+        quantization=4,
+        return_value_on_unfeasible=-1.0,
+    )
+    black_box = problem.black_box
+    x0 = problem.x0
+    random_seqs = np.array(
+        [list(black_box._sample_random_sequence()) for _ in range(127)]
+    )
+    x0 = np.concatenate([problem.x0, random_seqs], axis=0)
+    y0 = black_box(x0)
+
+    observer = SimpleObserver()
+    black_box.set_observer(observer)
+
+    # arr = np.load(THIS_DIR / "rasp_seed_data.npz")
+    # x0 = arr["x0"]
+    # y0 = arr["y0"]
+
+    observer.x_s.append(x0.reshape(-1, 1))
+    observer.y_s.append(y0)
+
+    presolver = FixedLengthGeneticAlgorithm(
+        black_box=black_box, x0=x0, y0=y0, population_size=128, prob_of_mutation=0.4
+    )
+    presolver.solve(max_iter=1)
+    presolver_x = np.array(presolver.history["x"])
+    presolver_x = presolver_x.reshape(presolver_x.shape[0], -1)
+
+    # import pdb; pdb.set_trace()
+    torch.set_float32_matmul_precision("medium")
+    lambo2 = LaMBO2(
+        black_box=black_box,
+        x0=presolver_x,  # inconsistent API
+        overrides=["max_epochs=2"],
+        max_epochs_for_retraining=8,
+    )
+
+    lambo2.solve(max_iter=32)
+
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    plot_best_y(observer, ax1)
+    plot_best_y(observer, ax2, start_from=x0.shape[0])
+    ax1.axvline(x0.shape[0], color="red")
+    plt.show()
+
+    black_box.terminate()
+
+
+if __name__ == "__main__":
+    run_with_default_hyperparameters()
diff --git a/examples/07_running_lambo2_on_ehrlich/simple_observer.py b/examples/07_running_lambo2_on_ehrlich/simple_observer.py
@@ -0,0 +1,27 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+from poli.core.black_box_information import BlackBoxInformation
+from poli.core.util.abstract_observer import AbstractObserver
+
+
+class SimpleObserver(AbstractObserver):
+    def __init__(self) -> None:
+        self.x_s = []
+        self.y_s = []
+        super().__init__()
+
+    def initialize_observer(
+        self, problem_setup_info: BlackBoxInformation, caller_info: object, seed: int
+    ) -> object: ...
+
+    def observe(self, x: np.ndarray, y: np.ndarray, context=None) -> None:
+        self.x_s.append(x)
+        self.y_s.append(y)
+
+
+def plot_best_y(obs: SimpleObserver, ax: plt.Axes, start_from: int = 0):
+    best_y = np.maximum.accumulate(np.vstack(obs.y_s).flatten())
+    ax.plot(best_y.flatten()[start_from:])
+    ax.set_xlabel("Number of evaluations")
+    ax.set_ylabel("Best value found")
diff --git a/pyproject.toml b/pyproject.toml
@@ -77,7 +77,7 @@ bounce = [
     "pytest>=7.3.1,<7.4.0",
     "bounce @ git+https://github.com/miguelgondu/bounce.git@main"
 ]
-lambo2 = ["pytorch-cortex"]
+lambo2 = ["pytorch-cortex", "beignet"]
 
 [project.urls]
 Homepage = "https://github.com/MachineLearningLifeScience/poli-baselines"
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/environment.lambo2.small.yml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/environment.lambo2.small.yml
@@ -7,6 +7,7 @@ dependencies:
   - pip:
       - --extra-index-url https://download.pytorch.org/whl/cpu
       - numpy<2
+      - beignet
       - "git+https://github.com/prescient-design/cortex.git"
       - "git+https://github.com/MachineLearningLifeScience/poli.git"
       - "git+https://github.com/MachineLearningLifeScience/poli-baselines.git@main"
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/environment.lambo2.yml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/environment.lambo2.yml
@@ -6,6 +6,7 @@ dependencies:
   - pip
   - pip:
       - numpy<2
+      - beignet
       - "git+https://github.com/prescient-design/cortex.git"
       - "git+https://github.com/MachineLearningLifeScience/poli.git"
       - "git+https://github.com/MachineLearningLifeScience/poli-baselines.git@main"
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/branches/protein_constraint.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/branches/protein_constraint.yaml
@@ -0,0 +1,6 @@
+protein_constraint:
+  _target_: cortex.model.branch.Conv1dBranch
+  out_dim: 8
+  channel_dim: ${feature_dim}
+  num_blocks: 1
+  kernel_size: ${kernel_size}
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/branches/protein_property.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/branches/protein_property.yaml
@@ -2,5 +2,5 @@ protein_property:
   _target_: cortex.model.branch.Conv1dBranch
   out_dim: 8
   channel_dim: ${feature_dim}
-  num_blocks: 0
+  num_blocks: 1
   kernel_size: ${kernel_size}
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/generic_training.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/generic_training.yaml
@@ -2,25 +2,31 @@ defaults:
   - tree: sequence_model_conservative
   - roots: [protein_seq]
   - trunk: sum_trunk
+  # - branches: [protein_property, protein_generation, protein_constraint]
   - branches: [protein_property, protein_generation]
   - tasks:
+    # - generic_constraint
     - generic_task
     - protein_seq
   - guidance_objective: generic_task
   - optim: lambo_conservative
 
-feature_dim: 32
+feature_dim: 128
 kernel_size: 3
-batch_size: 32
+batch_size: 128
 max_epochs: 1
 data_dir: ./.cache
 wandb_mode: offline
 random_seed: 42
 num_steps: 1
-num_samples: 16
+num_samples: ${batch_size}
 allow_length_change: false
+accelerator: cpu
 
 trainer:
   _target_: lightning.Trainer
   max_epochs: ${max_epochs}
-  num_sanity_val_steps: 1
+  num_sanity_val_steps: 0
+  accelerator: ${accelerator}
+  devices: 1
+  precision: 16
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/guidance_objective/generic_task.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/guidance_objective/generic_task.yaml
@@ -4,6 +4,8 @@ static_kwargs:
   objectives:
     - generic_task
   constraints: null
+    # generic_task:
+      # - generic_constraint
   scaling: null
 runtime_kwargs:
   _target_: cortex.acquisition.get_graph_nei_runtime_kwargs
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/optim/lambo_conservative.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/optim/lambo_conservative.yaml
@@ -12,7 +12,7 @@ num_mutations_per_step: 2
 # As few as you can get away with.
 # It depends on the number of mutations you want
 # to make per step.
-max_guidance_updates: 1
+max_guidance_updates: 8
 
 # Good to leave it still.
 # (It's the gradient descent param.)
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tasks/generic_constraint.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tasks/generic_constraint.yaml
@@ -0,0 +1,21 @@
+protein_constraint:
+  generic_constraint:
+    _target_: cortex.task.ClassificationTask
+    input_map:
+      protein_seq: ['tokenized_seq']
+    class_col: is_feasible
+    num_classes: 2
+    corrupt_train_inputs: true
+    root_key: protein_seq
+    ensemble_size: 8
+    data_module:
+      _target_: cortex.data.data_module.TaskDataModule
+      _recursive_: false
+      lengths: [1.0, 0.0]
+      balance_train_partition:
+        - ${tasks.protein_constraint.generic_constraint.class_col}
+        - recency
+      batch_size: ${batch_size}
+      dataset_config:
+        _target_: cortex.data.dataset.NumpyDataset
+        train: ???
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tasks/generic_task.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tasks/generic_task.yaml
@@ -4,11 +4,16 @@ protein_property:
     input_map:
       protein_seq: ['tokenized_seq']
     outcome_cols: ['generic_task']
+    corrupt_train_inputs: true
     root_key: protein_seq
-    ensemble_size: 1
+    nominal_label_var: 0.01
+    ensemble_size: 8
     data_module:
       _target_: cortex.data.data_module.TaskDataModule
       _recursive_: false
+      lengths: [1.0, 0.0]
+      balance_train_partition:
+        - recency
       batch_size: ${batch_size}
       dataset_config:
         _target_: cortex.data.dataset.NumpyDataset
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tasks/protein_seq.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tasks/protein_seq.yaml
@@ -12,7 +12,7 @@ protein_generation:
       _recursive_: false
       batch_size: ${batch_size}
       balance_train_partition: null
-      drop_last: true
+      drop_last: false
       lengths: [1.0, 0.0]
       train_on_everything: false
       num_workers: 1
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tree/sequence_model_conservative.yaml b/src/poli_baselines/solvers/bayesian_optimization/lambo2/hydra_configs/tree/sequence_model_conservative.yaml
@@ -8,9 +8,10 @@ fit_cfg:
     _target_: torch.optim.Adam
     lr: 5e-3
     weight_decay: 0.
-    betas: [0.99, 0.999]
+    betas: [0.9, 0.99]
     fused: false
   lr_scheduler:
     _target_: transformers.get_cosine_schedule_with_warmup
-    num_warmup_steps: 1
+    num_warmup_steps: 2
     num_training_steps: ${max_epochs}
+    # _target_: transformers.get_constant_schedule
diff --git a/src/poli_baselines/solvers/bayesian_optimization/lambo2/solver.py b/src/poli_baselines/solvers/bayesian_optimization/lambo2/solver.py
diff --git a/src/poli_baselines/tests/solvers/bayesian_optimization/test_lambo2.py b/src/poli_baselines/tests/solvers/bayesian_optimization/test_lambo2.py

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ bounce = [`
`77`	`77`	`"pytest>=7.3.1,<7.4.0",`
`78`	`78`	`"bounce @ git+https://github.com/miguelgondu/bounce.git@main"`
`79`	`79`	`]`
`80`		`-lambo2 = ["pytorch-cortex"]`
	`80`	`+lambo2 = ["pytorch-cortex", "beignet"]`
`81`	`81`
`82`	`82`	`[project.urls]`
`83`	`83`	`Homepage = "https://github.com/MachineLearningLifeScience/poli-baselines"`