Adapts one of the tutorials as instructions to create a new solver

miguelgondu · miguelgondu · commit d14554a0a505 · 2024-01-23T13:56:17.000+01:00
diff --git a/docs/protein-optimization/_autosummary/poli.tests.registry.proteins.test_foldx.rst b/docs/protein-optimization/_autosummary/poli.tests.registry.proteins.test_foldx.rst
@@ -21,6 +21,7 @@
       test_registering_foldx_sasa
       test_registering_foldx_stability
       test_registering_foldx_stability_and_sasa
+      test_registering_foldx_stability_and_sasa_with_verbose_output
    
    
 
diff --git a/docs/protein-optimization/_toc.yml b/docs/protein-optimization/_toc.yml
@@ -35,7 +35,6 @@ parts:
     chapters:
       - file: using_poli_baselines/random_mutations.md
       - file: using_poli_baselines/cma_es.md
-      - file: using_poli_baselines/nsga_2.md
       - file: using_poli_baselines/bayesian_optimization.md
       - file: using_poli_baselines/line_bayesian_optimization.md
   - caption: "Observers"
diff --git a/docs/protein-optimization/contributing/a_new_problem.md b/docs/protein-optimization/contributing/a_new_problem.md
@@ -61,17 +61,24 @@ class YourBlackBox(AbstractBlackBox):
     def __init__(
         self,
         info: ProblemSetupInformation,
+        your_arg: str,
+        your_second_arg: List[float],
+        your_kwarg: str=...,
         batch_size: int = None,
         parallelize: bool = False,
-        num_workers: int = None
+        num_workers: int = None,
+        evaluation_budget: int = float("inf")
     ):
         super().__init__(
             info=info,
             batch_size=batch_size,
             parallelize=parallelize,
             num_workers=num_workers,
+            evaluation_budget=evaluation_budget,
         )
 
+        #... your manipulation of args and kwargs.
+
     # The only method you have to define
     def _black_box(self, x: np.ndarray, context: dict = None) -> np.ndarray:
         return ...
@@ -95,14 +102,16 @@ class YourProblemFactory(AbstractProblemFactory):
     def create(
         self,
         seed: int = None,
+        your_arg: str = ...,
+        your_second_arg: List[float] = ...,
+        your_kwarg: str = ...,
         batch_size: int = None,
         parallelize: bool = False,
         num_workers: int = None,
-        your_keyword_1: str = ...,
-        your_keyword_2: int = ...,
-        your_keyword_3: List[float] = ...,
+        evaluation_budget: int = float("inf"),
+        your_second_arg: List[float] = ...,
     ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
-        # Manipulate keywords you might need at creation time...
+        # Manipulate args and kwargs you might need at creation time...
         ...
         
         # Getting the problem information
@@ -111,12 +120,16 @@ class YourProblemFactory(AbstractProblemFactory):
         # Creating your black box function
         f = YourBlackBox(
             info=problem_info,
+            your_arg=your_arg,
+            your_second_arg=your_second_arg,
+            your_kwarg=your_kwarg,
             batch_size=batch_size,
             parallelize=parallelize,
             num_workers=num_workers,
+            evaluation_budget=evaluation_budget,
         )
         
-        # Your first input (an np.array[str])
+        # Your first input (an np.array[str] of shape [b, L] or [b,])
         x0 = ...
 
         return f, x0, f(x0)
@@ -146,7 +159,7 @@ It is important that name of your problem should be the name of the folder it's
 
 :::{warning}
 
-`poli` is experimental. The input kwargs to the abstract black box
+`poli` is under active development. The input kwargs to the abstract black box
 and to the create method are under active development. Your IDE should
 tell you automatically, though!
 
@@ -165,7 +178,7 @@ dependencies:
   - pip
   - pip:
     - numpy
-    - "git+https://github.com/MachineLearningLifeScience/poli.git@master"
+    - "git+https://github.com/MachineLearningLifeScience/poli.git@dev"
     - YOUR OTHER DEPENDENCIES
 ```
 
@@ -191,11 +204,11 @@ dependencies:
   - pip:
     - numpy
     - click
-    - "git+https://github.com/MachineLearningLifeScience/poli.git@master"
+    - "git+https://github.com/MachineLearningLifeScience/poli.git@dev"
 
 ```
 
-It installs an `openjdk` that will be added to the path when the environment is active. Moreover, you can also hack your way around installing conda and creating conda environments inside Colab.
+It installs an `openjdk` that will be added to the path when the environment is active. Moreover, [`conda` is also installable in Google Colab, allowing you to use `poli` there](https://colab.research.google.com/drive/1-IISCebWYfu0QhuCJ11wOag8aKOiPtls?usp=sharing).
 
 :::
 
@@ -208,18 +221,11 @@ If you
 
 then you should be set!
 
-You can test that your problem is registerable by creating a fresh environment that includes poli, and running
+You can test that your problem is registerable by running
 
 ```bash
 $ python -c "from poli.core.registry import get_problems; print(get_problems())"
-[...]  # A list, without your problem in it.
-```
-
-Your problem is not registered yet, so don't fret. You can check _if_ you can register it by running
-
-```bash
-$ python -c "from poli.core.registry import get_problems; print(get_problems(include_repository=True))"
-[..., "your_problem", ...]   # If all goes well, you should see "your_problem" here.
+[..., "your_problem", ...]  # A list with your problem in it
 ```
 
 If you can find your problem in this list, then you're set! You should be able to run
@@ -230,18 +236,20 @@ from poli import objective_factory
 problem_info, f, x0, y0, _ = objective_factory.create(
     name="your_problem",
     ...,
-    your_keyword_1=...,      # <-- Keywords you (maybe) needed
-    your_keyword_2=...       # <-- at your_factory.create(...)
-                        # For now, only string kwargs are
-                        # supported. 
+    your_arg_1=...,      # <-- Keywords you (maybe) needed
+    your_arg_2=...,       # <-- at your_factory.create(...)
+    your_kwarg=...,       # <--
+                            # For now, only certain types are
+                            # supported: str, int, bool, float,
+                            # None, and lists thereof.
 )
 ```
 
-`poli` will ask you to confirm that you want to register your problem (you can force the registration by passing `force_register=True` to `objective_factory.create`).
-
 ## (Optional) Making your problem be available if dependencies are met
 
-At this point, you can run your objective function in an isolated process (which will literally import the factory and the black box function from the `register.py` you wrote). A better alternative is to get direct access to the object itself. Having access to the actual class makes your life easy, especially when it comes to using debugging tools like the ones in VSCode.
+At this point, you can run your objective function in an isolated process (which will literally import the factory and the black box function from the `register.py` you wrote).
+
+A better alternative is to get direct access to the object itself. Having access to the actual class makes your life easy, especially when it comes to using debugging tools like the ones in VSCode.
 
 If you want to make your problem available if it can be imported, take a look at `src/poli/objective_repository/__init__.py`. Add a block like this one at the end of it:
 
@@ -263,4 +271,4 @@ except ImportError:  # Maybe you'll need to check for other errors.
 
 ## Submitting a pull request
 
-If you want to share your problem with us, feel free to create a pull request in our repository: https://github.com/MachineLearningLifeScience/poli
+If you want to share your problem with us, feel free to create a pull request in our repository following the instructions in our `CONTRIBUTING.md`: https://github.com/MachineLearningLifeScience/poli
diff --git a/docs/protein-optimization/contributing/a_new_solver.md b/docs/protein-optimization/contributing/a_new_solver.md
@@ -1,3 +1,135 @@
-# Adding a new black box optimization algorithm
+# Adding a new optimizer to `poli-baselines`
 
-[TODO: write] For now, check [the chapter on creating solvers](../using_poli/the_basics/defining_a_problem_solver.md).
+The main use-case for `poli_baselines` is **defining optimizers for objective functions**.
+
+The main design objective of `poli` is for it to be almost trivial to **query** complicated black box objective functions; likewise, the design objective of `poli_baselines` is to allow developers of black-box optimization algorithms to test them on said objective functions.  
+
+This chapter explains how to define a "solver", or a black-box optimization algorithm.
+
+:::{note}
+
+By default, all our optimizers **maximize**.
+
+:::
+
+## An abstract problem solver
+
+All problem solvers in `poli_baselines` inherit from an `AbstractSolver`, which is implemented as follows:
+
+```python
+# poli_baselines/core/abstract_solver.py
+class AbstractSolver:
+    def __init__(
+        self,
+        black_box: AbstractBlackBox,
+        x0: np.ndarray,
+        y0: np.ndarray,
+    ):
+        self.black_box = black_box
+        self.x0 = x0
+        self.y0 = y0
+
+        self.history = {
+            "x": [x0_i.reshape(1, -1) for x0_i in x0],
+            "y": [y0_i.reshape(1, -1) for y0_i in y0],
+        }
+
+        self.iteration = 0
+```
+
+i.e. the minimal ingredients required to instantiate a solver are a black-box function defined through `poli`, the initial design `x0`, and its evaluation `y0`.
+
+**The only abstract method required** is a `next_candidate() -> np.ndarray`, which uses the `self.history` to propose a new candidate. Using this method, the abstract solver implements a `.solve(max_iter: int)` as follows:
+
+```python
+# poli_baselines/core/abstract_solver.py
+class AbstractSolver:
+    ...
+
+    def next_candidate(self) -> np.ndarray:
+        """
+        Returns the next candidate solution
+        after checking the history.
+        """
+        raise NotImplementedError(
+            "This method is abstract, and should be implemented by a subclass."
+        )
+
+    def solve(self, max_iter: int = 100):
+        """
+        Runs the solver for the given number of iterations.
+        """
+        for i in range(max_iter):
+            # Call the pre-step callbacks
+            if pre_step_callbacks is not None:
+                for callback in pre_step_callbacks:
+                    callback(self)
+
+            # Take a step, which in turn updates the local history.
+            _, y = self.step()
+
+            # Call the post-step callbacks
+            if post_step_callbacks is not None:
+                for callback in post_step_callbacks:
+                    callback(self)
+
+            if verbose:
+                print(f"Iteration {i}: {y}, best so far: {self.get_best_performance()}")
+
+            if break_at_performance is not None:
+                if y >= break_at_performance:
+                    break
+```
+
+## An example: `RandomMutations`
+
+Leveraging the fact that we are usually working with discrete sequences, we can implement the simplest version of an optimizer: one that takes the best performing sequence, and randomly mutates one of its positions.
+
+The following is an implementation of exactly this:
+
+```python
+# poli_baselines/solvers/simple/random_mutation.py (almost)
+class RandomMutation(AbstractSolver):
+    def __init__(
+        self,
+        black_box: AbstractBlackBox,
+        x0: np.ndarray,
+        y0: np.ndarray,
+    ):
+        super().__init__(black_box, x0, y0)
+        self.alphabet = black_box.info.alphabet
+        self.alphabet_size = len(self.alphabet)
+
+    def next_candidate(self) -> np.ndarray:
+        """
+        Returns the next candidate solution
+        after checking the history.
+
+        In this case, the RandomMutation solver
+        simply returns a random mutation of the
+        best performing solution so far.
+        """
+        # Get the best performing solution so far
+        best_x = self.history["x"][np.argmax(self.history["y"])]
+
+        # Perform a random mutation
+        # (Assuming that x is always [1, L] in shape)
+        next_x = best_x.copy()
+        pos = np.random.randint(0, len(next_x.flatten()))
+        mutant = np.random.choice(self.alphabet)
+        next_x[0][pos] = mutant
+
+        return next_x
+```
+
+Pretty lean! Notice how **the `next_candidate` method could perform all sorts of complicated logic** like latent space Bayesian Optimization, evolutionary algorithms... Moreover, the conda environment where you do the optimization has nothing to do with the enviroment where the objective function was defined: `poli` is set up in such a way that you can query the objective functions without having to worry!
+
+:::{note}
+Our implementation of `RandomMutation` is slightly different, since we allow users to query e.g. integer indices instead of strings.
+
+[Take a look at the exact implementation on `poli_baselines/solvers/simple/random_mutation.py`](https://github.com/MachineLearningLifeScience/poli-baselines/blob/main/src/poli_baselines/solvers/simple/random_mutation.py).
+:::
+
+## Submitting a pull request
+
+If you want to share your problem with us, feel free to create a pull request in our repository following the instructions in our `CONTRIBUTING.md`: https://github.com/MachineLearningLifeScience/poli-baselines
diff --git a/docs/protein-optimization/index.md b/docs/protein-optimization/index.md
@@ -5,10 +5,20 @@ This page contains documentation on how to use `poli`, a library of discrete obj
 A core feature of `poli` is isolating calls to complicated objective functions which might, for example, depend on simulators, binaries, and highly specific package requirements.
 Our promise is: if you can run your objective function reliably in a `conda` environment, then you can register it and call it from other projects and environments without having to worry about re-installing all the dependencies.
 
-## Get started!
+## Getting started
 
 A good place to start is the next chapter! [Go to Getting Started](./getting_started/getting_started.md).
 
+To install `poli` and `poli-baselines`, we recommend creating a fresh conda environment
+
+```bash
+conda create -n poli-base python=3.9
+conda activate poli-base
+pip install git+https://github.com/MachineLearningLifeScience/poli.git@dev
+pip install git+https://github.com/MachineLearningLifeScience/poli-baselines.git@main
+```
+
+`poli` also [runs on colab](https://colab.research.google.com/drive/1-IISCebWYfu0QhuCJ11wOag8aKOiPtls?usp=sharing).
 
 ## Black-box objective functions
 
@@ -121,12 +131,6 @@ On top of `poli`, we provide `poli-baselines`, a collection of **black-box optim
 Optimizing a discrete sequence by performing random mutations
 :::
 
-:::{grid-item-card} Discrete NSGA-2
-:link: ./using_poli_baselines/nsga_2.html
-:columns: 6
-A Genetic algorithm for multi-objective optimization of discrete sequences
-:::
-
 :::{grid-item-card} CMA-ES
 :link: ./using_poli_baselines/cma_es.html
 :columns: 6