diff --git a/CHANGELOG.md b/CHANGELOG.md index b50e2b5..409b0a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Added - Notebook on transfer learning -- Notebook on chemical encodings +- ~~Notebook on chemical encodings~~ See **Removed** - CODEOWNERS file - Notebook on reaction optimization - CI tests for execution of notebooks and changes in CHANGELOG - Basic description and infrastructure +- Jupyter notebook version of the reaction optimization notebook ### Changed - Update from BayBE version 0.13.0 to 0.14.2 + +### Removed +- Notebook on chemical encodings was merged with notebook on reaction optimization diff --git a/notebooks/Chemical_Encodings.py b/notebooks/Chemical_Encodings.py deleted file mode 100644 index 85a5048..0000000 --- a/notebooks/Chemical_Encodings.py +++ /dev/null @@ -1,253 +0,0 @@ -import marimo - -__generated_with = "0.19.7" -app = marimo.App(width="medium", app_title="Chemical encodings") - - -@app.cell -def _(): - import marimo as mo - import warnings - - warnings.filterwarnings("ignore") - return (mo,) - - -@app.cell(hide_code=True) -def _(mo): - mo.md(""" - # Chemical encodings - - This file contains some examples related to chemical encodings. - - Note that this example assumes some basic familiarity with using `BayBE`, and that it does not explain all of the core concepts. If you are interested in those, we recommend to first check out the `Reation_Optimization` example. - """) - return - - -@app.cell(hide_code=True) -def _(mo): - mo.md(""" - ## Setup - - We begin this example by defining a suitable recommender. We use a `TwoPhaseMetaRecommender` equipped with a `BotorchRecommender`. We also use use a specific kernel that is suited particularly well for chemical problems - the [`EDBOKernel`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.surrogates.gaussian_process.presets.edbo.EDBOKernelFactory.html#baybe.surrogates.gaussian_process.presets.edbo.EDBOKernelFactory). - """) - return - - -@app.cell -def _(): - from baybe.surrogates.gaussian_process.presets.edbo import EDBOKernelFactory - from baybe.recommenders import TwoPhaseMetaRecommender, BotorchRecommender - from baybe.surrogates import GaussianProcessSurrogate - - recommender = TwoPhaseMetaRecommender( - recommender=BotorchRecommender( - surrogate_model=GaussianProcessSurrogate( - kernel_or_factory=EDBOKernelFactory() - ) - ) - ) - return (recommender,) - - -@app.cell(hide_code=True) -def _(mo): - mo.md(""" - This examples uses the same basic example like the `Reaction_Optimization` notebook. However, we now use different encodings of the chemical parameters. The encoding that should be used can be described by the `encoding` field. We investigate three different encodings here and create one campaign per chemical encoding. - """) - return - - -@app.cell -def _(): - import pandas as pd - - from utils import create_dict_from_columns - - df = pd.read_csv("data/shields.csv") - - substances = { - "bases": create_dict_from_columns(df, "Base_Name", "Base_SMILES"), - "ligands": create_dict_from_columns(df, "Ligand_Name", "Ligand_SMILES"), - "solvents": create_dict_from_columns(df, "Solvent_Name", "Solvent_SMILES"), - } - return df, substances - - -@app.cell(hide_code=True) -def _(mo): - mo.md(""" - Now that we have the data collected, we can create different campaigns that we want to compare against each other. To facilitate the usage of `BayBE`'s simulation capabilities, we collect the different campaigns in a `dict`. - """) - return - - -@app.cell -def _(df, recommender, substances): - from baybe import Campaign - from baybe.parameters import ( - CategoricalParameter, - NumericalDiscreteParameter, - SubstanceParameter, - ) - from baybe.parameters import CategoricalParameter - from baybe.searchspace import SearchSpace - from baybe.targets import NumericalTarget - - objective = NumericalTarget(name="yield").to_objective() - - substance_encodings = ["MORDRED", "ECFP", "RDKIT2DDESCRIPTORS"] - scenarios = { - encoding: Campaign( - searchspace=SearchSpace.from_product( - parameters=[ - SubstanceParameter( - name="Solvent_Name", - data=substances["solvents"], - encoding=encoding, - ), - SubstanceParameter( - name="Base_Name", data=substances["bases"], encoding=encoding - ), - SubstanceParameter( - name="Ligand_Name", - data=substances["ligands"], - encoding=encoding, - ), - NumericalDiscreteParameter( - values=df["Concentration"].unique(), name="Concentration" - ), - NumericalDiscreteParameter( - values=df["Temp_C"].unique(), name="Temp_C" - ), - ] - ), - objective=objective, - recommender=recommender, - ) - for encoding in substance_encodings - } - return ( - Campaign, - CategoricalParameter, - NumericalDiscreteParameter, - SearchSpace, - objective, - scenarios, - ) - - -@app.cell(hide_code=True) -def _(mo): - mo.md(""" - Of course, we also want to compare the campaigns using the chemical encodings to other campaigns not using the special encoding. - """) - return - - -@app.cell -def _( - Campaign, - CategoricalParameter, - NumericalDiscreteParameter, - SearchSpace, - objective, - scenarios, - substances, -): - ohe_parameters = [ - CategoricalParameter( - name="Solvent_Name", values=substances["solvents"], encoding="OHE" - ), - CategoricalParameter( - name="Base_Name", values=substances["bases"], encoding="OHE" - ), - CategoricalParameter( - name="Ligand_Name", values=substances["ligands"], encoding="OHE" - ), - NumericalDiscreteParameter(name="Temp_C", values=[90, 105, 120]), - NumericalDiscreteParameter(name="Concentration", values=[0.057, 0.1, 0.153]), - ] - campaign_ohe = Campaign( - searchspace=SearchSpace.from_product(parameters=ohe_parameters), - objective=objective, - ) - scenarios["OHE"] = campaign_ohe - return - - -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - ## Using `BayBE`'s [simulation capabilities](https://emdgroup.github.io/baybe/0.14.2/userguide/simulation.html) - - `BayBE` offers multiple functionalities to “simulate” experimental campaigns with a given lookup mechanism. `BayBE`’s simulation package enables a wide range of use cases and can even be used for “oracle predictions”. This is made possible through the flexible use of lookup mechanisms, which act as the loop-closing element of an optimization loop. - - Lookups can be provided in a variety of ways, by using fixed data sets, analytical functions, or any other form of black-box callable. In all cases, their role is the same: to retrieve target values for parameter configurations suggested by the recommendation engine. - - In our case, we can directly use the data that we stored in the `df` dataframe and do the simulation. - """) - return - - -@app.cell -def _(df, scenarios): - from baybe.simulation import simulate_scenarios - - BATCH_SIZE = 2 - N_DOE_ITERATIONS = 10 # Change to ~15 for better plots - N_MC_ITERATIONS = 15 # Change to ~25 for better plots - - results = simulate_scenarios( - scenarios, - df, - batch_size=BATCH_SIZE, - n_doe_iterations=N_DOE_ITERATIONS, - n_mc_iterations=N_MC_ITERATIONS, - ) - - results.rename( - columns={ - "Scenario": "Substance encoding", - "Num_Experiments": "Number of experiments", - "yield_CumBest": "Running best yield", - }, - inplace=True, - ) - return (results,) - - -@app.cell(hide_code=True) -def _(mo): - mo.md(""" - We now visualize the results using the `backtest_plot` utility. This utility "averages" the individual Monte Carlo iterations and shows the mean and a confidence interval. It can also be used to give visual guidance on the performance of individual scenarios. - """) - return - - -@app.cell -def _(mo, results): - from utils import backtest_plot - import matplotlib.pyplot as plt - - backtest_plot( - df=results, - x="Number of experiments", - y="Running best yield", - hue="Substance encoding", - indicator_y=90, - indicator_labels=["MORDRED"], - ) - mo.mpl.interactive(plt.gcf()) - return (plt,) - - -@app.cell -def _(plt): - plt.close() - return - - -if __name__ == "__main__": - app.run() diff --git a/notebooks/Reaction_Optimization.ipynb b/notebooks/Reaction_Optimization.ipynb new file mode 100644 index 0000000..3c98002 --- /dev/null +++ b/notebooks/Reaction_Optimization.ipynb @@ -0,0 +1,775 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "MJUe", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "# Using `BayBE` to optimize Reaction Conditions\n", + "\n", + "This notebook contains an example on how to use BayBE for the optimization of reaction conditions. It is inspired by the corresponding notebook developed by Pat Walters as part of his [Practical Cheminformatics Tutorial](https://github.com/PatWalters/practical_cheminformatics_tutorials). This notebook assumes basic familiarity with the core concepts of Bayesian Optimization. The intention of this notebook is *not* to introduce and explain all aspects of Bayesian Optimization, but to focus on the usage of `BayBE`.\n", + "\n", + "In drug discovery, we frequently encounter situations where we need to modify a set of reaction conditions to optimize the yield. This notebook shows how to use `BayBE` to model and optimize such a campaign.\n", + "In particular, it demonstrates the power and usefulness of `BayBE`'s chemical encodings. If parameters in a process to be optimized are chemicals, this feature enables `BayBE` to automatically use meaningful chemical descriptors, automatically leveraging chemical knowledge for the optimization process.\n", + "\n", + "**Caution**\n", + "This notebook was developed for `BayBE` version 0.14.2. Although we do our best in keeping our breaking changes minimal and support outdated versions for a long time, this notebook might not be immediately applicable for other `BayBE` versions. If you install `BayBE` via the instructions in this repository, version 0.14.2 will thus be installed." + ] + }, + { + "cell_type": "markdown", + "id": "4d8fa07a", + "metadata": {}, + "source": [ + "## Installation\n", + "\n", + "To install `BayBE` in AWS SageMaker, make sure that you have the `conda_python3` kernel activated. Then, run the following cell to install all required packages.\n", + "Note that this might take some minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cd7003e", + "metadata": {}, + "outputs": [], + "source": [ + "%mamba install h5py\n", + "%pip install baybe[chem,simulation]==0.14.2 seaborn" + ] + }, + { + "cell_type": "markdown", + "id": "vblA", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Introduction\n", + "\n", + "In this notebook, we consider a reaction described in the supporting material of a 2020 paper by [Shields et al.](https://www.nature.com/articles/s41586-021-03213-y), in which the following reaction should be optimized:" + ] + }, + { + "cell_type": "markdown", + "id": "bkHC", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + }, + "vscode": { + "languageId": "bat" + } + }, + "source": [ + "![Optimized Reaction](../images/reaction.svg)" + ] + }, + { + "cell_type": "markdown", + "id": "lEQa", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "We can vary 5 different parameters in this experiment:\n", + "\n", + "1. **Ligand**: We are given a list of 12 different ligands that we can choose from.\n", + "2. **Base:** We have 4 different bases available for our experiment.\n", + "3. **Solvent**: We can use one of 4 available solvents.\n", + "4. **Concentration:** We can choose from one of 3 available concentrations.\n", + "5. **Temperature:** We can chose from one of 3 available temperatures.\n", + "\n", + "Consequently, this means that we have **1728** different potential experiments that we could run. Fortunately, Shields and coworkers have investigated all 1728 combinations and provided a table with the conditions and corresponding yields. Note that only 18 out of the 1728 potential experiments have a yield within the top 10 percent!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "PKri", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from utils import create_dict_from_columns\n", + "\n", + "df = pd.read_csv(\"data/shields.csv\")\n", + "\n", + "# Extract SMILES data for all chemical substances\n", + "solvents_dict = create_dict_from_columns(df, \"Solvent_Name\", \"Solvent_SMILES\")\n", + "ligands_dict = create_dict_from_columns(df, \"Ligand_Name\", \"Ligand_SMILES\")\n", + "bases_dict = create_dict_from_columns(df, \"Base_Name\", \"Base_SMILES\")\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "Xref", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "Our goal is to identify one of the top candidates, that is, one of the 18 experiments with a yield larger than 90 using only a few experiments.\n", + "\n", + "We will begin by identifying 10 initial reaction conditions. In practice, we would then run experiments to evaluate these conditions and record the corresponding reaction yields. However, in this case, we will look up the yields in a table. With the conditions and yields in hand, we can build a Bayesian model and use this model to select another 5 reaction conditions. We will then look up the yields for the 5 conditions and use this information to update the model. We will repeat this process through 5 rounds of optimization and examine the reaction yields for each optimization cycle." + ] + }, + { + "cell_type": "markdown", + "id": "SFPL", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Overview\n", + "\n", + "Setting up an experimentation campaign with `BayBE` requires us to set up the main components individually. In this notebook, we will set up the following components one after another.\n", + "\n", + "1. [**Parameters**](https://emdgroup.github.io/baybe/0.14.2/userguide/parameters.html): In our setting, a _parameter_ is something that we can control directly. An example of this is which ligand to choose, or at which of the available temperatures to run the experiment. Each of the 5 parameters described earlier will correspond to exactly one of `BayBE`'s `Parameter`s.\n", + "2. [**Search space**](https://emdgroup.github.io/baybe/0.14.2/userguide/searchspace.html): The search space defines the combination of parameters to be searched. It thus contains all possible experiments that we could conduct. The search space is typically defined using the function `SearchSpace.from_product`, which creates a search space as the Cartesian product of the parameters.\n", + "3. [**Target**](https://emdgroup.github.io/baybe/0.14.2/userguide/targets.html): The target is the quantity we are optimizing. In the case of reaction optimization, this is typically the yield. `BayBE` can optimize a single parameter or multiple parameters at once. In this notebook, we'll focus on single parameter optimization, where we are only optimizing the yield, and we hence stick to single target optimization.\n", + "4. [**Recommender**](https://emdgroup.github.io/baybe/0.14.2/userguide/recommenders.html): The recommender selects the next set of experiments to be performed. In this case, we use the default [`TwoPhaseMetaRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.meta.sequential.TwoPhaseMetaRecommender.html). This recommender behaves differently depending on whether it has experimental data. At the beginning of an optimization process, we typically don't have experimental data and want to find a diverse set of conditions to gather some initial data. If the `TwoPhaseMetaRecommender` has no data available, it uses random sampling to select a set of initial experiments. If the recommender has data, it uses the [`BotorchRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.html), a Bayesian optimizer that balances exploration and exploitation when selecting sets of reaction conditions.\n", + "5. [**Campaign**](https://emdgroup.github.io/baybe/0.14.2/userguide/campaigns.html): In `BayBE`, the search space, objective, and recommender are combined into a `Campaign` object. The `Campaign` has two important methods: `recommend`, which recommends the next set of experiments, and `add_measurements`, which adds a set of experiments and updates the underlying Bayesian model." + ] + }, + { + "cell_type": "markdown", + "id": "BYtC", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Defining the [`Parameters`](https://emdgroup.github.io/baybe/0.14.2/userguide/parameters.html)\n", + "\n", + "In this section, we introduce two different parameter types: The [`SubstanceParameter`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.parameters.substance.SubstanceParameter.html) and the [`NumericalDiscreteParameter`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.parameters.numerical.NumericalDiscreteParameter.html).\n", + "\n", + "The `SubstanceParameter` is specifically designed for chemical substances and can automatically use meaningful chemical descriptors. It takes a `name` field and a `data` dictionary mapping substance names to their SMILES representations. One can also choose a specific chemical `encoding` such as MORDRED, ECFP, or RDKIT2DDESCRIPTORS.\n", + "\n", + "In this tutorial, we model all three chemical parameters (ligand, solvent, and base) as `SubstanceParameter`s to leverage chemical knowledge in the optimization process. Since we have access to the SMILES data, we extract the mappings from the data and create the corresponding `SubstanceParameter`s." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "RGSE", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.parameters import SubstanceParameter\n", + "\n", + "ligand = SubstanceParameter(\n", + " name=\"Ligand_Name\",\n", + " data=ligands_dict,\n", + " encoding=\"MORDRED\"\n", + ")\n", + "solvent = SubstanceParameter(\n", + " name=\"Solvent_Name\",\n", + " data=solvents_dict,\n", + " encoding=\"MORDRED\"\n", + ")\n", + "base = SubstanceParameter(\n", + " name=\"Base_Name\",\n", + " data=bases_dict,\n", + " encoding=\"MORDRED\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "Kclp", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "The `NumericalDiscreteParameter` is another `DiscreteParameter` and is intended to be used for parameters that have numerical values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "emfo", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.parameters import NumericalDiscreteParameter\n", + "\n", + "concentration = NumericalDiscreteParameter(\n", + " values=df[\"Concentration\"].unique(), name=\"Concentration\"\n", + ")\n", + "temperature = NumericalDiscreteParameter(\n", + " values=df[\"Temp_C\"].unique(), name=\"Temp_C\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "Hstk", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Define the [`SearchSpace`](https://emdgroup.github.io/baybe/0.14.2/userguide/searchspace.html)\n", + "\n", + "The parameters are now combined into a [`SearchSpace`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.searchspace.html) object. Using the [`SearchSpace.from_product`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.searchspace.core.SearchSpace.html#baybe.searchspace.core.SearchSpace.from_product) constructor, we construct the cartesian product of the parameters that we defined previously." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nWHF", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.searchspace import SearchSpace\n", + "\n", + "parameters = [ligand, solvent, base, concentration, temperature]\n", + "searchspace = SearchSpace.from_product(parameters=parameters)" + ] + }, + { + "cell_type": "markdown", + "id": "iLit", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Define the [`Target`](https://emdgroup.github.io/baybe/0.14.2/userguide/targets.html) & objective\n", + "\n", + "In this example, we want to maximize the yield of the reaction. Since we are only optimizing a single objective, we use the [`SingleTargetObjective`](https://emdgroup.github.io/baybe/0.14.2/userguide/objectives.html#singletargetobjective) which assumes a maximization of the target as default." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ZHCJ", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.targets import NumericalTarget\n", + "from baybe.objectives import SingleTargetObjective\n", + "\n", + "target = NumericalTarget(name=\"yield\")\n", + "objective = SingleTargetObjective(target=target)" + ] + }, + { + "cell_type": "markdown", + "id": "ROlb", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Define the [`Recommender`](https://emdgroup.github.io/baybe/0.14.2/userguide/recommenders.html)\n", + "\n", + "The [`Recommender`](https://emdgroup.github.io/baybe/0.14.2/userguide/recommenders.html) selects the next set of experiments to try.\n", + "There are many different recommenders offered by `BayBE`, and a lot of ways of combining them. For this example, we use a [`TwoPhaseMetaRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.meta.sequential.TwoPhaseMetaRecommender.html) equipped with a [`BotorchRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.html) that uses the [`EDBOKernel`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.surrogates.gaussian_process.presets.edbo.EDBOKernelFactory.html). The EDBO kernel is particularly well-suited for chemical optimization problems as it was specifically designed to handle chemical descriptors effectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "qnkX", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.surrogates.gaussian_process.presets.edbo import EDBOKernelFactory\n", + "from baybe.recommenders import TwoPhaseMetaRecommender, BotorchRecommender\n", + "from baybe.surrogates import GaussianProcessSurrogate\n", + "\n", + "recommender = TwoPhaseMetaRecommender(\n", + " recommender=BotorchRecommender(\n", + " surrogate_model=GaussianProcessSurrogate(\n", + " kernel_or_factory=EDBOKernelFactory()\n", + " )\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "TqIu", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Define the [`Campaign`](https://emdgroup.github.io/baybe/0.14.2/userguide/campaigns.html)\n", + "\n", + "Now, we combine all of the individual pieces into one of the core concepts of `BayBE` - the `campaign` object. This object is responsible for organizing and managing an experimental campaign." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "Vxnm", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.campaign import Campaign\n", + "\n", + "campaign = Campaign(\n", + " searchspace=searchspace, objective=objective, recommender=recommender\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "DnEU", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Starting [the recommendation loop](https://emdgroup.github.io/baybe/0.14.2/userguide/getting_recommendations.html)\n", + "\n", + "Now that the `campaign` is defined, we can ask it for recommendations. So far, we haven't done any experiments. As such, the `campaign` will use random sampling to select a diverse set of initial experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ulZA", + "metadata": {}, + "outputs": [], + "source": [ + "initial_rec = campaign.recommend(batch_size=10)\n", + "initial_rec" + ] + }, + { + "cell_type": "markdown", + "id": "ecfG", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "At this point, we would typically perform a set of experiments using the 10 recommendations provided by the `campaign`. In this tutorial, we simply grab the yield from the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "Pvdt", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "outputs": [], + "source": [ + "merge_columns = [\n", + " \"Ligand_Name\",\n", + " \"Solvent_Name\",\n", + " \"Base_Name\",\n", + " \"Concentration\",\n", + " \"Temp_C\",\n", + "]\n", + "initial_rec_results = pd.merge(\n", + " initial_rec, df[merge_columns + [\"yield\"]], on=merge_columns, how=\"left\"\n", + ")\n", + "\n", + "initial_rec_results" + ] + }, + { + "cell_type": "markdown", + "id": "ZBYS", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "Now that we've performed experiments, we need to add the data from the experiments to the Campaign. We do this with the [`add_measurements`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.campaign.Campaign.html#baybe.campaign.Campaign.add_measurements) method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aLJB", + "metadata": {}, + "outputs": [], + "source": [ + "campaign.add_measurements(initial_rec_results)" + ] + }, + { + "cell_type": "markdown", + "id": "nHfw", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "Now let's simulate what we would do in practice.\n", + "\n", + "1. Ask the `campaign` for another set of 5 recommendations. Now that we've added measurements, the Campaign uses the underlying Bayesian model to select the next set of reaction conditions.\n", + "2. Next we will look up the yield for that set of conditions and use the yield data to update the Bayesian model.\n", + "\n", + "We'll repeat this process 5 times and examine the distribution of yields at each iteration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "xXTn", + "metadata": {}, + "outputs": [], + "source": [ + "for _ in range(10):\n", + " rec = campaign.recommend(5)\n", + " rec_results = rec.merge(\n", + " df[merge_columns + [\"yield\"]], on=merge_columns, how=\"left\"\n", + " )\n", + " campaign.add_measurements(rec_results)" + ] + }, + { + "cell_type": "markdown", + "id": "AjVT", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "Now, let's have a look at the results which are stored in the `campaign` object and compare them to the optimal value. Note how much `marimo`helps here with the inspection!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pHFh", + "metadata": {}, + "outputs": [], + "source": [ + "measurements = campaign.measurements\n", + "measurements" + ] + }, + { + "cell_type": "markdown", + "id": "NCOB", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "As we can see, we found a very good candidate, and only needed to evaluate a fraction of the search space!" + ] + }, + { + "cell_type": "markdown", + "id": "aqbW", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "## Investigating Different Chemical Encodings\n", + "\n", + "Now that we've seen how to use `BayBE` for reaction optimization with chemical encodings, let's investigate how different chemical encodings affect the optimization performance. BayBE supports multiple chemical encodings for `SubstanceParameter`s, each capturing different aspects of molecular structure.\n", + "\n", + "We'll compare three different chemical encodings as well as the One-Hot-Encoding by running simulated optimization campaigns and visualizing their performance." + ] + }, + { + "cell_type": "markdown", + "id": "TRpd", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "We create multiple campaigns, one for each encoding we want to compare. Each campaign will use `SubstanceParameter`s for all three chemical parameters (solvent, ligand, and base) with the specified encoding:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "TXez", + "metadata": {}, + "outputs": [], + "source": [ + "substance_encodings = [\"MORDRED\", \"ECFP\", \"RDKIT2DDESCRIPTORS\"]\n", + "scenarios = {\n", + " encoding: Campaign(\n", + " searchspace=SearchSpace.from_product(\n", + " parameters=[\n", + " SubstanceParameter(\n", + " name=\"Solvent_Name\",\n", + " data=solvents_dict,\n", + " encoding=encoding,\n", + " ),\n", + " SubstanceParameter(\n", + " name=\"Base_Name\",\n", + " data=bases_dict,\n", + " encoding=encoding,\n", + " ),\n", + " SubstanceParameter(\n", + " name=\"Ligand_Name\",\n", + " data=ligands_dict,\n", + " encoding=encoding,\n", + " ),\n", + " NumericalDiscreteParameter(\n", + " values=df[\"Concentration\"].unique(), name=\"Concentration\"\n", + " ),\n", + " NumericalDiscreteParameter(\n", + " values=df[\"Temp_C\"].unique(), name=\"Temp_C\"\n", + " ),\n", + " ]\n", + " ),\n", + " objective=objective,\n", + " recommender=recommender,\n", + " )\n", + " for encoding in substance_encodings\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "dNNg", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "We also want to compare the campaigns using chemical encodings to a baseline campaign that uses One-Hot Encoding (OHE) with `CategoricalParameter`s for all chemical parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "yCnT", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.parameters import CategoricalParameter\n", + "\n", + "ohe_parameters = [\n", + " CategoricalParameter(\n", + " name=\"Solvent_Name\", values=df[\"Solvent_Name\"].unique(), encoding=\"OHE\"\n", + " ),\n", + " CategoricalParameter(\n", + " name=\"Base_Name\", values=df[\"Base_Name\"].unique(), encoding=\"OHE\"\n", + " ),\n", + " CategoricalParameter(\n", + " name=\"Ligand_Name\", values=df[\"Ligand_Name\"].unique(), encoding=\"OHE\"\n", + " ),\n", + " NumericalDiscreteParameter(\n", + " name=\"Temp_C\", values=df[\"Temp_C\"].unique()\n", + " ),\n", + " NumericalDiscreteParameter(\n", + " name=\"Concentration\", values=df[\"Concentration\"].unique()\n", + " ),\n", + "]\n", + "campaign_ohe = Campaign(\n", + " searchspace=SearchSpace.from_product(parameters=ohe_parameters),\n", + " objective=objective,\n", + " recommender=recommender,\n", + ")\n", + "scenarios[\"OHE\"] = campaign_ohe" + ] + }, + { + "cell_type": "markdown", + "id": "wlCL", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "### Using BayBE's [simulation capabilities](https://emdgroup.github.io/baybe/0.14.2/userguide/simulation.html)\n", + "\n", + "BayBE offers powerful simulation capabilities that allow us to compare different optimization strategies without running actual experiments. The simulation uses a lookup mechanism to retrieve target values from our dataset, effectively simulating multiple optimization campaigns with different random seeds (Monte Carlo iterations).\n", + "\n", + "Let's run the simulation with multiple Monte Carlo iterations to get statistically meaningful results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "kqZH", + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.simulation import simulate_scenarios\n", + "\n", + "BATCH_SIZE = 2\n", + "N_DOE_ITERATIONS = 5 # Change to ~20 for better plots\n", + "N_MC_ITERATIONS = 10 # Change to ~30 for better plots\n", + "\n", + "results = simulate_scenarios(\n", + " scenarios,\n", + " df,\n", + " batch_size=BATCH_SIZE,\n", + " n_doe_iterations=N_DOE_ITERATIONS,\n", + " n_mc_iterations=N_MC_ITERATIONS,\n", + ")\n", + "\n", + "results.rename(\n", + " columns={\n", + " \"Scenario\": \"Substance encoding\",\n", + " \"Num_Experiments\": \"Number of experiments\",\n", + " \"yield_CumBest\": \"Running best yield\",\n", + " },\n", + " inplace=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "wAgl", + "metadata": { + "marimo": { + "config": { + "hide_code": true + } + } + }, + "source": [ + "### Visualizing the Results\n", + "\n", + "Now let's visualize the results using the `backtest_plot` utility. This plot shows:\n", + "- The mean performance across Monte Carlo iterations (solid line)\n", + "- Confidence intervals (shaded regions)\n", + "- A horizontal guideline at 90% yield (our target threshold)\n", + "- Vertical guidelines showing when the `MORDRED` and `OHE` encoding reach the target.\n", + "\n", + "The plot shows that using the `SubstanceParameter` and hence equipping `BayBE` with chemical knowledge significantly improves the performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rEll", + "metadata": {}, + "outputs": [], + "source": [ + "from utils import backtest_plot\n", + "\n", + "backtest_plot(\n", + " df=results,\n", + " x=\"Number of experiments\",\n", + " y=\"Running best yield\",\n", + " hue=\"Substance encoding\",\n", + " indicator_y=90,\n", + " indicator_labels=[\"MORDRED\", \"OHE\"],\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "baybe-resources (3.13.3)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + }, + "marimo": { + "app_config": { + "app_title": "Reaction Optimization", + "width": "full" + }, + "marimo_version": "0.19.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/Reaction_Optimization.py b/notebooks/Reaction_Optimization.py index 94cb548..412ea05 100644 --- a/notebooks/Reaction_Optimization.py +++ b/notebooks/Reaction_Optimization.py @@ -16,13 +16,17 @@ def _(): @app.cell(hide_code=True) def _(mo): mo.md(r""" - # Using BayBE to optimize Reaction Conditions + # Using `BayBE` to optimize Reaction Conditions - This notebook contains an example on how to use BayBE for the optimization of reaction conditions. It is inspired by the corresponding notebook developed by Pat Walters as part of his [Practical Cheminformatics Tutorial](https://github.com/PatWalters/practical_cheminformatics_tutorials). + This notebook contains an example on how to use BayBE for the optimization of reaction conditions. It is inspired by the corresponding notebook developed by Pat Walters as part of his [Practical Cheminformatics Tutorial](https://github.com/PatWalters/practical_cheminformatics_tutorials). This notebook assumes basic familiarity with the core concepts of Bayesian Optimization. The intention of this notebook is *not* to introduce and explain all aspects of Bayesian Optimization, but to focus on the usage of `BayBE`. - In drug discovery, we frequently encounter situations where we need to modify a set of reaction conditions to optimize the yield. This notebook shows how to use BayBE to model and optimize such a campaign. + In drug discovery, we frequently encounter situations where we need to modify a set of reaction conditions to optimize the yield. This notebook shows how to use `BayBE` to model and optimize such a campaign. + In particular, it demonstrates the power and usefulness of `BayBE`'s chemical encodings. If parameters in a process to be optimized are chemicals, this feature enables `BayBE` to automatically use meaningful chemical descriptors, automatically leveraging chemical knowledge for the optimization process. - **Note:** We assume basic familiarity with the core concepts of Bayesian Optimization. The intention of this notebook is *not* to introduce and explain all aspects of Bayesian Optimization, but to focus on the usage of BayBE. + + /// caution + This notebook was developed for `BayBE` version 0.14.2. Although we do our best in keeping our breaking changes minimal and support outdated versions for a long time, this notebook might not be immediately applicable for other `BayBE` versions. If you install `BayBE` via the instructions in this repository, version 0.14.2 will thus be installed. + /// """) return @@ -33,7 +37,6 @@ def _(mo): ## Introduction In this notebook, we consider a reaction described in the supporting material of a 2020 paper by [Shields et al.](https://www.nature.com/articles/s41586-021-03213-y), in which the following reaction should be optimized: - ` """) return @@ -57,9 +60,7 @@ def _(mo): 4. **Concentration:** We can choose from one of 3 available concentrations. 5. **Temperature:** We can chose from one of 3 available temperatures. - Consequently, this means that we have **1728** different potential experiments that we could run. However, we sould like to identify the optimal conditions with only a small number of experiments. Fortunately, Shields and coworkers have investigated all 1728 combinations and provided a table with the conditions and corresponding yields. - - Note that only 18 out of the 1728 potential experiments have a yield within the top 10 percent! + Consequently, this means that we have **1728** different potential experiments that we could run. Fortunately, Shields and coworkers have investigated all 1728 combinations and provided a table with the conditions and corresponding yields. Note that only 18 out of the 1728 potential experiments have a yield within the top 10 percent! """) return @@ -67,15 +68,24 @@ def _(mo): @app.cell(hide_code=True) def _(): import pandas as pd + from utils import create_dict_from_columns df = pd.read_csv("data/shields.csv") + + # Extract SMILES data for all chemical substances + solvents_dict = create_dict_from_columns(df, "Solvent_Name", "Solvent_SMILES") + ligands_dict = create_dict_from_columns(df, "Ligand_Name", "Ligand_SMILES") + bases_dict = create_dict_from_columns(df, "Base_Name", "Base_SMILES") + df - return df, pd + return bases_dict, df, ligands_dict, pd, solvents_dict @app.cell(hide_code=True) def _(mo): mo.md(r""" + Our goal is to identify one of the top candidates, that is, one of the 18 experiments with a yield larger than 90 using only a few experiments. + We will begin by identifying 10 initial reaction conditions. In practice, we would then run experiments to evaluate these conditions and record the corresponding reaction yields. However, in this case, we will look up the yields in a table. With the conditions and yields in hand, we can build a Bayesian model and use this model to select another 5 reaction conditions. We will then look up the yields for the 5 conditions and use this information to update the model. We will repeat this process through 5 rounds of optimization and examine the reaction yields for each optimization cycle. """) return @@ -88,11 +98,11 @@ def _(mo): Setting up an experimentation campaign with `BayBE` requires us to set up the main components individually. In this notebook, we will set up the following components one after another. - 1. [**Parameters**](https://emdgroup.github.io/baybe/0.14.2/userguide/parameters.html): We start by defining the reaction parameters to be optimized. Each of the 5 parameters described earlier will correspond to exactly one of BayBE's `Parameter`s. - 2. [**Search space**](https://emdgroup.github.io/baybe/0.14.2/userguide/searchspace.html): The search space defines the combination of parameters to be searched. The search space is typically defined using the function `Searchspace.from_product`, which creates a search space as the Cartesian product of the parameters. - 3. [**Target**](https://emdgroup.github.io/baybe/0.14.2/userguide/targets.html): The target is the quantity we are optimizing. In the case of reaction optimization, this is typically the yield. `BayBE` can optimize a single parameter or multiple parameters at once. In this notebook, we'll focus on single parameter optimization, where we are only optimizing the yield, and hence stick to single target optimization. - 4. [**Recommender**](https://emdgroup.github.io/baybe/0.14.2/userguide/recommenders.html): The recommender selects the next set of experiments to be performed. In this case, we use the [`TwoPhaseMetaRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.meta.sequential.TwoPhaseMetaRecommender.html). This recommender behaves differently depending on whether it has experimental data. At the beginning of an optimization process, we typically don't have experimental data and want to find a diverse set of conditions to gather some initial data. If the `TwoPhaseMetaRecommender` has no data available, it uses Farthest Point Sampling to select a diverse set of initial conditions. If the recommender has data, it uses the [`BotorchRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.html), a Bayesian optimizer that balances exploration and exploitation when selecting sets of reaction conditions. - 5. [**Campaign**](https://emdgroup.github.io/baybe/0.14.2/userguide/campaigns.html): In `BayBE`, the search space, objective, and recommender are combined into an `campaign` object. The Campaign has two important methods: `recommend`, which recommends the next set of experiments, and `add_measurements', which adds a set of experiments and updates the underlying Bayesian model. + 1. [**Parameters**](https://emdgroup.github.io/baybe/0.14.2/userguide/parameters.html): In our setting, a _parameter_ is something that we can control directly. An example of this is which ligand to choose, or at which of the available temperatures to run the experiment. Each of the 5 parameters described earlier will correspond to exactly one of `BayBE`'s `Parameter`s. + 2. [**Search space**](https://emdgroup.github.io/baybe/0.14.2/userguide/searchspace.html): The search space defines the combination of parameters to be searched. It thus contains all possible experiments that we could conduct. The search space is typically defined using the function `SearchSpace.from_product`, which creates a search space as the Cartesian product of the parameters. + 3. [**Target**](https://emdgroup.github.io/baybe/0.14.2/userguide/targets.html): The target is the quantity we are optimizing. In the case of reaction optimization, this is typically the yield. `BayBE` can optimize a single parameter or multiple parameters at once. In this notebook, we'll focus on single parameter optimization, where we are only optimizing the yield, and we hence stick to single target optimization. + 4. [**Recommender**](https://emdgroup.github.io/baybe/0.14.2/userguide/recommenders.html): The recommender selects the next set of experiments to be performed. In this case, we use the default [`TwoPhaseMetaRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.meta.sequential.TwoPhaseMetaRecommender.html). This recommender behaves differently depending on whether it has experimental data. At the beginning of an optimization process, we typically don't have experimental data and want to find a diverse set of conditions to gather some initial data. If the `TwoPhaseMetaRecommender` has no data available, it uses random sampling to select a set of initial experiments. If the recommender has data, it uses the [`BotorchRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.html), a Bayesian optimizer that balances exploration and exploitation when selecting sets of reaction conditions. + 5. [**Campaign**](https://emdgroup.github.io/baybe/0.14.2/userguide/campaigns.html): In `BayBE`, the search space, objective, and recommender are combined into a `Campaign` object. The `Campaign` has two important methods: `recommend`, which recommends the next set of experiments, and `add_measurements`, which adds a set of experiments and updates the underlying Bayesian model. """) return @@ -102,26 +112,35 @@ def _(mo): mo.md(""" ## Defining the [`Parameters`](https://emdgroup.github.io/baybe/0.14.2/userguide/parameters.html) - In this section, we introduce two different parameter types: The [`CategoricalParameter`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.parameters.categorical.CategoricalParameter.html) and the [`NumericalDiscreteParameter`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.parameters.numerical.NumericalDiscreteParameter.html). - + In this section, we introduce two different parameter types: The [`SubstanceParameter`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.parameters.substance.SubstanceParameter.html) and the [`NumericalDiscreteParameter`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.parameters.numerical.NumericalDiscreteParameter.html). - The `CategoricalParameter` has a `name` field as well as a `values` field. The `name` is used to describe the parameter, while the `values` are the collection of values that the parameter can take. In addition, one can choose a specific `encoding`. For the sake of this tutorial, we use the `One-Hot-Encoding`, `BayBE`'s default choice for `CategoricalParameter`s. + The `SubstanceParameter` is specifically designed for chemical substances and can automatically use meaningful chemical descriptors. It takes a `name` field and a `data` dictionary mapping substance names to their SMILES representations. One can also choose a specific chemical `encoding` such as MORDRED, ECFP, or RDKIT2DDESCRIPTORS. - In this tutorial, we model the three different chemical parameters, that is, the solvent, the ligand, and the base as `CategoricalParameters`. Since we have access to the data, we extract the values for the parameters from there, and create the corresponding `CategoricalParameters`. + In this tutorial, we model all three chemical parameters (ligand, solvent, and base) as `SubstanceParameter`s to leverage chemical knowledge in the optimization process. Since we have access to the SMILES data, we extract the mappings from the data and create the corresponding `SubstanceParameter`s. """) return @app.cell -def _(df): - from baybe.parameters import CategoricalParameter +def _(bases_dict, ligands_dict, solvents_dict): + from baybe.parameters import SubstanceParameter - ligand = CategoricalParameter(values=df["Ligand_Name"].unique(), name="Ligand_Name") - solvent = CategoricalParameter( - values=df["Solvent_Name"].unique(), name="Solvent_Name" + ligand = SubstanceParameter( + name="Ligand_Name", + data=ligands_dict, + encoding="MORDRED" + ) + solvent = SubstanceParameter( + name="Solvent_Name", + data=solvents_dict, + encoding="MORDRED" ) - base = CategoricalParameter(values=df["Base_Name"].unique(), name="Base_Name") - return base, ligand, solvent + base = SubstanceParameter( + name="Base_Name", + data=bases_dict, + encoding="MORDRED" + ) + return SubstanceParameter, base, ligand, solvent @app.cell(hide_code=True) @@ -142,7 +161,7 @@ def _(df): temperature = NumericalDiscreteParameter( values=df["Temp_C"].unique(), name="Temp_C" ) - return concentration, temperature + return NumericalDiscreteParameter, concentration, temperature @app.cell(hide_code=True) @@ -161,7 +180,7 @@ def _(base, concentration, ligand, solvent, temperature): parameters = [ligand, solvent, base, concentration, temperature] searchspace = SearchSpace.from_product(parameters=parameters) - return (searchspace,) + return SearchSpace, searchspace @app.cell(hide_code=True) @@ -169,7 +188,7 @@ def _(mo): mo.md(r""" ## Define the [`Target`](https://emdgroup.github.io/baybe/0.14.2/userguide/targets.html) & objective - In this example, we want to maximize the yield of the reaction. Since we are only optimizing a single objective, we use the [`SingleTargetObjective`](https://emdgroup.github.io/baybe/0.14.2/userguide/objectives.html#singletargetobjective) and set the `mode` to `"MAX"`. + In this example, we want to maximize the yield of the reaction. Since we are only optimizing a single objective, we use the [`SingleTargetObjective`](https://emdgroup.github.io/baybe/0.14.2/userguide/objectives.html#singletargetobjective) which assumes a maximization of the target as default. """) return @@ -190,21 +209,23 @@ def _(mo): ## Define the [`Recommender`](https://emdgroup.github.io/baybe/0.14.2/userguide/recommenders.html) The [`Recommender`](https://emdgroup.github.io/baybe/0.14.2/userguide/recommenders.html) selects the next set of experiments to try. - There are many different recommenders offered by `BayBE`, and a lot of ways of combining them. For the sake of this example, we do not use the default initial recommender, but use the [`FPSRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.pure.nonpredictive.sampling.FPSRecommender.html#baybe.recommenders.pure.nonpredictive.sampling.FPSRecommender). This choice ensures that if there is no data available, the recommender uses farthest point sampling to select a diverse set of conditions. If data is available, the [`BotorchRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.html#baybe.recommenders.pure.bayesian.botorch.BotorchRecommender) is used to balance exploration and exploitation and select the next set of reaction conditions. + There are many different recommenders offered by `BayBE`, and a lot of ways of combining them. For this example, we use a [`TwoPhaseMetaRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.meta.sequential.TwoPhaseMetaRecommender.html) equipped with a [`BotorchRecommender`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.html) that uses the [`EDBOKernel`](https://emdgroup.github.io/baybe/0.14.2/_autosummary/baybe.surrogates.gaussian_process.presets.edbo.EDBOKernelFactory.html). The EDBO kernel is particularly well-suited for chemical optimization problems as it was specifically designed to handle chemical descriptors effectively. """) return @app.cell def _(): - from baybe.recommenders import ( - TwoPhaseMetaRecommender, - FPSRecommender, - BotorchRecommender, - ) + from baybe.surrogates.gaussian_process.presets.edbo import EDBOKernelFactory + from baybe.recommenders import TwoPhaseMetaRecommender, BotorchRecommender + from baybe.surrogates import GaussianProcessSurrogate recommender = TwoPhaseMetaRecommender( - initial_recommender=FPSRecommender(), recommender=BotorchRecommender() + recommender=BotorchRecommender( + surrogate_model=GaussianProcessSurrogate( + kernel_or_factory=EDBOKernelFactory() + ) + ) ) return (recommender,) @@ -226,15 +247,15 @@ def _(objective, recommender, searchspace): campaign = Campaign( searchspace=searchspace, objective=objective, recommender=recommender ) - return (campaign,) + return Campaign, campaign @app.cell(hide_code=True) def _(mo): mo.md(r""" - ## Starting [`the recommendation loop`](https://emdgroup.github.io/baybe/0.14.2/userguide/getting_recommendations.html) + ## Starting [the recommendation loop](https://emdgroup.github.io/baybe/0.14.2/userguide/getting_recommendations.html) - Now that the `campaign` is defined, we can ask it for recommendations. So far, we haven't done any experiments. As such, the `campaign` will use the farthest point sampling to select a diverse set of initial experiments. + Now that the `campaign` is defined, we can ask it for recommendations. So far, we haven't done any experiments. As such, the `campaign` will use random sampling to select a diverse set of initial experiments. """) return @@ -254,7 +275,7 @@ def _(mo): return -@app.cell +@app.cell(hide_code=True) def _(df, initial_rec, pd): merge_columns = [ "Ligand_Name", @@ -330,10 +351,203 @@ def _(campaign): @app.cell(hide_code=True) def _(mo): mo.md(""" - As we can see, we found a very good candidate, and only needed to evaluate a fraction of the search space! This insight concludes this basic BayBE tutorial. + As we can see, we found a very good candidate, and only needed to evaluate a fraction of the search space! + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Investigating Different Chemical Encodings + + Now that we've seen how to use `BayBE` for reaction optimization with chemical encodings, let's investigate how different chemical encodings affect the optimization performance. BayBE supports multiple chemical encodings for `SubstanceParameter`s, each capturing different aspects of molecular structure. + + We'll compare three different chemical encodings as well as the One-Hot-Encoding by running simulated optimization campaigns and visualizing their performance. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(""" + We create multiple campaigns, one for each encoding we want to compare. Each campaign will use `SubstanceParameter`s for all three chemical parameters (solvent, ligand, and base) with the specified encoding: + """) + return + + +@app.cell +def _( + Campaign, + NumericalDiscreteParameter, + SearchSpace, + SubstanceParameter, + bases_dict, + df, + ligands_dict, + objective, + recommender, + solvents_dict, +): + substance_encodings = ["MORDRED", "ECFP", "RDKIT2DDESCRIPTORS"] + scenarios = { + encoding: Campaign( + searchspace=SearchSpace.from_product( + parameters=[ + SubstanceParameter( + name="Solvent_Name", + data=solvents_dict, + encoding=encoding, + ), + SubstanceParameter( + name="Base_Name", + data=bases_dict, + encoding=encoding, + ), + SubstanceParameter( + name="Ligand_Name", + data=ligands_dict, + encoding=encoding, + ), + NumericalDiscreteParameter( + values=df["Concentration"].unique(), name="Concentration" + ), + NumericalDiscreteParameter( + values=df["Temp_C"].unique(), name="Temp_C" + ), + ] + ), + objective=objective, + recommender=recommender, + ) + for encoding in substance_encodings + } + return (scenarios,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(""" + We also want to compare the campaigns using chemical encodings to a baseline campaign that uses One-Hot Encoding (OHE) with `CategoricalParameter`s for all chemical parameters: """) return +@app.cell +def _( + Campaign, + NumericalDiscreteParameter, + SearchSpace, + df, + objective, + recommender, + scenarios, +): + from baybe.parameters import CategoricalParameter + + ohe_parameters = [ + CategoricalParameter( + name="Solvent_Name", values=df["Solvent_Name"].unique(), encoding="OHE" + ), + CategoricalParameter( + name="Base_Name", values=df["Base_Name"].unique(), encoding="OHE" + ), + CategoricalParameter( + name="Ligand_Name", values=df["Ligand_Name"].unique(), encoding="OHE" + ), + NumericalDiscreteParameter( + name="Temp_C", values=df["Temp_C"].unique() + ), + NumericalDiscreteParameter( + name="Concentration", values=df["Concentration"].unique() + ), + ] + campaign_ohe = Campaign( + searchspace=SearchSpace.from_product(parameters=ohe_parameters), + objective=objective, + recommender=recommender, + ) + scenarios["OHE"] = campaign_ohe + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Using BayBE's [simulation capabilities](https://emdgroup.github.io/baybe/0.14.2/userguide/simulation.html) + + BayBE offers powerful simulation capabilities that allow us to compare different optimization strategies without running actual experiments. The simulation uses a lookup mechanism to retrieve target values from our dataset, effectively simulating multiple optimization campaigns with different random seeds (Monte Carlo iterations). + + Let's run the simulation with multiple Monte Carlo iterations to get statistically meaningful results: + """) + return + + +@app.cell +def _(df, scenarios): + from baybe.simulation import simulate_scenarios + + BATCH_SIZE = 2 + N_DOE_ITERATIONS = 5 # Change to ~20 for better plots + N_MC_ITERATIONS = 10 # Change to ~30 for better plots + + results = simulate_scenarios( + scenarios, + df, + batch_size=BATCH_SIZE, + n_doe_iterations=N_DOE_ITERATIONS, + n_mc_iterations=N_MC_ITERATIONS, + ) + + results.rename( + columns={ + "Scenario": "Substance encoding", + "Num_Experiments": "Number of experiments", + "yield_CumBest": "Running best yield", + }, + inplace=True, + ) + return (results,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(""" + ### Visualizing the Results + + Now let's visualize the results using the `backtest_plot` utility. This plot shows: + - The mean performance across Monte Carlo iterations (solid line) + - Confidence intervals (shaded regions) + - A horizontal guideline at 90% yield (our target threshold) + - Vertical guidelines showing when the `MORDRED` and `OHE` encoding reach the target. + + The plot shows that using the `SubstanceParameter` and hence equipping `BayBE` with chemical knowledge significantly improves the performance. + """) + return + + +@app.cell +def _(mo, results): + from utils import backtest_plot + import matplotlib.pyplot as plt + + backtest_plot( + df=results, + x="Number of experiments", + y="Running best yield", + hue="Substance encoding", + indicator_y=90, + indicator_labels=["MORDRED", "OHE"], + ) + mo.mpl.interactive(plt.gcf()) + return (plt,) + + +@app.cell +def _(plt): + plt.close() + return + + if __name__ == "__main__": app.run()