diff --git a/docs/zh/examples/smc_reac.md b/docs/zh/examples/smc_reac.md new file mode 100644 index 000000000..d2590c2c6 --- /dev/null +++ b/docs/zh/examples/smc_reac.md @@ -0,0 +1,145 @@ +# Suzuki-Miyaura 交叉偶联反应产率预测 + +!!! note + + 1. 开始训练、评估前,请先下载数据文件[data_set.xlsx](https://paddle-org.bj.bcebos.com/paddlescience/datasets/SMCReac/data_set.xlsx),并对应修改 yaml 配置文件中的 `data_dir` 为数据文件路径。 + 2. 如果需要使用预训练模型进行评估,请先下载预训练模型[smc_reac_model.pdparams](https://paddle-org.bj.bcebos.com/paddlescience/models/smc_reac/smc_reac_model.pdparams), 并对应修改 yaml 配置文件中的 `load_model_path` 为模型参数路径。 + 3. 开始训练、评估前,请安装 `rdkit` 等,相关依赖请执行`pip install -r requirements.txt`安装。 + +=== "模型训练命令" + + ``` sh + python smc_reac.py + ``` + +=== "模型评估命令" + + ``` sh + python smc_reac.py mode=eval EVAL.load_model_path=https://paddle-org.bj.bcebos.com/paddlescience/models/smc_reac/smc_reac_model.pdparams + ``` + +## 1. 背景简介 + +Suzuki-Miyaura 交叉偶联反应表达式如下所示。 + +$$ +\mathrm{Ar{-}X} + \mathrm{Ar'{-}B(OH)_2} \xrightarrow[\text{Base}]{\mathrm{Pd}^0} \mathrm{Ar{-}Ar'} + \mathrm{HX} +$$ + +在零价钯配合物催化下,芳基或烯基硼酸或硼酸酯与氯、溴、碘代芳烃或烯烃发生交叉偶联。该反应具有反应条件温和、转化率高的优点,在材料合成、药物研发等领域具有重要作用,但存在开发周期长,试错成本高的问题。本研究通过使用高通量实验数据分析反应底物(包括亲电试剂和亲核试剂),催化配体,碱基,溶剂对偶联反应产率的影响,从而建立预测模型。 + + +## 2. Suzuki-Miyaura 交叉偶联反应产率预测模型的实现 + +本节将讲解如何基于PaddleScience代码,实现对于 Suzuki-Miyaura 交叉偶联反应产率预测模型的构建、训练、测试和评估。案例的目录结构如下。 +``` log +smc_reac/ +├──config/ +│ └── smc_reac.yaml +├── data_set.xlsx +├── requirements.txt +└── smc_reac.py +``` + +### 2.1 数据集构建和载入 + +本样例使用的数据来自参考文献[1]提供的开源数据,仅考虑试剂本身对于实验结果的影响,从中筛选了各分量均有试剂参与的部分反应数据,保存在文件 `./data_set.xlsx` 中。该工作开发了一套基于流动化学(flow chemistry)的自动化平台,该平台在氩气保护的手套箱中组装,使用改良的高效液相色谱(HPLC)系统,结合自动化取样装置,从192个储液瓶中按设定程序吸取反应组分(亲电试剂、亲核试剂、催化剂、配体和碱),并注入流动载液中。每个反应段在温控反应盘管中以设定的流速、压力、时间进行反应,反应液通过UPLC-MS进行实时检测。通过调控亲电试剂、亲核试剂、11种配体、7种碱和4种溶剂的组合,最终实现了5760个反应条件的系统性筛选。接下来以其中一条数据为例结合代码说明数据集的构建与载入流程。 + +``` +ClC=1C=C2C=CC=NC2=CC1 | CC=1C(=C2C=NN(C2=CC1)C1OCCCC1)B(O)O | C(C)(C)(C)P(C(C)(C)C)C(C)(C)C | [OH-].[Na+] | C(C)#N | 4.76 +``` +其中用SMILES依次表示亲电试剂、亲核试剂、催化配体、碱、溶剂和实验产率。 + +首先从表格文件中将实验材料信息和反应产率进行导入,并划分训练集和测试集, + +``` py linenums="27" title="examples/smc_reac/smc_reac.py" +--8<-- +examples/smc_reac/smc_reac.py:27:35 +--8<-- +``` + +应用 `rdkit.Chem.rdFingerprintGenerator` 将亲电试剂、亲核试剂、催化配体、碱和溶剂的SMILES描述转换为 Morgan 指纹。Morgan指纹是一种分子结构的向量化描述,通过局部拓扑被编码为 hash 值,映射到2048位指纹位上。用 PaddleScience 代码表示如下 + +``` py linenums="38" title="examples/smc_reac/smc_reac.py" +--8<-- +examples/smc_reac/smc_reac.py:38:66 +--8<-- +``` + +### 2.2 约束构建 + +本案例采用监督学习,按照 PaddleScience 的API结构说明,采用内置的 `SupervisedConstraint` 构建监督约束。用 PaddleScience 代码表示如下 + +``` py linenums="73" title="examples/smc_reac/smc_reac.py" +--8<-- +examples/smc_reac/smc_reac.py:73:89 +--8<-- +``` +`SupervisedConstraint` 的第二个参数表示采用均方误差 `MSELoss` 作为损失函数,第三个参数表示约束条件的名字,方便后续对其索引。 + +### 2.3 模型构建 + +本案例设计了五条独立的子网络(全连接层+ReLU激活),每条子网络分别提取对应化学物质的特征。随后,这五个特征向量通过可训练的权重参数进行加权平均,实现不同化学成分对反应产率预测影响的自适应学习。最后,将融合后的特征输入到一个全连接层进行进一步映射,输出反应产率预测值。整个网络结构体现了对反应中各组成成分信息的独立提取与有权重的融合,符合反应机理特性。用 PaddleScience 代码表示如下 + +``` py linenums="7" title="ppsci/arch/smc_reac.py" +--8<-- +ppsci/arch/smc_reac.py:7:107 +--8<-- +``` + +模型依据配置文件信息进行实例化 + +``` py linenums="91" title="examples/smc_reac/smc_reac.py" +--8<-- +examples/smc_reac/smc_reac.py:91:91 +--8<-- +``` + +参数通过配置文件进行设置如下 + +``` py linenums="35" title="examples/smc_reac/config/smc_reac.yaml" +--8<-- +examples/smc_reac/config/smc_reac.yaml:35:41 +--8<-- +``` + +### 2.4 优化器构建 + +训练器采用Adam优化器,学习率设置由配置文件给出。用 PaddleScience 代码表示如下 + +``` py linenums="93" title="examples/smc_reac/smc_reac.py" +--8<-- +examples/smc_reac/smc_reac.py:93:93 +--8<-- +``` + +### 2.5 模型训练 + +完成上述设置之后,只需要将上述实例化的对象按顺序传递给`ppsci.solver.Solver`,然后启动训练即可。用PaddleScience 代码表示如下 + +``` py linenums="95" title="examples/smc_reac/smc_reac.py" +--8<-- +examples/smc_reac/smc_reac.py:95:105 +--8<-- +``` + +## 3. 完整代码 + +``` py linenums="1" title="examples/smc_reac/smc_reac.py" +--8<-- +examples/smc_reac/smc_reac.py +--8<-- +``` + +## 4. 结果展示 + +下图展示对 Suzuki-Miyaura 交叉偶联反应产率的模型预测结果。 + +
+ ![chem.png](https://paddle-org.bj.bcebos.com/paddlescience/docs/SMCReac/chem.png){ loading=lazy } +
Suzuki-Miyaura 交叉偶联反应产率的模型预测结果
+
+ +## 5. 参考文献 + +[1] Perera D, Tucker J W, Brahmbhatt S, et al. A platform for automated nanomole-scale reaction screening and micromole-scale synthesis in flow[J]. Science, 2018, 359(6374): 429-434. diff --git a/examples/smc_reac/config/smc_reac.yaml b/examples/smc_reac/config/smc_reac.yaml new file mode 100644 index 000000000..5232ee9cb --- /dev/null +++ b/examples/smc_reac/config/smc_reac.yaml @@ -0,0 +1,56 @@ +defaults: + - ppsci_default + - TRAIN: train_default + - TRAIN/ema: ema_default + - TRAIN/swa: swa_default + - EVAL: eval_default + - INFER: infer_default + - hydra/job/config/override_dirname/exclude_keys: exclude_keys_default + - _self_ + +hydra: + run: + # dynamic output directory according to running time and override name + dir: ./outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}/${hydra.job.override_dirname} # + job: + name: ${mode} # name of logfile + chdir: false # keep current working directory unchanged + callbacks: + init_callback: # + _target_: ppsci.utils.callbacks.InitCallback # + sweep: + # output directory for multirun + dir: ${hydra.run.dir} + subdir: ./ + +# general settings +mode: train # running mode: train/eval # +seed: 42 # +output_dir: ${hydra:run.dir} # +log_freq: 20 # +use_tbd: false # +data_dir: "./data_set.xlsx" # + +# model settings +MODEL: # + input_dim : 2048 # Assuming x_train is your DataFrame + output_dim : 1 + hidden_dim : 512 + hidden_dim2 : 1024 + hidden_dim3 : 2048 + hidden_dim4 : 1024 + +# training settings +TRAIN: # + epochs: 1500 # + iters_per_epoch: 20 # + batch_size: 8 # + learning_rate: 0.0001 + pretrained_model_path: null + checkpoint_path: null + +# evaluation settings +EVAL: + pretrained_model_path: null + batch_size: 8 # + seed: 20 diff --git a/examples/smc_reac/requirements.txt b/examples/smc_reac/requirements.txt new file mode 100644 index 000000000..2b7371b34 --- /dev/null +++ b/examples/smc_reac/requirements.txt @@ -0,0 +1,3 @@ +openpyxl +rdkit +scikit-learn diff --git a/examples/smc_reac/smc_reac.py b/examples/smc_reac/smc_reac.py new file mode 100644 index 000000000..30356e617 --- /dev/null +++ b/examples/smc_reac/smc_reac.py @@ -0,0 +1,182 @@ +import os + +import hydra +import matplotlib.pyplot as plt +import numpy as np +import paddle +import pandas as pd +import rdkit.Chem as Chem +from omegaconf import DictConfig +from rdkit.Chem import rdFingerprintGenerator +from sklearn.model_selection import train_test_split + +import ppsci + +paddle.set_device("gpu:5") +os.environ["HYDRA_FULL_ERROR"] = "1" +os.environ["KMP_DUPLICATE_LIB_OK"] = "True" +plt.rcParams["axes.unicode_minus"] = False +plt.rcParams["font.sans-serif"] = ["DejaVu Sans"] + +x_train = None +x_test = None +y_train = None +y_test = None + + +def load_data(cfg: DictConfig): + data_dir = cfg.data_dir + dataset = pd.read_excel(data_dir, skiprows=1) + x = dataset.iloc[:, 1:6] + y = dataset.iloc[:, 6] + x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.2, random_state=42 + ) + return x_train, x_test, y_train, y_test + + +def data_processed(x, y): + x = build_dataset(x) + y = paddle.to_tensor(y.to_numpy(dtype=np.float32)) + y = paddle.unsqueeze(y, axis=1) + return x, y + + +def build_dataset(data): + r1 = paddle.to_tensor(np.array(cal_print(data.iloc[:, 0])), dtype=paddle.float32) + r2 = paddle.to_tensor(np.array(cal_print(data.iloc[:, 1])), dtype=paddle.float32) + ligand = paddle.to_tensor( + np.array(cal_print(data.iloc[:, 2])), dtype=paddle.float32 + ) + base = paddle.to_tensor(np.array(cal_print(data.iloc[:, 3])), dtype=paddle.float32) + solvent = paddle.to_tensor( + np.array(cal_print(data.iloc[:, 4])), dtype=paddle.float32 + ) + return paddle.concat([r1, r2, ligand, base, solvent], axis=1) + + +def cal_print(smiles): + vectors = [] + for smi in smiles: + mol = Chem.MolFromSmiles(smi) + generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) + fp = generator.GetFingerprint(mol) + _input = np.array(list(map(float, fp.ToBitString()))) + vectors.append(_input) + return vectors + + +def train(cfg: DictConfig): + global x_train, y_train + x_train, y_train = data_processed(x_train, y_train) + + # build supervised constraint + sup = ppsci.constraint.SupervisedConstraint( + dataloader_cfg={ + "dataset": { + "input": {"v": x_train}, + "label": {"u": y_train}, + # "weight": {"W": param}, + "name": "IterableNamedArrayDataset", + }, + "batch_size": cfg.TRAIN.batch_size, + }, + loss=ppsci.loss.MSELoss("mean"), + name="sup", + ) + constraint = { + "sup": sup, + } + + model = ppsci.arch.SuzukiMiyauraModel(**cfg.MODEL) + + optimizer = ppsci.optimizer.optimizer.Adam(cfg.TRAIN.learning_rate)(model) + + # Build solver + solver = ppsci.solver.Solver( + model, + constraint=constraint, + optimizer=optimizer, + epochs=cfg.TRAIN.epochs, + eval_during_train=False, + iters_per_epoch=cfg.TRAIN.iters_per_epoch, + cfg=cfg, + ) + solver.train() + + +def evaluate(cfg: DictConfig): + global x_test, y_test + + x_test, y_test = data_processed(x_test, y_test) + + test_validator = ppsci.validate.SupervisedValidator( + dataloader_cfg={ + "dataset": { + "input": {"v": x_test}, + "label": {"u": y_test}, + "name": "IterableNamedArrayDataset", + }, + "batch_size": cfg.EVAL.batch_size, + "shuffle": False, + }, + loss=ppsci.loss.MSELoss("mean"), + metric={ + "MAE": ppsci.metric.MAE(), + "RMSE": ppsci.metric.RMSE(), + "R2": ppsci.metric.R2Score(), + }, + name="test_eval", + ) + validators = {"test_eval": test_validator} + + model = ppsci.arch.SuzukiMiyauraModel(**cfg.MODEL) + solver = ppsci.solver.Solver( + model, + validator=validators, + cfg=cfg, + ) + + loss_val, metric_dict = solver.eval() + + ypred = model({"v": x_test})["u"].numpy() + ytrue = y_test.numpy() + + mae = metric_dict["MAE"]["u"] + rmse = metric_dict["RMSE"]["u"] + r2 = metric_dict["R2"]["u"] + + plt.figure() + plt.scatter(ytrue, ypred, s=15, color="royalblue", marker="s", linewidth=1) + plt.plot([ytrue.min(), ytrue.max()], [ytrue.min(), ytrue.max()], "r-", lw=1) + plt.legend(title="R²={:.3f}\n\nMAE={:.3f}".format(r2, mae)) + plt.xlabel("Test Yield(%)") + plt.ylabel("Predicted Yield(%)") + save_path = "smc_reac.png" + plt.savefig(save_path) + print(f"Image saved to: {save_path}") + plt.show() + + print("Evaluation metrics:") + print(f"Loss: {loss_val:.4f}") + print(f"MAE : {mae:.4f}") + print(f"RMSE: {rmse:.4f}") + print(f"R2 : {r2:.4f}") + + +@hydra.main(version_base=None, config_path="./config", config_name="smc_reac.yaml") +def main(cfg: DictConfig): + global x_train, x_test, y_train, y_test + + x_train, x_test, y_train, y_test = load_data(cfg) + + if cfg.mode == "train": + train(cfg) + elif cfg.mode == "eval": + evaluate(cfg) + else: + raise ValueError(f"cfg.mode should in ['train', 'eval'], but got '{cfg.mode}'") + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 2734d2a4d..3f8d57fd0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -116,6 +116,7 @@ nav: - UNetFormer: zh/examples/unetformer.md - WGAN_GP: zh/examples/wgan_gp.md - 化学科学(AI for Chemistry): + - SMC Reac: zh/examples/smc_reac.md - Moflow: zh/examples/moflow.md - IFM: zh/examples/ifm.md diff --git a/ppsci/arch/__init__.py b/ppsci/arch/__init__.py index d0d99db16..d3deacf68 100644 --- a/ppsci/arch/__init__.py +++ b/ppsci/arch/__init__.py @@ -21,6 +21,7 @@ from ppsci.arch.amgnet import AMGNet # isort:skip from ppsci.arch.base import Arch # isort:skip from ppsci.arch.cfdgcn import CFDGCN # isort:skip +from ppsci.arch.smc_reac import SuzukiMiyauraModel # isort:skip from ppsci.arch.chip_deeponets import ChipDeepONets # isort:skip from ppsci.arch.crystalgraphconvnet import CrystalGraphConvNet # isort:skip from ppsci.arch.cuboid_transformer import CuboidTransformer # isort:skip @@ -73,6 +74,7 @@ "AutoEncoder", "build_model", "CFDGCN", + "SuzukiMiyauraModel", "ChipDeepONets", "CrystalGraphConvNet", "CuboidTransformer", diff --git a/ppsci/arch/smc_reac.py b/ppsci/arch/smc_reac.py new file mode 100644 index 000000000..8664877ec --- /dev/null +++ b/ppsci/arch/smc_reac.py @@ -0,0 +1,107 @@ +import paddle +from paddle import nn + +from ppsci.arch import base + + +class SuzukiMiyauraModel(base.Arch): + def __init__( + self, input_dim, hidden_dim, hidden_dim2, hidden_dim3, hidden_dim4, output_dim + ): + super().__init__() + + self.r1_fc = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim2), + nn.ReLU(), + nn.Linear(hidden_dim2, hidden_dim3), + ) + + self.r2_fc = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim2), + nn.ReLU(), + nn.Linear(hidden_dim2, hidden_dim3), + ) + + self.ligand_fc = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim2), + nn.ReLU(), + nn.Linear(hidden_dim2, hidden_dim3), + ) + + self.base_fc = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim2), + nn.ReLU(), + nn.Linear(hidden_dim2, hidden_dim3), + ) + + self.solvent_fc = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim2), + nn.ReLU(), + nn.Linear(hidden_dim2, hidden_dim3), + nn.ReLU(), + ) + + self.weights = paddle.create_parameter( + shape=[5], + dtype="float32", + default_initializer=paddle.nn.initializer.Assign( + paddle.to_tensor([0.2, 0.2, 0.2, 0.2, 0.2]) + ), + ) + + self.fc_combined = nn.Sequential( + nn.Linear(hidden_dim3, hidden_dim4), + nn.ReLU(), + nn.Linear(hidden_dim4, output_dim), + ) + + def weighted_average(self, features, weights): + + weights = weights.clone().detach() + + weighted_sum = sum(f * w for f, w in zip(features, weights)) + + total_weight = weights.sum() + + return weighted_sum / total_weight + + def forward(self, x): + x = self.concat_to_tensor(x, ("v"), axis=-1) + + input_splits = paddle.split(x, num_or_sections=5, axis=1) + + r1_input, r2_input, ligand_input, base_input, solvent_input = input_splits + + r1_features = self.r1_fc(r1_input) + + r2_features = self.r2_fc(r2_input) + + ligand_features = self.ligand_fc(ligand_input) + + base_features = self.base_fc(base_input) + + solvent_features = self.solvent_fc(solvent_input) + + features = [ + r1_features, + r2_features, + ligand_features, + base_features, + solvent_features, + ] + + combined_features = self.weighted_average(features, self.weights) + + output = self.fc_combined(combined_features) + output = self.split_to_dict(output, ("u"), axis=-1) + return output