Merge pull request #27 from AstraZeneca/deepsynergy

benedekrozemberczki · web-flow · commit 732660a57d51 · 2022-01-14T09:42:14.000Z
DeepSynergy and EPGCN-DS
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ Our framework solves the so called [drug pair scoring task](https://arxiv.org/ab
 
 **Case Study Tutorials**
 
-We provide in-depth case study tutorials in the [Documentation](https://chemicalx.readthedocs.io/en/latest/), each covers an aspect of ChemicalX’s functionality.
+We provide in-depth case study like tutorials in the [Documentation](https://chemicalx.readthedocs.io/en/latest/), each covers an aspect of ChemicalX’s functionality.
 
 --------------------------------------------------------------------------------
 
@@ -59,18 +59,9 @@ If you find *ChemicalX* and the new datasets useful in your research, please con
 }
 ```
 
---------------------------------------------------------------------------------
-
-**A simple example**
-
-```python
-
-```
---------------------------------------------------------------------------------
-
 **Methods Included**
 
-In detail, the following temporal graph neural networks were implemented.
+In detail, the following drug pair scoring models were implemented.
 
 **2017**
 
@@ -112,13 +103,6 @@ In detail, the following temporal graph neural networks were implemented.
 
 --------------------------------------------------------------------------------
 
-**Auxiliary Layers**
-
-
-
---------------------------------------------------------------------------------
-
-
 Head over to our [documentation](https://chemicalx.readthedocs.io) to find out more about installation, creation of datasets and a full list of implemented methods and available datasets.
 For a quick start, check out the [examples](https://chemicalx.readthedocs.io) in the `examples/` directory.
 
diff --git a/chemicalx/models/deepsynergy.py b/chemicalx/models/deepsynergy.py
@@ -19,10 +19,10 @@ def __init__(
         self,
         context_channels: int,
         drug_channels: int,
-        input_hidden_channels: int,
-        middle_hidden_channels: int,
-        final_hidden_channels: int,
-        dropout_rate: float,
+        input_hidden_channels: int = 32,
+        middle_hidden_channels: int = 32,
+        final_hidden_channels: int = 32,
+        dropout_rate: float = 0.5,
     ):
         super(DeepSynergy, self).__init__()
         self.encoder = torch.nn.Linear(drug_channels + drug_channels + context_channels, input_hidden_channels)
@@ -37,7 +37,16 @@ def forward(
         drug_features_left: torch.FloatTensor,
         drug_features_right: torch.FloatTensor,
     ) -> torch.FloatTensor:
+        """
+        A forward pass of the DeepSynergy model.
 
+        Args:
+            context_features (torch.FloatTensor): A matrix of biological context features.
+            drug_features_left (torch.FloatTensor): A matrix of head drug features.
+            drug_features_right (torch.FloatTensor): A matrix of tail drug features.
+        Returns:
+            hidden (torch.FloatTensor): A column vector of predicted synergy scores.
+        """
         hidden = torch.cat([context_features, drug_features_left, drug_features_right], dim=1)
         hidden = self.encoder(hidden)
         hidden = F.relu(hidden)
diff --git a/chemicalx/models/epgcnds.py b/chemicalx/models/epgcnds.py
@@ -1,12 +1,49 @@
-from .base import Model
+import torch
+import torch.nn.functional as F
+from torchdrug.data import PackedGraph
+from torchdrug.layers import MeanReadout
+from torchdrug.models import GraphConvolutionalNetwork
 
-__all__ = [
-    "EPGCNDS",
-]
 
+class EPGCNDS(torch.nn.Module):
+    r"""The EPGCN-DS model from the `"Structure-Based Drug-Drug Interaction Detection
+    via Expressive Graph Convolutional Networks and Deep Sets " <https://ojs.aaai.org/index.php/AAAI/article/view/7236>`_ paper.
 
-class EPGCNDS(Model):
-    """An implementation of the EPGCNDS model.
-
-    .. seealso:: https://github.com/AstraZeneca/chemicalx/issues/22
+    Args:
+        in_channels (int): The number of molecular features.
+        hidden_channels (int): The number of graph convolutional filters.
+        out_channels (int): The number of hidden layer neurons in the last layer.
     """
+
+    def __init__(self, in_channels: int, hidden_channels: int = 32, out_channels: int = 16):
+        super(EPGCNDS, self).__init__()
+        self.graph_convolution_in = GraphConvolutionalNetwork(in_channels, hidden_channels)
+        self.graph_convolution_out = GraphConvolutionalNetwork(hidden_channels, out_channels)
+        self.mean_readout = MeanReadout()
+        self.final = torch.nn.Linear(out_channels, 1)
+
+    def forward(self, molecules_left: PackedGraph, molecules_right: PackedGraph) -> torch.FloatTensor:
+        """
+        A forward pass of the EPGCN-DS model.
+
+        Args:
+            molecules_left (torch.FloatTensor): Batched molecules for the left side drugs.
+            molecules_right (torch.FloatTensor): Batched molecules for the right side drugs.
+        Returns:
+            hidden (torch.FloatTensor): A column vector of predicted synergy scores.
+        """
+        features_left = self.graph_convolution_in(molecules_left, molecules_left.data_dict["node_feature"])[
+            "node_feature"
+        ]
+        features_right = self.graph_convolution_in(molecules_right, molecules_right.data_dict["node_feature"])[
+            "node_feature"
+        ]
+
+        features_left = self.graph_convolution_out(molecules_left, features_left)["node_feature"]
+        features_right = self.graph_convolution_out(molecules_right, features_right)["node_feature"]
+
+        features_left = self.mean_readout(molecules_left, features_left)
+        features_right = self.mean_readout(molecules_right, features_right)
+        hidden = features_left + features_right
+        hidden = torch.sigmoid(self.final(hidden))
+        return hidden
diff --git a/examples/deepsynergy_examples.py b/examples/deepsynergy_examples.py
@@ -0,0 +1,54 @@
+import torch
+import pandas as pd
+from tqdm import tqdm
+from sklearn.metrics import roc_auc_score
+from chemicalx.models import DeepSynergy
+from chemicalx.data import DatasetLoader, BatchGenerator
+
+loader = DatasetLoader("drugcombdb")
+
+drug_feature_set = loader.get_drug_features()
+context_feature_set = loader.get_context_features()
+labeled_triples = loader.get_labeled_triples()
+
+train_triples, test_triples = labeled_triples.train_test_split()
+
+generator = BatchGenerator(
+    batch_size=5120, context_features=True, drug_features=True, drug_molecules=False, labels=True
+)
+
+generator.set_data(context_feature_set, drug_feature_set, train_triples)
+
+model = DeepSynergy(context_channels=112, drug_channels=256)
+
+optimizer = torch.optim.Adam(model.parameters())
+
+model.train()
+
+loss = torch.nn.BCELoss()
+
+for epoch in tqdm(range(100)):
+    for batch in generator:
+        optimizer.zero_grad()
+
+        prediction = model(batch.context_features, batch.drug_features_left, batch.drug_features_right)
+
+        loss_value = loss(prediction, batch.labels)
+        loss_value.backward()
+        optimizer.step()
+
+model.eval()
+
+generator.set_labeled_triples(test_triples)
+
+predictions = []
+for batch in generator:
+    prediction = model(batch.context_features, batch.drug_features_left, batch.drug_features_right)
+    prediction = prediction.detach().cpu().numpy()
+    identifiers = batch.identifiers
+    identifiers["prediction"] = prediction
+    predictions.append(identifiers)
+
+predictions = pd.concat(predictions)
+au_roc = roc_auc_score(predictions["label"], predictions["prediction"])
+print(f"AUROC : {au_roc:.4f}")
diff --git a/examples/epgcnds_examples.py b/examples/epgcnds_examples.py
@@ -0,0 +1,51 @@
+import torch
+import pandas as pd
+from tqdm import tqdm
+from sklearn.metrics import roc_auc_score
+from chemicalx.data import DatasetLoader, BatchGenerator
+from chemicalx.models import EPGCNDS
+
+loader = DatasetLoader("drugcombdb")
+
+drug_feature_set = loader.get_drug_features()
+context_feature_set = loader.get_context_features()
+labeled_triples = loader.get_labeled_triples()
+
+
+generator = BatchGenerator(batch_size=1024, context_features=True, drug_features=True, drug_molecules=True, labels=True)
+
+train_triples, test_triples = labeled_triples.train_test_split()
+
+generator.set_data(context_feature_set, drug_feature_set, train_triples)
+
+
+model = EPGCNDS(69)
+
+model.train()
+
+optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=10 ** -7)
+
+loss = torch.nn.BCELoss()
+
+for epoch in range(20):
+    for batch in tqdm(generator):
+        optimizer.zero_grad()
+        prediction = model(batch.drug_molecules_left, batch.drug_molecules_right)
+        output = loss(prediction, batch.labels)
+        output.backward()
+        optimizer.step()
+
+model.eval()
+generator.set_labeled_triples(test_triples)
+
+predictions = []
+for batch in tqdm(generator):
+    prediction = model(batch.drug_molecules_left, batch.drug_molecules_right)
+    prediction = prediction.detach().cpu().numpy()
+    identifiers = batch.identifiers
+    identifiers["prediction"] = prediction
+    predictions.append(identifiers)
+
+predictions = pd.concat(predictions)
+au_roc = roc_auc_score(predictions["label"], predictions["prediction"])
+print(f"AUROC : {au_roc:.4f}")
diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py
@@ -26,8 +26,9 @@ def setUp(self):
         drug_feature_set = loader.get_drug_features()
         context_feature_set = loader.get_context_features()
         labeled_triples = loader.get_labeled_triples()
+        labeled_triples, _ = labeled_triples.train_test_split(train_size=0.005)
         self.generator = BatchGenerator(
-            batch_size=5120, context_features=True, drug_features=True, drug_molecules=True, labels=True
+            batch_size=32, context_features=True, drug_features=True, drug_molecules=True, labels=True
         )
         self.generator.set_data(context_feature_set, drug_feature_set, labeled_triples)
 
@@ -40,8 +41,19 @@ def test_DPDDI(self):
         assert model.x == 2
 
     def test_EPGCNDS(self):
-        model = EPGCNDS(x=2)
-        assert model.x == 2
+
+        model = EPGCNDS(in_channels=69)
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
+        model.train()
+        loss = torch.nn.BCELoss()
+        for batch in self.generator:
+            optimizer.zero_grad()
+            prediction = model(batch.drug_molecules_left, batch.drug_molecules_right)
+            output = loss(prediction, batch.labels)
+            output.backward()
+            optimizer.step()
+            assert prediction.shape[0] == batch.labels.shape[0]
 
     def test_GCNBMP(self):
         model = GCNBMP(x=2)