Batching possible in link prediction? #3151

paulilioaica · 2021-09-16T13:17:04Z

paulilioaica
Sep 16, 2021

I am following the notebook example for the link prediction and as far as I have seen, all examples for link prediction only focus on one graph by the data = dataset[0]
I am unsure wether training link prediction on graph batches is possible,

I have tried the following:

def train():
    model.train()
    loss_epoch = 0
    for train_data in dataloader_train:
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)
        neg_edge_index = negative_sampling(
            edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
            num_neg_samples=train_data.edge_label_index.size(1), method='sparse')
        edge_label_index = torch.cat(
            [train_data.edge_label_index, neg_edge_index],
            dim=-1,
        )
        edge_label = torch.cat([
            train_data.edge_label,
            train_data.edge_label.new_zeros(neg_edge_index.size(1))
        ], dim=0)
        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        loss_epoch += loss.item()
        optimizer.step()
    return loss_epoch / len(dataloader_train)

@torch.no_grad()
def test(dataloader):
    mean_roc = 0
    model.eval()
    for data in dataloader:
        z = model.encode(data.x, data.edge_index)
        out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
        mean_roc += roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())
    return mean_roc / len(dataloader)

best_val_auc = final_test_auc = 0
for epoch in range(1, 101):
    loss = train()
    val_auc = test(dataloader_val)
    test_auc = test(dataloader_test)
    if val_auc > best_val_auc:
        best_val = val_auc
        final_test_auc = test_auc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
          f'Test: {test_auc:.4f}')

print(f'Final Test: {final_test_auc:.4f}')

where dataloaders are created by

transform = RandomLinkSplit(is_undirected=True)

for ....
    train_data, val_data, test_data = transform(data)
    train.append(train_data)
    val.append(val_data)
    test.append(test_data)

dataloader_train = DataLoader(train, batch_size=128)
dataloader_val = DataLoader(val, batch_size=128)
dataloader_test = DataLoader(test, batch_size=128)

This yields the following error:

<ipython-input-62-d792b94b8efa> in <module>
     41 best_val_auc = final_test_auc = 0
     42 for epoch in range(1, 101):
---> 43     loss = train()
     44     val_auc = test(dataloader_val)
     45     test_auc = test(dataloader_test)

<ipython-input-62-d792b94b8efa> in train()
      6         print(train_data.x.shape)
      7         print(train_data.edge_index.max())
----> 8         z = model.encode(train_data.x, train_data.edge_index)
      9         # We perform a new round of negative sampling for every training epoch:
     10         neg_edge_index = negative_sampling(

<ipython-input-56-79d2bfa0fe46> in encode(self, x, edge_index)
      6 
      7     def encode(self, x, edge_index):
----> 8         x = self.conv1(x, edge_index).relu()
      9         return self.conv2(x, edge_index)
     10 

~/miniconda3/envs/anomaly/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~/miniconda3/envs/anomaly/lib/python3.8/site-packages/torch_geometric/nn/conv/gcn_conv.py in forward(self, x, edge_index, edge_weight)
    160                 cache = self._cached_edge_index
    161                 if cache is None:
--> 162                     edge_index, edge_weight = gcn_norm(  # yapf: disable
    163                         edge_index, edge_weight, x.size(self.node_dim),
    164                         self.improved, self.add_self_loops)

~/miniconda3/envs/anomaly/lib/python3.8/site-packages/torch_geometric/nn/conv/gcn_conv.py in gcn_norm(edge_index, edge_weight, num_nodes, improved, add_self_loops, dtype)
     60 
     61         row, col = edge_index[0], edge_index[1]
---> 62         deg = scatter_add(edge_weight, col, dim=0, dim_size=num_nodes)
     63         deg_inv_sqrt = deg.pow_(-0.5)
     64         deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0)

~/miniconda3/envs/anomaly/lib/python3.8/site-packages/torch_scatter/scatter.py in scatter_add(src, index, dim, out, dim_size)
     27                 out: Optional[torch.Tensor] = None,
     28                 dim_size: Optional[int] = None) -> torch.Tensor:
---> 29     return scatter_sum(src, index, dim, out, dim_size)
     30 
     31 

~/miniconda3/envs/anomaly/lib/python3.8/site-packages/torch_scatter/scatter.py in scatter_sum(src, index, dim, out, dim_size)
     19             size[dim] = int(index.max()) + 1
     20         out = torch.zeros(size, dtype=src.dtype, device=src.device)
---> 21         return out.scatter_add_(dim, index, src)
     22     else:
     23         return out.scatter_add_(dim, index, src)

RuntimeError: index 2108 is out of bounds for dimension 0 with size 2097

I thought this was related to some error in graph creation where the edge index went over the number of nodes, but I have checked this and I am sure this is not the issue

Any suggestions?

Thanks

rusty1s · 2021-09-17T07:46:17Z

rusty1s
Sep 17, 2021
Maintainer

I think your code looks correct, so I think this is a problem with your data. You should check that it holds edge_index.max() < x.size(0) for all elements of your data. Furthermore, keep in mind that you want to apply batch-wise negative sampling via torch_geometric.utils.batched_negative_sampling.

The following code works for me:

import torch
from torch_geometric.nn import SAGEConv
import torch_geometric.transforms as T
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.utils import batched_negative_sampling

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                              add_negative_train_samples=False)
dataset = TUDataset('/tmp/TU', name='MUTAG', transform=transform)

train_dataset, val_dataset, test_dataset = zip(*dataset)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)
test_loader = DataLoader(test_dataset, batch_size=128)


class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)


model = GNN(dataset.num_features, 128, 64).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()


def train():
    model.train()
    loss_epoch = 0
    for data in train_loader:
        optimizer.zero_grad()
        z = model.encode(data.x, data.edge_index)

        neg_edge_index = batched_negative_sampling(
            data.edge_index, batch=data.batch,
            num_neg_samples=data.edge_label_index.size(1), method='sparse')

        edge_label_index = torch.cat(
            [data.edge_label_index, neg_edge_index],
            dim=-1,
        )
        edge_label = torch.cat([
            data.edge_label,
            data.edge_label.new_zeros(neg_edge_index.size(1))
        ], dim=0)
        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        loss_epoch += loss.item()
        optimizer.step()
    return loss_epoch / len(train_loader)


for epoch in range(1, 101):
    loss = train()
    print(loss)

4 replies

paulilioaica Sep 17, 2021
Author

Thank you for your reply

I think I found the issue but I am not sure how to fix it.

I hardcoded the nodes to be larger than the biggest index in edge_index, but the code still failed with the same issue

    nodes = torch.tensor([[0.5] for i in range(edge_index.max() + 1)])
    
    assert edge_index.max() < nodes.size(0), "Error"

    
    data = Data(nodes, edge_index = edge_index)
    train_data, val_data, test_data = transform(data)
    train.append(train_data)
    val.append(val_data)
    test.append(test_data)
    
    print(f"Train edge weight max {train_data.edge_index.max()}, Train node size {train_data.x.size(0)}")
    assert train_data.edge_index.max() < train_data.x.size(0), "Error on train"

But I added the assertion for the train dataset, and the result was the following

Train edge weight max 182, Train node size 184
Train edge weight max 137, Train node size 138
Train edge weight max 281, Train node size 282
Train edge weight max 113, Train node size 115
Train edge weight max 78, Train node size 79
Train edge weight max 94, Train node size 95
Train edge weight max 43, Train node size 46
Train edge weight max 44, Train node size 45
Train edge weight max 92, Train node size 94
Train edge weight max 249, Train node size 251
Train edge weight max 226, Train node size 234
Train edge weight max 269, Train node size 270
Train edge weight max 103, Train node size 113
Train edge weight max 156, Train node size 157
Train edge weight max 184, Train node size 185
Train edge weight max 110, Train node size 98
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-17-f0fb14296c10> in <module>
     27 
     28     print(f"Train edge weight max {train_data.edge_index.max()}, Train node size {train_data.x.size(0)}")
---> 29     assert train_data.edge_index.max() < train_data.x.size(0), "Error on train"
     30 
     31 #     print(f"Edge weight max {edge_index.max()}, node size {nodes.size(0)}")

AssertionError: Error on train

So the RandomLinkSplit function seems to return in some cases edges that are larger than the nodes used for training?

rusty1s Sep 17, 2021
Maintainer

Thanks. We just index_select the edge_index, so I am not sure why this operation should increase any of its indices. Is it possible for you to construct a minimal example to reproduce on my end? You can also upload your data if that is more convenient to you.

paulilioaica Sep 17, 2021
Author

edge_index = torch.tensor([[  1,  89,  55,  73,   2,  16,  19,  70, 107, 100, 102,  69,  74,   9,
          50,   4,  21,  38,   3,  17, 111,  75,  43,  32,  86,   7,   5,  68,
          83,  53, 101,  98,  92,  12,  41,  40,  40,  40,  40,  40,  40,  40,
          40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,
          40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,
          40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,
          40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,
          40,  40,  40,  40,  40,  40,  40,  40,  40,  71,  58, 105,  24,  99],
        [ 41,  41,  40,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,
          41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,
          41,  41,  41,  41,  41,  41,  40,  82,  25,  76,  80,  11, 103,  97,
          14,  22, 106,  78,  72,  35,  49,  10,  96,  42,  65,  56,  51,  95,
          64,  47,  84,  63, 104, 108,  18,  45,  13,  57, 109,  94,  90,  67,
          88,  39,  24,  46,  85,  37,  60, 110,  44,  34,  81,  30,  52,  77,
          15,  93,  23,  61,  33,  28,  66,  54,  79,  31,  20,  59,  27,   8,
          62,  87,  91,  26,  36,  29,   6,   0,  48,  40,  40,  40,  40,  40]])
nodes = torch.tensor([[0.5] for i in range(edge_index.max() + 1)])
data = Data(nodes, edge_index = edge_index)
transform(data)

This yields

(Data(x=[98, 1], edge_index=[2, 98], edge_label=[49], edge_label_index=[2, 49]),
 Data(x=[98, 1], edge_index=[2, 98], edge_label=[12], edge_label_index=[2, 12]),
 Data(x=[110, 1], edge_index=[2, 110], edge_label=[11], edge_label_index=[2, 11]))

and edge_index.max() in train is tensor(110) yet x has only 98 nodes

rusty1s Sep 17, 2021
Maintainer

Oh, wow. I see. The issue is that your data shares the same number of nodes and number of edges, and therefore x is wrongly treated as an edge-level attribute. This is now fixed in master, see here.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Batching possible in link prediction? #3151

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment 4 replies

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Batching possible in link prediction? #3151

Uh oh!

Uh oh!

paulilioaica Sep 16, 2021

Replies: 1 comment · 4 replies

Uh oh!

rusty1s Sep 17, 2021 Maintainer

Uh oh!

paulilioaica Sep 17, 2021 Author

Uh oh!

rusty1s Sep 17, 2021 Maintainer

Uh oh!

Uh oh!

paulilioaica Sep 17, 2021 Author

Uh oh!

rusty1s Sep 17, 2021 Maintainer

paulilioaica
Sep 16, 2021

Replies: 1 comment 4 replies

rusty1s
Sep 17, 2021
Maintainer

paulilioaica Sep 17, 2021
Author

rusty1s Sep 17, 2021
Maintainer

paulilioaica Sep 17, 2021
Author

rusty1s Sep 17, 2021
Maintainer