MPNN model has a main memory leak, why? #9357

tails1234 · 2024-05-24T19:45:57Z

tails1234
May 24, 2024

I am training the model using cuda, but with each epoch the used main memory and the swap memory keep on increasing, I tried gc.collect() did not resolve the issue. I don't know if the leak is happening inside the model class or somewhere else, the first epoch starts with 5GB used and around fifth epoch 32GB of memory is used, then the program is killed. I am using python 3.10, torch 2.0.1+cu118 , and, torch-geometric 2.3.1. I've read somewhere else that tensors inside lists can sometimes not be released, perhaps inside the torch.cat function? I used the edge convolution tutorial in the docs for reference BTW. here is the code:

class MPNN(MessagePassing):
    def __init__(self, Vertex_in_channels, edge_in_channels, aggr):
        self.aggr = aggr
        super(MPNN, self).__init__(aggr=self.aggr)
        self.linearMessage = nn.Sequential(nn.Linear(Vertex_in_channels+edge_in_channels, Vertex_in_channels),
                                        nn.ReLU(inplace=True))
        self.linearNode = nn.Sequential(nn.Linear(Vertex_in_channels, Vertex_in_channels),
                                             nn.ReLU(inplace=True))
        self.LinearEdge = nn.Sequential(nn.Linear(2*Vertex_in_channels+edge_in_channels, edge_in_channels),
                                        nn.ReLU(inplace=True))
        self.reset_parameters()

    def forward(self, x, edge_index, edge_attr):
        return self.propagate(edge_index, x=x, edge_attr=edge_attr), self.edge_updater(edge_index=edge_index, edge_attr=edge_attr, x=x)

    def message(self, x_i, x_j, edge_attr):
        return self.linearMessage(torch.cat([x_j - x_i, edge_attr], dim=-1))

    def update(self, aggr_out, x):
        return self.linearNode(aggr_out + x)

    def edge_update(self, edge_attr,  x_i, x_j):
        return edge_attr + self.LinearEdge(torch.cat([x_i, edge_attr, x_j], dim=-1))
    
    def reset_parameters(self):
        super().reset_parameters()
        reset(self.linearMessage)
        reset(self.linearNode)
        reset(self.LinearEdge)

class edgeClassifier(nn.Module):
    def __init__(self, vertexFeatures, edgeFeatures, aggr, device='cuda'):
        super(edgeClassifier, self).__init__()
        self.deivce = device
        self.aggr = aggr
        self.vertexFeatures = vertexFeatures
        self.edgeFeatures = edgeFeatures

        self.edge_1D_conv = nn.Sequential(nn.Conv1d(in_channels=1, out_channels=50, kernel_size=3, stride=1, padding=1, bias=True),
                                          nn.MaxPool1d(kernel_size=3, stride=2, padding=1),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(in_channels=50, out_channels=100, kernel_size=3, stride=1, padding=1, bias=True),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(in_channels=100, out_channels=50, kernel_size=3, stride=1, padding=1, bias=True),
                                          nn.MaxPool1d(kernel_size=3, stride=2, padding=1),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(in_channels=50, out_channels=20, kernel_size=3, stride=1, padding=1, bias=True),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(in_channels=20, out_channels=10, kernel_size=3, stride=1, padding=1, bias=True),
                                          nn.Flatten(start_dim=1, end_dim=2))

        self.edge_op_channels = ceil(self.edgeFeatures/(2**2))*10
        self.mpnn_1 = MPNN(self.vertexFeatures, self.edge_op_channels, 'max')
        self.mpnn_2 = MPNN(self.vertexFeatures, self.edge_op_channels, 'max')
        self.mpnn_3 = MPNN(self.vertexFeatures, self.edge_op_channels, 'max')

        self.classifier_input_channels = 2*self.vertexFeatures+self.edge_op_channels+2
        self.classifier_hidden_channels = floor(self.classifier_input_channels/2)
        self.classifier_hidden_channels_2 = floor(self.classifier_hidden_channels/2)
        self.classifier_output_channels = 2
        self.classifier = nn.Sequential(nn.Linear(self.classifier_input_channels, self.classifier_hidden_channels),
                                        nn.ReLU(inplace=True),
                                        nn.Linear(self.classifier_hidden_channels, self.classifier_hidden_channels_2),
                                        nn.ReLU(inplace=True),
                                        nn.Linear(self.classifier_hidden_channels_2, self.classifier_output_channels))#,

    def forward(self, x, edge_attr, edge_index, orgEdgeList, metaFeatures, isEval: bool=False):
        
        edge_out = self.edge_1D_conv(edge_attr)
        x_e, e_e = self.mpnn_1(x, edge_index, edge_out)
        x_e, e_e = self.mpnn_2(x_e, edge_index, e_e)
        x_e, e_e = self.mpnn_3(x_e, edge_index, e_e)
        out = self.classifier(torch.cat([ x_e[edge_index[0]], e_e, metaFeatures, x_e[edge_index[1]] ], dim=-1))

	gc.collect()
		
        if isEval:
            return out, x_e
        else:
            return out

rusty1s · 2024-05-27T07:40:41Z

rusty1s
May 27, 2024
Maintainer

I don't see any obvious memory leak inside your model. I suspect the issue is within your training loop, i.e., the PyTorch computation graph may not be correctly freed.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

MPNN model has a main memory leak, why? #9357

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

MPNN model has a main memory leak, why? #9357

Uh oh!

Uh oh!

tails1234 May 24, 2024

Replies: 1 comment

Uh oh!

rusty1s May 27, 2024 Maintainer

tails1234
May 24, 2024

rusty1s
May 27, 2024
Maintainer