Get stuck in deadlock while training

## 🐛 Bug

During the training phase, at a certain epoch, the training get stuck. There are no error reported and, when exiting with brutal force, the only information given is "wait_for_tstate_lock".

### To Reproduce

```{python}
class MaskRCNN(pl.LightningModule):
    
    def __init__(self, hidden_layers=256, n_classes=47, learning_rate = 1e-2):
        super().__init__()
        
        self.hidden_layers = hidden_layers
        self.n_classes = n_classes
        self.learning_rate = learning_rate
        
        self.model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True, progress = False)
        in_features = self.model.roi_heads.box_predictor.cls_score.in_features
        # replace the pre-trained head with a new one
        self.model.roi_heads.box_predictor = FastRCNNPredictor(in_features, self.n_classes)
        in_features_mask = self.model.roi_heads.mask_predictor.conv5_mask.in_channels
        self.model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, self.hidden_layers, self.n_classes)
    
    def forward(self, x, y):
        return self.model(x, y)
    
    
    '''def on_after_backward(self):
    # example to inspect gradient information in tensorboard
        if self.trainer.global_step % 25 == 0:  # don't make the tf file huge
            params = self.state_dict()
            for k, v in params.items():
                print(v)
                print(type(v))
                grads = v
                name = k
                self.logger.experiment.add_histogram(tag=name, values=grads, global_step=self.trainer.global_step)'''

        
    def training_step(self, batch, batch_idx):

        #images, targets = batch
        images = batch[0]
        targets = batch[1]
        loss_dict = self.model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        self.log('Training general loss:', losses)
        for key, value in loss_dict.items():
            #self.logger.experiment.add_scalar("{}".format(key), value, self.current_epoch)
            self.log(key, value)
            
        return losses
    
    def validation_step(self, batch, batch_idx):
        
        #batch = self.batch_to_device(batch, self.trainer.model.device)
        images, targets = batch
        self.model.train()
        loss_dict = self.model(images, targets)
        self.model.eval()
        prediction = self.model(images)
        prediction = np.array(prediction)
        
        val_losses = sum(loss for loss in loss_dict.values())
        self.log("General_Validation_Loss", val_losses)
        
        #self device per muoverle le cose
        for key, value in loss_dict.items():
            self.log(key, value)
        for k in range(0, len(prediction)):
            #print("Boxes {}".format(prediction[k]['boxes']))
            if len(prediction[k]['boxes']) > 0 and len(prediction[k]['masks']>0):
                nms_res = torchvision.ops.nms(prediction[k]['boxes'], prediction[k]['scores'], 0.3)
                #keep only the elements in the nms_res
                prediction[k]['masks'] = torch.index_select(prediction[k]['masks'], 0, nms_res)
                prediction[k]['boxes'] = torch.index_select(prediction[k]['boxes'], 0, nms_res)
                prediction[k]['labels'] = torch.index_select(prediction[k]['labels'], 0, nms_res)
                prediction[k]['scores'] = torch.index_select(prediction[k]['scores'], 0, nms_res)
                #mAP computation
                #print((targets[k]['boxes']).shape)
                #mAP = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'], thresh = 0.1)
                mAP = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'], thresh = 0.1)
                #mAP50 = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'])
                mAP50 = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'])
                #mAP75 = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'], thresh = 0.75)
                mAP75 = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'], thresh = 0.75)

                self.log("mAP_boxes", mAP)
                self.log("mAP50_boxes", mAP50)
                self.log("mAP75_boxes", mAP75)
        return val_losses
     
     
    def configure_optimizers(self):
        optim = torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.9, weight_decay=0.0001)
        #optim = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optim

```


```{python}
  checkpoint_callback = ModelCheckpoint(
          monitor='General_Validation_Loss',
          dirpath='./checkpoints_pruned',
          filename='imaterialist_maskrcnn224x224',
          save_top_k=3,
          mode='min'
      )
    
    tb_logger = pl.loggers.TensorBoardLogger('logs/pruned_imaterialist', name = 'general-224x224', default_hp_metric=False)
    lr_monitor = LearningRateMonitor(logging_interval = 'epoch')
    #n_classes = 46+1
    n_classes = 19
    model = MaskRCNN(hidden_layers = 256, n_classes=n_classes, learning_rate = 0.005)
    data_module = FashionDatasetModule("./dataset/train_splitted", "./dataset/val_splitted", "./dataset/train_pruned.csv", "./dataset/val_pruned.csv", 224, 224)
    trainer = pl.Trainer(max_epochs = 250, gpus = 8, accelerator = 'ddp', plugins=DDPPlugin(find_unused_parameters=False), default_root_dir = './checkpoints_pruned',logger = tb_logger, callbacks=[checkpoint_callback, lr_monitor])
    trainer.fit(model, data_module)
  

```

### Expected behavior



### Environment
* CUDA:
	- available:         True
	- version:           11.2
* Packages:
	- numpy:             1.19.2
	- pyTorch_debug:     False
	- pyTorch_version:   1.8.0a0+52ea372
	- pytorch-lightning: 1.3.8
	- tqdm:              4.53.0
* System:
	- OS:                Linux
	- python:            3.8.5
 - How you installed PyTorch (`conda`, `pip`, source): pip

### Additional context
To try to solve this problem I removed the LRScheduler, gradient clipping and early stopping, restarted the docker container but with no results.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Get stuck in deadlock while training #8355

🐛 Bug

To Reproduce

Expected behavior

Environment

Additional context

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Get stuck in deadlock while training #8355

Description

🐛 Bug

To Reproduce

Expected behavior

Environment

Additional context

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions