Skip to content

Get stuck in deadlock while trainingΒ #8355

@aliceinland

Description

@aliceinland

πŸ› Bug

During the training phase, at a certain epoch, the training get stuck. There are no error reported and, when exiting with brutal force, the only information given is "wait_for_tstate_lock".

To Reproduce

class MaskRCNN(pl.LightningModule):
    
    def __init__(self, hidden_layers=256, n_classes=47, learning_rate = 1e-2):
        super().__init__()
        
        self.hidden_layers = hidden_layers
        self.n_classes = n_classes
        self.learning_rate = learning_rate
        
        self.model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True, progress = False)
        in_features = self.model.roi_heads.box_predictor.cls_score.in_features
        # replace the pre-trained head with a new one
        self.model.roi_heads.box_predictor = FastRCNNPredictor(in_features, self.n_classes)
        in_features_mask = self.model.roi_heads.mask_predictor.conv5_mask.in_channels
        self.model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, self.hidden_layers, self.n_classes)
    
    def forward(self, x, y):
        return self.model(x, y)
    
    
    '''def on_after_backward(self):
    # example to inspect gradient information in tensorboard
        if self.trainer.global_step % 25 == 0:  # don't make the tf file huge
            params = self.state_dict()
            for k, v in params.items():
                print(v)
                print(type(v))
                grads = v
                name = k
                self.logger.experiment.add_histogram(tag=name, values=grads, global_step=self.trainer.global_step)'''

        
    def training_step(self, batch, batch_idx):

        #images, targets = batch
        images = batch[0]
        targets = batch[1]
        loss_dict = self.model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        self.log('Training general loss:', losses)
        for key, value in loss_dict.items():
            #self.logger.experiment.add_scalar("{}".format(key), value, self.current_epoch)
            self.log(key, value)
            
        return losses
    
    def validation_step(self, batch, batch_idx):
        
        #batch = self.batch_to_device(batch, self.trainer.model.device)
        images, targets = batch
        self.model.train()
        loss_dict = self.model(images, targets)
        self.model.eval()
        prediction = self.model(images)
        prediction = np.array(prediction)
        
        val_losses = sum(loss for loss in loss_dict.values())
        self.log("General_Validation_Loss", val_losses)
        
        #self device per muoverle le cose
        for key, value in loss_dict.items():
            self.log(key, value)
        for k in range(0, len(prediction)):
            #print("Boxes {}".format(prediction[k]['boxes']))
            if len(prediction[k]['boxes']) > 0 and len(prediction[k]['masks']>0):
                nms_res = torchvision.ops.nms(prediction[k]['boxes'], prediction[k]['scores'], 0.3)
                #keep only the elements in the nms_res
                prediction[k]['masks'] = torch.index_select(prediction[k]['masks'], 0, nms_res)
                prediction[k]['boxes'] = torch.index_select(prediction[k]['boxes'], 0, nms_res)
                prediction[k]['labels'] = torch.index_select(prediction[k]['labels'], 0, nms_res)
                prediction[k]['scores'] = torch.index_select(prediction[k]['scores'], 0, nms_res)
                #mAP computation
                #print((targets[k]['boxes']).shape)
                #mAP = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'], thresh = 0.1)
                mAP = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'], thresh = 0.1)
                #mAP50 = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'])
                mAP50 = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'])
                #mAP75 = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'], thresh = 0.75)
                mAP75 = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'], thresh = 0.75)

                self.log("mAP_boxes", mAP)
                self.log("mAP50_boxes", mAP50)
                self.log("mAP75_boxes", mAP75)
        return val_losses
     
     
    def configure_optimizers(self):
        optim = torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.9, weight_decay=0.0001)
        #optim = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optim

  checkpoint_callback = ModelCheckpoint(
          monitor='General_Validation_Loss',
          dirpath='./checkpoints_pruned',
          filename='imaterialist_maskrcnn224x224',
          save_top_k=3,
          mode='min'
      )
    
    tb_logger = pl.loggers.TensorBoardLogger('logs/pruned_imaterialist', name = 'general-224x224', default_hp_metric=False)
    lr_monitor = LearningRateMonitor(logging_interval = 'epoch')
    #n_classes = 46+1
    n_classes = 19
    model = MaskRCNN(hidden_layers = 256, n_classes=n_classes, learning_rate = 0.005)
    data_module = FashionDatasetModule("./dataset/train_splitted", "./dataset/val_splitted", "./dataset/train_pruned.csv", "./dataset/val_pruned.csv", 224, 224)
    trainer = pl.Trainer(max_epochs = 250, gpus = 8, accelerator = 'ddp', plugins=DDPPlugin(find_unused_parameters=False), default_root_dir = './checkpoints_pruned',logger = tb_logger, callbacks=[checkpoint_callback, lr_monitor])
    trainer.fit(model, data_module)
  

Expected behavior

Environment

  • CUDA:
    • available: True
    • version: 11.2
  • Packages:
    • numpy: 1.19.2
    • pyTorch_debug: False
    • pyTorch_version: 1.8.0a0+52ea372
    • pytorch-lightning: 1.3.8
    • tqdm: 4.53.0
  • System:
    • OS: Linux
    • python: 3.8.5
  • How you installed PyTorch (conda, pip, source): pip

Additional context

To try to solve this problem I removed the LRScheduler, gradient clipping and early stopping, restarted the docker container but with no results.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workinghelp wantedOpen to be worked onpriority: 1Medium priority taskwaiting on authorWaiting on user action, correction, or update

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions