-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Closed
Labels
bugSomething isn't workingSomething isn't workinghelp wantedOpen to be worked onOpen to be worked onpriority: 1Medium priority taskMedium priority taskwaiting on authorWaiting on user action, correction, or updateWaiting on user action, correction, or update
Milestone
Description
π Bug
During the training phase, at a certain epoch, the training get stuck. There are no error reported and, when exiting with brutal force, the only information given is "wait_for_tstate_lock".
To Reproduce
class MaskRCNN(pl.LightningModule):
def __init__(self, hidden_layers=256, n_classes=47, learning_rate = 1e-2):
super().__init__()
self.hidden_layers = hidden_layers
self.n_classes = n_classes
self.learning_rate = learning_rate
self.model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True, progress = False)
in_features = self.model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
self.model.roi_heads.box_predictor = FastRCNNPredictor(in_features, self.n_classes)
in_features_mask = self.model.roi_heads.mask_predictor.conv5_mask.in_channels
self.model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, self.hidden_layers, self.n_classes)
def forward(self, x, y):
return self.model(x, y)
'''def on_after_backward(self):
# example to inspect gradient information in tensorboard
if self.trainer.global_step % 25 == 0: # don't make the tf file huge
params = self.state_dict()
for k, v in params.items():
print(v)
print(type(v))
grads = v
name = k
self.logger.experiment.add_histogram(tag=name, values=grads, global_step=self.trainer.global_step)'''
def training_step(self, batch, batch_idx):
#images, targets = batch
images = batch[0]
targets = batch[1]
loss_dict = self.model(images, targets)
losses = sum(loss for loss in loss_dict.values())
self.log('Training general loss:', losses)
for key, value in loss_dict.items():
#self.logger.experiment.add_scalar("{}".format(key), value, self.current_epoch)
self.log(key, value)
return losses
def validation_step(self, batch, batch_idx):
#batch = self.batch_to_device(batch, self.trainer.model.device)
images, targets = batch
self.model.train()
loss_dict = self.model(images, targets)
self.model.eval()
prediction = self.model(images)
prediction = np.array(prediction)
val_losses = sum(loss for loss in loss_dict.values())
self.log("General_Validation_Loss", val_losses)
#self device per muoverle le cose
for key, value in loss_dict.items():
self.log(key, value)
for k in range(0, len(prediction)):
#print("Boxes {}".format(prediction[k]['boxes']))
if len(prediction[k]['boxes']) > 0 and len(prediction[k]['masks']>0):
nms_res = torchvision.ops.nms(prediction[k]['boxes'], prediction[k]['scores'], 0.3)
#keep only the elements in the nms_res
prediction[k]['masks'] = torch.index_select(prediction[k]['masks'], 0, nms_res)
prediction[k]['boxes'] = torch.index_select(prediction[k]['boxes'], 0, nms_res)
prediction[k]['labels'] = torch.index_select(prediction[k]['labels'], 0, nms_res)
prediction[k]['scores'] = torch.index_select(prediction[k]['scores'], 0, nms_res)
#mAP computation
#print((targets[k]['boxes']).shape)
#mAP = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'], thresh = 0.1)
mAP = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'], thresh = 0.1)
#mAP50 = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'])
mAP50 = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'])
#mAP75 = calculate_map(targets[0]['boxes'], prediction[0]['boxes'], prediction[0]['scores'], thresh = 0.75)
mAP75 = calculate_map(targets[k]['boxes'], prediction[k]['boxes'], prediction[k]['scores'], thresh = 0.75)
self.log("mAP_boxes", mAP)
self.log("mAP50_boxes", mAP50)
self.log("mAP75_boxes", mAP75)
return val_losses
def configure_optimizers(self):
optim = torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.9, weight_decay=0.0001)
#optim = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
return optim
checkpoint_callback = ModelCheckpoint(
monitor='General_Validation_Loss',
dirpath='./checkpoints_pruned',
filename='imaterialist_maskrcnn224x224',
save_top_k=3,
mode='min'
)
tb_logger = pl.loggers.TensorBoardLogger('logs/pruned_imaterialist', name = 'general-224x224', default_hp_metric=False)
lr_monitor = LearningRateMonitor(logging_interval = 'epoch')
#n_classes = 46+1
n_classes = 19
model = MaskRCNN(hidden_layers = 256, n_classes=n_classes, learning_rate = 0.005)
data_module = FashionDatasetModule("./dataset/train_splitted", "./dataset/val_splitted", "./dataset/train_pruned.csv", "./dataset/val_pruned.csv", 224, 224)
trainer = pl.Trainer(max_epochs = 250, gpus = 8, accelerator = 'ddp', plugins=DDPPlugin(find_unused_parameters=False), default_root_dir = './checkpoints_pruned',logger = tb_logger, callbacks=[checkpoint_callback, lr_monitor])
trainer.fit(model, data_module)
Expected behavior
Environment
- CUDA:
- available: True
- version: 11.2
- Packages:
- numpy: 1.19.2
- pyTorch_debug: False
- pyTorch_version: 1.8.0a0+52ea372
- pytorch-lightning: 1.3.8
- tqdm: 4.53.0
- System:
- OS: Linux
- python: 3.8.5
- How you installed PyTorch (
conda,pip, source): pip
Additional context
To try to solve this problem I removed the LRScheduler, gradient clipping and early stopping, restarted the docker container but with no results.
sidml and tobiascz
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workinghelp wantedOpen to be worked onOpen to be worked onpriority: 1Medium priority taskMedium priority taskwaiting on authorWaiting on user action, correction, or updateWaiting on user action, correction, or update