Why does the distributed training get stuck here and doesn't move. #15266

Struggle-Forever · 2022-10-24T06:10:32Z

Struggle-Forever
Oct 24, 2022

I have requested two GPUs on slurm cluster for distributed training, but the program does not move?

When I use only one GPU, the model is trained normally.

        tb_logger = pl_loggers.TensorBoardLogger(save_dir=save_log_path)
        checkpoint_callback = ModelCheckpoint(monitor='val_f1',
                                              dirpath="{}".format(save_log_path),
                                              filename='best_{}'.format(index_times),
                                              save_top_k=1,
                                              mode='max',
                                              save_weights_only=True,
                                              save_last=False)

        early_stop_callback = EarlyStopping(monitor="val_f1", min_delta=0.00, patience=args.patience,
                                            verbose=False, mode="max")

        trainer = Trainer(devices="auto", accelerator="auto",
                          logger=tb_logger, log_every_n_steps=50,
                          flush_logs_every_n_steps=50, callbacks=[checkpoint_callback, early_stop_callback],
                          max_epochs=args.epochs)
        trainer.fit(model)
        result = trainer.test(model, ckpt_path='best')

class BERT_baseline(pl.LightningModule):
    def __init__(self, args, lr):
        super().__init__()
        self.save_hyperparameters()
        self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path=args.path)
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=args.path)
        self.CE_loss = nn.CrossEntropyLoss()
        self.lr = lr
        self.args = args
        self.metric = load_metric("seqeval")
        self.label_name = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
        self.num_labels = len(self.label_name)
        self.fc = nn.Sequential(nn.Linear(self.bert.config.hidden_size, self.num_labels))

    def forward(self, inputs=None, labels=None, label_mask=None):
        bert_out = self.bert(**inputs)[0]  # batch * seq-length * hidden
        logits = self.fc(bert_out)  # batch_size *  seq_length * num_lables
        if labels is not None:  # 训练
            # Only keep active parts of the loss
            active_loss = label_mask.view(-1) == 1
            active_logits = logits.view(-1, self.num_labels)[active_loss]  # 预测的结果
            active_labels = labels.view(-1)[active_loss]  # 真实的结果
            loss = self.CE_loss(active_logits, active_labels)
            return loss
        else:  # 测试集
            return bert_out, logits

    def training_step(self, batch, batch_idx):
        bert_inputs = batch[0]
        real_labels = batch[1]
        label_mask = batch[2]
        loss = self(inputs=bert_inputs, labels=real_labels, label_mask=label_mask)
        loss_metirc = {"loss": loss}
        self.log_dict(loss_metirc, on_step=True, prog_bar=True, logger=True, on_epoch=True)
        return loss_metirc

    def training_step_end(self, training_step_outputs):  # 一个batch结束
        pass

    def training_epoch_end(self, training_step_outputs):  # 一个epoch
        pass

    def share_val_step(self, batch):
        bert_inputs = batch[0]
        real_labels = batch[1]  # batch * seq
        hidden, logits = self(inputs=bert_inputs)
        y_pred_label = logits.argmax(dim=-1)
        result_dict = self.compute_metrics(labels=real_labels, predictions=y_pred_label)
        return result_dict['f1'], result_dict['precision'], result_dict['recall']

    def validation_step(self, batch, batch_idx):
        f1, precision,  recall = self.share_val_step(batch)
        metrics = {'val_f1': f1, 'val_precision': precision, 'val_recall': recall}
        self.log_dict(metrics, prog_bar=True, logger=True, on_epoch=True)

    def validation_step_end(self, val_step_outputs):
        pass

    def validation_epoch_end(self, validation_step_outputs):
        pass

    def test_step(self, batch, batch_idx, dataloader_idx):
        f1, precision,  recall = self.share_val_step(batch)
        metrics = {'f1': f1, 'precision': precision, 'recall': recall}
        self.log_dict(metrics, prog_bar=True, logger=True, on_epoch=True)

    def test_step_end(self, output_results):
        pass

    def test_epoch_end(self, test_step_outputs):
        pass

    def configure_optimizers(self):
        if self.args.freeze != -1:
            no_grad = ["embeddings"] + ["layer." + str(layer_i) + "." for layer_i in range(12)
                                        if layer_i < self.args.freeze]
            for n, p in self.bert.named_parameters():
                p.requires_grad = False if any(nd in n for nd in no_grad) else True

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [  # 将权重衰减应用于除了'偏移'这些之外的所有参数
            # n是模型每层名字，p是每层参数，model是模型名字
            {'params': [p for n, p in self.bert.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.args.weight_decay},
            {'params': [p for n, p in self.bert.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr, weight_decay=self.args.weight_decay)
        # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9, last_epoch=-1)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.args.warmup_steps,
                                                    num_training_steps=self.args.total_steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

    def train_dataloader(self):
        train_data = Ner_Dataset(language=self.args.source, desc='train')
        self.print('======================== 训练集有：{}====================='.format(len(train_data)))
        return DataLoader(train_data, batch_size=self.args.batch_size,
                          shuffle=True, collate_fn=self.collate)

    def val_dataloader(self):
        val_data = Ner_Dataset(language=self.args.source, desc='valid')
        self.print('======================== 验证集有：{}====================='.format(len(val_data)))
        return DataLoader(val_data, batch_size=self.args.batch_size,
                          shuffle=False, collate_fn=self.collate)

    def test_dataloader(self):
        data_list = [Ner_Dataset(language=one, desc='test') for one in self.args.target_list]
        return [DataLoader(one, batch_size=self.args.batch_size, shuffle=False, collate_fn=self.collate)
                for one in data_list]

Answered by Struggle-Forever

Oct 24, 2022

When I switch the communication method from ‘nccl’ to 'gloo', it works. I don't know what the problem is, but I hope I can help you.

        ddp = DDPStrategy(process_group_backend='gloo')
        trainer = Trainer(devices="auto", accelerator="auto", strategy=ddp,
                          logger=tb_logger, log_every_n_steps=50,
                          flush_logs_every_n_steps=50, callbacks=[checkpoint_callback, early_stop_callback],
                          max_epochs=args.epochs)

View full answer

Struggle-Forever · 2022-10-24T06:43:49Z

Struggle-Forever
Oct 24, 2022
Author

I guess it deadlocked when creating the missing folder?

0 replies

Struggle-Forever · 2022-10-24T13:11:51Z

Struggle-Forever
Oct 24, 2022
Author

When I switch the communication method from ‘nccl’ to 'gloo', it works. I don't know what the problem is, but I hope I can help you.

        ddp = DDPStrategy(process_group_backend='gloo')
        trainer = Trainer(devices="auto", accelerator="auto", strategy=ddp,
                          logger=tb_logger, log_every_n_steps=50,
                          flush_logs_every_n_steps=50, callbacks=[checkpoint_callback, early_stop_callback],
                          max_epochs=args.epochs)

0 replies

gpucce · 2022-11-01T07:41:21Z

gpucce
Nov 1, 2022

I am having the same issue, did you find what what was the problem?

1 reply

Struggle-Forever Nov 1, 2022
Author

In the answer above, I gave my answer. This is a problem caused by communication. You can reapply the GPU or modify the communication method.

htlee6 · 2022-11-11T07:49:25Z

htlee6
Nov 11, 2022

Similar issues, but a little bit different. My training could start successfully no matter using nccl or gloo as the backend, however, it always got stuck halfway. Like always at the start of an epoch.

Start distribution successfully

Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=gloo
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

Training...
[2022-11-11 15:42:53,934][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
Training...
[2022-11-11 15:42:53,934][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 1
[2022-11-11 15:42:53,934][torch.distributed.distributed_c10d][INFO] - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
[2022-11-11 15:42:53,934][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.

Got stuck


Validation DataLoader 0:  88%|████████▊ | 14/16 [00:00<00:00, 35.46it/s]�[A
Epoch 12:  97%|█████████▋| 61/63 [00:02<00:00, 27.79it/s, loss=0.581, v_num=17]

Validation DataLoader 0:  94%|█████████▍| 15/16 [00:00<00:00, 35.44it/s]�[A
Epoch 12:  98%|█████████▊| 62/63 [00:02<00:00, 27.89it/s, loss=0.581, v_num=17]

Validation DataLoader 0: 100%|██████████| 16/16 [00:00<00:00, 36.09it/s]�[A
Epoch 12: 100%|██████████| 63/63 [00:02<00:00, 28.08it/s, loss=0.581, v_num=17]
Epoch 12: 100%|██████████| 63/63 [00:02<00:00, 28.07it/s, loss=0.581, v_num=17]

                                                                        �[A
Epoch 12: 100%|██████████| 63/63 [00:02<00:00, 28.05it/s, loss=0.581, v_num=17]
Epoch 12:   0%|          | 0/63 [00:00<?, ?it/s, loss=0.581, v_num=17]
Epoch 13:   0%|          | 0/63 [00:00<?, ?it/s, loss=0.581, v_num=17]

4 replies

Struggle-Forever Nov 11, 2022
Author

Sorry, I don't meet this problem.

htlee6 Nov 11, 2022

I accidentally solved this issue, I forced sync_dist=True in log() method. But be aware that according to the document, this may lead a significant communication overhead.

Saying your LightningModule like this:

class YourLightningModule(pl.LightningModule):
...
    def training_step(self, data):
        your_loss = func_compute_loss(data)
        self.log("loss", your_loss, sync_dist=True)
...

@Struggle-Forever thank you all the same!

gpucce Nov 11, 2022

Another solution that worked for me is setting the environment variable NCCL_P2P_LEVEL to a value different than "SYS" or "PHB" as documented here although that made it run slower than gloo for me. Though this may apply to the original issue and not to the missing sync_dist=True case.

nighting0le01 Jul 27, 2023

Hey Mine gets stuck at the start Start distribution successfully

and doesn't move past it . how to get it to run

Why does the distributed training get stuck here and doesn't move. #15266

Uh oh!

Struggle-Forever Oct 24, 2022

Replies: 4 comments · 5 replies

Uh oh!

Struggle-Forever Oct 24, 2022 Author

Uh oh!

Struggle-Forever Oct 24, 2022 Author

Uh oh!

gpucce Nov 1, 2022

Uh oh!

Struggle-Forever Nov 1, 2022 Author

Uh oh!

htlee6 Nov 11, 2022

Start distribution successfully

Got stuck

Uh oh!

Struggle-Forever Nov 11, 2022 Author

Uh oh!

Uh oh!

htlee6 Nov 11, 2022

Uh oh!

Uh oh!

gpucce Nov 11, 2022

Uh oh!

nighting0le01 Jul 27, 2023

Struggle-Forever
Oct 24, 2022

Replies: 4 comments 5 replies

Struggle-Forever
Oct 24, 2022
Author

Struggle-Forever
Oct 24, 2022
Author

gpucce
Nov 1, 2022

Struggle-Forever Nov 1, 2022
Author

htlee6
Nov 11, 2022

Struggle-Forever Nov 11, 2022
Author