`torch.distributed.elastic.multiprocessing.errors.ChildFailedError` #6312

ahxmeds · 2023-04-06T08:58:28Z

ahxmeds
Apr 6, 2023

I am trying to train a UNet model using DistributedDataParallel on a Microsoft Azure Standard NC24s v3 Virtual Machine with 24 vCPUs and 4 GPUs (16 GiB per GPU) on the dataset from this challenge. My code given below is roughly based on the tutorial here.

from monai.transforms import (
    AsDiscrete,
    Compose,
)
import monai
from monai.inferers import sliding_window_inference
from monai.data import CacheDataset, DataLoader, decollate_batch
import torch
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import torch.nn as nn
import time
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist # import init_process_group, destroy_process_group
import os
from ddp_initialize_model_data import (
    get_train_valid_data_in_dict_format, 
    get_train_transforms, 
    get_valid_transforms, 
    get_model, 
    get_loss_function,
    get_optimizer, 
    get_scheduler,
    get_metric
)
torch.backends.cudnn.benchmark = True
#%%
def ddp_setup():
    dist.init_process_group(backend='nccl', init_method="env://")

def convert_to_4digits(str_num):
    if len(str_num) == 1:
        new_num = '000' + str_num
    elif len(str_num) == 2:
        new_num = '00' + str_num
    elif len(str_num) == 3:
        new_num = '0' + str_num
    else:
        new_num = str_num
    return new_num

#%%
def load_train_objects():
    train_data, valid_data = get_train_valid_data_in_dict_format(fold=0) 
    train_transforms = get_train_transforms(spatialsize=(128, 128, 128))
    valid_transforms = get_valid_transforms()
    model = get_model()
    optimizer = get_optimizer(model)
    loss_function = get_loss_function()
    scheduler = get_scheduler(optimizer)
    metric = get_metric()

    return (
        train_data,
        valid_data,
        train_transforms,
        valid_transforms,
        model,
        loss_function,
        optimizer,
        scheduler,
        metric
    )


def prepare_dataset(data, transforms):
    dataset = CacheDataset(data=data, transform=transforms, cache_rate=1, num_workers=24)
    return dataset


def main_worker(save_models_dir):
    ddp_setup() 
    # get local rank on the GPU
    local_rank = dist.get_rank()

    # get all training and validation objects
    train_data, valid_data, train_transforms, valid_transforms, model, loss_function, optimizer, scheduler, metric = load_train_objects()
    
    # get dataset of object-type CacheDataset 
    train_dataset = prepare_dataset(train_data, train_transforms)
    valid_dataset = prepare_dataset(valid_data, valid_transforms)

    # get DistributedSampler instances for both training and validation dataloader
    # this will be used to split data into different GPUs
    train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
    valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False)
    
    # initializing train and valid dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=8,
        pin_memory=True,
        shuffle=False,
        sampler=train_sampler,
        num_workers=16
    )
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=1,
        pin_memory=True,
        shuffle=False,
        sampler=valid_sampler,
        num_workers=16
    )

    # initialize the GPU device    
    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)

    # wrap the model around DDP wrapper
    model = model.to(device)
    model = DDP(model, device_ids=[device])

        
    max_epochs = 10
    val_interval = 2
    best_metric = -1
    best_metric_epoch = -1
    post_pred = Compose([AsDiscrete(argmax=True, to_onehot=3)])
    post_label = Compose([AsDiscrete(to_onehot=3)])

    
    epoch_loss_values = []
    metric_values = []
    for epoch in range(max_epochs):
        epoch_start_time = time.time()
        model.train()
        epoch_loss = 0
        step = 0
        train_sampler.set_epoch(epoch)
        for batch_data in train_dataloader:
            step += 1
            inputs, labels = (
                batch_data['CTPT'].to(device),
                batch_data['GT'].to(device),
            )
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            print(f"[GPU:{local_rank}]: epoch {epoch + 1}/{max_epochs}: average loss: {epoch_loss:.4f}: num_samples:{len(train_dataloader)}")
        epoch_loss /= step
        epoch_loss_values.append(epoch_loss)

        # steps forward the CosineAnnealingLR scheduler
        scheduler.step()

        # epoch_loss_values_df = pd.DataFrame(data=epoch_loss_values, columns=['loss'])
        # epoch_loss_values_df.to_csv(trainlog_fpath, index=False)

        if (epoch + 1) % val_interval == 0:
            model.eval()
            with torch.no_grad():
                for val_data in valid_dataloader:
                    val_inputs, val_labels = (
                        val_data['CTPT'].to(device),
                        val_data['GT'].to(device),
                    )
                    roi_size = (128, 128, 128)
                    sw_batch_size = 4
                    val_outputs = sliding_window_inference(
                        val_inputs, roi_size, sw_batch_size, model)
                    val_outputs = [post_pred(i) for i in decollate_batch(val_outputs)]
                    val_labels = [post_label(i) for i in decollate_batch(val_labels)]
                    # compute metric for the current iteration
                    metric(y_pred=val_outputs, y=val_labels)

                # aggregate the final mean dice result
                metric_value = metric.aggregate().item()
                metric_values.append(metric_value)
                # reset the status for the next validation round
                metric.reset()
                print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
            
                if dist.get_rank() == 0:
                    print("SAVING MODELLLLLLLLL!!!!!!!!!!!!!!!!!")
                    savepath = "model_ep="+convert_to_4digits(str(int(epoch+1)))+".pth"
                    torch.save(model.module.state_dict(), savepath)

     
        print(f"[GPU:{local_rank}]: Epoch: {epoch+1} completed, train losses so far: {epoch_loss_values}")
        print(f"[GPU:{local_rank}]: Epoch: {epoch+1} completed, valid metrics so far: {metric_values}")

    dist.destroy_process_group()

def main():
    os.environ['OMP_NUM_THREADS'] = '6'
    main_worker()

if __name__ == "__main__": 
    main()

I run this code in a VS Code terminal (Linux) with torchrun using the following command:

torchrun --standalone --nproc_per_node=4  trainddp_seg3d_ctpt_randomcrop.py

The code runs and trains perfectly when I use a small dataset (like n=16), but gives the following error if I use the whole training (n=419) and validation (n=105) dataset.

WARNING:torch.distributed.run:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
2023-04-06 08:21:05,267 - Added key: store_based_barrier_key:1 to store for rank: 2
2023-04-06 08:21:05,267 - Added key: store_based_barrier_key:1 to store for rank: 3
2023-04-06 08:21:06,255 - Added key: store_based_barrier_key:1 to store for rank: 1
2023-04-06 08:21:06,262 - Added key: store_based_barrier_key:1 to store for rank: 0
2023-04-06 08:21:06,263 - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
2023-04-06 08:21:06,266 - Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
2023-04-06 08:21:06,272 - Rank 3: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
2023-04-06 08:21:06,272 - Rank 2: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
Loading dataset:  41%|██████████████████████████████████████▏                                                      | 172/419 [06:47<14:20,  3.48s/it]WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 356810 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 356811 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 356812 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 3 (pid: 356813) of binary: /anaconda/envs/dlml/bin/python
Traceback (most recent call last):
  File "/anaconda/envs/dlml/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==1.11.0', 'console_scripts', 'torchrun')())
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
    return f(*args, **kwargs)
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/run.py", line 724, in main
    run(args)
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/run.py", line 715, in run
    elastic_launch(
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
=======================================================
trainddp_seg3d_ctpt_randomcrop.py FAILED
-------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
-------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-04-06_08:27:57
  host      : qurit2rg5769-vm.internal.cloudapp.net
  rank      : 3 (local_rank: 3)
  exitcode  : -9 (pid: 356813)
  error_file: <N/A>
  traceback : Signal 9 (SIGKILL) received by PID 356813
=======================================================

I tried to set the OMP_NUM_THREADS variable from 1-6 but that didn't help. Please let me know what could the issue be as the error log doesn't seem very explanatory to me.

Edit: For all 4 processes, the Loading dataset progress bar fails at either 171/419, 172/419, or 186/419.

KumoLiu · 2023-04-06T09:56:20Z

KumoLiu
Apr 6, 2023
Maintainer

Hi @ahxmeds, since the error is happened when loading images, I guess it was due to OOM. I would suggest you to set cache_rate=0.3 or just 0 to have a try.

Hope it can help you, thanks!

2 replies

ahxmeds Apr 7, 2023
Author

Hi @KumoLiu
Thanks for pointing that out. This did work with loading the CacheDataset (I set the cache_rate=0.3), but now the same error occurs after training for 10 epochs.

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111996 closing signal SIGTERMWARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111997 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111998 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 3 (pid: 111999) of binary: /anaconda/envs/dlml/bin/python
Traceback (most recent call last):
  File "/anaconda/envs/dlml/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==1.11.0', 'console_scripts', 'torchrun')())
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
    return f(*args, **kwargs)
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/run.py", line 724, in main
    run(args)
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/run.py", line 715, in run
    elastic_launch(
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/anaconda/envs/dlml/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
=======================================================
trainddp_seg3d_ctpt_randomcrop.py FAILED
-------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
-------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-04-07_02:41:50
  host      : qurit2rg5769-vm.internal.cloudapp.net
  rank      : 3 (local_rank: 3)
  exitcode  : -9 (pid: 111999)
  error_file: <N/A>
  traceback : Signal 9 (SIGKILL) received by PID 111999
=======================================================

KumoLiu Apr 7, 2023
Maintainer

Hi @ahxmeds, it may still be due to OOM, could you please try cache_rate=0 and also check whether your memory usage is always increased during training by using nvidia-smi -l.
Hope it can help you, thanks!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`torch.distributed.elastic.multiprocessing.errors.ChildFailedError` #6312

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment 2 replies

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

torch.distributed.elastic.multiprocessing.errors.ChildFailedError #6312

Uh oh!

Uh oh!

ahxmeds Apr 6, 2023

Replies: 1 comment · 2 replies

Uh oh!

KumoLiu Apr 6, 2023 Maintainer

Uh oh!

ahxmeds Apr 7, 2023 Author

Uh oh!

KumoLiu Apr 7, 2023 Maintainer

`torch.distributed.elastic.multiprocessing.errors.ChildFailedError` #6312

ahxmeds
Apr 6, 2023

Replies: 1 comment 2 replies

KumoLiu
Apr 6, 2023
Maintainer

ahxmeds Apr 7, 2023
Author

KumoLiu Apr 7, 2023
Maintainer