RuntimeError: unable to open shared memory object </torch_xxxx_xxxxxxxxxx> in read-write mode #2661
Replies: 4 comments 9 replies
-
Hi @hsiangyuzhao , I think you may face a common PyTorch program, refer to: Thanks in advance. |
Beta Was this translation helpful? Give feedback.
-
data loading and augment:import os
import torch
import torchio as tio
import numpy as np
from monai.data import CacheDataset, Dataset, SmartCacheDataset, PersistentDataset
from monai.transforms import LoadImaged, ScaleIntensityRangePercentilesd, SpatialPadd, RandSpatialCropd, \
Compose, RandScaleIntensityd, RandShiftIntensityd, Rand3DElasticd, \
RandAxisFlipd, ToTensord, MapLabelValued, RandGaussianNoised, AddChanneld, NormalizeIntensityd, EnsureTyped
from sklearn.model_selection import train_test_split
from scipy.ndimage import measurements
class SubjectReader:
def __init__(self, image_root, training_size):
self.image_dir = image_root
self.training_size = training_size
self.subject_list = os.listdir(self.image_dir)
def get_subjects(self, subject_list, is_training=True):
print('Subject path: {}'.format(self.image_dir))
subjects = []
for index, subject_name in enumerate(subject_list):
if is_training:
subject = {'t1': os.path.join(self.image_dir, subject_name, subject_name + '_t1.nii.gz'),
't2': os.path.join(self.image_dir, subject_name, subject_name + '_t2.nii.gz'),
't1ce': os.path.join(self.image_dir, subject_name, subject_name + '_t1ce.nii.gz'),
'flair': os.path.join(self.image_dir, subject_name, subject_name + '_flair.nii.gz'),
'label': os.path.join(self.image_dir, subject_name, subject_name + '_seg.nii.gz'),
'name': subject_name}
else:
subject = {'t1': os.path.join(self.image_dir, subject_name, subject_name + '_t1.nii.gz'),
't2': os.path.join(self.image_dir, subject_name, subject_name + '_t2.nii.gz'),
't1ce': os.path.join(self.image_dir, subject_name, subject_name + '_t1ce.nii.gz'),
'flair': os.path.join(self.image_dir, subject_name, subject_name + '_flair.nii.gz'),
'name': subject_name}
subjects.append(subject)
print('Subjects prepared. Number of subjects: {}'.format(len(subjects)))
return subjects
def get_dataset(self, test_size, random_state):
print('Prepare train & val dataset. Test size: {}; Random state: {}'.format(test_size, random_state))
train_transform = self.get_training_transform()
val_transform = self.get_evaluation_transform(inference=False)
train_subject_list, val_subject_list = train_test_split(self.subject_list,
test_size=test_size,
random_state=random_state)
print('Train subjects: {}'.format(train_subject_list))
print('Val subjects: {}'.format(val_subject_list))
train_subjects = self.get_subjects(train_subject_list)
val_subjects = self.get_subjects(val_subject_list)
trainset = Dataset(data=train_subjects, transform=train_transform)
valset = Dataset(data=val_subjects, transform=val_transform)
print('Dataset prepared. Trainset length: {}; Valset length: {}'.format(len(trainset), len(valset)))
return trainset, valset
def get_testset(self):
inference_transform = self.get_evaluation_transform(inference=True)
subjects = self.get_subjects(self.subject_list, is_training=False)
testset = Dataset(data=subjects, transform=inference_transform)
print('BraTS Validation dataset prepared. Length: {}'.format(len(testset)))
return testset
def get_trainset(self):
train_transform = self.get_training_transform()
subjects = self.get_subjects(self.subject_list)
trainset = Dataset(data=subjects, transform=train_transform)
# trainset = PersistentDataset(data=subjects, transform=train_transform, cache_dir='./data_cache')
print('BraTS Training dataset prepared. Length: {}'.format(len(trainset)))
return trainset
@staticmethod
def get_evaluation_transform(inference=False):
training_keys = ('t1', 't2', 't1ce', 'flair', 'label') # BraTS training set contains label
image_keys = ('t1', 't2', 't1ce', 'flair')
if inference:
load = LoadImaged(keys=image_keys)
pad = SpatialPadd(keys=image_keys, spatial_size=(240, 240, 160))
else:
load = LoadImaged(keys=training_keys)
pad = SpatialPadd(keys=training_keys, spatial_size=(240, 240, 160))
if inference:
preprocess = Compose([
load,
AddChanneld(keys=image_keys),
pad,
NormalizeIntensityd(keys=image_keys),
ToTensord(keys=image_keys)
])
else:
preprocess = Compose([
load,
AddChanneld(keys=training_keys),
pad,
MapLabelValued(keys='label', orig_labels=(0, 1, 2, 4), target_labels=(0, 1, 2, 3)),
NormalizeIntensityd(keys=image_keys),
ToTensord(keys=training_keys)
])
return preprocess
def get_training_transform(self):
training_keys = ('t1', 't2', 't1ce', 'flair', 'label') # BraTS training set contains label
image_keys = ('t1', 't2', 't1ce', 'flair')
augment = Compose([
LoadImaged(keys=training_keys),
AddChanneld(keys=training_keys),
MapLabelValued(keys='label', orig_labels=(0, 1, 2, 4), target_labels=(0, 1, 2, 3)),
RandSpatialCropd(keys=training_keys, roi_size=self.training_size, random_size=False),
NormalizeIntensityd(keys=image_keys),
RandAxisFlipd(keys=training_keys, prob=0.5),
RandScaleIntensityd(keys=image_keys, factors=0.1, prob=0.5),
RandShiftIntensityd(keys=image_keys, offsets=0.1, prob=0.5),
Rand3DElasticd(keys=training_keys, prob=0.5, mode='nearest',
sigma_range=(1, 20), magnitude_range=(0.3, 2.3),
rotate_range=(np.pi / 6, np.pi / 6, np.pi / 6),
shear_range=(0.1, 0.1, 0.1),
scale_range=(0.2, 0.2, 0.2)),
# RandGaussianNoised(keys=image_keys, prob=0.5),
EnsureTyped(keys=training_keys)
])
return augment training (core code)import os
import yaml
import tqdm
import argparse
import numpy as np
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from monai.data import list_data_collate
from monai.inferers import sliding_window_inference
from monai.losses import DiceCELoss
from monai.metrics import DiceMetric, HausdorffDistanceMetric
from monai.transforms import Activations, AsDiscrete, Compose
from monai.networks.utils import one_hot
from models.networks import UNet, AttentionUNet, EnhancedUNet, CascadedUNet, PriorAttentionNet
from utils.data_pipeline import SubjectReader, overlap_labels, ETThresholdSuppression, RemoveMinorConnectedComponents
from utils.iterator import MetricMeter, set_random_seed, CosineAnnealingWithWarmUp
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')
def parse_args():
....
def main():
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')
args = parse_args()
....
subject_reader = SubjectReader(data_dir, training_size=patch_size)
if not use_trainset:
trainset, valset = subject_reader.get_dataset(test_size=0.2, random_state=dataseed)
val_loader = DataLoader(valset, batch_size=1, shuffle=False, num_workers=num_workers)
else:
trainset = subject_reader.get_trainset()
train_loader = DataLoader(trainset, batch_size=batch_size * num_gpu, shuffle=True,
num_workers=num_workers, collate_fn=list_data_collate, multiprocessing_context='spawn')
# train_loader = DataLoader(trainset, batch_size=batch_size * num_gpu, shuffle=True, num_workers=num_workers)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# define network
model = model_dict[model_name](num_classes=4, input_channels=4, use_deconv=use_deconv,
channels=(32, 64, 128, 256, 320), strides=(1, 2, 2, 2, 2),
leaky=use_leaky, norm=norm_type).to(device)
....
for epoch in range(start_epoch + 1, num_epochs):
print("Epoch {}/{}".format(epoch + 1, num_epochs))
model.train()
epoch_binary_loss = 0
epoch_multi_loss = 0
if verbose:
loader = tqdm.tqdm(train_loader)
else:
loader = train_loader
for step, batch_data in enumerate(loader):
# load data separately and then concatenate in channel dimension
# modalities must be augmented separately for better performance
inputs_t1 = batch_data['t1'].to(device)
inputs_t2 = batch_data['t2'].to(device)
inputs_t1ce = batch_data['t1ce'].to(device)
inputs_flair = batch_data['flair'].to(device)
inputs = torch.cat([inputs_t1, inputs_t2, inputs_t1ce, inputs_flair], dim=1)
# load targets
targets = one_hot(batch_data['label'].to(device), num_classes=4)
if optimize_overlap:
targets = overlap_labels(targets)
optimizer.zero_grad()
if is_mixed:
# automatic mixed precision
with autocast():
outputs = model(inputs)
# calculate losses
multi_loss = model.get_multi_loss(criterion, outputs, targets, is_ds=is_ds)
# backward
scaler.scale(multi_loss).backward()
scaler.step(optimizer)
scaler.update()
else:
outputs = model(inputs)
multi_loss = model.get_multi_loss(criterion, outputs, targets, is_ds=is_ds)
multi_loss.backward()
optimizer.step()
.... |
Beta Was this translation helpful? Give feedback.
-
Could you please help share some comments on this ticket? I am PTO for some family issue today. Thanks in advance. |
Beta Was this translation helpful? Give feedback.
-
Hi @hsiangyuzhao, you mention this problem occurs when you use MONAI transforms, and does not occur when you don't use them. Does it happen on the first iteration, or does it happen randomly at some point during training? Can you comment out transforms and then gradually add them back in until you find the culprit? |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
-
Hi there, I am using MONAI to perform BraTS 2020 segmentation challenge. Recently I have been met with bugs listed below. The bugs happen on two mechaines I have used, one with Intel 9900K CPU and NVIDIA RTX 2080Ti GPU and the other with Intel Xeon CPU and NVIDIA RTX A6000 48G GPU. I think the bug is not related to the hardware but I have no idea about the cause. I have been stuck here for 1 week now and will anyone be able to help me out? Thanks a lot.
The bug is always related to "shared memory object" but the traceback is not exactly the same when the bug happens.
Error message:
Sometimes:
And sometimes the bug happens but the Python process stucks here (won't stop because of the error, need to stop it with Ctrl + C)
It seems to be with the "list_data_collate" in monai/utils, but I don't know why.
Beta Was this translation helpful? Give feedback.
All reactions