Training dies with multiprocessing error #16241
-
Hey, my training dies after exactly 24 epoches always with the same multiprocessing error. Any idea what this Any idea on how to investigate it? Or does it seem like core PyTorch and is best asked in their respective forum? Traceback (most recent call last):
File "/home/riesgroup/deploy/mac-ries26/decode/decode/neuralfitter/train/train.py", line 81, in train
trainer.fit(
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1098, in _run
results = self._run_stage()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1177, in _run_stage
self._run_train()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1200, in _run_train
self.fit_loop.run()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.on_advance_end()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 251, in on_advance_end
self._run_validation()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 310, in _run_validation
self.val_loop.run()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 121, in advance
batch = next(data_fetcher)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py", line 184, in __next__
return self.fetching_function()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py", line 265, in fetching_function
self._fetch_next_batch(self.dataloader_iter)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py", line 280, in _fetch_next_batch
batch = next(iterator)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1316, in _next_data
idx, data = self._get_data()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1282, in _get_data
success, data = self._try_get_data()
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1120, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/multiprocessing/queues.py", line 122, in get
return _ForkingPickler.loads(res)
File "/home/riesgroup/mambaforge/envs/decode_dev_lighting186/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 322, in rebuild_storage_filename
storage = torch.UntypedStorage._new_shared_filename_cpu(manager, handle, size)
RuntimeError: unable to mmap 160 bytes from file </torch_1282240_2656777504_296>: Cannot allocate memory (12) |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment
-
Custom Collate_FN error. Unrelated |
Beta Was this translation helpful? Give feedback.
Custom Collate_FN error. Unrelated