-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Open
Labels
bugSomething isn't workingSomething isn't workingprecision: ampAutomatic Mixed PrecisionAutomatic Mixed Precisionver: 2.5.x
Description
Bug description
I used Lighting for VAE training with DDP on and precision 16-mixed, but every time I train to epoch=3 it crashes, at epoch=0, 1, 2 it's fine, I troubleshooted the problem, and it's fine if I use a precision of 32, here's my training code:
import time
import argparse
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
import lightning as L
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
from dataset import MultiVariateDataset
from models.IceVAE import IceVAE
from configs import ICE_CONFIGS, SIZE_CONFIGS
L.seed_everything(42)
parser = argparse.ArgumentParser()
parser.add_argument(
"--task",
type=str,
default="25km_525",
choices=list(ICE_CONFIGS.keys()),
help="The task to run.",
)
parser.add_argument(
"--size",
type=str,
default="448*304",
choices=list(SIZE_CONFIGS.keys()),
help="The area (width*height) of the data.",
)
parser.add_argument(
"--ckpt_dir",
type=str,
default="/home/ubuntu/Oscar/IceDiffusion/checkpoints/vae",
help="The path to the checkpoint directory.",
)
parser.add_argument(
"--gpus",
type=str,
default="0",
help="Specify the GPU device IDs, e.g., '0,1,2' for using GPU 0, 1, 2 (default: '0')",
)
args = parser.parse_args()
config = ICE_CONFIGS[args.task]
gpu_ids = [int(gpu_id) for gpu_id in args.gpus.split(",")]
# Datasets and Dataloaders
train_dataset = MultiVariateDataset(
config.full_data_path,
config.input_length,
config.pred_length,
19790101,
20231231,
config.max_values_path,
config.min_values_path,
)
train_dataloader = DataLoader(
dataset=train_dataset,
batch_size=config.batch_size,
shuffle=True,
num_workers=config.num_workers,
)
class MyLightningModule(L.LightningModule):
def __init__(self, config):
super().__init__()
self.config = config
self.model = IceVAE(
in_channels=config.num_channels,
out_channels=config.num_channels,
dim=config.dim,
z_dim=config.z_dim,
dim_mult=config.dim_mult,
num_res_blocks=config.num_res_blocks,
attn_scales=config.attn_scales,
temperal_downsample=config.temperal_downsample,
dropout=config.dropout,
)
self.save_hyperparameters(config)
def forward(self, inputs):
x_recon, mu, log_var = self.model(inputs)
return x_recon, mu, log_var
def _calculate_metrics(self, x_recon, inputs, mu, log_var):
# 计算L1重建损失
l1_loss = F.l1_loss(x_recon, inputs)
# 计算KL散度损失
kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
loss = l1_loss + 1e-5 * kl_loss
metrics = {
"loss": loss,
"l1_loss": l1_loss,
"kl_loss": kl_loss,
}
return metrics
def training_step(self, batch):
inputs, targets = batch
x_recon, mu, log_var = self.model(inputs)
metrics = self._calculate_metrics(x_recon, inputs, mu, log_var)
self.log_dict(metrics, prog_bar=True, logger=True, on_step=True)
return metrics["loss"]
def configure_optimizers(self):
optimizer = AdamW(self.model.parameters(), lr=self.config.lr)
scheduler = OneCycleLR(
optimizer,
max_lr=self.config.lr,
epochs=self.config.num_epochs,
steps_per_epoch=len(train_dataloader),
)
return {
"optimizer": optimizer,
"lr_scheduler": {
"scheduler": scheduler,
"interval": "step", # Update the learning rate after every optimizer step
},
}
# Initialize model
model = MyLightningModule(config)
logger = CSVLogger(
save_dir=config.log_path,
name=f"{args.task}",
version=f"{args.task}_{time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())}",
)
callbacks = [
EarlyStopping(monitor="loss", patience=config.patience),
ModelCheckpoint(
monitor="loss",
dirpath=f"{args.ckpt_dir}/{args.task}",
filename=f"{args.task}_{time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())}",
),
]
torch.set_float32_matmul_precision("high")
trainer = Trainer(
accelerator="cuda",
strategy="ddp",
devices=gpu_ids,
precision="32",
logger=logger,
callbacks=callbacks,
max_epochs=config.num_epochs,
)
# Train model
trainer.fit(model, train_dataloader)
Here's what was reported as an error:
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/ubuntu/Oscar/IceDiffusion/train_vae.py", line 165, in <module>
[rank0]: trainer.fit(model, train_dataloader)
[rank0]: ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 561, in fit
[rank0]: call._call_and_handle_interrupt(
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]: self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 105, in launch
[rank0]: return function(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 599, in _fit_impl
[rank0]: self._run(model, ckpt_path=ckpt_path)
[rank0]: ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1012, in _run
[rank0]: results = self._run_stage()
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1056, in _run_stage
[rank0]: self.fit_loop.run()
[rank0]: ~~~~~~~~~~~~~~~~~^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 216, in run
[rank0]: self.advance()
[rank0]: ~~~~~~~~~~~~^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 455, in advance
[rank0]: self.epoch_loop.run(self._data_fetcher)
[rank0]: ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 150, in run
[rank0]: self.advance(data_fetcher)
[rank0]: ~~~~~~~~~~~~^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 320, in advance
[rank0]: batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 192, in run
[rank0]: self._optimizer_step(batch_idx, closure)
[rank0]: ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 270, in _optimizer_step
[rank0]: call._call_lightning_module_hook(
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]: trainer,
[rank0]: ^^^^^^^^
[rank0]: ...<4 lines>...
[rank0]: train_step_and_backward_closure,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 176, in _call_lightning_module_hook
[rank0]: output = fn(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1302, in optimizer_step
[rank0]: optimizer.step(closure=optimizer_closure)
[rank0]: ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/optimizer.py", line 154, in step
[rank0]: step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
[rank0]: optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 239, in optimizer_step
[rank0]: return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/amp.py", line 79, in optimizer_step
[rank0]: closure_result = closure()
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 146, in __call__
[rank0]: self._result = self.closure(*args, **kwargs)
[rank0]: ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in closure
[rank0]: self._backward_fn(step_output.closure_loss)
[rank0]: ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 241, in backward_fn
[rank0]: call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 328, in _call_strategy_hook
[rank0]: output = fn(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 213, in backward
[rank0]: self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/precision.py", line 73, in backward
[rank0]: model.backward(tensor, *args, **kwargs)
[rank0]: ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1097, in backward
[rank0]: loss.backward(*args, **kwargs)
[rank0]: ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/_tensor.py", line 648, in backward
[rank0]: torch.autograd.backward(
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]: self, gradient, retain_graph, create_graph, inputs=inputs
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/__init__.py", line 353, in backward
[rank0]: _engine_run_backward(
[rank0]: ~~~~~~~~~~~~~~~~~~~~^
[rank0]: tensors,
[rank0]: ^^^^^^^^
[rank0]: ...<5 lines>...
[rank0]: accumulate_grad=True,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/graph.py", line 824, in _engine_run_backward
[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: t_outputs, *args, **kwargs
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: ) # Calls into the C++ engine to run the backward pass
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
[rank0]: _error_if_any_worker_fails()
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
[rank0]: RuntimeError: DataLoader worker (pid 478468) is killed by signal: Aborted.
What version are you seeing the problem on?
master
Reproduced in studio
No response
How to reproduce the bug
python train_vae.py --task 25km_525 --size 448*304 --gpus 0,2,6
Error messages and logs
# Error messages and logs here please
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/ubuntu/Oscar/IceDiffusion/train_vae.py", line 165, in <module>
[rank0]: trainer.fit(model, train_dataloader)
[rank0]: ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 561, in fit
[rank0]: call._call_and_handle_interrupt(
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]: self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 105, in launch
[rank0]: return function(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 599, in _fit_impl
[rank0]: self._run(model, ckpt_path=ckpt_path)
[rank0]: ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1012, in _run
[rank0]: results = self._run_stage()
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1056, in _run_stage
[rank0]: self.fit_loop.run()
[rank0]: ~~~~~~~~~~~~~~~~~^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 216, in run
[rank0]: self.advance()
[rank0]: ~~~~~~~~~~~~^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 455, in advance
[rank0]: self.epoch_loop.run(self._data_fetcher)
[rank0]: ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 150, in run
[rank0]: self.advance(data_fetcher)
[rank0]: ~~~~~~~~~~~~^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 320, in advance
[rank0]: batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 192, in run
[rank0]: self._optimizer_step(batch_idx, closure)
[rank0]: ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 270, in _optimizer_step
[rank0]: call._call_lightning_module_hook(
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]: trainer,
[rank0]: ^^^^^^^^
[rank0]: ...<4 lines>...
[rank0]: train_step_and_backward_closure,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 176, in _call_lightning_module_hook
[rank0]: output = fn(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1302, in optimizer_step
[rank0]: optimizer.step(closure=optimizer_closure)
[rank0]: ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/optimizer.py", line 154, in step
[rank0]: step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
[rank0]: optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 239, in optimizer_step
[rank0]: return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/amp.py", line 79, in optimizer_step
[rank0]: closure_result = closure()
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 146, in __call__
[rank0]: self._result = self.closure(*args, **kwargs)
[rank0]: ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in closure
[rank0]: self._backward_fn(step_output.closure_loss)
[rank0]: ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 241, in backward_fn
[rank0]: call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 328, in _call_strategy_hook
[rank0]: output = fn(*args, **kwargs)
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 213, in backward
[rank0]: self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/precision.py", line 73, in backward
[rank0]: model.backward(tensor, *args, **kwargs)
[rank0]: ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1097, in backward
[rank0]: loss.backward(*args, **kwargs)
[rank0]: ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/_tensor.py", line 648, in backward
[rank0]: torch.autograd.backward(
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]: self, gradient, retain_graph, create_graph, inputs=inputs
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/__init__.py", line 353, in backward
[rank0]: _engine_run_backward(
[rank0]: ~~~~~~~~~~~~~~~~~~~~^
[rank0]: tensors,
[rank0]: ^^^^^^^^
[rank0]: ...<5 lines>...
[rank0]: accumulate_grad=True,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: )
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/graph.py", line 824, in _engine_run_backward
[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: t_outputs, *args, **kwargs
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: ) # Calls into the C++ engine to run the backward pass
[rank0]: ^
[rank0]: File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
[rank0]: _error_if_any_worker_fails()
[rank0]: ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
[rank0]: RuntimeError: DataLoader worker (pid 478468) is killed by signal: Aborted.
Environment
Current environment
#- PyTorch Lightning Version (e.g., 2.5.0): 2.5.1.post0
#- PyTorch Version (e.g., 2.5): 2.7.0
#- Python version (e.g., 3.12): 3.13
#- OS (e.g., Linux): Ubuntu 24.04
#- CUDA/cuDNN version: 12.6
#- GPU models and configuration: L40*8
#- How you installed Lightning(`conda`, `pip`, source): pip
More info
No response
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingprecision: ampAutomatic Mixed PrecisionAutomatic Mixed Precisionver: 2.5.x