Skip to content

Program crashes due to the use of AMP #20826

@jialiangZ

Description

@jialiangZ

Bug description

I used Lighting for VAE training with DDP on and precision 16-mixed, but every time I train to epoch=3 it crashes, at epoch=0, 1, 2 it's fine, I troubleshooted the problem, and it's fine if I use a precision of 32, here's my training code:

import time
import argparse
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

import lightning as L
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger

from dataset import MultiVariateDataset
from models.IceVAE import IceVAE
from configs import ICE_CONFIGS, SIZE_CONFIGS


L.seed_everything(42)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--task",
    type=str,
    default="25km_525",
    choices=list(ICE_CONFIGS.keys()),
    help="The task to run.",
)
parser.add_argument(
    "--size",
    type=str,
    default="448*304",
    choices=list(SIZE_CONFIGS.keys()),
    help="The area (width*height) of the data.",
)
parser.add_argument(
    "--ckpt_dir",
    type=str,
    default="/home/ubuntu/Oscar/IceDiffusion/checkpoints/vae",
    help="The path to the checkpoint directory.",
)
parser.add_argument(
    "--gpus",
    type=str,
    default="0",
    help="Specify the GPU device IDs, e.g., '0,1,2' for using GPU 0, 1, 2 (default: '0')",
)

args = parser.parse_args()

config = ICE_CONFIGS[args.task]
gpu_ids = [int(gpu_id) for gpu_id in args.gpus.split(",")]

# Datasets and Dataloaders
train_dataset = MultiVariateDataset(
    config.full_data_path,
    config.input_length,
    config.pred_length,
    19790101,
    20231231,
    config.max_values_path,
    config.min_values_path,
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    num_workers=config.num_workers,
)


class MyLightningModule(L.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.model = IceVAE(
            in_channels=config.num_channels,
            out_channels=config.num_channels,
            dim=config.dim,
            z_dim=config.z_dim,
            dim_mult=config.dim_mult,
            num_res_blocks=config.num_res_blocks,
            attn_scales=config.attn_scales,
            temperal_downsample=config.temperal_downsample,
            dropout=config.dropout,
        )
        self.save_hyperparameters(config)

    def forward(self, inputs):
        x_recon, mu, log_var = self.model(inputs)
        return x_recon, mu, log_var

    def _calculate_metrics(self, x_recon, inputs, mu, log_var):
        # 计算L1重建损失
        l1_loss = F.l1_loss(x_recon, inputs)

        # 计算KL散度损失
        kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

        loss = l1_loss + 1e-5 * kl_loss

        metrics = {
            "loss": loss,
            "l1_loss": l1_loss,
            "kl_loss": kl_loss,
        }
        return metrics

    def training_step(self, batch):
        inputs, targets = batch
        x_recon, mu, log_var = self.model(inputs)
        metrics = self._calculate_metrics(x_recon, inputs, mu, log_var)
        self.log_dict(metrics, prog_bar=True, logger=True, on_step=True)
        return metrics["loss"]

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.config.lr)
        scheduler = OneCycleLR(
            optimizer,
            max_lr=self.config.lr,
            epochs=self.config.num_epochs,
            steps_per_epoch=len(train_dataloader),
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",  # Update the learning rate after every optimizer step
            },
        }


# Initialize model
model = MyLightningModule(config)

logger = CSVLogger(
    save_dir=config.log_path,
    name=f"{args.task}",
    version=f"{args.task}_{time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())}",
)

callbacks = [
    EarlyStopping(monitor="loss", patience=config.patience),
    ModelCheckpoint(
        monitor="loss",
        dirpath=f"{args.ckpt_dir}/{args.task}",
        filename=f"{args.task}_{time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())}",
    ),
]

torch.set_float32_matmul_precision("high")
trainer = Trainer(
    accelerator="cuda",
    strategy="ddp",
    devices=gpu_ids,
    precision="32",
    logger=logger,
    callbacks=callbacks,
    max_epochs=config.num_epochs,
)

# Train model
trainer.fit(model, train_dataloader)

Here's what was reported as an error:

[rank0]: Traceback (most recent call last):
[rank0]:   File "/home/ubuntu/Oscar/IceDiffusion/train_vae.py", line 165, in <module>
[rank0]:     trainer.fit(model, train_dataloader)
[rank0]:     ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 561, in fit
[rank0]:     call._call_and_handle_interrupt(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]:         self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
[rank0]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank0]:            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 105, in launch
[rank0]:     return function(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 599, in _fit_impl
[rank0]:     self._run(model, ckpt_path=ckpt_path)
[rank0]:     ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1012, in _run
[rank0]:     results = self._run_stage()
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1056, in _run_stage
[rank0]:     self.fit_loop.run()
[rank0]:     ~~~~~~~~~~~~~~~~~^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 216, in run
[rank0]:     self.advance()
[rank0]:     ~~~~~~~~~~~~^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 455, in advance
[rank0]:     self.epoch_loop.run(self._data_fetcher)
[rank0]:     ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 150, in run
[rank0]:     self.advance(data_fetcher)
[rank0]:     ~~~~~~~~~~~~^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 320, in advance
[rank0]:     batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 192, in run
[rank0]:     self._optimizer_step(batch_idx, closure)
[rank0]:     ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 270, in _optimizer_step
[rank0]:     call._call_lightning_module_hook(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]:         trainer,
[rank0]:         ^^^^^^^^
[rank0]:     ...<4 lines>...
[rank0]:         train_step_and_backward_closure,
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 176, in _call_lightning_module_hook
[rank0]:     output = fn(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1302, in optimizer_step
[rank0]:     optimizer.step(closure=optimizer_closure)
[rank0]:     ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/optimizer.py", line 154, in step
[rank0]:     step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
[rank0]:     optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 239, in optimizer_step
[rank0]:     return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
[rank0]:            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/amp.py", line 79, in optimizer_step
[rank0]:     closure_result = closure()
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 146, in __call__
[rank0]:     self._result = self.closure(*args, **kwargs)
[rank0]:                    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]:     return func(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in closure
[rank0]:     self._backward_fn(step_output.closure_loss)
[rank0]:     ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 241, in backward_fn
[rank0]:     call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 328, in _call_strategy_hook
[rank0]:     output = fn(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 213, in backward
[rank0]:     self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/precision.py", line 73, in backward
[rank0]:     model.backward(tensor, *args, **kwargs)
[rank0]:     ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1097, in backward
[rank0]:     loss.backward(*args, **kwargs)
[rank0]:     ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/_tensor.py", line 648, in backward
[rank0]:     torch.autograd.backward(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]:         self, gradient, retain_graph, create_graph, inputs=inputs
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/__init__.py", line 353, in backward
[rank0]:     _engine_run_backward(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~^
[rank0]:         tensors,
[rank0]:         ^^^^^^^^
[rank0]:     ...<5 lines>...
[rank0]:         accumulate_grad=True,
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/graph.py", line 824, in _engine_run_backward
[rank0]:     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[rank0]:            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:         t_outputs, *args, **kwargs
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )  # Calls into the C++ engine to run the backward pass
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
[rank0]:     _error_if_any_worker_fails()
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
[rank0]: RuntimeError: DataLoader worker (pid 478468) is killed by signal: Aborted.

What version are you seeing the problem on?

master

Reproduced in studio

No response

How to reproduce the bug

python train_vae.py --task 25km_525 --size 448*304 --gpus 0,2,6

Error messages and logs

# Error messages and logs here please
[rank0]: Traceback (most recent call last):
[rank0]:   File "/home/ubuntu/Oscar/IceDiffusion/train_vae.py", line 165, in <module>
[rank0]:     trainer.fit(model, train_dataloader)
[rank0]:     ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 561, in fit
[rank0]:     call._call_and_handle_interrupt(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]:         self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
[rank0]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank0]:            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 105, in launch
[rank0]:     return function(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 599, in _fit_impl
[rank0]:     self._run(model, ckpt_path=ckpt_path)
[rank0]:     ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1012, in _run
[rank0]:     results = self._run_stage()
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/trainer.py", line 1056, in _run_stage
[rank0]:     self.fit_loop.run()
[rank0]:     ~~~~~~~~~~~~~~~~~^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 216, in run
[rank0]:     self.advance()
[rank0]:     ~~~~~~~~~~~~^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py", line 455, in advance
[rank0]:     self.epoch_loop.run(self._data_fetcher)
[rank0]:     ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 150, in run
[rank0]:     self.advance(data_fetcher)
[rank0]:     ~~~~~~~~~~~~^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 320, in advance
[rank0]:     batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 192, in run
[rank0]:     self._optimizer_step(batch_idx, closure)
[rank0]:     ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 270, in _optimizer_step
[rank0]:     call._call_lightning_module_hook(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]:         trainer,
[rank0]:         ^^^^^^^^
[rank0]:     ...<4 lines>...
[rank0]:         train_step_and_backward_closure,
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 176, in _call_lightning_module_hook
[rank0]:     output = fn(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1302, in optimizer_step
[rank0]:     optimizer.step(closure=optimizer_closure)
[rank0]:     ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/optimizer.py", line 154, in step
[rank0]:     step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
[rank0]:     optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 239, in optimizer_step
[rank0]:     return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
[rank0]:            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/amp.py", line 79, in optimizer_step
[rank0]:     closure_result = closure()
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 146, in __call__
[rank0]:     self._result = self.closure(*args, **kwargs)
[rank0]:                    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]:     return func(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 140, in closure
[rank0]:     self._backward_fn(step_output.closure_loss)
[rank0]:     ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 241, in backward_fn
[rank0]:     call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/trainer/call.py", line 328, in _call_strategy_hook
[rank0]:     output = fn(*args, **kwargs)
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/strategies/strategy.py", line 213, in backward
[rank0]:     self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/plugins/precision/precision.py", line 73, in backward
[rank0]:     model.backward(tensor, *args, **kwargs)
[rank0]:     ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/lightning/pytorch/core/module.py", line 1097, in backward
[rank0]:     loss.backward(*args, **kwargs)
[rank0]:     ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/_tensor.py", line 648, in backward
[rank0]:     torch.autograd.backward(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~^
[rank0]:         self, gradient, retain_graph, create_graph, inputs=inputs
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/__init__.py", line 353, in backward
[rank0]:     _engine_run_backward(
[rank0]:     ~~~~~~~~~~~~~~~~~~~~^
[rank0]:         tensors,
[rank0]:         ^^^^^^^^
[rank0]:     ...<5 lines>...
[rank0]:         accumulate_grad=True,
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/autograd/graph.py", line 824, in _engine_run_backward
[rank0]:     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[rank0]:            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:         t_outputs, *args, **kwargs
[rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:     )  # Calls into the C++ engine to run the backward pass
[rank0]:     ^
[rank0]:   File "/home/ubuntu/anaconda3/envs/IceDiffusion/lib/python3.13/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
[rank0]:     _error_if_any_worker_fails()
[rank0]:     ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
[rank0]: RuntimeError: DataLoader worker (pid 478468) is killed by signal: Aborted.

Environment

Current environment
#- PyTorch Lightning Version (e.g., 2.5.0): 2.5.1.post0
#- PyTorch Version (e.g., 2.5): 2.7.0
#- Python version (e.g., 3.12): 3.13
#- OS (e.g., Linux): Ubuntu 24.04
#- CUDA/cuDNN version: 12.6
#- GPU models and configuration: L40*8
#- How you installed Lightning(`conda`, `pip`, source): pip

More info

No response

cc @justusschock @lantiga

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions