-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Closed
speediedan/finetuning-scheduler
#17Labels
Description
Bug description
As MLFlowLogger
will not set Trainer.log_dir
, the FinetuningScheduler
will not be able to find trainer.log_dir
and so will fail to run. A dummy logger is needed to set trainer.log_dir
.
pytorch-lightning/src/lightning/pytorch/trainer/trainer.py
Lines 1213 to 1235 in 1f5add3
@property | |
def log_dir(self) -> Optional[str]: | |
"""The directory for the current experiment. Use this to save images to, etc... | |
.. note:: You must call this on all processes. Failing to do so will cause your program to stall forever. | |
.. code-block:: python | |
def training_step(self, batch, batch_idx): | |
img = ... | |
save_img(img, self.trainer.log_dir) | |
""" | |
if len(self.loggers) > 0: | |
if not isinstance(self.loggers[0], (TensorBoardLogger, CSVLogger)): | |
dirpath = self.loggers[0].save_dir | |
else: | |
dirpath = self.loggers[0].log_dir | |
else: | |
dirpath = self.default_root_dir | |
dirpath = self.strategy.broadcast(dirpath) | |
return dirpath |
How to reproduce the bug
import torch
from finetuning_scheduler import FinetuningScheduler
from pytorch_lightning.loggers import MLFlowLogger
import lightning.pytorch as L
from torch.utils.data import DataLoader, Dataset
# Thanks to #20605 for the small classes!
class RandomDataset(Dataset):
def __init__(self, size):
self.data = torch.randn(size, 10)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx], torch.tensor(0) # Dummy target
class MinimalDataModule(L.LightningDataModule):
def train_dataloader(self):
return DataLoader(RandomDataset(100), batch_size=10)
# when removing the val_dataloader method completely, the error is not raised
def val_dataloader(self):
return []
class MinimalModel(L.LightningModule):
def __init__(self):
super().__init__()
self.linear = torch.nn.Linear(10, 1)
def forward(self, x):
return self.linear(x)
def training_step(self, batch, batch_idx):
x, y = batch
loss = torch.nn.functional.mse_loss(self(x), y.float().unsqueeze(1))
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
loss = torch.nn.functional.mse_loss(self(x), y.float().unsqueeze(1))
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=0.01)
mlflow_logger = MLFlowLogger(
tracking_uri="sqlite:///test.db",
experiment_name="test",
run_name=f"ft-test",
)
ft_callback = FinetuningScheduler(gen_ft_sched_only=True)
# Initialise the trainer
trainer = L.Trainer(
accelerator="auto",
max_epochs=1,
logger=[mlflow_logger],
callbacks=[ft_callback],
check_val_every_n_epoch=1,
)
model = MinimalModel()
data = MinimalDataModule()
trainer.fit(model, datamodule=data)
What version are you seeing the problem on?
v2.5
Error messages and logs
File "/Users/user/anaconda3/envs/l2cfd/lib/python3.12/site-packages/finetuning_scheduler/fts_supporters.py", line 1333, in gen_or_load_sched
assert self.pl_module.trainer.log_dir is not None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
Environment
Current environment
#- PyTorch Lightning Version (e.g., 2.5.0):2.5.0
#- PyTorch Version (e.g., 2.5): 2.6
#- Python version (e.g., 3.12): 3.12.9
#- OS (e.g., Linux): MacOS
#- CUDA/cuDNN version: N/A
#- GPU models and configuration: N/A
#- How you installed Lightning(`conda`, `pip`, source): pip
More info
No response