how to save the last epoch only? #7153
-
"monitor (Optional[str]) – quantity to monitor. By default it is None which saves a checkpoint only for the last epoch." i also try another way, set the 'save_last' to True. while this needs to set a monitor. And if i set save_top_k to 0, it will save nothing; if set to 1, it will save 2 models, the best one and the last one. But i just want to save the last one. is this a bug or i made sth wrong? is there a way to save model with epoch asigned myself? such as the last 3 epochs? |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 1 reply
-
Hey! Have a look at this example: import os
import torch
from torch.utils.data import Dataset
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss, logger=False)
return {"x": loss}
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
def run():
train_data = torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=2, num_workers=0)
val_data = torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=2, num_workers=0)
model = BoringModel()
trainer = Trainer(
default_root_dir=os.getcwd(),
limit_train_batches=1,
limit_val_batches=1,
num_sanity_val_steps=0,
max_epochs=5, # this will save a checkpoint at epoch index 4 (last epoch)
weights_summary=None,
logger=False,
callbacks=[ModelCheckpoint(dirpath="./checkpoints", monitor=None)]
)
trainer.fit(model, train_dataloader=train_data, val_dataloaders=val_data)
if __name__ == '__main__':
run() I'm choosing: That's all, it saves only one checkpoint, named epoch=4-step=4.ckpt, it corresponds to the last epoch being run. |
Beta Was this translation helpful? Give feedback.
Hey! Have a look at this example: