Monitored metric from ModelCheckpoint does not match metric from validation_epoch_end #8083

Bunoviske · 2021-06-22T18:51:18Z

Bunoviske
Jun 22, 2021

Hello! Using the code below, I get different metric results for "val_accuracy". The monitored metric in ModelCheckpoint callback does some reduce function different from simple average (as done in val_epoch_end).

class BIGRU(LightningModule):

    def __init__(self, transformer, rnn_hiddenSize, numClasses, lr=None, weight_decay=None):
        super().__init__()

        self.lr = lr if lr is not None else 0.01
        self.weight_decay = weight_decay if weight_decay is not None else 0.1
        self.loss_func = torch.nn.CrossEntropyLoss()
        #self.torchDevice = device
        print("Running on", self.device)

        # metrics
        self.metricsMethods = BaseMetrics(["text", "signature"], [])
        self.modelMetrics = {'accuracy': self.metricsMethods.getAccuracy, 'f1score': self.metricsMethods.get_f1Score}

        # define encoder transformer model
        self.encoderTokenizer = AutoTokenizer.from_pretrained(transformer)
        self.encoderModel = AutoModel.from_pretrained(transformer)

        # define rnn
        encoder_embedding_size = self.encoderModel.config.hidden_size
        self.rnn = nn.GRU(encoder_embedding_size, rnn_hiddenSize, batch_first=True, bidirectional=True)

        # define fully connected layers
        self.classify = torch.nn.Linear(rnn_hiddenSize*2, numClasses)
        torch.nn.init.xavier_uniform_(self.classify.weight)

    def forward(self, sentences):

        # embedding
        encodedSentences = self.encoderTokenizer(sentences, truncation=True, padding=True, return_tensors="pt")
        encodedSentences = {key:value.to(self.device) for (key,value) in encodedSentences.items()} # convert tensors to device
        with torch.no_grad():
            encoder_output = self.encoderModel(**encodedSentences)
        sentences_embeddings = self.encoderMeanPooling(encoder_output, encodedSentences['attention_mask'])

        # rnn
        sentences_embeddings = sentences_embeddings.unsqueeze(dim=0)
        outputRnn, _ = self.rnn(sentences_embeddings)

        # classify
        outputRnn = outputRnn.view(len(sentences), -1) #probably does not work with batch size > 1. You should flat before giving this to the loss function
        classification = self.classify(outputRnn)
        return classification
    
    #Mean Pooling - Take attention mask into account for correct averaging
    def encoderMeanPooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def configure_optimizers(self):
        optim = torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        return optim

    
    def training_step(self, batch, batch_idx):
        x, y = self._preprocessBatch(batch)
        preds = self(x)
        loss = self.loss_func(preds, y)
        
        metrics = self._calculateMetrics(preds, y, phase="train")
        metrics["loss"] = loss
        self.log_dict(metrics, on_step=True, on_epoch=True)

        return metrics

    def validation_step(self, batch, batch_idx):
        x, y = self._preprocessBatch(batch)
        preds = self(x)
        loss = self.loss_func(preds, y)
        
        metrics = self._calculateMetrics(preds, y, phase="val")
        metrics["val_loss"] = loss
        self.log_dict(metrics)  

        return metrics

    def validation_epoch_end(self, outputs):
        metricsAvg = {}
        for key in outputs[0].keys():
            metricsAvg[key] = torch.stack([x[key].float() for x in outputs]).mean()

        print(metricsAvg)
        return metricsAvg

    def _preprocessBatch(self, batch):
        x,y = batch

        # fix dataloader items (dataloader transform list of string in list of tuples)
        x = [item[0] for item in x]
        y = torch.tensor(y)

        return x, y

    def _calculateMetrics(self, predictions, labels, phase):
        metrics = {}

        bestProbabilities, bestPredictions  = self._getBestPredictions(predictions)
        for metricName, functionToCall in self.modelMetrics.items():
            metricValue = torch.tensor(functionToCall(bestPredictions, labels.cpu()))
            metrics[phase + '_' + metricName] = metricValue
        
        return metrics

    def _getBestPredictions(self, predictions: torch.Tensor) -> Tuple[np.ndarray,np.ndarray]:
        probabilities = torch.softmax(predictions, 1)
        bestProbabilities, bestPredictions = torch.max(probabilities, 1)
        bestProbabilities = bestProbabilities.detach().cpu().numpy()
        bestPredictions = bestPredictions.detach().cpu().numpy()

        return bestProbabilities, bestPredictions

wandb_logger = WandbLogger(project="emailPreprocessing", log_model=False)
wandb_logger.watch(model, log='all', log_freq=100)
#wandb_logger.log_hyperparams(hparams)

# monitored metric is different from val_epoch_end. Maybe because torch lightning applies weighted average on each batch (like they do for the loss)
checkpoint_callback = ModelCheckpoint(monitor='val_accuracy', mode="max", verbose=True)
early_stop_callback = EarlyStopping(monitor='val_loss', mode="min", patience=5, verbose=True)

gpus = 0
epochs = 3
grad_accumulation_batches = 16

trainer = Trainer(gpus=gpus,
            max_epochs=epochs, 
            accumulate_grad_batches=grad_accumulation_batches,
            # gradient_clip_val=1.0,
            # amp_backend='apex',
            # amp_level='02',
            precision=32,
            logger=wandb_logger,
            checkpoint_callback=True,
            callbacks=[early_stop_callback, checkpoint_callback])

trainer.fit(model, trainDataloader, testDataloader)

RESULTS

Val_epoch_end print: {'val_accuracy': tensor(0.8599), 'val_f1score': tensor(0.8599), 'val_loss': tensor(0.3385)}

ModelCheckpoint verbose: Epoch 1, global step 9: val_accuracy reached 0.96450 (best 0.96450), saving model to c:\wandb\run-20210621_183543-13eb6ofv\files\emailPreprocessing\13eb6ofv\checkpoints\epoch=1-step=9.ckpt" as top 1

In this epoch I showed, the difference was really huge. Most of time, they are similar, but never equal. Besides that, when I log these results to WandB, the logging is exactly the same as ModelCheckpoint verbose.

As discussed here, I imagine that some weighted average over my batch samples is being applied, but since I expect variable sequences as input in the batch, this average is not helping me to get the actual best model.

Any clarification regarding this result difference is appreciated! Thanks!

justusschock · 2021-06-23T07:31:43Z

justusschock
Jun 23, 2021
Maintainer

Hi, Do you by chance use the torchmetrics package? It wasn't exactly clear to me, where your metrics are coming from.

In general, averaging metrics like Accuracy and F1 Score is not accurate accumulation over batches, which is why in torchmetrics each Metric has a separate way of accumulation that may most likely differ from simple averaging. This may result in the different values of the automatically aggregated metrics and your manually averaged ones.

3 replies

Bunoviske Jun 23, 2021
Author

Hello! Sorry, that part is missing in the code. I am using sklearn metrics, so in the end it is also simple averaging.

Question 1: I never heard about this other type of accumulation. Can you give me more details? Is it the same as discussed here?

Question 2: Since my model uses RNN architecture, I expect always variable length sequences as input and always batch_size equal to 1. Do you think that pytorchLightning interprets sequences as batches with different lengths? Then the accumulation is compromised by this weighted average coming from each sequence?

Thanks!

justusschock Jun 23, 2021
Maintainer

Hi,

First of all: try to avoid sklearn metrics if possible, since numpy conversions are slow :) Give torchmetrics a try. It really integrates nice with lightning.

Second:
Look at the Accuracy example of update (which just computes the stat scores) and compute (which then computes the actual accuracy). The compute is not equal to just averaging values of single batches.

Third: For RNN you probably want to use truncated backprop through time, right? You then have to set LightningModule.truncated_bptt_steps and also additionally have a batchsize dimension. Otherwise lightning will indeed interpret it as batches

Bunoviske Jun 23, 2021
Author

Nice! I will study that, thanks for your support

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Monitored metric from ModelCheckpoint does not match metric from validation_epoch_end #8083

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment 3 replies

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Monitored metric from ModelCheckpoint does not match metric from validation_epoch_end #8083

Uh oh!

Bunoviske Jun 22, 2021

Replies: 1 comment · 3 replies

Uh oh!

justusschock Jun 23, 2021 Maintainer

Uh oh!

Bunoviske Jun 23, 2021 Author

Uh oh!

justusschock Jun 23, 2021 Maintainer

Uh oh!

Bunoviske Jun 23, 2021 Author

Bunoviske
Jun 22, 2021

Replies: 1 comment 3 replies

justusschock
Jun 23, 2021
Maintainer

Bunoviske Jun 23, 2021
Author

justusschock Jun 23, 2021
Maintainer

Bunoviske Jun 23, 2021
Author