Accuracy/loss is not improving when train huggingface transformer bert #15044
-
SummaryLoss does not decrease and accuracy/F1-score is not improving during training HuggingFace Transformer BertForSequenceClassification with Pytorch-Lightning IssueHello PTL team, previously, I trained huggingface bert model with my own trainer code. To improve code quality and implement MLOps system, I’m trying to train huggingface’s transformers Bert with pytorch lightning. When I train BertForSequenceClassification in the transformers with PTL, however, Loss, accuracy, and even f1 score seems to not improve during a training phase. I think there are some bugs in the optimizer or back-propagation in my code, but I can’t find any problems. my question is, what is the problem with my code? my assumtions:
package versionspython==3.7 code snippetimport os
import csv
import tqdm
from typing import *
import pandas as pd
import torch
from torch import nn
import pytorch_lightning as pl
import torchmetrics.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import BertForSequenceClassification, BertTokenizer, InputExample, InputFeatures, AutoConfig, get_linear_schedule_with_warmup
def get_input_example(guid: int, text_a: str, label: str) -> Tuple[InputExample, str]:
input_example = InputExample(
guid=guid,
text_a=text_a,
text_b=None,
label=label
)
return input_example, label
def add_examples(
texts_or_text_and_labels: Union[List[str], str],
text_index: int,
label_index: int,
label_dict: Dict[str, int] = None,
remove_top: bool = False
) -> Tuple[List[InputExample], Dict[str, int]]:
examples = list()
labels = list()
tmp = []
if isinstance(texts_or_text_and_labels, str):
with open(texts_or_text_and_labels, 'r') as f:
if texts_or_text_and_labels.endswith('csv'):
delimiter = ','
elif texts_or_text_and_labels.endswith('tsv'):
delimiter='\t'
reader = csv.reader(f, delimiter=delimiter, quotechar='"')
for idx, line in enumerate(tqdm.tqdm(reader)):
if remove_top is True and idx == 0:
pass
else:
tmp.append(line)
texts_or_text_and_labels = tmp
for line in tqdm.tqdm(texts_or_text_and_labels):
text_a = line[text_index]
label = line[label_index]
input_example, label = get_input_example(guid=line[0], text_a=text_a, label=label)
examples.append(input_example)
if label_dict is None:
labels.append(label)
if label_dict is None:
label_dict = {i: idx for idx, i in enumerate(list(set(labels)))}
return examples, label_dict
class BertDataset(Dataset):
def __init__(self, examples, tokenizer, label_dict, max_length):
self.examples = examples
self.tokenizer = tokenizer
self.label_dict = label_dict
self.max_length = max_length
def _truncate_seq_pair(self, tokens_a, tokens_b, max_length) -> None:
"""
Truncates a sequence pair in place to the maximum length.
This is a simple heuristic which will always truncate the longer sequence
one token at a time. This makes more sense than truncating an equal percent
of tokens from each, since if one sequence is very short then each token
that's truncated likely contains more information than a longer sequence.
"""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length - 3:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def __len__(self):
return len(self.examples)
def __getitem__(
self,
idx: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
def _tokens_and_segment_id(token_a: List[str], token_b: List[str] = None) -> Tuple[Any, List[int]]:
tokens = ['[CLS]'] + token_a + ['[SEP]'] # See in 1-1. Section in /docs/Appendix.md
token_type_ids = [0] * len(tokens) # for more information of 138-145 lines
if token_b:
tokens += token_b + ['[SEP]']
token_type_ids += [1] * (len(token_b) + 1)
return tokens, token_type_ids
text_a = self.examples[idx].text_a
text_b = self.examples[idx].text_b
label = self.examples[idx].label
# Convert texts into tokens
tokens_a = self.tokenizer.tokenize(text_a)
tokens_b = None
if text_b:
tokens_b = self.tokenizer.tokenize(text_b)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with '- 3'
self._truncate_seq_pair(tokens_a, tokens_b, self.max_length)
else:
if len(tokens_a) > self.max_length - 2:
tokens_a = tokens_a[:(self.max_length - 2)]
tokens, token_type_ids = _tokens_and_segment_id(tokens_a, tokens_b)
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
label_ids = self.label_dict[label]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
attention_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (self.max_length - len(input_ids))
input_ids += padding
attention_mask += padding
token_type_ids += padding
assert len(input_ids) == self.max_length
assert len(attention_mask) == self.max_length
assert len(token_type_ids) == self.max_length
input_ids = torch.tensor(input_ids, dtype=torch.long)
attention_mask = torch.tensor(attention_mask, dtype=torch.long)
token_type_ids = torch.tensor(token_type_ids, dtype=torch.long)
labels = torch.tensor(label_ids, dtype=torch.long)
return input_ids, attention_mask, token_type_ids, labels
class BertAccTestModel(pl.LightningModule):
def __init__(self, num_classes: int):
super().__init__()
self.save_hyperparameters()
self.config = AutoConfig.from_pretrained('klue/bert-base', num_labels=num_classes)
self.model = BertForSequenceClassification.from_pretrained('klue/bert-base', config=self.config)
self.tokenizer = BertTokenizer.from_pretrained('klue/bert-base')
self.num_classes = num_classes
def forward(self, init_ids, input_mask, segment_ids) -> SequenceClassifierOutput:
outputs = self.model(init_ids, input_mask, segment_ids)
return outputs
def info(self, dictionary: dict) -> None:
r"""
Logging information from dictionary.
Args:
dictionary (dict): dictionary contains information.
"""
for key, value in dictionary.items():
self.log(key, value, prog_bar=True, sync_dist=True)
def training_step(self, batch: Tuple, batch_idx: int) -> Dict[str, torch.Tensor]:
init_ids, input_mask, segment_ids, label_ids = batch
outputs = self.model(init_ids, input_mask, segment_ids, labels=label_ids)
top1_acc = F.accuracy(outputs.logits, label_ids)
top1_f1 = F.f1_score(outputs.logits, label_ids, average='macro', num_classes=self.num_classes)
loss = outputs.loss
self.info({
'train_loss': loss,
'train_acc': top1_acc,
'train_f1': top1_f1,
})
return loss
def validation_step(self, batch: Tuple, batch_idx: int) -> Dict[str, torch.Tensor]:
init_ids, input_mask, segment_ids, label_ids = batch
outputs = self.model(init_ids, input_mask, segment_ids, labels=label_ids)
top1_acc = F.accuracy(outputs.logits, label_ids)
top1_f1 = F.f1_score(outputs.logits, label_ids, average='macro', num_classes=self.num_classes)
loss = outputs.loss
self.info({
'val_loss': loss,
'val_acc': top1_acc,
'val_f1': top1_f1,
})
return loss
def test_step(self, batch: Tuple, batch_idx: int) -> Dict[str, torch.Tensor]:
init_ids, input_mask, segment_ids, label_ids = batch
outputs = self.model(init_ids, input_mask, segment_ids, labels=label_ids)
top1_acc = F.accuracy(outputs.logits, label_ids)
top1_f1 = F.f1_score(outputs.logits, label_ids, average='macro', num_classes=self.num_classes)
loss = outputs.loss
self.info({
'test_loss': loss,
'test_acc': top1_acc,
'test_f1': top1_f1,
})
return loss
def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
init_ids, input_mask, segment_ids, label_ids = batch
outputs = self(init_ids, input_mask, segment_ids, label_ids)
return torch.argmax(outputs.logits)
def configure_optimizers(self):
model = self.model
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(
optim,
num_warmup_steps=500,
num_training_steps=self.trainer.estimated_stepping_batches,
)
scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
return [optim], [scheduler]
def main():
root = '/'.join(os.getcwd().split('/'))
print(root)
tokenizer = BertTokenizer.from_pretrained('klue/bert-base', do_lower_case=False)
train_examples, label_dict = add_examples(os.path.join(root, 'data/training_merged_1d.tsv'), text_index=2, label_index=7, remove_top=True)
eval_examples, _ = add_examples(os.path.join(root, 'data/validation.tsv'), text_index=2, label_index=7, label_dict=label_dict)
test_examples, _ = add_examples(os.path.join(root, 'data/test.tsv'), text_index=2, label_index=7, label_dict=label_dict, remove_top=True)
train_dataset = BertDataset(train_examples, tokenizer, max_length=256, label_dict=label_dict)
eval_dataset = BertDataset(eval_examples, tokenizer, max_length=256, label_dict=label_dict)
test_dataset = BertDataset(test_examples, tokenizer, max_length=256, label_dict=label_dict)
print(len(train_dataset), len(eval_dataset), len(test_dataset))
trn_dataloader = DataLoader(train_dataset, batch_size=64, num_workers=4, sampler=RandomSampler(train_dataset))
eval_dataloader = DataLoader(eval_dataset, batch_size=64, num_workers=4, sampler=SequentialSampler(eval_dataset))
test_dataloader = DataLoader(test_dataset, batch_size=4, num_workers=4, sampler=SequentialSampler(test_dataset))
inference_label_dict = {v: k for k, v in label_dict.items()}
model = BertAccTestModel(num_classes= len(label_dict))
checkpoint_callback = pl.callbacks.ModelCheckpoint(
save_last=True,
save_weights_only=True,
monitor='val_f1',
mode='max',
dirpath=os.path.join(root, 'weights'),
filename='pytorch_model'
)
trainer = pl.Trainer(gpus=4, max_epochs=10, accelerator='cuda', strategy='ddp', precision=32, callbacks=[checkpoint_callback])
trainer.fit(model, train_dataloaders=trn_dataloader, val_dataloaders=eval_dataloader)
trainer.test(model, test_dataloader)
with open(os.path.join(root, 'weights', 'labels.dict'), 'w') as f:
import json
json.dump(inference_label_dict, f)
if __name__ == '__main__':
main() |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment
-
UpdatesI found that DDP was the problem. Accuracy and loss improve when I train my model on single-GPU, It seems to add_examples code was a problem because they make a label list which is not a fixed index. |
Beta Was this translation helpful? Give feedback.
Updates
I found that DDP was the problem. Accuracy and loss improve when I train my model on single-GPU, It seems to add_examples code was a problem because they make a label list which is not a fixed index.