bert_test/evaluate_model.py at bert-sentiment-classification · Ryougiakari/bert_test · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# evaluate_model.py

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_from_disk
from sklearn.metrics import accuracy_score, f1_score
import os

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

def main():
    model_dir = "sentiment_bert_model"
    test_data_dir = "processed_data/test"
    results_dir = "./eval_results" # Directory to save evaluation results/logs

    print(f"Loading tokenizer from {model_dir}...")
    if not os.path.exists(model_dir) or not os.listdir(model_dir):
        print(f"Error: Model directory {model_dir} is empty or does not exist.")
        print("Please ensure the model has been trained and saved correctly using train_model.py.")
        return

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        return

    print(f"Loading model from {model_dir}...")
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    print(f"Loading test dataset from {test_data_dir}...")
    if not os.path.exists(test_data_dir):
        print(f"Error: Test data directory {test_data_dir} does not exist.")
        print("Please ensure the dataset has been processed using prepare_dataset.py.")
        return
    try:
        test_dataset = load_from_disk(test_data_dir)
        # Ensure columns are correctly named for the Trainer
        if 'label' in test_dataset.column_names and 'labels' not in test_dataset.column_names:
            test_dataset = test_dataset.rename_column("label", "labels")

        required_cols = {'input_ids', 'attention_mask', 'labels'}
        if not required_cols.issubset(test_dataset.column_names):
            print(f"Error: Test dataset is missing required columns. Found: {test_dataset.column_names}. Required: {required_cols}")
            return

    except Exception as e:
        print(f"Error loading test dataset: {e}")
        return

    print("Defining training arguments for evaluation...")
    # Ensure the output directory for evaluation results exists
    os.makedirs(results_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=results_dir,
        per_device_eval_batch_size=8,
        do_train=False,
        do_eval=True,
        report_to="none" # Disable reporting to wandb/tensorboard if not configured
    )

    print("Initializing Trainer for evaluation...")
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    print("Starting evaluation...")
    try:
        eval_results = trainer.evaluate()
        print("Evaluation Results:")
        for key, value in eval_results.items():
            print(f"  {key}: {value}")
    except Exception as e:
        print(f"Error during evaluation: {e}")

if __name__ == "__main__":
    main()