Twitter_COVID19/textclassification.py at main · elenanereiss/Twitter_COVID19 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# -*- coding: utf-8 -*-

import sys
from datasets import load_dataset, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers.utils import logging
logging.set_verbosity_error()
import evaluate
from sklearn.metrics import classification_report
import numpy as np
import json
from config import my_cache_dir, my_output_dir
from best_hp import hyperparameter


# collect all scores from cross validation
def sum_cv_scores(cv_results, cv_number):
    for item in cv_number.keys():
        if item != 'accuracy':
            for score in cv_number[item].keys():
                cv_results[item][score].append(cv_number[item][score])
    cv_results['accuracy'].append(cv_number['accuracy'])
    return cv_results


# calculate mean of scores for cross validation
def mean_cv_scores(cv_results):
    print(cv_results)
    for item in cv_results.keys():
        if item != 'accuracy':
            for score in cv_results[item].keys():
                cv_results[item][score] = sum(cv_results[item][score])/len(cv_results[item][score])
    cv_results['accuracy'] = sum(cv_results['accuracy'])/len(cv_results['accuracy'])
    return cv_results


# Metrics for evaluation
def compute_metrics(eval_pred):
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")
    metric4 = evaluate.load("accuracy")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


# Decode multiclass labels
def map_label2id(example):
    example['labels'] = ClassLabels.str2int(example[task])
    return example


def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)


def initialize_scores_dict(labels):
    d = {}
    for label in labels:
        d[label] = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
    d.update({'accuracy': [], 'macro avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'weighted avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}})
    return d


if __name__ == "__main__":
    task = sys.argv[1]
    # check task name
    if task not in ['informativeness', 'topic', 'credibility']:
        print("{} is an unknown task. Please choose between ['informativeness', 'topic', 'credibility']".format(task))
        exit()

    model_list = sys.argv[2:]

    # collect labels from dataset
    dataset = load_dataset("json", data_files="data/Twitter_COVID19.json", field="Twitter_COVID-19_Dataset")
    labels = sorted(set(label for label in dataset["train"][task]))

    num_lab = len(labels)

    # Cast to ClassLabel
    ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

    scores_dict = {}


    # start training for each model
    for checkpoint in model_list:
        print("\n\n\n****************************Training of {}****************************\n".format(checkpoint))

        # 5 fold cross-validation
        fold = 5

        # default hyperparameters for other models
        if checkpoint not in ["bert-base-german-cased", "bert-base-multilingual-cased", "bert-base-multilingual-uncased", "Twitter/twhin-bert-base", "Twitter/twhin-bert-large"]:
            hyperparameters = {'learning_rate': 1e-5, 'num_train_epochs': 3, 'seed': 42, 'per_device_train_batch_size': 8, 'weight_decay': 0, 'adam_epsilon': 1e-8, 'gradient_accumulation_steps': 1}
        else: hyperparameters = hyperparameter[task][checkpoint]['hyperparameters']

        # save results to folder results
        file_name = "results/{}_{}.txt".format(task, checkpoint.split("/")[-1])
        w = open(file_name, "w+", encoding="utf-8")
        w.write("{}\n\n".format(checkpoint))

        # for scores from cross-validation
        cv_results = initialize_scores_dict(labels)


        # start cross-validation
        for n in range(0,fold):
            print("\n****************************Cross-validation number {}****************************\n".format(n+1))
            # split dataset
            dataset_train = load_dataset("json", data_files="data/train_cv"+str(n))
            dataset_valid = load_dataset("json", data_files="data/val_cv"+str(n))
            dataset_train = dataset_train['train']
            dataset_valid = dataset_valid['train']

            tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=my_cache_dir)

            dataset_train = dataset_train.map(tokenize_data, batched=True)
            dataset_valid = dataset_valid.map(tokenize_data, batched=True)
            valid_labels = dataset_valid[task]
            remove_columns = ['title', 'tokens', 'named_entity_recognition', 'relations', 'informativeness', 'topic', 'credibility']
            dataset_train = dataset_train.map(map_label2id, remove_columns=remove_columns)
            dataset_valid = dataset_valid.map(map_label2id, remove_columns=remove_columns)

            # Casting label column to ClassLabel Object
#            dataset_dict = dataset_dict.cast_column('credibility', ClassLabels)

            model = AutoModelForSequenceClassification.from_pretrained(
                checkpoint, num_labels=num_lab, cache_dir=my_cache_dir
            )

            training_args = TrainingArguments(
                learning_rate=hyperparameters["learning_rate"],
                output_dir=my_output_dir,
                overwrite_output_dir=True,
                per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
                num_train_epochs=hyperparameters["num_train_epochs"],
                seed=hyperparameters["seed"],
                adam_epsilon=hyperparameters["adam_epsilon"],
                weight_decay=hyperparameters["weight_decay"],
                gradient_accumulation_steps=hyperparameters["gradient_accumulation_steps"],
                evaluation_strategy="steps",
                eval_steps=500,
#                optim="adamw_torch", # /transformers/src/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=dataset_train,
                eval_dataset=dataset_valid,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
            )

            trainer.train()

            # Classification report for validation set
            results = trainer.evaluate()
            predicted_results=trainer.predict(dataset_valid)
            predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
            predicted_labels = predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list
            predicted_labels = [ClassLabels.int2str(l) for l in predicted_labels]  # Convert from integers back to strings for readability

            cv_results = sum_cv_scores(cv_results, classification_report(valid_labels,predicted_labels,zero_division=0, digits=4, output_dict=True))

            results=classification_report(valid_labels,predicted_labels,zero_division=0, digits=4)
            print(results)

            # Write results to file
            w.write("**************************************************Cross-validation number {}**************************************************\n\n".format(n+1))
            w.write(results)
            w.write("\n")

        cv_results = mean_cv_scores(cv_results)
        w.write("************************************************Results of 5 fold cross-validation************************************************\n\n")
        w.write('%20s%12s%12s%12s%12s\n\n' % ('', 'precision', 'recall', 'f1-score', 'support'))
        w.write('%20s%*.*f\n' % ('accuracy', 36, 4, cv_results['accuracy']))
        for key, value in cv_results.items():
            if key != 'accuracy':
                string= ""
                for key2, value2 in cv_results[key].items():
                    if key2 != 'support':
                        string +=  '%*.*f' % (12,4, value2)
                    else: string +=  '%*.*f' % (12,2, value2)
                w.write('%20s%s\n' % (key, string))
        w.close()

        # # Save training arguments to folder bestrun
        # print(training_args.to_json_string())
        # args = open("bestrun/args_{}_{}.txt".format(task, checkpoint.split("/")[-1]), "w", encoding="utf-8")
        # args.write(training_args.to_json_string())
        # args.close()

        scores_dict[checkpoint] = cv_results["weighted avg"]
    # print(scores_dict)
    scores = open("results/{}_scores_dict.py".format(task), "w", encoding="utf-8")
    scores.write("{} = {}\n".format(task, scores_dict))
    scores.close()