Twitter_COVID19/tokenclassification.py at main · elenanereiss/Twitter_COVID19 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# -*- coding: utf-8 -*-

import sys
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
from sklearn.metrics import classification_report
from seqeval.metrics import accuracy_score, classification_report
import numpy as np
import json
from config import my_cache_dir, my_output_dir
from best_hp import hyperparameter


# collect all scores from cross validation
def sum_cv_scores(cv_results, cv_number):
    for item in cv_results.keys():
        for score in cv_results[item].keys():
           cv_results[item][score].append(cv_number[item][score])
    return cv_results


# calculate mean of scores for cross validation
def mean_cv_scores(cv_results):
    for item in cv_results.keys():
        for score in cv_results[item].keys():
            cv_results[item][score] = sum(cv_results[item][score])/len(cv_results[item][score])
    return cv_results


# for sklearn classification_report we need to decode and encode labels to int32
# by precision, recall and f1 from huggingface is used micro average -> bad score for unbalanced data
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # classification_report from seqeval (https://github.com/chakki-works/seqeval)
    # used weighted avgerage for precision, recall and f1
    report = classification_report(y_true=true_labels, y_pred=true_predictions, suffix=False, output_dict=True, scheme=None, mode=None, sample_weight=None, zero_division="warn", digits=4)
    results = report['weighted avg']
    results["accuracy"] = accuracy_score(true_labels, true_predictions)
    return {
        "precision": results["precision"],
        "recall": results["recall"],
        "f1": results["f1-score"],
        "accuracy": results["accuracy"],
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True,)# padding=True)

    labels = []
    for i, label in enumerate(examples[f"named_entity_recognition"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


if __name__ == "__main__":
    model_list = sys.argv[1:]
    metric = load_metric("seqeval")

    # save scores for all models in dict
    scores_dict = {}


    # start training for each model
    for checkpoint in model_list:
        print("****************************Training of {}****************************".format(checkpoint))

        # 5 fold cross-validation
        fold = 5

        label_list = ['B-ATTRIBUTE', 'B-DISEASE', 'B-LOCATION', 'B-LOCATION_BODY', 'B-MEASURE', 'B-MORTALITY', 'B-ORGANIZATION', 'B-PERSON', 'B-SYMPTOM', 'B-TIME', 'I-ATTRIBUTE', 'I-DISEASE', 'I-LOCATION', 'I-LOCATION_BODY', 'I-MEASURE', 'I-MORTALITY', 'I-ORGANIZATION', 'I-PERSON', 'I-SYMPTOM', 'I-TIME', 'O']
        num_lab = len(label_list) # named_entity_recognition

        # save results to folder results
        file_name = "results/ner_{}.txt".format(checkpoint.split("/")[-1])
        w = open(file_name, "w+", encoding="utf-8")
        w.write("{}\n\n".format(checkpoint))

        # for scores from cross-validation
        cv_results = {'ATTRIBUTE': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'DISEASE': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'LOCATION': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'LOCATION_BODY': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'MEASURE': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'MORTALITY': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'ORGANIZATION': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'PERSON': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'SYMPTOM': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'TIME': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'micro avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'macro avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'weighted avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}}

        # default hyperparameters for other models
        if checkpoint not in ["bert-base-german-cased", "bert-base-multilingual-cased", "bert-base-multilingual-uncased", "Twitter/twhin-bert-base", "Twitter/twhin-bert-large"]:
            hyperparameters = {'learning_rate': 1e-5, 'num_train_epochs': 8, 'seed': 42, 'per_device_train_batch_size': 8, 'weight_decay': 0, 'adam_epsilon': 1e-8, 'gradient_accumulation_steps': 1}
        else: hyperparameters = hyperparameter["ner"][checkpoint]['hyperparameters']


        # start cross-validation
        for n in range(0,fold):
            label_all_tokens = True

            print("****************************Cross-validation number {}****************************".format(n+1))
            # split dataset
            dataset_train = load_dataset("json", data_files="data/train_cv"+str(n))
            dataset_valid = load_dataset("json", data_files="data/val_cv"+str(n))
            dataset_train = dataset_train['train']
            dataset_valid = dataset_valid['train']

            label_names = label_list

            # Cast to ClassLabel
            dataset_train = dataset_train.cast_column("named_entity_recognition", Sequence(ClassLabel(names=label_names)))
            dataset_valid = dataset_valid.cast_column("named_entity_recognition", Sequence(ClassLabel(names=label_names)))

            tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=my_cache_dir)

            remove_columns = ['title', 'tokens', 'named_entity_recognition', 'relations', 'informativeness', 'topic', 'credibility']
            dataset_train = dataset_train.map(tokenize_and_align_labels, batched=True, remove_columns=remove_columns)
            label_all_tokens = True
            dataset_valid = dataset_valid.map(tokenize_and_align_labels, batched=True, remove_columns=remove_columns)


            data_collator = DataCollatorForTokenClassification(tokenizer)
            model = AutoModelForTokenClassification.from_pretrained(
                checkpoint, num_labels=num_lab, cache_dir=my_cache_dir
            )
            training_args = TrainingArguments(
                learning_rate=hyperparameters["learning_rate"],
                output_dir=my_output_dir,
                overwrite_output_dir=True,
                per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
                num_train_epochs=hyperparameters["num_train_epochs"],
                seed=hyperparameters["seed"],
                adam_epsilon=hyperparameters["adam_epsilon"],
                weight_decay=hyperparameters["weight_decay"],
                gradient_accumulation_steps=hyperparameters["gradient_accumulation_steps"],
                evaluation_strategy="steps",
                eval_steps=500,
#                optim="adamw_torch", # /transformers/src/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=dataset_train,
                eval_dataset=dataset_valid,
                data_collator=data_collator,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
            )

            trainer.train()

            # Classification report for validation set
            results = trainer.evaluate()

            predictions, labels, _ = trainer.predict(dataset_valid)
            predictions = np.argmax(predictions, axis=2)

            # Remove ignored index (special tokens)
            true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
            true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

 #           results = metric.compute(predictions=true_predictions, references=true_labels)
            results = classification_report(y_true=true_labels, y_pred=true_predictions, suffix=False, output_dict=False, scheme=None, mode=None, sample_weight=None, zero_division=0, digits=4)
            print(results)

            results_dict = classification_report(y_true=true_labels, y_pred=true_predictions, suffix=False, output_dict=True, scheme=None, mode=None, sample_weight=None, zero_division=0, digits=4)
            cv_results = sum_cv_scores(cv_results, results_dict)

            # Write results to file
            w.write("**************************************************Cross-validation number {}**************************************************\n\n".format(n+1))
            w.write(results)
            w.write("\n")

        cv_results = mean_cv_scores(cv_results)
        w.write("************************************************Results of 5 fold cross-validation************************************************\n\n")
        w.write('%20s%12s%12s%12s%12s\n\n' % ('', 'precision', 'recall', 'f1-score', 'support'))
        for key, value in cv_results.items():
            string= ""
            for key2, value2 in cv_results[key].items():
                if key2 != 'support':
                    string +=  '%*.*f' % (12,4, value2)
                else: string +=  '%*.*f' % (12,2, value2)
            w.write('%20s%s\n' % (key, string))
        w.close()

        # # Save training arguments to folder bestrun
        # print(training_args.to_json_string())
        # args = open("bestrun/args_ner_{}.txt".format( checkpoint.split("/")[-1]), "w", encoding="utf-8")
        # args.write(training_args.to_json_string())
        # args.close()

        scores_dict[checkpoint] = cv_results["weighted avg"]
    # print(scores_dict)
    scores = open("results/ner_scores_dict.py", "w", encoding="utf-8")
    scores.write("ner = {}\n".format(scores_dict))
    scores.close()