Twitter_COVID19/llm_tokenclassification.py at main · elenanereiss/Twitter_COVID19 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from datetime import datetime as t
import json
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.metrics import f1_score, classification_report, accuracy_score
from seqeval.metrics import accuracy_score, classification_report
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model


# https://github.com/adidror005/youtube-videos/blob/main/LLAMA_3_Fine_Tuning_for_Sequence_Classification_Actual_Video.ipynb
from datasets import DatasetDict, load_dataset, ClassLabel, Sequence
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)


#from transformers.utils import logging
#logging.set_verbosity_error()
from config import my_cache_dir, my_output_dir
from best_hp import hyperparameter

import warnings
warnings.filterwarnings('ignore')

def main():
    class CustomTrainer(Trainer):
        def __init__(self, *args, class_weights=None, **kwargs):
            super().__init__(*args, **kwargs)
            # Ensure label_weights is a tensor
            if class_weights is not None:
                #            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device) # old param -> error with new version of transformers
                self.class_weights = class_weights.type(dtype=torch.float32).clone().detach().to(self.args.device)
            else:
                self.class_weights = None

        # def compute_loss(self, model, inputs, return_outputs=False): # old param -> error with new version of transformers
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            # Extract labels and convert them to long type for cross_entropy
            labels = inputs.pop("labels").long()

            # Forward pass
            outputs = model(**inputs)

            # Extract logits assuming they are directly outputted by the model
            logits = outputs.get('logits')

            # Compute custom loss with class weights for imbalanced data handling
            if self.class_weights is not None:
                print(labels)
                loss = F.cross_entropy(logits, labels, weight=self.class_weights)
            else:
                loss = F.cross_entropy(logits, labels)

            return (loss, outputs) if return_outputs else loss
    #        return (loss(logits, labels), outputs) if return_output else loss(logits, labels) # old param -> error with new version of transformers

    def llama_preprocessing_function(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)#, max_length=MAX_LEN)# padding=True)

        labels = []
        for i, label in enumerate(examples[f"named_entity_recognition"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs


    # for sklearn classification_report we need to decode and encode labels to int32
    # by precision, recall and f1 from huggingface is used micro average -> bad score for unbalanced data
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
#        true_predictions = [label for sentence in true_predictions for label in sentence]

        true_labels = [
            [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
#        true_labels = [label for sentence in true_labels for label in sentence]

        # classification_report from seqeval (https://github.com/chakki-works/seqeval)
        # used weighted avgerage for precision, recall and f1
        report = classification_report(y_true=true_labels, y_pred=true_predictions, suffix=False, output_dict=True, scheme=None, mode=None, sample_weight=None, zero_division="warn", digits=4)
#        report = classification_report(y_true=true_labels, y_pred=true_predictions, output_dict=True, zero_division="warn", digits=4)
        results = report['weighted avg']
#        results["accuracy"] = accuracy_score(true_labels, true_predictions)
        return {
            "precision": results["precision"],
            "recall": results["recall"],
            "f1": results["f1-score"],
#            "accuracy": results["accuracy"],
        }


    def initialize_scores_dict(label_names):
        d = {}
        updated_label_names=set([label.replace("B-", "").replace("I-", "") for label in label_names if label !="O"])
        for label in updated_label_names:
            d[label] = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
        d.update({'micro avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'macro avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}, 'weighted avg': {'precision': [], 'recall': [], 'f1-score': [], 'support': []}})
        return d

    # collect all scores from cross validation
    def sum_cv_scores(cv_results, cv_number):
        for item in cv_results.keys():
            for score in cv_results[item].keys():
               cv_results[item][score].append(cv_number[item][score])
        return cv_results


    # calculate mean of scores for cross validation
    def mean_cv_scores(cv_results):
        for item in cv_results.keys():
            for score in cv_results[item].keys():
                cv_results[item][score] = sum(cv_results[item][score])/len(cv_results[item][score])
        return cv_results

    # Input model name
    checkpoint = sys.argv[1]

    print(checkpoint)


    for task in ['named_entity_recognition']: #
        print(task)
        scores_dict = {}
        # collect labels from dataset
        dataset = load_dataset("json", data_files="data/Twitter_COVID19.json", field="Twitter_COVID-19_Dataset")
        label_names = sorted(set([label for sentence in dataset["train"][task] for label in sentence]))
        num_lab = len(label_names)

        # Cast to ClassLabel
        ClassLabels = ClassLabel(num_classes=len(label_names), names=label_names)

        # 5 fold cross-validation
        fold = 5

        # save results to folder results
        file_name = "results/{}_{}.txt".format(task, checkpoint.split("/")[-1])
        w = open(file_name, "w+", encoding="utf-8")
        w.write("{}\n\n".format(checkpoint))

        # for scores from cross-validation
        cv_results = initialize_scores_dict(label_names)

        # start cross-validation
        for n in range(0,fold):
            print("\n****************************Cross-validation number {}****************************\n".format(n+1))
            # split dataset
            dataset_train = load_dataset("json", data_files="data/train_cv"+str(n))
            dataset_valid = load_dataset("json", data_files="data/val_cv"+str(n))
            dataset_train = dataset_train['train']
            dataset_valid = dataset_valid['train']

            # Cast to ClassLabel, assign labels to codes
            dataset_train = dataset_train.cast_column("named_entity_recognition", Sequence(ClassLabel(names=label_names)))
            dataset_valid = dataset_valid.cast_column("named_entity_recognition", Sequence(ClassLabel(names=label_names)))

            col_to_delete = [*dataset_train.features]

            # Shuffle the training dataset
            dataset_train_shuffled = dataset_train.shuffle(seed=42)  # Using a seed for reproducibility

            # Huggingface dataset object
            hf_dataset = DatasetDict({
                'train': dataset_train_shuffled,
                'val': dataset_valid,
                'test': dataset_valid
            })


            # calculate class weights based on inverse value counts
            target = [label for sentence in dataset_train[task] for label in sentence]
            df = pd.DataFrame({'target': target})
            class_weights=(1/df.target.value_counts(normalize=True).sort_index()).tolist()
            class_weights=torch.tensor(class_weights)
            class_weights=class_weights/class_weights.sum()

            quantization_config = BitsAndBytesConfig(
                load_in_4bit = True, # enable 4-bit quantization
                bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
                bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
                bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
            )

            lora_config = LoraConfig(
                r = 16, # the dimension of the low-rank matrices
                lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
                target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
                lora_dropout = 0.05, # dropout probability of the LoRA layers
                bias = 'none', # whether to train bias weights, set to 'none' for attention layers
                task_type = 'SEQ_CLS'
            )

            model = AutoModelForTokenClassification.from_pretrained(
                checkpoint,
                quantization_config=quantization_config,
                num_labels=num_lab,
                cache_dir="/cache_dir"
            )

            model = prepare_model_for_kbit_training(model)
            model = get_peft_model(model, lora_config)

            tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True, cache_dir="/cache_dir")

            tokenizer.pad_token_id = tokenizer.eos_token_id
            tokenizer.pad_token = tokenizer.eos_token

            model.config.pad_token_id = tokenizer.pad_token_id
            model.config.use_cache = False
            model.config.pretraining_tp = 1

            MAX_LEN = 512


            label_all_tokens = True
            tokenized_datasets = hf_dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
#            tokenized_datasets = tokenized_datasets.rename_column(task, "labels")
            tokenized_datasets.set_format("torch")

            collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

            training_args = TrainingArguments(
                output_dir = "/output",
                learning_rate = 4e-5,
                per_device_train_batch_size = 8,
                per_device_eval_batch_size = 8,
                num_train_epochs = 20,
                eval_steps=50, #new
                save_steps=50, # new
                metric_for_best_model="eval_loss",  # new2
                weight_decay = 0.01,
                eval_strategy="steps", #new
                load_best_model_at_end = True
            )

#            trainer = CustomTrainer(
            trainer = Trainer(
                model = model,
                args = training_args,
                train_dataset = tokenized_datasets['train'],
                eval_dataset = tokenized_datasets['val'],
                processing_class = tokenizer, #arg processing_class new for transformers 4.46
                data_collator = collate_fn,
                compute_metrics = compute_metrics,
#                class_weights=class_weights,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], #new
            )

            #train_result =
            trainer.train()

            # Classification report for validation set
            results = trainer.evaluate()

            predictions, labels, _ = trainer.predict(tokenized_datasets['val'])
            predictions = np.argmax(predictions, axis=2)

            # Remove ignored index (special tokens)
            predicted_labels = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
            true_labels = [[label_names[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
 #           predicted_labels = [label for sentence in predicted_labels for label in sentence]
 #           true_labels = [label for sentence in true_labels for label in sentence]

 #           results = metric.compute(predictions=predicted_labels, references=true_labels)
#            results = classification_report(y_true=true_labels, y_pred=predicted_labels, output_dict=False, zero_division=0, digits=4)
 #           print(results)
            results = classification_report(y_true=true_labels, y_pred=predicted_labels, suffix=False, output_dict=False, scheme=None, mode=None, sample_weight=None, zero_division=0, digits=4)
            print(results)

            results_dict = classification_report(y_true=true_labels, y_pred=predicted_labels, suffix=False, output_dict=True, scheme=None, mode=None, sample_weight=None, zero_division=0, digits=4)
            cv_results = sum_cv_scores(cv_results, results_dict)


#            cv_results = sum_cv_scores(cv_results, classification_report(true_labels,predicted_labels,zero_division=0, digits=4, output_dict=True))

            # Write results to file
            w.write("**************************************************Cross-validation number {}**************************************************\n\n".format(n+1))
            w.write(results)
            w.write("\n")

        cv_results = mean_cv_scores(cv_results)
        w.write("************************************************Results of 5 fold cross-validation************************************************\n\n")
        w.write('%20s%12s%12s%12s%12s\n\n' % ('', 'precision', 'recall', 'f1-score', 'support'))
        for key, value in cv_results.items():
            string= ""
            for key2, value2 in cv_results[key].items():
                if key2 != 'support':
                    string +=  '%*.*f' % (12,4, value2)
                else: string +=  '%*.*f' % (12,2, value2)
            w.write('%20s%s\n' % (key, string))
        w.close()

        scores_dict[checkpoint] = cv_results["weighted avg"]
        # print(scores_dict)
        scores = open("results/llm_{}_scores_dict.py".format(task), "w", encoding="utf-8")
        scores.write("{} = {}\n".format(task, scores_dict))
        scores.close()

if __name__ == "__main__":
    main()