diff --git a/all_glosses/data_selection.py b/all_glosses/data_selection.py new file mode 100644 index 0000000..d927527 --- /dev/null +++ b/all_glosses/data_selection.py @@ -0,0 +1,42 @@ + +import math +import torch +import torch.nn as nn +from collections import Counter +from torch import Tensor +import io +import time +import os +import pandas as pd +import json +from datetime import datetime + + + +def read(text_info, mms_info): + data_list = [] + (text_directory, text_encoding) = text_info + print("text_directory: ", text_directory) + (mms_directory, mms_encoding) = mms_info + for filenumber in os.listdir(text_directory): + f = os.path.join(mms_directory, filenumber+".mms") + try: + df = pd.read_csv(f, na_filter=False, encoding=mms_encoding) # to overcome nan problem in dom and ndom glosses + except FileNotFoundError as e: + print(f"WARNING: Text file exists while mms file does not, skipping: {e}") + continue + + text_address = os.path.join(text_directory, filenumber, "gebaerdler.Text_Deutsch.annotation~") + file = open(text_address, encoding=text_encoding) + lines = file.readlines() + text_line = "" + for i, text_data in enumerate(lines): + if i>0: + text_line = text_line + " " + text_data.replace("\n", "").split(";")[2] + else: + text_line = text_line + text_data.replace("\n", "").split(";")[2] + glosses = df["maingloss"] + "_" + df["domgloss"] + "_" + df["ndomgloss"] + gloss_line = " ".join(glosses.tolist()) + data_dict = {"file_ID":filenumber, "text": text_line, "gloss": gloss_line} + data_list.append(data_dict) + return data_list \ No newline at end of file diff --git a/all_glosses/datasets.py b/all_glosses/datasets.py new file mode 100644 index 0000000..e9b955a --- /dev/null +++ b/all_glosses/datasets.py @@ -0,0 +1,96 @@ +import math +import torch +from torch import Tensor +import io +import time +import os +import pandas as pd +import json +from datetime import datetime +import pickle +from pathlib import Path +from torch.utils.data import Dataset +from collections import Counter +from torch.nn.utils.rnn import pad_sequence +import torchtext +from torchtext.data.utils import get_tokenizer +from collections import Counter +from torchtext.vocab import vocab +import numpy as np +from transformers import AutoTokenizer +import torch.nn.functional as F +from pathlib import Path +from . import data_selection + +mms_directories = [ + ("mms-subset91", 'latin-1'), + ("modified/location/mms", 'utf-8'), + ("modified/platform/mms", 'utf-8'), + ("modified/time/mms", 'utf-8'), + ("modified/train_name/mms", 'utf-8'), +] +text_directories = [ + ("annotations_full/annotations", 'latin-1'), + ("modified/location/text", 'utf-8'), + ("modified/platform/text", 'utf-8'), + ("modified/time/text", 'utf-8'), + ("modified/train_name/text", 'utf-8'), +] + +checkpoint = 'facebook/nllb-200-distilled-600M' #for nllb +tokenizer = AutoTokenizer.from_pretrained(checkpoint) + +def read(): + data_list_only_original = [] + data_list_only_modified = [] + for i, text_info in enumerate(text_directories): + mms_info = mms_directories[i] + data_list_one = data_selection.read(text_info, mms_info) + if i <= 0: + data_list_only_original += data_list_one + else: + data_list_only_modified += data_list_one + + data_list_full = data_list_only_original + data_list_only_modified + + return (data_list_only_original, data_list_only_modified, data_list_full) + + +class SignLanguageDataset(Dataset): + def __init__(self, data_list, tokenizer, max_length=512): + self.data_list = data_list + self.tokenizer = tokenizer + self.max_length = max_length + self.vocab_size = len(tokenizer) + + def __len__(self): + return len(self.data_list) + + def __getitem__(self, idx): + data = self.data_list[idx] + file_Id = data['file_ID'] + text_tokens = self.tokenizer.encode(data['text'], add_special_tokens=True) + text_tokens = torch.tensor(text_tokens) + + maingloss_tokens = self.tokenizer.encode(''.join(data['gloss']).lower(), add_special_tokens=True) + maingloss_tokens = torch.tensor(maingloss_tokens) + + return file_Id, text_tokens, maingloss_tokens + + return file_Id, text_tokens, gloss_tokens + + +def collate_fn(batch): + file_Id, text_tokens, gloss_tokens = zip(*batch) + padding_value = tokenizer.pad_token_id # here for nllb paddign token is 1 + + text_tokens_padded = torch.nn.utils.rnn.pad_sequence(text_tokens, batch_first=True, padding_value=padding_value) + gloss_tokens_padded = torch.nn.utils.rnn.pad_sequence(gloss_tokens, batch_first=True, padding_value=padding_value) + + # Ensure all have the same sequence length + max_len = max(text_tokens_padded.size(1), gloss_tokens_padded.size(1)) + + text_tokens_padded = torch.nn.functional.pad(text_tokens_padded, (0, max_len - text_tokens_padded.size(1)), value=padding_value) + gloss_tokens_padded = torch.nn.functional.pad(gloss_tokens_padded, (0, max_len - gloss_tokens_padded.size(1)), value=padding_value) + + return file_Id, text_tokens_padded, gloss_tokens_padded diff --git a/all_glosses/nllb.py b/all_glosses/nllb.py new file mode 100644 index 0000000..ee90cbc --- /dev/null +++ b/all_glosses/nllb.py @@ -0,0 +1,287 @@ +import torch +import torch.nn as nn +import numpy as np +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +import pickle +import os +from sacrebleu.metrics import BLEU +from nltk.translate.bleu_score import sentence_bleu +from . import datasets +from pathlib import Path +from sklearn.model_selection import KFold +from torch.utils.data import DataLoader +import time +from enum import Enum, verify, UNIQUE + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +checkpoint = 'facebook/nllb-200-distilled-600M' #for nllb +tokenizer = AutoTokenizer.from_pretrained(checkpoint) + +def train(fold, ds, augment): + + if not augment: + augment_dir = "original_data" + else: + augment_dir = "aug_data" + save_folder = os.path.join("/ds/videos/AVASAG/allgloss_tg/", augment_dir, "nllb") + save_file_path = os.path.join(save_folder, "result") + Path(save_folder).mkdir(parents=True, exist_ok=True) + + (original, modified, full) = ds + dataset = original + + # Split the dataset into 10 folds + kf = KFold(n_splits=10, shuffle=True, random_state=42) + folds = list(kf.split(dataset)) + + # Split the dataset into train and test sets based on the current fold + train_indices = [idx for fold_idx, idx in enumerate(folds[fold][0]) if fold_idx != fold] + test_indices = folds[fold][1] + train_data = [dataset[idx] for idx in train_indices] + test_data = [dataset[idx] for idx in test_indices] + + # Augment the training data if augment=True + if augment: + train_data = augment_data(train_data, modified) + + train_dataset = datasets.SignLanguageDataset(train_data, tokenizer) + train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=datasets.collate_fn) + + test_dataset = datasets.SignLanguageDataset(test_data, tokenizer) + test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=datasets.collate_fn) + + NUM_EPOCHS = 1000 + loss_graf = [] + + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + + train_log = open(save_file_path+ f"_fold_{fold}_train_log.txt", 'w') + + best_epoch = 0 + + for epoch in range(1, NUM_EPOCHS+1): + start_time = time.time() + train_loss = train_epoch(model, train_dataloader, optimizer, tokenizer) + + end_time = time.time() + log = "Epoch: " + str(epoch)+", Train loss: "+ str(train_loss)+" Epoch duration "+ str(end_time - start_time)+"\n" + train_log.write(log) + if epoch <= 1 or train_loss < min(loss_graf): + best_model_path = save_file_path+f"_fold_{fold}_best_model.pt" + torch.save(model.state_dict(), best_model_path) + log = "min so far is at epoch: "+ str(epoch)+"\n" + train_log.write(log) + best_epoch = epoch + + loss_graf.append(train_loss) + + log = "best epoch is: "+ str(best_epoch) + train_log.write(log) + train_log.close() + + torch.save(model.state_dict(), save_file_path+f"_fold_{fold}_last_model.pt") + + return test_dataloader, save_file_path + +def extract_glosses(glosses): + main_glosses, dom_glosses, ndom_glosses = [], [], [] + for gloss in glosses.split(): + glosses_split = gloss.split("_") + if len(glosses_split) > 0: + main_glosses.append(glosses_split[0]) + if len(glosses_split) > 1: + dom_glosses.append(glosses_split[1]) + if len(glosses_split) > 2: + ndom_glosses.append(glosses_split[2]) + return " ".join(main_glosses), " ".join(dom_glosses), " ".join(ndom_glosses) + + + +def count_length_comparisons(hypotheses, ground_truths): + counts = { + 'num_P_T': sum(len(h.split()) > len(g.split()) for h, g in zip(hypotheses, ground_truths)), + 'num_T_P': sum(len(h.split()) < len(g.split()) for h, g in zip(hypotheses, ground_truths)), + 'num_e': sum(len(h.split()) == len(g.split()) for h, g in zip(hypotheses, ground_truths)) + } + return counts + +def save_results(fold, model_type, save_file_path, counts, bleus, ground_truths, hypotheses): + with open(save_file_path + f"_fold_{fold}_{model_type}_outputs.txt", "w") as f: + # Write BLEU scores for each gloss type + f.write("BLEU Scores:\n") + for gloss_type, score in bleus.items(): + f.write(f"{gloss_type}: {score}\n") + + f.write("\nLength Comparison Counts:\n") + # Write counts for each gloss type + for gloss_type, count_dict in counts.items(): + f.write(f"{gloss_type}:\n") + f.write(f" P>T: {count_dict['num_P_T']}\n") + f.write(f" T>P: {count_dict['num_T_P']}\n") + f.write(f" Equal: {count_dict['num_e']}\n") + + f.write("\nGround Truth and Predicted Texts:\n") + # Write ground truth and predictions for each sample + for i in range(len(ground_truths['maingloss'])): + f.write(f"\nSample {i+1}:\n") + f.write(f"Ground Truth (maingloss): {ground_truths['maingloss'][i]}\n") + f.write(f"Predicted (maingloss): {hypotheses['maingloss'][i]}\n") + f.write(f"Ground Truth (domgloss): {ground_truths['domgloss'][i]}\n") + f.write(f"Predicted (domgloss): {hypotheses['domgloss'][i]}\n") + f.write(f"Ground Truth (ndomgloss): {ground_truths['ndomgloss'][i]}\n") + f.write(f"Predicted (ndomgloss): {hypotheses['ndomgloss'][i]}\n") + +def calculate_bleu(hypotheses, references): + scores = [] + for hyp, ref in zip(hypotheses, references): + ref = [ref.split()] + hyp = hyp.split() + score = sentence_bleu(ref, hyp, weights=(1, 0, 0, 0)) # BLEU-1 + scores.append(score) + return sum(scores) / len(scores) if scores else 0.0 # Average BLEU-1 + + +def evaluate(fold, model_type, model_name, test_dataloader, save_file_path): # Evaluation + model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device) + model.load_state_dict(torch.load(save_file_path+f"_fold_{fold}_{model_type}_{model_name}")) + + ground_truths = { + 'maingloss': [], + 'domgloss': [], + 'ndomgloss': [] + } + hypotheses = { + 'maingloss': [], + 'domgloss': [], + 'ndomgloss': [] + } + + model.eval() + with torch.no_grad(): + for batch in test_dataloader: + file_Id, text_tokens_padded, maingloss_tokens_padded = batch + text_tokens_padded = text_tokens_padded.to(device) + maingloss_tokens_padded = maingloss_tokens_padded.to(device) + + pred = model.generate(input_ids=text_tokens_padded, max_length=maingloss_tokens_padded.size(1)) + + for i in range(text_tokens_padded.size(0)): + gt_maingloss = "".join(tokenizer.decode(maingloss_tokens_padded[i], skip_special_tokens=True)) + input_text = tokenizer.decode(text_tokens_padded[i], skip_special_tokens=True) + text_predicted = tokenizer.decode(pred[i], skip_special_tokens=True) + + + main_glosses, dom_glosses, ndom_glosses = extract_glosses(gt_maingloss) + main_glosses_pred, dom_glosses_pred, ndom_glosses_pred = extract_glosses(text_predicted) + + + if fold == 9: # only for printing + print("file_Id", file_Id) + print(f"\nSample {len(gt_maingloss) + 1}:") + print(f"Input Text: {input_text}") + + print(f"ground_truth_maingloss: {main_glosses}") + print(f"ground_truth_domgloss: {dom_glosses}") + print(f"ground_truth_ndomgloss: {ndom_glosses}") + + print(f"main_glosses_pred: {main_glosses_pred}") + print(f"dom_glosses_pred: {dom_glosses_pred}") + print(f"ndom_glosses_pred: {ndom_glosses_pred}") + + ground_truths['maingloss'].append(main_glosses) + ground_truths['domgloss'].append(dom_glosses) + ground_truths['ndomgloss'].append(ndom_glosses) + + hypotheses['maingloss'].append(main_glosses_pred) + hypotheses['domgloss'].append(dom_glosses_pred) + hypotheses['ndomgloss'].append(ndom_glosses_pred) + + + # Calculate BLEU score + bleu = BLEU() + bleus = { + 'maingloss': bleu.corpus_score(hypotheses['maingloss'], [ground_truths['maingloss']]), + 'domgloss': calculate_bleu(hypotheses['domgloss'], ground_truths['domgloss']), + 'ndomgloss': calculate_bleu(hypotheses['ndomgloss'], ground_truths['ndomgloss']) + } + + + # Count lengths for each gloss type + counts = {key: count_length_comparisons(hypotheses[key], ground_truths[key]) for key in hypotheses} + + # Save results to file + save_results(fold, model_type, save_file_path, counts, bleus, ground_truths, hypotheses) + + return bleus['maingloss'].score, bleus['domgloss'], bleus['ndomgloss'] + + +def augment_data(train_data, sentences): + + augmented_train_data = train_data.copy() + augmented_train_data.extend(sentences) + + return augmented_train_data + +def train_epoch(model, train_dataloader, optimizer, tokenizer): + model.train() + total_loss = 0 + for batch_idx, batch in enumerate(train_dataloader): + file_Id, text_tokens_padded, maingloss_tokens_padded = batch + text_tokens_padded = text_tokens_padded.to(device) + maingloss_tokens_padded = maingloss_tokens_padded.to(device) + input_attention_mask = (text_tokens_padded != tokenizer.pad_token_id).to(device) + + optimizer.zero_grad() + + output_final = model(input_ids=text_tokens_padded, attention_mask=input_attention_mask, labels=maingloss_tokens_padded) + loss = output_final.loss + total_loss += loss.item() + loss.backward() + optimizer.step() + + avg_train_loss = total_loss / len(train_dataloader) + return avg_train_loss + +if __name__ == "__main__": + import sys + + original_scores = { 'best': { 'maingloss': [], 'domgloss': [], 'ndomgloss': [] }, + 'last': { 'maingloss': [], 'domgloss': [], 'ndomgloss': [] } } + + + augmented_scores = { 'best': { 'maingloss': [], 'domgloss': [], 'ndomgloss': [] }, + 'last': { 'maingloss': [], 'domgloss': [], 'ndomgloss': [] } } + + ds = datasets.read() + + for fold in range(10): + print(f"Current fold {fold}:") + print("Original data :") + test_dataloader, save_file_path = train(fold, ds, augment=False) + test_dataloader_1, save_file_path_1 = train(fold, ds, augment=True) + assert save_file_path != save_file_path_1 + for model_type in ['best', 'last']: + print(f"{model_type.capitalize()} model:") + original_maingloss, original_domgloss, original_ndomgloss = evaluate(fold, model_type, "model.pt", test_dataloader, save_file_path) + original_scores[model_type]['maingloss'].append(original_maingloss) + original_scores[model_type]['domgloss'].append(original_domgloss) + original_scores[model_type]['ndomgloss'].append(original_ndomgloss) + + aug_maingloss, aug_domgloss, aug_ndomgloss = evaluate(fold, model_type, "model.pt", test_dataloader_1, save_file_path_1) + augmented_scores[model_type]['maingloss'].append(aug_maingloss) + augmented_scores[model_type]['domgloss'].append(aug_domgloss) + augmented_scores[model_type]['ndomgloss'].append(aug_ndomgloss) + + + avg_original_scores = { model_type: { gloss: np.mean(original_scores[model_type][gloss]) for gloss in original_scores[model_type] } for model_type in original_scores } + avg_augmented_scores = { model_type: { gloss: np.mean(augmented_scores[model_type][gloss]) for gloss in augmented_scores[model_type] } for model_type in augmented_scores } + + + for model_type in ['best', 'last']: + for gloss in ['maingloss', 'domgloss', 'ndomgloss']: + print(f" BLEU score on original data for each fold {model_type}_model {gloss}: {original_scores[model_type][gloss]}") + print(f" BLEU score on augmented data for each fold {model_type}_model {gloss}: {augmented_scores[model_type][gloss]}") + print(f" Average BLEU score on original data for {model_type}_model {gloss}: {avg_original_scores[model_type][gloss]}") + print(f" Average BLEU score on augmented data for {model_type}_model {gloss}: {avg_augmented_scores[model_type][gloss]}") + \ No newline at end of file diff --git a/llama/data_selection.py b/llama/data_selection.py new file mode 100644 index 0000000..6d9655b --- /dev/null +++ b/llama/data_selection.py @@ -0,0 +1,97 @@ +import math +import torch +import torch.nn as nn +from collections import Counter +from torch import Tensor +import io +import time +import os +import pandas as pd +import json +from datetime import datetime +from transformers import AutoTokenizer +from torch.utils.data import Dataset, DataLoader +from sklearn.model_selection import train_test_split +from .utils import Translation + +features_names = ["maingloss"] +mms_directories = [ + ("mms-subset91", 'latin-1'), + ("modified/location/mms", 'utf-8'), + ("modified/platform/mms", 'utf-8'), + ("modified/time/mms", 'utf-8'), + ("modified/train_name/mms", 'utf-8'), +] +text_directories = [ + ("annotations_full/annotations", 'latin-1'), + ("modified/location/text", 'utf-8'), + ("modified/platform/text", 'utf-8'), + ("modified/time/text", 'utf-8'), + ("modified/train_name/text", 'utf-8'), +] + +def read(text_info, mms_info, translation): + data_list = [] + (text_directory, text_encoding) = text_info + print("text_directory: ", text_directory) + (mms_directory, mms_encoding) = mms_info + for filenumber in os.listdir(text_directory): + f = os.path.join(mms_directory, filenumber+".mms") + try: + df = pd.read_csv(f, encoding=mms_encoding) + except FileNotFoundError as e: + print(f"WARNING: Text file exists while mms file does not, skipping: {e}") + continue + + text_address = os.path.join(text_directory, filenumber, "gebaerdler.Text_Deutsch.annotation~") + file = open(text_address, encoding=text_encoding) + lines = file.readlines() + text_line = "" + for i, text_data in enumerate(lines): + if i>0: + text_line = text_line + " " + text_data.replace("\n", "").split(";")[2] + else: + text_line = text_line + text_data.replace("\n", "").split(";")[2] + for feature in features_names: + gloss_line = " ".join(df["maingloss"].tolist()) + if translation == Translation.TextToGloss: + combined_line = f"{text_line} ###> {gloss_line}" # text to gloss + elif translation == Translation.GlossToText: + combined_line = f"{gloss_line} ###> {text_line}" # gloss to text + else: + raise ValueError("Invalid translation") + data_list.append({"text": combined_line}) + return data_list + +def create_datasets(translation): + data_list_only_original = [] + data_list_only_modified = [] + for i, text_info in enumerate(text_directories): + mms_info = mms_directories[i] + data_list_one = read(text_info, mms_info, translation) + if i <= 0: + data_list_only_original += data_list_one + else: + data_list_only_modified += data_list_one + + data_list_full = data_list_only_original + data_list_only_modified + + + train_data, temp_data = train_test_split(data_list_full, test_size=0.2, random_state=42) + val_data, test_data = train_test_split(temp_data, test_size=1/3, random_state=42) + + + if translation == Translation.TextToGloss: + translation_dir = "t2g_llama" + elif translation == Translation.GlossToText: + translation_dir = "g2t_llama" + else: + raise ValueError("Invalid translation") + with open(f"train_data_{translation_dir}.json", "w") as f: + json.dump(train_data, f) + + with open(f"val_data_{translation_dir}.json", "w") as f: + json.dump(val_data, f) + + with open(f"test_data_{translation_dir}.json", "w") as f: + json.dump(test_data, f) diff --git a/llama/fine_tune.py b/llama/fine_tune.py new file mode 100644 index 0000000..da6c3c4 --- /dev/null +++ b/llama/fine_tune.py @@ -0,0 +1,148 @@ +import torch +import torch.nn as nn +import numpy as np +import transformers +from transformers import AutoTokenizer, AutoModelForCausalLM +import pickle +import os +from sacrebleu.metrics import BLEU +from .data_selection import * +from pathlib import Path +from torch.utils.data import DataLoader +import time +from enum import Enum, verify, UNIQUE +from transformers import BitsAndBytesConfig +from huggingface_hub import login +from datasets import Dataset, load_dataset +from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model +from trl import SFTTrainer + +hf_access_token = os.getenv("HF_ACCESS_TOKEN") +assert hf_access_token is not None, "You need to set the Hugging Face access token environment variable: export HF_ACCESS_TOKEN=hf_TODO" + +login(token = hf_access_token) + +def training(translation): + + create_datasets(translation) + + if translation == Translation.TextToGloss: + translation_dir = "t2g_llama" + elif translation == Translation.GlossToText: + translation_dir = "g2t_llama" + else: + raise ValueError("Invalid translation") + + + with open(f"train_data_{translation_dir}.json", "r") as f: + train_data = json.load(f) + + with open(f"val_data_{translation_dir}.json", "r") as f: + val_data = json.load(f) + + train_dataset = Dataset.from_list(train_data) + val_dataset = Dataset.from_list(val_data) + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + torch.cuda.empty_cache() + cache_dir = "/ds/videos/AVASAG/cache" + model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" + + tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_access_token, cache_dir=cache_dir, add_eos_token=True) + # Set padding token + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "right" + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16 + ) + + save_folder = os.path.join("/ds/videos/AVASAG/llama_finetune/", translation_dir) + sft_model_name = os.path.join(save_folder, "llama-31-it-8b-sft") + merged_model_name=os.path.join(save_folder, "llama-31-it-8b-sft-merged") + + model = AutoModelForCausalLM.from_pretrained( + model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=bnb_config, token=hf_access_token, cache_dir=cache_dir) + + model = prepare_model_for_kbit_training(model) + + modules = ["down_proj","up_proj","gate_proj"] + + lora_config = LoraConfig( + r=64, + lora_alpha=32, + target_modules=modules, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM" + ) + + model = get_peft_model(model, lora_config) + + trainable, total = model.get_nb_trainable_parameters() + print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%") + + tokenizer.pad_token = tokenizer.eos_token + torch.cuda.empty_cache() + + trainer = SFTTrainer( + model=model, + train_dataset=train_dataset, + eval_dataset=val_dataset, + dataset_text_field="text", + peft_config=lora_config, + args=transformers.TrainingArguments( + report_to=[], # Disable logging + per_device_train_batch_size=1, + gradient_accumulation_steps=4, + warmup_ratio=0.03, + max_steps=1000, + learning_rate=2e-5, + logging_steps=1, + output_dir="/ds/videos/AVASAG/llama_finetune/outputs_{translation_dir}", + optim="paged_adamw_8bit", + save_strategy="epoch", + ddp_find_unused_parameters=False, + ), + data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + trainer.model.save_pretrained(sft_model_name) + + base_model = AutoModelForCausalLM.from_pretrained( + model_id, + low_cpu_mem_usage=True, + return_dict=True, + torch_dtype=torch.float16, + device_map="auto", + ) + merged_model = PeftModel.from_pretrained(base_model, sft_model_name) + merged_model = merged_model.merge_and_unload() + + merged_model.save_pretrained(merged_model_name, safe_serialization=True) + tokenizer.save_pretrained(merged_model_name) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) != 2: + print("Usage: python k_fold.py [--textTogloss|--glossTotext]") + sys.exit(1) + + if sys.argv[1] == "--textTogloss": + print("Translating from Text to Gloss") + translation = Translation.TextToGloss + elif sys.argv[1] == "--glossTotext": + print("Translating from Gloss to Text ") + translation = Translation.GlossToText + else: + print("You have to specify either --textTogloss or --glossTotext as an argument.") + sys.exit(1) + + training(translation) diff --git a/llama/inference.py b/llama/inference.py new file mode 100644 index 0000000..30216a8 --- /dev/null +++ b/llama/inference.py @@ -0,0 +1,123 @@ +import torch +import torch.nn as nn +import numpy as np +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +import pickle +import os +from sacrebleu.metrics import BLEU +from pathlib import Path +from torch.utils.data import DataLoader +import time +from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model +from trl import SFTTrainer +import bitsandbytes as bnb +import transformers +import json +import pandas as pd +from datasets import Dataset, load_dataset +from .utils import Translation + + +def evaluation(translation): + + if translation == Translation.TextToGloss: + translation_dir = "t2g_llama" + elif translation == Translation.GlossToText: + translation_dir = "g2t_llama" + else: + raise ValueError("Invalid translation") + + folder_path = os.path.join("/ds/videos/AVASAG/llama_finetune/", translation_dir) + merged_model_name = os.path.join(folder_path, "llama-31-it-8b-sft-merged") + cache_dir = "/ds/videos/AVASAG/cache" + model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16 + ) + + model_finetune = AutoModelForCausalLM.from_pretrained( + merged_model_name, + local_files_only=True, + quantization_config=bnb_config, + device_map="auto" + ) + tokenizer_finetune = AutoTokenizer.from_pretrained( + merged_model_name, + local_files_only=True, + add_eos_token=True) + + + with open(f'test_data_{translation_dir}.json', 'r') as f: + test_data = json.load(f) + + # Initialize BLEU metric + bleu = BLEU() + references = [] + predictions = [] + + # Loop through the test data and generate translations + for entry in test_data: + # Extract the text before and after ###> + my_text = entry["text"].split("###>")[0].strip() + prompt = my_text+" ###>" + assert entry["text"].startswith(prompt), f"Prompt not found in the text: {entry['text']}" + reference = entry["text"].split("###>")[1].strip() + print("Input is:", my_text) + print("Ground truth is:", reference) + + # Tokenize and generate the translation + tokenized_input = tokenizer_finetune(prompt, return_tensors="pt") + input_ids = tokenized_input["input_ids"].cuda() + attention_mask = tokenized_input["attention_mask"].cuda() + reference_length = len(tokenizer_finetune(reference)["input_ids"]) # Get the number of tokens in reference + + + # Generate the translation using the model + generation_output = model_finetune.generate( + input_ids=input_ids, + attention_mask=attention_mask, + num_beams=6, + return_dict_in_generate=True, + output_scores=True, + max_new_tokens= reference_length + ) + + # Decode the generated output + for seq in generation_output.sequences: + output = tokenizer_finetune.decode(seq, skip_special_tokens=True).split("###>")[1].strip() + predictions.append(output) + print("Generated output:", output) + print("\n") + + # Append the reference to the references list + references.append([reference]) + + # Calculate BLEU score + bleu_score = bleu.corpus_score(predictions, references) + + # Print the BLEU score + print(f"BLEU Score: {bleu_score.score}") + + +if __name__ == "__main__": + import sys + + if len(sys.argv) != 2: + print("Usage: python k_fold.py [--textTogloss|--glossTotext]") + sys.exit(1) + + if sys.argv[1] == "--textTogloss": + print("Translating from Text to Gloss") + translation = Translation.TextToGloss + elif sys.argv[1] == "--glossTotext": + print("Translating from Gloss to Text ") + translation = Translation.GlossToText + else: + print("You have to specify either --textTogloss or --glossTotext as an argument.") + sys.exit(1) + + evaluation(translation)