Finetuning Llama3.1 model to perform translation of text-to-gloss and gloss-to-text

MaithriRao · MaithriRao · commit 2db1b164a06a · 2025-03-23T11:29:42.000+01:00
diff --git a/llama/data_selection.py b/llama/data_selection.py
@@ -0,0 +1,97 @@
+import math
+import torch
+import torch.nn as nn
+from collections import Counter
+from torch import Tensor
+import io
+import time
+import os
+import pandas as pd
+import json
+from datetime import datetime
+from transformers import AutoTokenizer
+from torch.utils.data import Dataset, DataLoader
+from sklearn.model_selection import train_test_split
+from .utils import Translation
+
+features_names = ["maingloss"]
+mms_directories = [
+    ("mms-subset91", 'latin-1'),
+    ("modified/location/mms", 'utf-8'),
+    ("modified/platform/mms", 'utf-8'),
+    ("modified/time/mms", 'utf-8'),
+    ("modified/train_name/mms", 'utf-8'),
+]
+text_directories = [
+    ("annotations_full/annotations", 'latin-1'),
+    ("modified/location/text", 'utf-8'),
+    ("modified/platform/text", 'utf-8'),
+    ("modified/time/text", 'utf-8'),
+    ("modified/train_name/text", 'utf-8'),
+]
+
+def read(text_info, mms_info, translation):
+    data_list = []
+    (text_directory, text_encoding) = text_info
+    print("text_directory: ", text_directory)
+    (mms_directory, mms_encoding) = mms_info
+    for filenumber in os.listdir(text_directory):
+        f = os.path.join(mms_directory, filenumber+".mms")
+        try:
+            df = pd.read_csv(f, encoding=mms_encoding)
+        except FileNotFoundError as e:
+            print(f"WARNING: Text file exists while mms file does not, skipping: {e}")
+            continue
+
+        text_address = os.path.join(text_directory, filenumber, "gebaerdler.Text_Deutsch.annotation~")
+        file = open(text_address, encoding=text_encoding)
+        lines = file.readlines()
+        text_line = ""
+        for i, text_data in enumerate(lines):
+            if i>0:
+                text_line = text_line + " " + text_data.replace("\n", "").split(";")[2]
+            else:
+                text_line = text_line + text_data.replace("\n", "").split(";")[2]
+        for feature in features_names:
+            gloss_line = " ".join(df["maingloss"].tolist())
+        if translation == Translation.TextToGloss:
+            combined_line = f"{text_line} ###> {gloss_line}"  # text to gloss
+        elif translation == Translation.GlossToText:
+            combined_line = f"{gloss_line} ###> {text_line}"  # gloss to text
+        else:
+            raise ValueError("Invalid translation")
+        data_list.append({"text": combined_line})
+    return data_list
+
+def create_datasets(translation):
+    data_list_only_original = []
+    data_list_only_modified = []
+    for i, text_info in enumerate(text_directories):
+        mms_info = mms_directories[i]
+        data_list_one = read(text_info, mms_info, translation)
+        if i <= 0:
+            data_list_only_original += data_list_one
+        else:
+            data_list_only_modified += data_list_one
+
+    data_list_full = data_list_only_original + data_list_only_modified
+
+
+    train_data, temp_data = train_test_split(data_list_full, test_size=0.2, random_state=42)
+    val_data, test_data = train_test_split(temp_data, test_size=1/3, random_state=42)
+
+
+    if translation == Translation.TextToGloss:
+        translation_dir = "t2g_llama"
+    elif translation == Translation.GlossToText:
+        translation_dir = "g2t_llama"
+    else:
+        raise ValueError("Invalid translation")
+    with open(f"train_data_{translation_dir}.json", "w") as f:
+        json.dump(train_data, f)
+
+    with open(f"val_data_{translation_dir}.json", "w") as f:
+        json.dump(val_data, f)
+
+    with open(f"test_data_{translation_dir}.json", "w") as f:
+        json.dump(test_data, f)
diff --git a/llama/fine_tune.py b/llama/fine_tune.py
@@ -0,0 +1,148 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import pickle
+import os
+from sacrebleu.metrics import BLEU
+from .data_selection import *
+from pathlib import Path
+from torch.utils.data import DataLoader
+import time
+from enum import Enum, verify, UNIQUE
+from transformers import BitsAndBytesConfig
+from huggingface_hub import login
+from datasets import Dataset, load_dataset
+from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
+from trl import SFTTrainer
+
+hf_access_token = os.getenv("HF_ACCESS_TOKEN")
+assert hf_access_token is not None, "You need to set the Hugging Face access token environment variable: export HF_ACCESS_TOKEN=hf_TODO"
+
+login(token = hf_access_token)
+
+def training(translation):
+
+    create_datasets(translation)
+
+    if translation == Translation.TextToGloss:
+        translation_dir = "t2g_llama"
+    elif translation == Translation.GlossToText:
+        translation_dir = "g2t_llama"
+    else:
+        raise ValueError("Invalid translation")
+
+
+    with open(f"train_data_{translation_dir}.json", "r") as f:
+        train_data = json.load(f)
+
+    with open(f"val_data_{translation_dir}.json", "r") as f:
+        val_data = json.load(f)
+
+    train_dataset = Dataset.from_list(train_data)
+    val_dataset = Dataset.from_list(val_data)
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    torch.cuda.empty_cache()
+    cache_dir = "/ds/videos/AVASAG/cache"
+    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_access_token, cache_dir=cache_dir, add_eos_token=True)
+    # Set padding token
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+
+    save_folder = os.path.join("/ds/videos/AVASAG/llama_finetune/", translation_dir)
+    sft_model_name = os.path.join(save_folder, "llama-31-it-8b-sft")
+    merged_model_name=os.path.join(save_folder, "llama-31-it-8b-sft-merged")
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=bnb_config, token=hf_access_token, cache_dir=cache_dir)
+
+    model = prepare_model_for_kbit_training(model)
+
+    modules = ["down_proj","up_proj","gate_proj"]
+
+    lora_config = LoraConfig(
+        r=64,
+        lora_alpha=32,
+        target_modules=modules,
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+
+    model = get_peft_model(model, lora_config)
+
+    trainable, total = model.get_nb_trainable_parameters()
+    print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")
+
+    tokenizer.pad_token = tokenizer.eos_token
+    torch.cuda.empty_cache()
+
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        dataset_text_field="text",
+        peft_config=lora_config,
+        args=transformers.TrainingArguments(
+            report_to=[],  # Disable logging
+            per_device_train_batch_size=1,
+            gradient_accumulation_steps=4,
+            warmup_ratio=0.03,
+            max_steps=1000,
+            learning_rate=2e-5,
+            logging_steps=1,
+            output_dir="/ds/videos/AVASAG/llama_finetune/outputs_{translation_dir}",
+            optim="paged_adamw_8bit",
+            save_strategy="epoch",
+            ddp_find_unused_parameters=False,
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+    )
+    model.config.use_cache = False
+    trainer.train()
+
+    trainer.model.save_pretrained(sft_model_name)
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        low_cpu_mem_usage=True,
+        return_dict=True,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    merged_model = PeftModel.from_pretrained(base_model, sft_model_name)
+    merged_model = merged_model.merge_and_unload()
+
+    merged_model.save_pretrained(merged_model_name, safe_serialization=True)
+    tokenizer.save_pretrained(merged_model_name)
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) != 2:
+        print("Usage: python k_fold.py [--textTogloss|--glossTotext]")
+        sys.exit(1)
+
+    if sys.argv[1] == "--textTogloss":
+        print("Translating from Text to  Gloss")
+        translation = Translation.TextToGloss
+    elif sys.argv[1] == "--glossTotext":
+        print("Translating from Gloss to Text ")
+        translation = Translation.GlossToText
+    else:
+        print("You have to specify either --textTogloss or --glossTotext as an argument.")
+        sys.exit(1)
+
+    training(translation)
diff --git a/llama/inference.py b/llama/inference.py
@@ -0,0 +1,123 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import pickle
+import os
+from sacrebleu.metrics import BLEU
+from pathlib import Path
+from torch.utils.data import DataLoader
+import time
+from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
+from trl import SFTTrainer
+import bitsandbytes as bnb
+import transformers
+import json
+import pandas as pd
+from datasets import Dataset, load_dataset
+from .utils import Translation
+
+
+def evaluation(translation):
+
+    if translation == Translation.TextToGloss:
+        translation_dir = "t2g_llama"
+    elif translation == Translation.GlossToText:
+        translation_dir = "g2t_llama"
+    else:
+        raise ValueError("Invalid translation")
+
+    folder_path = os.path.join("/ds/videos/AVASAG/llama_finetune/", translation_dir)
+    merged_model_name = os.path.join(folder_path, "llama-31-it-8b-sft-merged")
+    cache_dir = "/ds/videos/AVASAG/cache"
+    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+
+    model_finetune = AutoModelForCausalLM.from_pretrained(
+        merged_model_name,
+        local_files_only=True,
+        quantization_config=bnb_config,
+        device_map="auto"
+    )
+    tokenizer_finetune = AutoTokenizer.from_pretrained(
+        merged_model_name,
+        local_files_only=True,
+        add_eos_token=True)
+
+
+    with open(f'test_data_{translation_dir}.json', 'r') as f:
+        test_data = json.load(f)
+
+    # Initialize BLEU metric
+    bleu = BLEU()
+    references = []
+    predictions = []
+
+    # Loop through the test data and generate translations
+    for entry in test_data:
+        # Extract the text before and after ###>
+        my_text = entry["text"].split("###>")[0].strip()
+        prompt = my_text+" ###>"
+        assert entry["text"].startswith(prompt), f"Prompt not found in the text: {entry['text']}"
+        reference = entry["text"].split("###>")[1].strip()
+        print("Input is:", my_text)
+        print("Ground truth is:", reference)
+
+        # Tokenize and generate the translation
+        tokenized_input = tokenizer_finetune(prompt, return_tensors="pt")
+        input_ids = tokenized_input["input_ids"].cuda()
+        attention_mask = tokenized_input["attention_mask"].cuda()
+        reference_length = len(tokenizer_finetune(reference)["input_ids"])  # Get the number of tokens in reference
+
+
+        # Generate the translation using the model
+        generation_output = model_finetune.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            num_beams=6,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens= reference_length
+        )
+
+        # Decode the generated output
+        for seq in generation_output.sequences:
+            output = tokenizer_finetune.decode(seq, skip_special_tokens=True).split("###>")[1].strip()
+            predictions.append(output)
+            print("Generated output:", output)
+            print("\n")
+
+        # Append the reference to the references list
+        references.append([reference])
+
+    # Calculate BLEU score
+    bleu_score = bleu.corpus_score(predictions, references)
+
+    # Print the BLEU score
+    print(f"BLEU Score: {bleu_score.score}")
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) != 2:
+        print("Usage: python k_fold.py [--textTogloss|--glossTotext]")
+        sys.exit(1)
+
+    if sys.argv[1] == "--textTogloss":
+        print("Translating from Text to  Gloss")
+        translation = Translation.TextToGloss
+    elif sys.argv[1] == "--glossTotext":
+        print("Translating from Gloss to Text ")
+        translation = Translation.GlossToText
+    else:
+        print("You have to specify either --textTogloss or --glossTotext as an argument.")
+        sys.exit(1)
+
+    evaluation(translation)