diff --git a/12-training/2-unsloth-finetune/cerebrium.toml b/12-training/2-unsloth-finetune/cerebrium.toml new file mode 100644 index 00000000..17c0b036 --- /dev/null +++ b/12-training/2-unsloth-finetune/cerebrium.toml @@ -0,0 +1,27 @@ +[cerebrium.deployment] +name = "cipher-finetune-8b" +python_version = "3.10" +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" +include = ['main.py', 'cerebrium.toml'] +exclude = ['.*'] + +[cerebrium.hardware] +region = "us-east-1" +compute = "AMPERE_A10" +cpu = 2 +memory = 24.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +max_replicas = 1 +response_grace_period = 7200 # 2 hours + +[cerebrium.dependencies.pip] +unsloth = "latest" +trl = "latest" +peft = "latest" +accelerate = "latest" +bitsandbytes = "latest" +datasets = "latest" +sentencepiece = "latest" \ No newline at end of file diff --git a/12-training/2-unsloth-finetune/evalLLM.py b/12-training/2-unsloth-finetune/evalLLM.py new file mode 100644 index 00000000..5ce72a1e --- /dev/null +++ b/12-training/2-unsloth-finetune/evalLLM.py @@ -0,0 +1,274 @@ +import csv +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +from decimal import InvalidOperation +from pathlib import Path +from typing import List, Optional, Tuple + +from dotenv import load_dotenv +from openai import OpenAI + +import string +import secrets + +char_pool = string.ascii_letters + string.digits + string.punctuation + +def getRandomString(length) -> str: + return ''.join(secrets.choice(char_pool) for _ in range(length)) + +def positional_mirror_cipher(text): + alphabet = string.ascii_lowercase + mirror = alphabet[::-1] + result = [] + + for i, char in enumerate(text): + if char.lower() in alphabet: + is_upper = char.isupper() + idx = alphabet.index(char.lower()) + + if i % 2 == 0: # Even: Mirror + new_char = mirror[idx] + else: # Odd: Shift +3 + new_char = alphabet[(idx + 3) % 26] + + result.append(new_char.upper() if is_upper else new_char) + else: + result.append(char) + + return "".join(result) + +def generateCipher(length: int) -> Tuple[str, str]: + plain_text = getRandomString(length) + cipher_text = positional_mirror_cipher(plain_text) + return plain_text, cipher_text + + +class LLMResponseError(Exception): + pass + +class LLMSolver: + def __init__(self, model_name: str = "gpt-5-nano"): + load_dotenv() + + openai_api_key = os.environ.get("OPENAI_API_KEY") + + if not openai_api_key: + raise EnvironmentError("Missing OPENAI_API_KEY in environment variables") + + self.client = OpenAI(api_key=openai_api_key) + self.model_name = model_name + + print(f"Using model: {model_name}") + + def generate_response(self, prompt: str) -> str: + try: + response = self.client.responses.create( + model=self.model_name, + input=prompt, + ) + + output_text = response.output_text.strip() + print(f"Generated raw response: {output_text}") + + return output_text + + except (InvalidOperation, AttributeError, KeyError) as parse_error: + raise LLMResponseError( + "LLM response could not be interpreted as a number." + ) from parse_error + except Exception as api_error: + raise LLMResponseError(f"Error during API call: {api_error}") from api_error + + +def _write_result_rows( + writer: csv.writer, + length: int, + results: List[Tuple[int, str, str, str, bool]], +) -> None: + for _, expression, truthAns, llmAns, is_correct in results: + writer.writerow( + [ + length, + expression, + truthAns, + llmAns, + is_correct, + ] + ) + +def evaluate( + length: int, + iterations: int = 100, + csv_path: str = "llm_results.csv", + writer: Optional[csv.writer] = None, +) -> Tuple[int, int, int]: + llm = LLMSolver() + + csv_full_path: Optional[Path] = None + if writer is None: + csv_full_path = Path(csv_path).expanduser().resolve() + csv_full_path.parent.mkdir(parents=True, exist_ok=True) + + samples: List[Tuple[int, str, str]] = [] + for i in range(1, iterations+1): + plain, cipher = generateCipher(length) + samples.append((i, plain, cipher)) + + samples_lookup = { + run_index: (expression, truthAns) + for run_index, expression, truthAns in samples + } + + def _solve_sample( + run_index: int, expression: str, truthAns: str + ) -> Tuple[int, str, str, str, bool]: + prompt = ( + "Consider the following cipher. The Rules. Consider (0) indexing the entire string.\n" + "Even Index ($0, 2, 4...$): Replace with the 'mirror' of the alphabet ($a \\to z, A \\to Z, b \\to y, B \\to Y, c \\to x$, etc.).\n" + "Odd Index ($1, 3, 5...$): Shift forward by 3 (a \\to d, A \\to D, b \\to e, B \\to E, z \\to c$).\n" + "All other characters: Leave unchanged.\n" + f"Please cipher the following:\n{expression} \nOutput ONLY the final ciphered text with no additional commentary or punctuation." + ) + + llmAns = llm.generate_response(prompt) + is_correct = llmAns == truthAns + print(f"Found LLM answer: {llmAns} (Correct: {is_correct})") + return run_index, expression, truthAns, llmAns, is_correct + + results: List[Tuple[int, str, str, str, bool]] = [] + max_workers = min(10, iterations) + processed = 0 + skipped_errors = 0 + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_map = { + executor.submit(_solve_sample, run_index, expression, truthAns): run_index + for run_index, expression, truthAns in samples + } + + for future in as_completed(future_map): + run_index = future_map[future] + try: + ( + _, + expression, + truthAns, + llmAns, + is_correct, + ) = future.result() + except LLMResponseError as error: + expression, truthAns = samples_lookup[run_index] + snippet = ( + f"{expression[:60]}..." + if len(expression) > 60 + else expression + ) + processed += 1 + skipped_errors += 1 + print( + f"[{processed}/{iterations}] LLM error for run #{run_index}: {error}. " + f"Skipping expression: {snippet}" + ) + continue + except Exception as error: + expression, truthAns = samples_lookup[run_index] + snippet = ( + f"{expression[:60]}..." + if len(expression) > 60 + else expression + ) + processed += 1 + skipped_errors += 1 + print( + f"[{processed}/{iterations}] Unexpected error for run #{run_index}: {error}. " + f"Skipping expression: {snippet}" + ) + continue + + results.append((run_index, expression, truthAns, llmAns, is_correct)) + processed += 1 + + print( + f"[{processed}/{iterations}] " + f"Truth={truthAns} " + f"LLM={llmAns} " + f"({'Correct' if is_correct else 'Incorrect'})" + ) + + results.sort(key=lambda item: item[0]) + correct_count = sum(int(item[4]) for item in results) + attempts = len(results) + + if writer is None: + assert csv_full_path is not None + with csv_full_path.open("w", newline="") as csvfile: + file_writer = csv.writer(csvfile) + file_writer.writerow( + ["length", "original_text", "true_cipher", "model_cipher", "is_correct"] + ) + _write_result_rows(file_writer, length, results) + else: + _write_result_rows(writer, length, results) + + if attempts: + accuracy = (correct_count / attempts) * 100 + print( + f"LLM accuracy for length {length}: {accuracy:.2f}% " + f"({correct_count}/{attempts}) with {skipped_errors} skipped." + ) + else: + print( + f"LLM produced no successful runs for length {length}. " + f"Skipped {skipped_errors} attempts." + ) + + return correct_count, attempts, skipped_errors + + +def run_length_sweep( + mn: int = 4, + mx: int = 10, + iterations_per_length: int = 2, + csv_path: str = "llm_results.csv", +) -> None: + lengths = [i for i in range(mn, mx + 1)] + csv_full_path = Path(csv_path).expanduser().resolve() + csv_full_path.parent.mkdir(parents=True, exist_ok=True) + + overall_correct = 0 + overall_attempts = 0 + overall_skipped = 0 + + with csv_full_path.open("w", newline="") as csvfile: + writer = csv.writer(csvfile) + writer.writerow( + ["length", "original_text", "true_cipher", "model_cipher", "is_correct"] + ) + + for length in lengths: + print(f"\n=== Evaluating expressions with length {length} ===") + correct, attempts, skipped = evaluate( + length=length, + iterations=iterations_per_length, + writer=writer, + ) + overall_correct += correct + overall_attempts += attempts + overall_skipped += skipped + + if overall_attempts: + overall_accuracy = (overall_correct / overall_attempts) * 100 + print( + f"\nOverall accuracy across lengths {lengths[0]}-{lengths[-1]}: " + f"{overall_accuracy:.2f}% " + f"({overall_correct}/{overall_attempts}) with {overall_skipped} skipped." + ) + else: + print( + f"\nNo successful evaluations recorded across lengths {lengths[0]}-{lengths[-1]}. " + f"Skipped {overall_skipped} attempts." + ) + + +if __name__ == "__main__": + run_length_sweep(1, 3, 3, "llm_results.csv") # from length 1 to 6, 200 iterations each \ No newline at end of file diff --git a/12-training/2-unsloth-finetune/generate.py b/12-training/2-unsloth-finetune/generate.py new file mode 100644 index 00000000..0ab6ad71 --- /dev/null +++ b/12-training/2-unsloth-finetune/generate.py @@ -0,0 +1,97 @@ +import string +import secrets +import json +import random + +def positional_mirror_cipher_with_logic(text): + alphabet = string.ascii_lowercase + mirror = alphabet[::-1] + + analysis_steps = [] + final_result = [] + + for i, char in enumerate(text): + if char.lower() in alphabet: + is_upper = char.isupper() + idx = alphabet.index(char.lower()) + + if i % 2 == 0: + # Even: Mirror (a -> z) + new_char = mirror[idx] + transformation = f"{char}({i}:even)->{new_char.upper() if is_upper else new_char}" + else: + # Odd: Shift +3 (a -> d) + new_char = alphabet[(idx + 3) % 26] + transformation = f"{char}({i}:odd)->{new_char.upper() if is_upper else new_char}" + + res_char = new_char.upper() if is_upper else new_char + final_result.append(res_char) + analysis_steps.append(transformation) + else: + # Other (Spaces, Digits, Punctuation) + final_result.append(char) + analysis_steps.append(f"{char}({i}:sym)->{char}") + + analysis_str = ", ".join(analysis_steps) + cipher_text = "".join(final_result) + return analysis_str, cipher_text + +def getRandomString(length) -> str: + categories = [ + string.ascii_lowercase, + string.ascii_uppercase, + string.digits, + string.punctuation + " " + ] + # distribution for each of the above categories + weights = [70, 15, 5, 10] + + result = [] + for _ in range(length): + chosen_category = random.choices(categories, weights=weights, k=1)[0] + result.append(secrets.choice(chosen_category)) + + return "".join(result) + +def create_trainset(filename="./results/dataset.jsonl", num_samples=20000): + instruction = ( + "Apply the positional mirror-shift cipher to the input text using a GLOBAL index. " + "The index starts at 0 for the first character and increments for EVERY character (including spaces and symbols). " + "Rules: If the global index is even, mirror the letter ($a \\to z, A \\to Z, b \\to y, B \\to Y, c \\to x$, etc.). " + "If the global index is odd, shift the letter forward by 3 (a \\to d, A \\to D, b \\to e, B \\to E, z \\to c$). " + "Non-alphabetic characters do not change but still consume an index count." + ) + + with open(filename, "w") as f: + for _ in range(num_samples): + length = secrets.randbelow(5) + 5 + plain_text = getRandomString(length) + analysis, cipher = positional_mirror_cipher_with_logic(plain_text) + + response = f"Analysis: {analysis}\nFinal Cipher: {cipher}" + + record = { + "text": f"### Instruction:\n{instruction}\n\n### Input:\n{plain_text}\n\n### Response:\n{response}" + } + f.write(json.dumps(record) + "\n") + + print(f"Successfully generated {num_samples} samples with the requested character distribution.") + +def create_testset(filename="./results/testset.json", num_samples=200): + with open(filename, "w") as f: + plain_texts = [] + ciphered_texts = [] + for _ in range(num_samples): + length = secrets.randbelow(5) + 5 # length between 5 and 9 + plain_text = getRandomString(length) + _, cipher = positional_mirror_cipher_with_logic(plain_text) + plain_texts.append(plain_text) + ciphered_texts.append(cipher) + + json.dump({"plain_texts": plain_texts, + "ciphered_texts": ciphered_texts}, f) + print(f"Successfully generated {num_samples} test samples.") + +if __name__ == "__main__": + create_trainset() + # create_testset() \ No newline at end of file diff --git a/12-training/2-unsloth-finetune/main.py b/12-training/2-unsloth-finetune/main.py new file mode 100644 index 00000000..da0a7365 --- /dev/null +++ b/12-training/2-unsloth-finetune/main.py @@ -0,0 +1,176 @@ + +STORAGE_PATH = "/persistent-storage" +DATASET_FILE = f"{STORAGE_PATH}/dataset.jsonl" +ADAPTER_PATH = f"{STORAGE_PATH}/cipher-lora-adapter" +CHECKPOINT_DIR = f"{STORAGE_PATH}/checkpoints" + +MODEL_NAME = "unsloth/llama-3.1-8b-instruct-bnb-4bit" +MAX_SEQ_LENGTH = 2048 + +def train(mx_steps: int = 100): + # lazy import + import os + import torch + from unsloth import FastLanguageModel + from datasets import load_dataset + from trl import SFTTrainer + from transformers import TrainingArguments + + try: + mx_steps = int(mx_steps) + except (TypeError, ValueError) as e: + raise ValueError("Invalid parameter type") from e + + # we use 4-bit quantization + model, tokenizer = FastLanguageModel.from_pretrained( + model_name = MODEL_NAME, + max_seq_length = MAX_SEQ_LENGTH, + load_in_4bit = True, + ) + + # PEFT / LoRA setup + model = FastLanguageModel.get_peft_model( + model, + r = 16, + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"], + lora_alpha = 16, + lora_dropout = 0, + bias = "none", + ) + + dataset = load_dataset("json", data_files=DATASET_FILE, split="train") + + trainer = SFTTrainer( + model = model, + tokenizer = tokenizer, + train_dataset = dataset, + dataset_text_field = "text", + max_seq_length = MAX_SEQ_LENGTH, + args = TrainingArguments( + per_device_train_batch_size = 1, + gradient_accumulation_steps = 4, # simulates larger batch size w/o memory spike + gradient_checkpointing = True, + warmup_steps = 5, + max_steps = int(mx_steps), + learning_rate = 2e-4, + fp16 = not torch.cuda.is_bf16_supported(), + bf16 = torch.cuda.is_bf16_supported(), + logging_steps = 1, + output_dir = CHECKPOINT_DIR, + save_strategy = "steps", + save_steps = 50, + save_total_limit = 1, + ), + ) + + resume_from = None + if os.path.exists(CHECKPOINT_DIR) and os.listdir(CHECKPOINT_DIR): + print("Resuming from latest checkpoint...") + resume_from = True + + trainer.train(resume_from_checkpoint=resume_from) + + model.save_pretrained(ADAPTER_PATH) + tokenizer.save_pretrained(ADAPTER_PATH) + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print(f"Weights saved to {ADAPTER_PATH}") + + return {"status": "success", "message": "Training complete or checkpoint saved."} + + + +def evaluate(input_filename: str = "testset.json", output_filename: str = "finetune_results.csv"): + # lazy import + import os + import json + import csv + from unsloth import FastLanguageModel + + input_path = os.path.join(STORAGE_PATH, input_filename) + output_path = os.path.join(STORAGE_PATH, output_filename) + + if not os.path.exists(input_path): + return {"error": f"File {input_filename} not found in {STORAGE_PATH}"} + + with open(input_path, 'r') as f: + data = json.load(f) + + raw_list = data['plain_texts'] + cipher_truth_list = data['ciphered_texts'] + + load_path = ADAPTER_PATH if os.path.exists(ADAPTER_PATH) else MODEL_NAME + + # 4-bit quantization + model, tokenizer = FastLanguageModel.from_pretrained( + model_name = load_path, + max_seq_length = 2048, + load_in_4bit = True, + ) + FastLanguageModel.for_inference(model) + + tokenizer.padding_side = "left" + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + instruction = ( + "Apply the positional mirror-shift cipher to the input text using a GLOBAL index. " + "The index starts at 0 for the first character and increments for EVERY character (including spaces and symbols). " + "Rules: If the global index is even, mirror the letter ($a \\to z, A \\to Z, b \\to y, B \\to Y, c \\to x$, etc.). " + "If the global index is odd, shift the letter forward by 3 (a \\to d, A \\to D, b \\to e, B \\to E, z \\to c$). " + "Non-alphabetic characters do not change but still consume an index count." + ) + + batch_size = 8 + results_for_csv = [] + + print(f"Starting evaluation on {len(raw_list)} items...") + + # batch inference + for i in range(0, len(raw_list), batch_size): + print(f"Processing batch {len(raw_list)//(i+batch_size)}") + chunk_input = raw_list[i : i + batch_size] + chunk_truth = cipher_truth_list[i : i + batch_size] + + prompts = [f"### Instruction:\n{instruction}\n\n### Input:\n{text}\n\n### Response:\n" for text in chunk_input] + + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda") + + outputs = model.generate(**inputs, max_new_tokens=256, use_cache=True) + decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + for j, full_text in enumerate(decoded): + response_section = full_text.split("### Response:\n")[-1].strip() + + if "Final Cipher:" in response_section: + model_cipher = response_section.split("Final Cipher:")[-1].strip() + else: + model_cipher = response_section.split("\n")[-1].strip() + + truth = chunk_truth[j] + is_correct = (model_cipher == truth) + + results_for_csv.append({ + "length": len(chunk_input[j]), + "original_text": chunk_input[j], + "true_cipher": truth, + "model_cipher": model_cipher, + "is_correct": is_correct + }) + + keys = ["length", "original_text", "true_cipher", "model_cipher", "is_correct"] + with open(output_path, 'w', newline='', encoding='utf-8') as f: + dict_writer = csv.DictWriter(f, fieldnames=keys) + dict_writer.writeheader() + dict_writer.writerows(results_for_csv) + + accuracy = sum(1 for x in results_for_csv if x['is_correct']) / len(results_for_csv) * 100 + print(f"Evaluation complete. Accuracy: {accuracy:.2f}%") + + return { + "status": "completed", + "accuracy": f"{accuracy:.2f}%", + "output_file": output_path + } diff --git a/12-training/2-unsloth-finetune/plot.py b/12-training/2-unsloth-finetune/plot.py new file mode 100644 index 00000000..bec41cbc --- /dev/null +++ b/12-training/2-unsloth-finetune/plot.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + +import pandas as pd +from matplotlib import pyplot as plt + +LLM_CSV = Path("./results/llm_results.csv") +FINETUNE_CSV = Path("./results/finetune_results.csv") + + +def load_results(csv_path: Path, expected_columns: set[str]) -> pd.DataFrame: + if not csv_path.exists(): + raise FileNotFoundError(f"CSV file not found: {csv_path}") + + df = pd.read_csv(csv_path) + missing = expected_columns.difference(df.columns) + if missing: + raise ValueError(f"File {csv_path.name} missing columns: {sorted(missing)}") + + if not pd.api.types.is_numeric_dtype(df["is_correct"]): + df["is_correct"] = df["is_correct"].astype(bool) + + return df + + +def compute_accuracy(df: pd.DataFrame) -> pd.DataFrame: + """Groups by length and calculates the accuracy percentage.""" + grouped = df.groupby("length") + accuracy = grouped["is_correct"].mean().reset_index() + accuracy.rename(columns={"is_correct": "accuracy"}, inplace=True) + accuracy["accuracy_percentage"] = accuracy["accuracy"] * 100 + return accuracy.sort_values("length") + + +def plot_comparison( + llm_acc: pd.DataFrame, + ft_acc: pd.DataFrame, + output_path: Path | None = None +) -> None: + """Plots both LLM and Fine-tune accuracy on the same graph.""" + fig, ax = plt.subplots(figsize=(10, 6)) + + ax.plot( + llm_acc["length"], + llm_acc["accuracy_percentage"], + marker="o", + label="LLM (Baseline)", + color="tab:blue", + alpha=0.8 + ) + + ax.plot( + ft_acc["length"], + ft_acc["accuracy_percentage"], + marker="s", + label="Fine-tuned Model", + color="tab:orange", + alpha=0.8 + ) + + ax.set_xlabel("Sequence Length") + ax.set_ylabel("Accuracy (%)") + ax.set_title("Cipher Accuracy Comparison: LLM vs. Fine-tuned") + ax.grid(True, which="both", linestyle="--", linewidth=0.5, alpha=0.7) + + all_lengths = sorted(list(set(llm_acc["length"]) | set(ft_acc["length"]))) + ax.set_xticks(all_lengths) + ax.get_xaxis().set_major_formatter(plt.ScalarFormatter()) + + ax.set_ylim(0, 105) + ax.legend() + + fig.tight_layout() + + if output_path: + output_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(output_path) + print(f"Saved comparison plot to {output_path}") + else: + plt.show() + + plt.close(fig) + +def main() -> None: + + cols = {"length", "original_text", "true_cipher", "model_cipher", "is_correct"} + + try: + df_llm = load_results(LLM_CSV, cols) + df_ft = load_results(FINETUNE_CSV, cols) + + acc_llm = compute_accuracy(df_llm) + acc_ft = compute_accuracy(df_ft) + + plot_comparison(acc_llm, acc_ft, None) + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/12-training/2-unsloth-finetune/requirements.txt b/12-training/2-unsloth-finetune/requirements.txt new file mode 100644 index 00000000..0fe082ff --- /dev/null +++ b/12-training/2-unsloth-finetune/requirements.txt @@ -0,0 +1,31 @@ +annotated-types==0.7.0 +anyio==4.12.1 +certifi==2026.1.4 +contourpy==1.3.3 +cycler==0.12.1 +distro==1.9.0 +fonttools==4.61.1 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +idna==3.11 +jiter==0.12.0 +kiwisolver==1.4.9 +matplotlib==3.10.8 +numpy==2.4.0 +openai==2.14.0 +packaging==25.0 +pandas==2.3.3 +pillow==12.1.0 +pydantic==2.12.5 +pydantic_core==2.41.5 +pyparsing==3.3.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +pytz==2025.2 +six==1.17.0 +sniffio==1.3.1 +tqdm==4.67.1 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +tzdata==2025.3