-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy patheval.py
More file actions
100 lines (72 loc) · 3.54 KB
/
eval.py
File metadata and controls
100 lines (72 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Modal-based Evaluation Script
import modal
from modal import Image, Stub, gpu
import torch
from tqdm import tqdm
from huggingface_hub import login
import argparse
from evaluator import format_prompt, save_info_to_json, get_info_from_json
import os, json
login(os.environ["HF_TOKEN"])
stub = modal.Stub(
image = Image.debian_slim(python_version="3.11")
.pip_install(
["transformers", "datasets", "huggingface_hub", "torch", "tqdm", "psutil", "sentencepiece"]
)
.apt_install("git")
.apt_install( "gcc")
.run_commands("git config --global user.name ksgk-fangyuan",
"git config --global user.email fangyuan.yu18@gmail.com",
)
)
@stub.function(gpu = modal.gpu.A100(size="40GB"),
secrets=[modal.Secret.from_name("ksgk-secret")],
timeout=2400)
def evaluate_perplexity(model_id, dataset_name):
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
print('Secrete HF_TOKEN:', os.environ["HF_TOKEN"])
login(os.environ["HF_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset(dataset_name, split="test")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Updated to use CPU as fallback
model.to(device)
model.eval()
perplexities = []
for data in tqdm(dataset, desc="Evaluating Perplexity on the Instruction-Tuning Dataset"):
prompt = data["prompt"]
completion = data["completion"] + tokenizer.eos_token
target_sequence, query_sequence = format_prompt(prompt, completion)
query_sequence_length = tokenizer.encode(query_sequence, return_tensors="pt").shape[1]
sequence_ids = tokenizer.encode(target_sequence, return_tensors="pt").to(device) # Ensure sequence_ids are on the correct device
with torch.no_grad():
sequence_logits = model(sequence_ids).logits
target_logits = sequence_logits[:, (query_sequence_length-1):-1]
target_ids = sequence_ids[:, query_sequence_length:].view(-1)
target_logits = sequence_logits[:, (query_sequence_length-1):-1]
target_ids = sequence_ids[:, query_sequence_length:].view(-1)
loss = torch.nn.functional.cross_entropy(target_logits.reshape(-1, target_logits.size(-1)), target_ids, reduction="none")
perplexity = loss.mean().cpu()
perplexities.append(perplexity)
# for some reason it needs to go back to CPU before returning
del model
del tokenizer
return sum(perplexities) / len(perplexities)
@stub.local_entrypoint()
def main(model_id = "HuggingFaceH4/zephyr-7b-beta",
dataset_name = "Ksgk-fy/alignment-sft-test01"):
# Pass if evaluation already done for this model
if os.path.exists('./merge_info/merge_info.json'):
info_list = get_info_from_json()
model_exists = any(info["Model ID"] == model_id for info in info_list)
if model_exists:
print(f"Evaluation already done for model {model_id}. Skipping evaluation.")
return
avg_perplexity = evaluate_perplexity.remote(model_id, dataset_name)
print("Average Perplexity:", avg_perplexity)
info = {"Model ID": model_id, "Dataset Name": dataset_name, "Average Perplexity": avg_perplexity.item()}
os.makedirs("./merge_info", exist_ok=True)
# Save info to a JSON file
save_info_to_json(info, "./merge_info/merge_info.json")