Skip to content

Commit 60c752b

Browse files
committed
feat: add hard script
1 parent c1422e2 commit 60c752b

File tree

1 file changed

+128
-0
lines changed

1 file changed

+128
-0
lines changed

analysis/bcb_hard.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import pickle
2+
import json
3+
import numpy as np
4+
from tqdm import tqdm
5+
from ast import literal_eval
6+
from glob import glob
7+
from sentence_transformers import SentenceTransformer, util
8+
import matplotlib.pyplot as plt
9+
from datasets import load_dataset, Dataset, Features, Value, Sequence
10+
11+
from utils import *
12+
13+
def embed_sentences(data, col_name, id_name, model, save_path, push_to_hub=False):
14+
pool = model.start_multi_process_pool()
15+
embeddings = model.encode_multi_process(data[col_name], pool=pool)
16+
qids = data[id_name]
17+
features = Features({id_name: Value(dtype='string'), 'embeddings': Sequence(Value('float32'))})
18+
embed_dict = {
19+
id_name: qids,
20+
"embeddings": embeddings
21+
}
22+
embed_ds = Dataset.from_dict(embed_dict, features=features)
23+
if push_to_hub:
24+
embed_ds.push_to_hub(f"bigcode/{save_path}")
25+
else:
26+
embed_ds.save_to_disk(save_path)
27+
return embed_ds
28+
29+
30+
def get_top_docs(query_embs, doc_emb, docs):
31+
scores = np.dot(query_embs, doc_emb.T)
32+
top_doc_indices = np.argmax(scores, axis=1)
33+
top_scores = scores[np.arange(len(scores)), top_doc_indices]
34+
results = [(i, docs[doc_idx], score) for i, (doc_idx, score) in tqdm(enumerate(zip(top_doc_indices, top_scores)))]
35+
36+
return results
37+
38+
39+
def filter_top_k_percent(results, k_percent):
40+
all_scores = [score for _, score in results]
41+
threshold = np.percentile(all_scores, 100 - k_percent)
42+
filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold]
43+
return filtered_results
44+
45+
46+
def filter_top_threshold(results, threshold):
47+
filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold]
48+
return filtered_results
49+
50+
51+
def read_task_perf(top_tid, task="complete"):
52+
model_results = dict()
53+
result_files = []
54+
for model, info in model_info.items():
55+
if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
56+
continue
57+
task_perf = {f"BigCodeBench/{task_id}": 0 for task_id in range(1140)}
58+
model = model.replace("/", "--")
59+
if info["link"].startswith("https://huggingface.co/"):
60+
model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
61+
try:
62+
if info["prompted"]:
63+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
64+
if files:
65+
file = files[0]
66+
else:
67+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
68+
else:
69+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
70+
except:
71+
continue
72+
with open(file, "r") as f:
73+
data = json.load(f)
74+
for task_id, perfs in data["eval"].items():
75+
status = 1 if perfs[0]["status"] == "pass" else 0
76+
task_perf[task_id] = status
77+
model_results[info["name"]] = np.mean([status for tid, status in task_perf.items() if tid in top_tid])
78+
return sorted(model_results.items(), key=lambda x: x[1], reverse=True)
79+
80+
81+
if __name__ == "__main__":
82+
bcb = load_dataset("bigcode/bigcodebench", trust_remote_code=True, split="v0.1.0_hf")
83+
se = load_dataset("bigcode/stack-exchange-preferences-20230914-clean-anonymization", trust_remote_code=True, split="train")
84+
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
85+
86+
se_embed = embed_sentences(se, "question", "qid", model, "stack-exchange-embeddings-20230914", push_to_hub=True)
87+
bcb_embed = embed_sentences(bcb, "complete_prompt", "task_id", model, "bigcodebench-doc-embeddings", push_to_hub=True)
88+
89+
solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", trust_remote_code=True, split="complete")
90+
91+
query_embs = np.array(se_embed["embeddings"])
92+
doc_emb = np.array(bcb_embed["embeddings"])
93+
docs = bcb_embed["task_id"]
94+
retrieval_results = get_top_docs(query_embs, doc_emb, docs)
95+
96+
Dataset.from_dict({"qid": [i for i, _, _ in retrieval_results], "tid": [doc for _, doc, _ in retrieval_results], "score": [score for _, _, score in retrieval_results]}).push_to_hub("bigcode/se_bcb_results")
97+
98+
retrieval_ds = load_dataset("bigcode/se_bcb_results", trust_remote_code=True, split="train")
99+
retrieval_ds = load_dataset("bigcode/se_bcb_instruct_results", trust_remote_code=True, split="train")
100+
101+
top_results = dict()
102+
for sample in tqdm(retrieval_ds):
103+
i, doc, score = sample["qid"], sample["tid"], sample["score"]
104+
if score > 0.7:
105+
if doc not in top_results:
106+
top_results[doc] = (i, doc, score)
107+
else:
108+
if score > top_results[doc][2]:
109+
top_results[doc] = (i, doc, score)
110+
111+
top_id = {task_id: (qid, score) for qid, task_id, score in top_results.values()}
112+
113+
lib_filter = {sample["task_id"] for sample in bcb if len(literal_eval(sample["libs"])) > 2}
114+
length_filter = {sample["task_id"] for sample in bcb if len(sample["canonical_solution"]) > 426}
115+
rate_filter = {task["task_id"]: task["solve_rate"] for task in solve_rate if task["solve_rate"] < 50}
116+
117+
top_tid = top_id.keys() & length_filter & rate_filter.keys() & lib_filter
118+
# hard_results = read_task_perf(top_tid)
119+
120+
hard_bcb = bcb.filter(lambda x: x["task_id"] in top_tid)
121+
hard_bcb_tid = hard_bcb["task_id"]
122+
se_qid = [top_id[_id][0] for _id in hard_bcb_tid]
123+
se_q = se.select(se_qid)
124+
se_scores = [top_id[_id][1] for _id in hard_bcb_tid]
125+
hard_bcb = hard_bcb.add_column("qid", se_qid)
126+
hard_bcb = hard_bcb.add_column("question", se_q["question"])
127+
hard_bcb = hard_bcb.add_column("score", se_scores)
128+
hard_bcb.push_to_hub("bigcode/bigcodebench-hard")

0 commit comments

Comments
 (0)