1+ import pickle
2+ import json
3+ import numpy as np
4+ from tqdm import tqdm
5+ from ast import literal_eval
6+ from glob import glob
7+ from sentence_transformers import SentenceTransformer , util
8+ import matplotlib .pyplot as plt
9+ from datasets import load_dataset , Dataset , Features , Value , Sequence
10+
11+ from utils import *
12+
13+ def embed_sentences (data , col_name , id_name , model , save_path , push_to_hub = False ):
14+ pool = model .start_multi_process_pool ()
15+ embeddings = model .encode_multi_process (data [col_name ], pool = pool )
16+ qids = data [id_name ]
17+ features = Features ({id_name : Value (dtype = 'string' ), 'embeddings' : Sequence (Value ('float32' ))})
18+ embed_dict = {
19+ id_name : qids ,
20+ "embeddings" : embeddings
21+ }
22+ embed_ds = Dataset .from_dict (embed_dict , features = features )
23+ if push_to_hub :
24+ embed_ds .push_to_hub (f"bigcode/{ save_path } " )
25+ else :
26+ embed_ds .save_to_disk (save_path )
27+ return embed_ds
28+
29+
30+ def get_top_docs (query_embs , doc_emb , docs ):
31+ scores = np .dot (query_embs , doc_emb .T )
32+ top_doc_indices = np .argmax (scores , axis = 1 )
33+ top_scores = scores [np .arange (len (scores )), top_doc_indices ]
34+ results = [(i , docs [doc_idx ], score ) for i , (doc_idx , score ) in tqdm (enumerate (zip (top_doc_indices , top_scores )))]
35+
36+ return results
37+
38+
39+ def filter_top_k_percent (results , k_percent ):
40+ all_scores = [score for _ , score in results ]
41+ threshold = np .percentile (all_scores , 100 - k_percent )
42+ filtered_results = [(i , doc , score ) for i , doc , score in results if score > threshold ]
43+ return filtered_results
44+
45+
46+ def filter_top_threshold (results , threshold ):
47+ filtered_results = [(i , doc , score ) for i , doc , score in results if score > threshold ]
48+ return filtered_results
49+
50+
51+ def read_task_perf (top_tid , task = "complete" ):
52+ model_results = dict ()
53+ result_files = []
54+ for model , info in model_info .items ():
55+ if task == "instruct" and (not info ["prompted" ] or info ["name" ] in ["Granite-Code-3B-Instruct" , "Granite-Code-8B-Instruct" ]):
56+ continue
57+ task_perf = {f"BigCodeBench/{ task_id } " : 0 for task_id in range (1140 )}
58+ model = model .replace ("/" , "--" )
59+ if info ["link" ].startswith ("https://huggingface.co/" ):
60+ model = info ["link" ].split ("https://huggingface.co/" )[- 1 ].replace ("/" , "--" )
61+ try :
62+ if info ["prompted" ]:
63+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results.json" )
64+ if files :
65+ file = files [0 ]
66+ else :
67+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
68+ else :
69+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
70+ except :
71+ continue
72+ with open (file , "r" ) as f :
73+ data = json .load (f )
74+ for task_id , perfs in data ["eval" ].items ():
75+ status = 1 if perfs [0 ]["status" ] == "pass" else 0
76+ task_perf [task_id ] = status
77+ model_results [info ["name" ]] = np .mean ([status for tid , status in task_perf .items () if tid in top_tid ])
78+ return sorted (model_results .items (), key = lambda x : x [1 ], reverse = True )
79+
80+
81+ if __name__ == "__main__" :
82+ bcb = load_dataset ("bigcode/bigcodebench" , trust_remote_code = True , split = "v0.1.0_hf" )
83+ se = load_dataset ("bigcode/stack-exchange-preferences-20230914-clean-anonymization" , trust_remote_code = True , split = "train" )
84+ model = SentenceTransformer ("sentence-transformers/all-mpnet-base-v2" )
85+
86+ se_embed = embed_sentences (se , "question" , "qid" , model , "stack-exchange-embeddings-20230914" , push_to_hub = True )
87+ bcb_embed = embed_sentences (bcb , "complete_prompt" , "task_id" , model , "bigcodebench-doc-embeddings" , push_to_hub = True )
88+
89+ solve_rate = load_dataset ("bigcode/bigcodebench-solve-rate" , trust_remote_code = True , split = "complete" )
90+
91+ query_embs = np .array (se_embed ["embeddings" ])
92+ doc_emb = np .array (bcb_embed ["embeddings" ])
93+ docs = bcb_embed ["task_id" ]
94+ retrieval_results = get_top_docs (query_embs , doc_emb , docs )
95+
96+ Dataset .from_dict ({"qid" : [i for i , _ , _ in retrieval_results ], "tid" : [doc for _ , doc , _ in retrieval_results ], "score" : [score for _ , _ , score in retrieval_results ]}).push_to_hub ("bigcode/se_bcb_results" )
97+
98+ retrieval_ds = load_dataset ("bigcode/se_bcb_results" , trust_remote_code = True , split = "train" )
99+ retrieval_ds = load_dataset ("bigcode/se_bcb_instruct_results" , trust_remote_code = True , split = "train" )
100+
101+ top_results = dict ()
102+ for sample in tqdm (retrieval_ds ):
103+ i , doc , score = sample ["qid" ], sample ["tid" ], sample ["score" ]
104+ if score > 0.7 :
105+ if doc not in top_results :
106+ top_results [doc ] = (i , doc , score )
107+ else :
108+ if score > top_results [doc ][2 ]:
109+ top_results [doc ] = (i , doc , score )
110+
111+ top_id = {task_id : (qid , score ) for qid , task_id , score in top_results .values ()}
112+
113+ lib_filter = {sample ["task_id" ] for sample in bcb if len (literal_eval (sample ["libs" ])) > 2 }
114+ length_filter = {sample ["task_id" ] for sample in bcb if len (sample ["canonical_solution" ]) > 426 }
115+ rate_filter = {task ["task_id" ]: task ["solve_rate" ] for task in solve_rate if task ["solve_rate" ] < 50 }
116+
117+ top_tid = top_id .keys () & length_filter & rate_filter .keys () & lib_filter
118+ # hard_results = read_task_perf(top_tid)
119+
120+ hard_bcb = bcb .filter (lambda x : x ["task_id" ] in top_tid )
121+ hard_bcb_tid = hard_bcb ["task_id" ]
122+ se_qid = [top_id [_id ][0 ] for _id in hard_bcb_tid ]
123+ se_q = se .select (se_qid )
124+ se_scores = [top_id [_id ][1 ] for _id in hard_bcb_tid ]
125+ hard_bcb = hard_bcb .add_column ("qid" , se_qid )
126+ hard_bcb = hard_bcb .add_column ("question" , se_q ["question" ])
127+ hard_bcb = hard_bcb .add_column ("score" , se_scores )
128+ hard_bcb .push_to_hub ("bigcode/bigcodebench-hard" )
0 commit comments