Skip to content

Commit ce5cea9

Browse files
committed
LCBv5Official
1 parent 08aaf4a commit ce5cea9

File tree

2 files changed

+648
-0
lines changed

2 files changed

+648
-0
lines changed
Lines changed: 369 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
import copy
2+
import logging
3+
import os
4+
import re
5+
from collections import defaultdict
6+
from concurrent.futures import ThreadPoolExecutor, as_completed
7+
from typing import Any, Dict, List, Optional
8+
9+
import numpy as np
10+
from datasets import Dataset, concatenate_datasets, load_dataset
11+
from lm_eval.api.instance import Instance
12+
from lm_eval.api.model import LM
13+
14+
from eval.task import BaseBenchmark
15+
from huggingface_hub import hf_hub_download
16+
17+
from .livecodebench_utils import lcb_run, map_to_example, post_process_code, translate_private_test_cases
18+
19+
HF_HUB_CACHE = os.environ.get("HF_HUB_CACHE")
20+
if not HF_HUB_CACHE:
21+
print(
22+
"WARNING: HF_HUB_CACHE environment variable is not set, using default cache directory ~/.cache/huggingface/hub for LiveCodeBenchv5 benchmark"
23+
)
24+
25+
26+
def has_code(response):
27+
pattern = r"```(?:[a-zA-Z]*)\n(.*?)```"
28+
# Use re.DOTALL to match multiline content inside backticks
29+
matches = re.findall(pattern, response, re.DOTALL)
30+
return matches
31+
32+
33+
# Calculate mean and standard error for all metrics
34+
def calc_stats(values):
35+
mean = np.mean(values)
36+
stderr = np.std(values, ddof=1) / np.sqrt(len(values))
37+
return mean, stderr
38+
39+
40+
41+
def filter_by_contest_date(example):
42+
target_months = ["2024-08", "2024-09", "2024-10", "2024-11", "2024-12", "2025-01"]
43+
return example['contest_date'][:7] in target_months
44+
45+
46+
class LiveCodeBenchV5OfficialBenchmark(BaseBenchmark):
47+
"""
48+
LiveCodeBench v5 - v2 Benchmark for evaluating the math reasoning of LLMs.
49+
50+
Follows the evaluation logic of hendrycks_math answer extraction.
51+
"""
52+
53+
def __init__(
54+
self,
55+
debug: bool = False,
56+
seed: List[int] = [0, 1234, 1234, 1234],
57+
max_tokens: int = 32768,
58+
logger: Optional[logging.Logger] = None,
59+
system_instruction: Optional[str] = None,
60+
):
61+
"""
62+
Initialize LiveCodeBenchV5 benchmark.
63+
64+
Args:
65+
debug: If set, only evaluate on 2 examples
66+
seed: Random seed for reproducibility. Default is [0, 1234, 1234, 1234] for lm-eval-harness.
67+
logger: Optional logger instance
68+
system_instruction: Optional system instruction for the model
69+
"""
70+
super().__init__(logger=logger, system_instruction=system_instruction)
71+
self.debug = debug
72+
self.max_new_tokens = max_tokens
73+
self.seed = seed
74+
self.n_repeat = 3
75+
76+
def generate_responses(self, model: LM) -> Dict[str, Any]:
77+
"""
78+
Generate solution completions using the provided model.
79+
80+
Args:
81+
model: Language model
82+
83+
Returns:
84+
Dictionary containing generated responses and temporary directory,
85+
or None for non-primary ranks
86+
"""
87+
examples = self.load_questions()
88+
if self.debug:
89+
examples = examples[:10]
90+
91+
all_outputs = []
92+
93+
for i in range(self.n_repeat):
94+
all_instances = []
95+
seed = [s + i for s in self.seed]
96+
97+
for idx, example in enumerate(examples):
98+
if example["is_stdin"]:
99+
prompt_text = (
100+
"Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition."
101+
+ example["prompt"]
102+
)
103+
else:
104+
prompt_text = (
105+
"Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution."
106+
+ example["prompt"]
107+
)
108+
messages = [{"role": "user", "content": prompt_text}]
109+
110+
templated_messages = self._prepare_messages(messages, model)
111+
112+
instance = Instance(
113+
"generate_until",
114+
example,
115+
(
116+
templated_messages,
117+
{
118+
"do_sample": False,
119+
"max_new_tokens": self.max_new_tokens,
120+
"temperature": 0.7,
121+
"seed": seed,
122+
},
123+
),
124+
idx,
125+
)
126+
instance.repeat_idx = i
127+
all_instances.append(instance)
128+
129+
# Generate model responses
130+
self.logger.info("Generating responses for LiveCodeBenchV5...")
131+
outputs = self.compute(model, all_instances)
132+
all_outputs.append(outputs)
133+
134+
# Return None early for non-primary ranks
135+
if model.rank != 0:
136+
return None
137+
138+
examples_list = []
139+
140+
for example, outputs in zip(examples, zip(*all_outputs)):
141+
example["model_outputs"] = list(outputs)
142+
example["model_answers"] = [has_code(o) for o in outputs]
143+
examples_list.append(example)
144+
145+
return {"examples": examples_list}
146+
147+
@staticmethod
148+
def check_correctness(problem: Dict, completion: str, timeout: float, is_extracted: bool = False) -> Dict:
149+
"""
150+
Evaluates the functional correctness of a completion by running the test
151+
suite provided in the problem.
152+
153+
:param completion_id: an optional completion ID so we can match
154+
the results later even if execution finishes asynchronously.
155+
"""
156+
result_list = lcb_run(problem, completion, timeout, is_extracted)
157+
details = [r[0] for r in result_list]
158+
all_passed = all(details)
159+
160+
result = ""
161+
if result_list and all_passed:
162+
result = "passed"
163+
164+
return result == "passed"
165+
166+
def evaluate_single_example(self, example):
167+
"""Helper function to evaluate a single example"""
168+
try:
169+
response_entry = {
170+
"content": example["model_answer"],
171+
"difficulty": example["difficulty"],
172+
"correctness": None,
173+
"reason": None,
174+
}
175+
176+
code_filter_result = example["model_answer"]
177+
178+
if not code_filter_result or len(code_filter_result) == 0:
179+
response_entry["correctness"] = False
180+
response_entry["reason"] = "Does not contain code component."
181+
return response_entry
182+
183+
try:
184+
last_code = code_filter_result[-1]
185+
problem_to_check = copy.deepcopy(example)
186+
187+
# Add debugging
188+
self.logger.debug(f"Evaluating {example['difficulty']} problem...")
189+
190+
# Add timeout handling
191+
curr_res = self.check_correctness(
192+
problem=problem_to_check,
193+
completion=post_process_code(last_code),
194+
timeout=6,
195+
is_extracted=not problem_to_check["is_stdin"],
196+
)
197+
198+
# Log the result
199+
self.logger.debug(f"Result for {example['difficulty']}: {curr_res}")
200+
201+
response_entry["correctness"] = curr_res
202+
response_entry["reason"] = "" if curr_res else "Code is incorrect."
203+
204+
except Exception as e:
205+
self.logger.error(f"Error evaluating {example['difficulty']} example: {str(e)}")
206+
response_entry["correctness"] = False
207+
response_entry["reason"] = f"Evaluation error: {str(e)}"
208+
209+
return response_entry
210+
211+
except Exception as outer_e:
212+
self.logger.error(f"Outer error in evaluate_single_example: {str(outer_e)}")
213+
return {
214+
"content": example.get("model_answer"),
215+
"difficulty": example.get("difficulty"),
216+
"correctness": False,
217+
"reason": f"Critical error: {str(outer_e)}",
218+
}
219+
220+
def evaluate_responses(self, responses: Dict[str, Any]) -> Dict[str, float]:
221+
"""Evaluate the generated solution completions in parallel using threads."""
222+
# Handle None result from non-primary ranks
223+
if responses is None:
224+
return None
225+
226+
self.logger.info(f"Evaluating {len(responses['examples'])} examples...")
227+
self.logger.warning(f"Expect some output leaks from the code / test execution into stdout")
228+
229+
# First, organize completions by repeat index
230+
examples_by_repeat = defaultdict(list)
231+
for example in responses["examples"]:
232+
for i, (output, answers) in enumerate(zip(example["model_outputs"], example["model_answers"])):
233+
# Create a copy of the original example and update with the specific completion
234+
example_copy = example.copy() # Make a shallow copy of the example
235+
example_copy["model_answer"] = answers
236+
example_copy["model_output"] = output
237+
# Remove the lists of all outputs/answers to avoid confusion
238+
example_copy.pop("model_outputs", None)
239+
example_copy.pop("model_answers", None)
240+
examples_by_repeat[i].append(example_copy)
241+
242+
# Evaluate each set of completions separately
243+
all_metrics = []
244+
run_stats = []
245+
num_questions = len(responses["examples"])
246+
247+
for repeat_idx, examples in examples_by_repeat.items():
248+
# Use ThreadPoolExecutor with limited concurrency
249+
results = []
250+
with ThreadPoolExecutor(max_workers=32) as executor:
251+
future_to_example = {}
252+
for i, example in enumerate(examples):
253+
future = executor.submit(self.evaluate_single_example, example)
254+
future_to_example[future] = (i, example)
255+
256+
# Collect results as they complete
257+
results = [None] * len(examples)
258+
for future in as_completed(future_to_example):
259+
idx, example = future_to_example[future]
260+
try:
261+
result = future.result()
262+
results[idx] = (result, example)
263+
except Exception as e:
264+
self.logger.error(f"Future error for example {idx}: {str(e)}")
265+
results[idx] = (
266+
{
267+
"content": example["model_answer"],
268+
"difficulty": example["difficulty"],
269+
"correctness": False,
270+
"reason": f"Future error: {str(e)}",
271+
},
272+
example,
273+
)
274+
275+
# Calculate metrics for this repeat
276+
total_correct = sum(1 for result, _ in results if result["correctness"])
277+
total_finish = len(results)
278+
279+
per_difficulty_correct = defaultdict(int)
280+
per_difficulty_total = defaultdict(int)
281+
282+
for result, example in results:
283+
per_difficulty_correct[example["difficulty"]] += result["correctness"]
284+
per_difficulty_total[example["difficulty"]] += 1
285+
286+
metrics = {
287+
"total_correct": total_correct,
288+
"total_finish": total_finish,
289+
"accuracy": total_correct / total_finish,
290+
"per_difficulty_correct": dict(per_difficulty_correct),
291+
"per_difficulty_total": dict(per_difficulty_total),
292+
}
293+
294+
# Add per-difficulty accuracies
295+
for difficulty in per_difficulty_correct.keys():
296+
metrics[f"accuracy_{difficulty}"] = (
297+
per_difficulty_correct[difficulty] / per_difficulty_total[difficulty]
298+
)
299+
300+
all_metrics.append(metrics)
301+
302+
# Add to run_stats for precomputed_hf_lm.py compatibility
303+
run_stats.append(
304+
{
305+
"repetition": repeat_idx + 1,
306+
"num_total": total_finish,
307+
"num_solved": total_correct,
308+
"accuracy": total_correct / total_finish,
309+
}
310+
)
311+
312+
final_metrics = {}
313+
314+
# Calculate stats for overall accuracy
315+
acc_values = [m["accuracy"] for m in all_metrics]
316+
mean_acc, stderr_acc = calc_stats(acc_values)
317+
final_metrics["accuracy_avg"] = mean_acc
318+
final_metrics["accuracy_std_err"] = stderr_acc
319+
self.logger.info(f"Overall accuracy: {mean_acc:.2%} ± {stderr_acc:.2%}")
320+
321+
# Calculate stats for each difficulty level
322+
difficulties = all_metrics[0]["per_difficulty_correct"].keys()
323+
for diff in difficulties:
324+
acc_values = [m[f"accuracy_{diff}"] for m in all_metrics]
325+
mean_acc, stderr_acc = calc_stats(acc_values)
326+
final_metrics[f"accuracy_{diff}_avg"] = mean_acc
327+
final_metrics[f"accuracy_{diff}_std_err"] = stderr_acc
328+
329+
# Log results
330+
for diff in difficulties:
331+
mean = final_metrics[f"accuracy_{diff}_avg"]
332+
stderr = final_metrics[f"accuracy_{diff}_std_err"]
333+
self.logger.info(f"Accuracy {diff}: {mean:.2%} ± {stderr:.2%}")
334+
335+
# Include raw results and examples in final metrics
336+
final_metrics["raw_metrics"] = all_metrics
337+
final_metrics["examples"] = [result for result, _ in results] # Include last run's examples
338+
339+
# Add compatibility with precomputed_hf_lm.py
340+
solved_avg = np.mean([result["num_solved"] for result in run_stats])
341+
final_metrics.update(
342+
{
343+
"num_total": num_questions,
344+
"solved_avg": solved_avg,
345+
"run_stats": run_stats,
346+
"num_repeat": self.n_repeat,
347+
}
348+
)
349+
350+
return final_metrics
351+
352+
def load_questions(self) -> Dataset:
353+
"""Load LiveCodeBenchV5 questions from source."""
354+
self.logger.info("Loading LiveCodeBenchV5 questions from source and converting to dataset...")
355+
cpu_count = os.cpu_count()
356+
lcb_codegen = load_dataset("livecodebench/code_generation_lite", version_tag="release_v5", cache_dir="./")['test']
357+
ds = lcb_codegen.filter(filter_by_contest_date)
358+
processed_shards = []
359+
num_shards = 4
360+
for i in range(num_shards):
361+
shard = ds.shard(num_shards=num_shards, index=i)
362+
shard = shard.map(
363+
lambda example: {"private_test_cases": translate_private_test_cases(example["private_test_cases"])},
364+
num_proc=cpu_count,
365+
)
366+
shard = shard.map(map_to_example, remove_columns=ds.column_names)
367+
processed_shards.append(shard)
368+
ds = concatenate_datasets(processed_shards)
369+
return ds

0 commit comments

Comments
 (0)