Skip to content

Commit bbaf0cc

Browse files
committed
feat: add failed tasks
1 parent 7561435 commit bbaf0cc

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

bigcodebench/evaluate.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ def evaluate(flags):
134134
expected_time = {task_id: None for task_id in problems}
135135

136136
gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
137+
failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
137138

138139
if os.path.isfile(result_path):
139140
print(f"Load from previous results from {result_path}")
@@ -150,6 +151,9 @@ def evaluate(flags):
150151
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
151152
return
152153

154+
if len(failed_tasks) > 0:
155+
cprint(f"Failed tasks: {failed_tasks}", "red")
156+
153157
results = {
154158
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
155159
"eval": {},
@@ -259,6 +263,9 @@ def stucking_checker():
259263
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
260264
else:
261265
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
266+
267+
if len(failed_tasks) > 0:
268+
cprint(f"Failed tasks: {failed_tasks}", "red")
262269

263270
for k, v in pass_at_k.items():
264271
cprint(f"{k}:\t{v:.3f}", "green")

0 commit comments

Comments
 (0)