|
11 | 11 | parser = argparse.ArgumentParser() |
12 | 12 | parser.add_argument("--class_path", type=str, default="llava_1_5_7b_hf") |
13 | 13 | parser.add_argument("--task_id", type=str, default="japanese-heron-bench") |
14 | | -parser.add_argument("--openai_model_id", type=str, default="gpt-4o-mini-2024-07-18") |
| 14 | +parser.add_argument("--judge_model", type=str, default="gpt-4o-mini-2024-07-18") |
15 | 15 | parser.add_argument("--batch_size_for_evaluation", type=int, default=10) |
16 | 16 | parser.add_argument("--overwrite", action="store_true") |
17 | 17 | parser.add_argument("--result_dir", type=str, default="result") |
|
22 | 22 | parser.add_argument("--top_p", type=float, default=1.0) |
23 | 23 | parser.add_argument("--do_sample", action="store_true", default=False) |
24 | 24 | parser.add_argument("--use_cache", action="store_true", default=True) |
| 25 | +parser.add_argument( |
| 26 | + "--max_dataset_len", |
| 27 | + type=int, |
| 28 | + default=None, |
| 29 | + help="max data size for evaluation. If None, use all data. Else, use the first n data.", |
| 30 | +) |
| 31 | +parser.add_argument( |
| 32 | + "--metrics", |
| 33 | + type=str, |
| 34 | + default="llm_as_a_judge_heron_bench", |
| 35 | + help="metrics to evaluate. You can specify multiple metrics separated by comma (e.g. --metrics exact_match,rougel).", |
| 36 | +) |
25 | 37 |
|
26 | 38 | args = parser.parse_args() |
27 | 39 |
|
|
36 | 48 |
|
37 | 49 | class_path = args.class_path |
38 | 50 | task_id = args.task_id |
39 | | -openai_model_id = args.openai_model_id |
40 | 51 |
|
41 | 52 | module = importlib.import_module(class_path) |
42 | 53 | model_id = module.VLM.model_id.replace("/", "-") |
43 | 54 |
|
44 | | -task = eval_mm.api.registry.get_task(task_id) |
45 | | -dataset = task.dataset |
| 55 | +task_config = eval_mm.api.task.TaskConfig( |
| 56 | + max_dataset_len=args.max_dataset_len, |
| 57 | + judge_model=args.judge_model, |
| 58 | + batch_size_for_evaluation=args.batch_size_for_evaluation, |
| 59 | +) |
| 60 | +task = eval_mm.api.registry.get_task_cls(task_id)(task_config) |
46 | 61 |
|
47 | 62 | # save the predictions to jsonl file |
48 | 63 | os.makedirs(args.result_dir, exist_ok=True) |
|
57 | 72 |
|
58 | 73 | prediction_result_file_path = os.path.join(prediction_result_dir, f"{model_id}.jsonl") |
59 | 74 |
|
60 | | - |
61 | 75 | # if prediciton is already done, load the prediction |
62 | 76 | if os.path.exists(prediction_result_file_path) and not args.overwrite: |
63 | 77 | with open(prediction_result_file_path, "r") as f: |
64 | 78 | preds = [json.loads(line) for line in f] |
| 79 | + assert ( |
| 80 | + len(preds) == len(task.dataset) |
| 81 | + ), f"Prediction result length is not equal to the dataset length. Prediction result length: {len(preds)}, Dataset length: {len(task.dataset)}" |
65 | 82 | print(f"Prediction result loaded from {prediction_result_file_path}") |
66 | 83 | else: |
67 | 84 | model = module.VLM() |
68 | 85 | preds = [] |
69 | | - for doc in tqdm(dataset): |
| 86 | + print(task.dataset) |
| 87 | + for doc in tqdm(task.dataset): |
70 | 88 | # print("doc", doc) |
71 | 89 | image = task.doc_to_visual(doc) |
72 | 90 | text = task.doc_to_text(doc) |
|
90 | 108 | exit() |
91 | 109 | print("Evaluation start") |
92 | 110 | # evaluate the predictions |
93 | | -metrics, eval_results = task.compute_metrics( |
94 | | - preds, model_id=openai_model_id, batch_size=args.batch_size_for_evaluation |
95 | | -) |
| 111 | + |
| 112 | +metrics = args.metrics.split(",") |
| 113 | + |
| 114 | +scores_for_each_metric = {} |
| 115 | + |
| 116 | +for metric in metrics: |
| 117 | + scores_for_each_metric[metric] = task.calc_scores(preds, metric) |
| 118 | + print(f"Scores for {metric}: {scores_for_each_metric[metric]}") |
| 119 | + |
| 120 | +calculated_metrics = {} |
| 121 | + |
| 122 | +for metric in metrics: |
| 123 | + calculated_metrics[metric] = task.gather_scores( |
| 124 | + scores_for_each_metric[metric], metric |
| 125 | + ) |
| 126 | + print(f"{metric}: {calculated_metrics[metric]}") |
96 | 127 |
|
97 | 128 |
|
98 | | -results = task.format_result(preds, eval_results) |
99 | 129 | with open(os.path.join(prediction_result_file_path), "w") as f: |
100 | | - for result in results: |
101 | | - f.write(json.dumps(result, ensure_ascii=False) + "\n") |
| 130 | + for i, pred in enumerate(preds): |
| 131 | + question_id = pred["question_id"] |
| 132 | + text = pred["text"] |
| 133 | + answer = task.doc_to_answer(task.dataset[i]) |
| 134 | + content = {"question_id": question_id, "text": text, "answer": answer} |
| 135 | + for metric in metrics: |
| 136 | + content[metric] = scores_for_each_metric[metric][i] |
| 137 | + f.write(json.dumps(content, ensure_ascii=False) + "\n") |
102 | 138 | print(f"Prediction result saved to {prediction_result_file_path}") |
103 | 139 |
|
104 | 140 | eval_result_file_path = os.path.join(evaluation_result_dir, f"{model_id}.jsonl") |
105 | 141 | with open(eval_result_file_path, "w") as f: |
106 | | - f.write(json.dumps(metrics, ensure_ascii=False) + "\n") |
107 | | - |
108 | | -print(f"Metrics: {metrics}") |
109 | | -print(f"Evaluation result example: {eval_results[0]}") |
| 142 | + f.write(json.dumps(calculated_metrics, ensure_ascii=False) + "\n") |
| 143 | +print(f"Evaluation result saved to {eval_result_file_path}") |
0 commit comments