Skip to content

Commit 31d5fce

Browse files
authored
Add V*Bench dataset (#165)
Adds V*Bench dataset from the paper: [https://arxiv.org/abs/2312.14135](https://arxiv.org/abs/2312.14135).
1 parent 6830fca commit 31d5fce

File tree

3 files changed

+166
-0
lines changed

3 files changed

+166
-0
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
We asked a large language model the following question:
2+
3+
BEGIN INITIAL QUESTION
4+
5+
Question: {{initial_prompt}}
6+
7+
END INITIAL QUESTION
8+
9+
The model gave the following answer:
10+
11+
BEGIN RAW MODEL ANSWER
12+
13+
Raw Model answer: {{model_output_raw}}
14+
15+
END RAW MODEL ANSWER
16+
17+
Please extract the final answer from the raw model answer according to the specified format. Do not mention any other details; only extract the letter of the option corresponding to the final answer. If you are not able to find a final answer, output None.

eureka_ml_insights/user_configs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
SPATIAL_MAP_REPORTING_PIPELINE,
9898
SPATIAL_MAP_TEXTONLY_PIPELINE,
9999
)
100+
from .vstar_bench import VSTAR_BENCH_PIPELINE
100101

101102
__all__ = [
102103
LLM_EXTRACTION_SUBPIPELINE_MIXIN,
@@ -175,4 +176,5 @@
175176
NPHARD_SAT_HYBRIDEXTRACT_PIPELINE,
176177
FLICKR30K_PIPELINE,
177178
NOCAPS_PIPELINE,
179+
VSTAR_BENCH_PIPELINE,
178180
]
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import os
2+
3+
from eureka_ml_insights.configs.experiment_config import ExperimentConfig
4+
from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing, DataProcessing
5+
6+
from eureka_ml_insights.data_utils import (
7+
HFDataReader,
8+
MMDataLoader,
9+
DataLoader,
10+
DataReader,
11+
SequenceTransform,
12+
ColumnRename,
13+
AddColumn,
14+
)
15+
from eureka_ml_insights.metrics import CountAggregator, SubstringExistsMatch
16+
17+
from eureka_ml_insights.configs import (
18+
AggregatorConfig,
19+
DataProcessingConfig,
20+
DataSetConfig,
21+
EvalReportingConfig,
22+
InferenceConfig,
23+
MetricConfig,
24+
ModelConfig,
25+
PipelineConfig,
26+
PromptProcessingConfig,
27+
)
28+
from eureka_ml_insights.configs.model_configs import OAI_GPT4_1106_PREVIEW_CONFIG as PERSONAL_GPT4O
29+
30+
"""This file contains example user defined configuration classes for the V*Bench task.
31+
In order to define a new configuration, a new class must be created that directly or indirectly
32+
inherits from UserDefinedConfig and the user_init method should be implemented.
33+
You can inherit from one of the existing user defined classes below and override the necessary
34+
attributes to reduce the amount of code you need to write.
35+
36+
The user defined configuration classes are used to define your desired *pipeline* that can include
37+
any number of *component*s. Find *component* options in the core module.
38+
39+
Pass the name of the class to the main.py script to run the pipeline.
40+
"""
41+
42+
43+
class VSTAR_BENCH_PIPELINE(ExperimentConfig):
44+
"""This method is used to define an eval pipeline with inference and metric report components,
45+
on the V*Bench dataset."""
46+
47+
def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
48+
# Download V*Bench from HuggingFace
49+
self.data_processing_comp = PromptProcessingConfig(
50+
component_type=PromptProcessing,
51+
data_reader_config=DataSetConfig(
52+
HFDataReader,
53+
{
54+
"path": "tmlabonte/vstar_bench",
55+
"split": "test",
56+
},
57+
),
58+
output_dir=os.path.join(self.log_dir, "data_processing_output"),
59+
)
60+
61+
# Perform inference with desired model on V*Bench.
62+
self.inference_comp = InferenceConfig(
63+
component_type=Inference,
64+
model_config=model_config,
65+
data_loader_config=DataSetConfig(
66+
MMDataLoader,
67+
{
68+
"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
69+
},
70+
),
71+
output_dir=os.path.join(self.log_dir, "inference_result"),
72+
resume_from=resume_from,
73+
max_concurrent=8,
74+
)
75+
76+
# Prepare inference result for LLM answer extraction
77+
self.preeval_data_post_processing_comp = PromptProcessingConfig(
78+
component_type=PromptProcessing,
79+
data_reader_config=DataSetConfig(
80+
DataReader,
81+
{
82+
"path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
83+
"format": ".jsonl",
84+
"transform": SequenceTransform(
85+
[
86+
ColumnRename(name_mapping={
87+
"prompt": "initial_prompt",
88+
"model_output": "model_output_raw",
89+
}),
90+
AddColumn(column_name="prompt"),
91+
]
92+
),
93+
},
94+
),
95+
prompt_template_path=os.path.join(
96+
os.path.dirname(__file__),
97+
"../prompt_templates/vstar_bench_templates/extract_answer.jinja",
98+
),
99+
output_dir=os.path.join(self.log_dir, "preeval_data_post_processing_output"),
100+
)
101+
102+
# Extract answer using LLM
103+
self.llm_answer_extract_comp = InferenceConfig(
104+
component_type=Inference,
105+
model_config=PERSONAL_GPT4O,
106+
data_loader_config=DataSetConfig(
107+
DataLoader,
108+
{"path": os.path.join(self.preeval_data_post_processing_comp.output_dir, "transformed_data.jsonl")},
109+
),
110+
output_dir=os.path.join(self.log_dir, "llm_answer_extract_inference_result"),
111+
max_concurrent=8,
112+
)
113+
114+
# Evaluate extracted answer
115+
self.evalreporting_comp = EvalReportingConfig(
116+
component_type=EvalReporting,
117+
data_reader_config=DataSetConfig(
118+
DataReader,
119+
{
120+
"path": os.path.join(self.llm_answer_extract_comp.output_dir, "inference_result.jsonl"),
121+
"format": ".jsonl",
122+
},
123+
),
124+
metric_config=MetricConfig(SubstringExistsMatch),
125+
aggregator_configs=[
126+
AggregatorConfig(
127+
CountAggregator, {"column_names": ["SubstringExistsMatch_result"], "normalize": True}
128+
),
129+
AggregatorConfig(
130+
CountAggregator,
131+
{"column_names": ["SubstringExistsMatch_result"], "group_by": "category", "normalize": True},
132+
),
133+
],
134+
output_dir=os.path.join(self.log_dir, "eval_report"),
135+
)
136+
137+
# Configure the pipeline
138+
return PipelineConfig(
139+
[
140+
self.data_processing_comp,
141+
self.inference_comp,
142+
self.preeval_data_post_processing_comp,
143+
self.llm_answer_extract_comp,
144+
self.evalreporting_comp,
145+
],
146+
self.log_dir,
147+
)

0 commit comments

Comments
 (0)