Add V*Bench dataset (#165)

tmlabonte · web-flow · commit 31d5fce6799c · 2025-06-27T17:07:20.000-07:00
Adds V*Bench dataset from the paper: [https://arxiv.org/abs/2312.14135](https://arxiv.org/abs/2312.14135).
diff --git a/eureka_ml_insights/prompt_templates/vstar_bench_templates/extract_answer.jinja b/eureka_ml_insights/prompt_templates/vstar_bench_templates/extract_answer.jinja
@@ -0,0 +1,17 @@
+We asked a large language model the following question:
+
+BEGIN INITIAL QUESTION
+
+Question: {{initial_prompt}}
+
+END INITIAL QUESTION
+
+The model gave the following answer:
+
+BEGIN RAW MODEL ANSWER
+
+Raw Model answer: {{model_output_raw}}
+
+END RAW MODEL ANSWER
+
+Please extract the final answer from the raw model answer according to the specified format. Do not mention any other details; only extract the letter of the option corresponding to the final answer. If you are not able to find a final answer, output None.
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
@@ -97,6 +97,7 @@
     SPATIAL_MAP_REPORTING_PIPELINE,
     SPATIAL_MAP_TEXTONLY_PIPELINE,
 )
+from .vstar_bench import VSTAR_BENCH_PIPELINE
 
 __all__ = [
     LLM_EXTRACTION_SUBPIPELINE_MIXIN,
@@ -175,4 +176,5 @@
     NPHARD_SAT_HYBRIDEXTRACT_PIPELINE,
     FLICKR30K_PIPELINE,
     NOCAPS_PIPELINE,
+    VSTAR_BENCH_PIPELINE,
 ]
diff --git a/eureka_ml_insights/user_configs/vstar_bench.py b/eureka_ml_insights/user_configs/vstar_bench.py
@@ -0,0 +1,147 @@
+import os
+
+from eureka_ml_insights.configs.experiment_config import ExperimentConfig
+from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing, DataProcessing
+
+from eureka_ml_insights.data_utils import (
+    HFDataReader,  
+    MMDataLoader,
+    DataLoader,
+    DataReader,
+    SequenceTransform,
+    ColumnRename,
+    AddColumn,
+)
+from eureka_ml_insights.metrics import CountAggregator, SubstringExistsMatch
+
+from eureka_ml_insights.configs import (
+    AggregatorConfig,
+    DataProcessingConfig,
+    DataSetConfig,
+    EvalReportingConfig,
+    InferenceConfig,
+    MetricConfig,
+    ModelConfig,
+    PipelineConfig,
+    PromptProcessingConfig,
+)
+from eureka_ml_insights.configs.model_configs import OAI_GPT4_1106_PREVIEW_CONFIG as PERSONAL_GPT4O
+
+"""This file contains example user defined configuration classes for the V*Bench task.
+In order to define a new configuration, a new class must be created that directly or indirectly
+ inherits from UserDefinedConfig and the user_init method should be implemented.
+You can inherit from one of the existing user defined classes below and override the necessary
+attributes to reduce the amount of code you need to write.
+
+The user defined configuration classes are used to define your desired *pipeline* that can include
+any number of *component*s. Find *component* options in the core module.
+
+Pass the name of the class to the main.py script to run the pipeline.
+"""
+
+
+class VSTAR_BENCH_PIPELINE(ExperimentConfig):
+    """This method is used to define an eval pipeline with inference and metric report components,
+    on the V*Bench dataset."""
+
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
+        # Download V*Bench from HuggingFace
+        self.data_processing_comp = PromptProcessingConfig(
+            component_type=PromptProcessing,
+            data_reader_config=DataSetConfig(
+                HFDataReader,
+                {
+                    "path": "tmlabonte/vstar_bench",
+                    "split": "test",
+                },
+            ),
+            output_dir=os.path.join(self.log_dir, "data_processing_output"),
+        )
+
+        # Perform inference with desired model on V*Bench.
+        self.inference_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=model_config,
+            data_loader_config=DataSetConfig(
+                MMDataLoader,
+                {
+                    "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
+                },
+            ),
+            output_dir=os.path.join(self.log_dir, "inference_result"),
+            resume_from=resume_from,
+            max_concurrent=8,
+        )
+
+        # Prepare inference result for LLM answer extraction
+        self.preeval_data_post_processing_comp = PromptProcessingConfig(
+            component_type=PromptProcessing,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform(
+                        [
+                            ColumnRename(name_mapping={
+                                "prompt": "initial_prompt",
+                                "model_output": "model_output_raw",
+                            }),
+                            AddColumn(column_name="prompt"),
+                        ]
+                    ),
+                },
+            ),
+            prompt_template_path=os.path.join(
+                os.path.dirname(__file__),
+                "../prompt_templates/vstar_bench_templates/extract_answer.jinja",
+            ),
+            output_dir=os.path.join(self.log_dir, "preeval_data_post_processing_output"),
+        )
+
+        # Extract answer using LLM
+        self.llm_answer_extract_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=PERSONAL_GPT4O,
+            data_loader_config=DataSetConfig(
+                DataLoader,
+                {"path": os.path.join(self.preeval_data_post_processing_comp.output_dir, "transformed_data.jsonl")},
+            ),
+            output_dir=os.path.join(self.log_dir, "llm_answer_extract_inference_result"),
+            max_concurrent=8,
+        )
+
+        # Evaluate extracted answer
+        self.evalreporting_comp = EvalReportingConfig(
+            component_type=EvalReporting,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.llm_answer_extract_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                },
+            ),
+            metric_config=MetricConfig(SubstringExistsMatch),
+            aggregator_configs=[
+                AggregatorConfig(
+                    CountAggregator, {"column_names": ["SubstringExistsMatch_result"], "normalize": True}
+                ),
+                AggregatorConfig(
+                    CountAggregator,
+                    {"column_names": ["SubstringExistsMatch_result"], "group_by": "category", "normalize": True},
+                ),
+            ],
+            output_dir=os.path.join(self.log_dir, "eval_report"),
+        )
+
+        # Configure the pipeline
+        return PipelineConfig(
+            [
+                self.data_processing_comp,
+                self.inference_comp,
+                self.preeval_data_post_processing_comp,
+                self.llm_answer_extract_comp,
+                self.evalreporting_comp,
+            ],
+            self.log_dir,
+        )

Original file line number	Diff line number	Diff line change
`@@ -97,6 +97,7 @@`
`97`	`97`	`SPATIAL_MAP_REPORTING_PIPELINE,`
`98`	`98`	`SPATIAL_MAP_TEXTONLY_PIPELINE,`
`99`	`99`	`)`
	`100`	`+from .vstar_bench import VSTAR_BENCH_PIPELINE`
`100`	`101`
`101`	`102`	`__all__ = [`
`102`	`103`	`LLM_EXTRACTION_SUBPIPELINE_MIXIN,`
`@@ -175,4 +176,5 @@`
`175`	`176`	`NPHARD_SAT_HYBRIDEXTRACT_PIPELINE,`
`176`	`177`	`FLICKR30K_PIPELINE,`
`177`	`178`	`NOCAPS_PIPELINE,`
	`179`	`+ VSTAR_BENCH_PIPELINE,`
`178`	`180`	`]`