adding vision caption evals (#160)

michaelharrisonmai · Michael Harrison · web-flow · commit f948f5761414 · 2025-06-25T10:56:53.000-07:00
Adding Flickr30K and NoCaps evals. Each of these datasets contains
images and 5-10 sample captions per image. The eval asks LLM judge to
score a new caption from 0-5, given the sample captions.

---------

Co-authored-by: Michael Harrison &lt;mharrison@microsoft.com&gt;
diff --git a/eureka_ml_insights/prompt_templates/flickr30k_templates/scoring_prompt.jinja b/eureka_ml_insights/prompt_templates/flickr30k_templates/scoring_prompt.jinja
@@ -0,0 +1,5 @@
+Your task is to evaluate a student's ability to generate a high quality caption for an image.  You are not provided with the image itself, but you are given example captions which were deemed to accurately describe the image.  With this information, rate the student's caption on a scale of 0 to 5.  You may think about how to score the student, then write your final score in the form SCORE: <your score here>
+
+EXAMPLE CAPTIONS: {{ caption }}
+
+STUDENT CAPTION: {{ response }}
diff --git a/eureka_ml_insights/prompt_templates/nocaps_templates/scoring_prompt.jinja b/eureka_ml_insights/prompt_templates/nocaps_templates/scoring_prompt.jinja
@@ -0,0 +1,5 @@
+Your task is to evaluate a student's ability to generate a high quality caption for an image.  You are not provided with the image itself, but you are given example captions which were deemed to accurately describe the image.  With this information, rate the student's caption on a scale of 0 to 5.  You may think about how to score the student, then write your final score in the form SCORE: <your score here>
+
+EXAMPLE CAPTIONS: {{ annotations_captions }}
+
+STUDENT CAPTION: {{ response }}
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
@@ -13,6 +13,7 @@
 from .dna import DNA_PIPELINE
 from .drop import Drop_Experiment_Pipeline
 from .flenqa import FlenQA_Experiment_Pipeline
+from .flickr30k import FLICKR30K_PIPELINE
 from .geometer import GEOMETER_PIPELINE
 from .gpqa import GPQA_Experiment_Pipeline, GPQA_PIPELINE_5Run
 from .gsm8k import GSM8K_MUTATED_PIPELINE, GSM8K_PIPELINE, GSMSYMBOLIC_PIPELINE
@@ -62,6 +63,7 @@
 from .mathvision import MATHVISION_PIPELINE
 from .mathvista import MATHVISTA_PIPELINE
 from .mmmu import MMMU_BASELINE_PIPELINE
+from .nocaps import NOCAPS_PIPELINE
 from .nondeterminism import (
     Geo_Nondeterminism,
     IFEval_Nondeterminism,
@@ -169,4 +171,6 @@
     NPHARD_SAT_PIPELINE,
     NPHARD_SAT_PIPELINE_MULTIPLE_RUNS,
     NPHARD_SAT_HYBRIDEXTRACT_PIPELINE,
+    FLICKR30K_PIPELINE,
+    NOCAPS_PIPELINE,
 ]
diff --git a/eureka_ml_insights/user_configs/flickr30k.py b/eureka_ml_insights/user_configs/flickr30k.py
@@ -0,0 +1,144 @@
+"""This file contains an implementation of the flickr30k eval.
+"""
+
+import os
+from typing import Any
+
+from eureka_ml_insights.core import (
+    EvalReporting,
+    Inference,
+    PromptProcessing
+)
+from eureka_ml_insights.data_utils import (
+    AddColumnAndData,
+    AddColumn,
+    CopyColumn,
+    ColumnRename,
+    DataReader,
+    HFDataReader,
+    MapStringsTransform,
+    MMDataLoader,
+    SamplerTransform,
+    SequenceTransform,
+)
+
+from eureka_ml_insights.configs import(
+    AggregatorConfig,
+    DataSetConfig,
+    EvalReportingConfig,
+    InferenceConfig,
+    ModelConfig,
+    PipelineConfig,
+    PromptProcessingConfig,
+)
+
+
+from eureka_ml_insights.metrics.reports import AverageAggregator, ValueFilteredAggregator
+from eureka_ml_insights.configs import ExperimentConfig
+from eureka_ml_insights.configs.model_configs import OAI_GPT4_1106_PREVIEW_CONFIG as PERSONAL_GPT4O
+
+
+class FLICKR30K_PIPELINE(ExperimentConfig):
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        # Configure the data processing component.
+        self.data_processing_comp = PromptProcessingConfig(
+            component_type=PromptProcessing,
+            data_reader_config=DataSetConfig(
+                HFDataReader,
+                {
+                    "path": "nlphuji/flickr30k",
+                    "split": "test",
+                    "transform": SequenceTransform(
+                        [
+                            AddColumnAndData(column_name="prompt", data="Write a brief caption to summarize the contents of the image."),
+                            #SamplerTransform(sample_count=200, random_seed=1234),
+                        ]
+                    ),
+                },
+            ),
+            output_dir=os.path.join(self.log_dir, "data_processing_output"),
+        )
+
+        # Configure the inference component
+        self.inference_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=model_config,
+            data_loader_config=DataSetConfig(
+                MMDataLoader,
+                {"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")},
+            ),
+            output_dir=os.path.join(self.log_dir, "inference_result"),
+            resume_from=resume_from,
+        )
+
+        # Eval data pre processing component
+        self.eval_data_pre_processing = PromptProcessingConfig(
+            component_type=PromptProcessing,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform([ColumnRename(name_mapping={"model_output": "response"})]),
+                },
+            ),
+            prompt_template_path=os.path.join(
+                os.path.dirname(__file__), "../prompt_templates/flickr30k_templates/scoring_prompt.jinja"
+            ),
+            output_dir=os.path.join(self.log_dir, "eval_data_pre_processing_output"),
+        )
+
+        # Eval Inference component (LLM scoring)
+        self.eval_inference_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=PERSONAL_GPT4O,
+            data_loader_config=DataSetConfig(
+                MMDataLoader,
+                {"path": os.path.join(self.eval_data_pre_processing.output_dir, "transformed_data.jsonl"), "load_images":False},
+            ),
+            output_dir=os.path.join(self.log_dir, "eval_inference_result"),
+        )
+
+        self.evalreporting_comp = EvalReportingConfig(
+            component_type=EvalReporting,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.eval_inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform(
+                        [
+                            AddColumn(column_name="score"),
+                            CopyColumn(column_name_src="model_output", column_name_dst="score"),
+                            MapStringsTransform(columns=["score"], mapping = lambda x: x.split("SCORE: ")[-1][0] if isinstance(x, str) and x.find("SCORE: ") != -1 else "-1"),
+                        ]
+                    )
+                },
+            ),
+            aggregator_configs=[
+                AggregatorConfig(
+                    ValueFilteredAggregator,
+                    {
+                        "agg_class": AverageAggregator,
+                        "value": "-1",
+                        "column_names": ["score"],
+                        "filename_base": "Flickr30K_Score",
+                        "ignore_non_numeric": True,
+                    },
+                ),
+            ],
+            output_dir=os.path.join(self.log_dir, "eval_report"),
+        )
+
+        return PipelineConfig(
+            [
+                self.data_processing_comp,
+                self.inference_comp,
+                self.eval_data_pre_processing,
+                self.eval_inference_comp,
+                self.evalreporting_comp,
+            ],
+            self.log_dir,
+        )
diff --git a/eureka_ml_insights/user_configs/nocaps.py b/eureka_ml_insights/user_configs/nocaps.py
@@ -0,0 +1,144 @@
+"""This file contains an implementation of the NoCaps eval.
+"""
+
+import os
+from typing import Any
+
+from eureka_ml_insights.core import (
+    EvalReporting,
+    Inference,
+    PromptProcessing
+)
+from eureka_ml_insights.data_utils import (
+    AddColumnAndData,
+    AddColumn,
+    CopyColumn,
+    ColumnRename,
+    DataReader,
+    HFDataReader,
+    MapStringsTransform,
+    MMDataLoader,
+    SamplerTransform,
+    SequenceTransform,
+)
+
+from eureka_ml_insights.configs import(
+    AggregatorConfig,
+    DataSetConfig,
+    EvalReportingConfig,
+    InferenceConfig,
+    ModelConfig,
+    PipelineConfig,
+    PromptProcessingConfig,
+)
+
+
+from eureka_ml_insights.metrics.reports import AverageAggregator, ValueFilteredAggregator
+from eureka_ml_insights.configs import ExperimentConfig
+from eureka_ml_insights.configs.model_configs import OAI_GPT4_1106_PREVIEW_CONFIG as PERSONAL_GPT4O
+
+
+class NOCAPS_PIPELINE(ExperimentConfig):
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        # Configure the data processing component.
+        self.data_processing_comp = PromptProcessingConfig(
+            component_type=PromptProcessing,
+            data_reader_config=DataSetConfig(
+                HFDataReader,
+                {
+                    "path": "HuggingFaceM4/NoCaps",
+                    "split": "validation",
+                    "transform": SequenceTransform(
+                        [
+                            AddColumnAndData(column_name="prompt", data="Write a brief caption to summarize the contents of the image."),
+                            #SamplerTransform(sample_count=200, random_seed=1234),
+                        ]
+                    ),
+                },
+            ),
+            output_dir=os.path.join(self.log_dir, "data_processing_output"),
+        )
+
+        # Configure the inference component
+        self.inference_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=model_config,
+            data_loader_config=DataSetConfig(
+                MMDataLoader,
+                {"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")},
+            ),
+            output_dir=os.path.join(self.log_dir, "inference_result"),
+            resume_from=resume_from,
+        )
+
+        # Eval data pre processing component
+        self.eval_data_pre_processing = PromptProcessingConfig(
+            component_type=PromptProcessing,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform([ColumnRename(name_mapping={"model_output": "response"})]),
+                },
+            ),
+            prompt_template_path=os.path.join(
+                os.path.dirname(__file__), "../prompt_templates/nocaps_templates/scoring_prompt.jinja"
+            ),
+            output_dir=os.path.join(self.log_dir, "eval_data_pre_processing_output"),
+        )
+
+        # Eval Inference component (LLM scoring)
+        self.eval_inference_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=PERSONAL_GPT4O,
+            data_loader_config=DataSetConfig(
+                MMDataLoader,
+                {"path": os.path.join(self.eval_data_pre_processing.output_dir, "transformed_data.jsonl"), "load_images":False},
+            ),
+            output_dir=os.path.join(self.log_dir, "eval_inference_result"),
+        )
+
+        self.evalreporting_comp = EvalReportingConfig(
+            component_type=EvalReporting,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.eval_inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform(
+                        [
+                            AddColumn(column_name="score"),
+                            CopyColumn(column_name_src="model_output", column_name_dst="score"),
+                            MapStringsTransform(columns=["score"], mapping = lambda x: x.split("SCORE: ")[-1][0] if isinstance(x, str) and x.find("SCORE: ") != -1 else "-1"),
+                        ]
+                    )
+                },
+            ),
+            aggregator_configs=[
+                AggregatorConfig(
+                    ValueFilteredAggregator,
+                    {
+                        "agg_class": AverageAggregator,
+                        "value": "-1",
+                        "column_names": ["score"],
+                        "filename_base": "NoCaps_Score",
+                        "ignore_non_numeric": True,
+                    },
+                ),
+            ],
+            output_dir=os.path.join(self.log_dir, "eval_report"),
+        )
+
+        return PipelineConfig(
+            [
+                self.data_processing_comp,
+                self.inference_comp,
+                self.eval_data_pre_processing,
+                self.eval_inference_comp,
+                self.evalreporting_comp,
+            ],
+            self.log_dir,
+        )