Skip to content

Commit f948f57

Browse files
michaelharrisonmaiMichael Harrison
andauthored
adding vision caption evals (#160)
Adding Flickr30K and NoCaps evals. Each of these datasets contains images and 5-10 sample captions per image. The eval asks LLM judge to score a new caption from 0-5, given the sample captions. --------- Co-authored-by: Michael Harrison <[email protected]>
1 parent 93e2f85 commit f948f57

File tree

5 files changed

+302
-0
lines changed

5 files changed

+302
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Your task is to evaluate a student's ability to generate a high quality caption for an image. You are not provided with the image itself, but you are given example captions which were deemed to accurately describe the image. With this information, rate the student's caption on a scale of 0 to 5. You may think about how to score the student, then write your final score in the form SCORE: <your score here>
2+
3+
EXAMPLE CAPTIONS: {{ caption }}
4+
5+
STUDENT CAPTION: {{ response }}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Your task is to evaluate a student's ability to generate a high quality caption for an image. You are not provided with the image itself, but you are given example captions which were deemed to accurately describe the image. With this information, rate the student's caption on a scale of 0 to 5. You may think about how to score the student, then write your final score in the form SCORE: <your score here>
2+
3+
EXAMPLE CAPTIONS: {{ annotations_captions }}
4+
5+
STUDENT CAPTION: {{ response }}

eureka_ml_insights/user_configs/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .dna import DNA_PIPELINE
1414
from .drop import Drop_Experiment_Pipeline
1515
from .flenqa import FlenQA_Experiment_Pipeline
16+
from .flickr30k import FLICKR30K_PIPELINE
1617
from .geometer import GEOMETER_PIPELINE
1718
from .gpqa import GPQA_Experiment_Pipeline, GPQA_PIPELINE_5Run
1819
from .gsm8k import GSM8K_MUTATED_PIPELINE, GSM8K_PIPELINE, GSMSYMBOLIC_PIPELINE
@@ -62,6 +63,7 @@
6263
from .mathvision import MATHVISION_PIPELINE
6364
from .mathvista import MATHVISTA_PIPELINE
6465
from .mmmu import MMMU_BASELINE_PIPELINE
66+
from .nocaps import NOCAPS_PIPELINE
6567
from .nondeterminism import (
6668
Geo_Nondeterminism,
6769
IFEval_Nondeterminism,
@@ -169,4 +171,6 @@
169171
NPHARD_SAT_PIPELINE,
170172
NPHARD_SAT_PIPELINE_MULTIPLE_RUNS,
171173
NPHARD_SAT_HYBRIDEXTRACT_PIPELINE,
174+
FLICKR30K_PIPELINE,
175+
NOCAPS_PIPELINE,
172176
]
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""This file contains an implementation of the flickr30k eval.
2+
"""
3+
4+
import os
5+
from typing import Any
6+
7+
from eureka_ml_insights.core import (
8+
EvalReporting,
9+
Inference,
10+
PromptProcessing
11+
)
12+
from eureka_ml_insights.data_utils import (
13+
AddColumnAndData,
14+
AddColumn,
15+
CopyColumn,
16+
ColumnRename,
17+
DataReader,
18+
HFDataReader,
19+
MapStringsTransform,
20+
MMDataLoader,
21+
SamplerTransform,
22+
SequenceTransform,
23+
)
24+
25+
from eureka_ml_insights.configs import(
26+
AggregatorConfig,
27+
DataSetConfig,
28+
EvalReportingConfig,
29+
InferenceConfig,
30+
ModelConfig,
31+
PipelineConfig,
32+
PromptProcessingConfig,
33+
)
34+
35+
36+
from eureka_ml_insights.metrics.reports import AverageAggregator, ValueFilteredAggregator
37+
from eureka_ml_insights.configs import ExperimentConfig
38+
from eureka_ml_insights.configs.model_configs import OAI_GPT4_1106_PREVIEW_CONFIG as PERSONAL_GPT4O
39+
40+
41+
class FLICKR30K_PIPELINE(ExperimentConfig):
42+
def configure_pipeline(
43+
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
44+
) -> PipelineConfig:
45+
# Configure the data processing component.
46+
self.data_processing_comp = PromptProcessingConfig(
47+
component_type=PromptProcessing,
48+
data_reader_config=DataSetConfig(
49+
HFDataReader,
50+
{
51+
"path": "nlphuji/flickr30k",
52+
"split": "test",
53+
"transform": SequenceTransform(
54+
[
55+
AddColumnAndData(column_name="prompt", data="Write a brief caption to summarize the contents of the image."),
56+
#SamplerTransform(sample_count=200, random_seed=1234),
57+
]
58+
),
59+
},
60+
),
61+
output_dir=os.path.join(self.log_dir, "data_processing_output"),
62+
)
63+
64+
# Configure the inference component
65+
self.inference_comp = InferenceConfig(
66+
component_type=Inference,
67+
model_config=model_config,
68+
data_loader_config=DataSetConfig(
69+
MMDataLoader,
70+
{"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")},
71+
),
72+
output_dir=os.path.join(self.log_dir, "inference_result"),
73+
resume_from=resume_from,
74+
)
75+
76+
# Eval data pre processing component
77+
self.eval_data_pre_processing = PromptProcessingConfig(
78+
component_type=PromptProcessing,
79+
data_reader_config=DataSetConfig(
80+
DataReader,
81+
{
82+
"path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
83+
"format": ".jsonl",
84+
"transform": SequenceTransform([ColumnRename(name_mapping={"model_output": "response"})]),
85+
},
86+
),
87+
prompt_template_path=os.path.join(
88+
os.path.dirname(__file__), "../prompt_templates/flickr30k_templates/scoring_prompt.jinja"
89+
),
90+
output_dir=os.path.join(self.log_dir, "eval_data_pre_processing_output"),
91+
)
92+
93+
# Eval Inference component (LLM scoring)
94+
self.eval_inference_comp = InferenceConfig(
95+
component_type=Inference,
96+
model_config=PERSONAL_GPT4O,
97+
data_loader_config=DataSetConfig(
98+
MMDataLoader,
99+
{"path": os.path.join(self.eval_data_pre_processing.output_dir, "transformed_data.jsonl"), "load_images":False},
100+
),
101+
output_dir=os.path.join(self.log_dir, "eval_inference_result"),
102+
)
103+
104+
self.evalreporting_comp = EvalReportingConfig(
105+
component_type=EvalReporting,
106+
data_reader_config=DataSetConfig(
107+
DataReader,
108+
{
109+
"path": os.path.join(self.eval_inference_comp.output_dir, "inference_result.jsonl"),
110+
"format": ".jsonl",
111+
"transform": SequenceTransform(
112+
[
113+
AddColumn(column_name="score"),
114+
CopyColumn(column_name_src="model_output", column_name_dst="score"),
115+
MapStringsTransform(columns=["score"], mapping = lambda x: x.split("SCORE: ")[-1][0] if isinstance(x, str) and x.find("SCORE: ") != -1 else "-1"),
116+
]
117+
)
118+
},
119+
),
120+
aggregator_configs=[
121+
AggregatorConfig(
122+
ValueFilteredAggregator,
123+
{
124+
"agg_class": AverageAggregator,
125+
"value": "-1",
126+
"column_names": ["score"],
127+
"filename_base": "Flickr30K_Score",
128+
"ignore_non_numeric": True,
129+
},
130+
),
131+
],
132+
output_dir=os.path.join(self.log_dir, "eval_report"),
133+
)
134+
135+
return PipelineConfig(
136+
[
137+
self.data_processing_comp,
138+
self.inference_comp,
139+
self.eval_data_pre_processing,
140+
self.eval_inference_comp,
141+
self.evalreporting_comp,
142+
],
143+
self.log_dir,
144+
)
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""This file contains an implementation of the NoCaps eval.
2+
"""
3+
4+
import os
5+
from typing import Any
6+
7+
from eureka_ml_insights.core import (
8+
EvalReporting,
9+
Inference,
10+
PromptProcessing
11+
)
12+
from eureka_ml_insights.data_utils import (
13+
AddColumnAndData,
14+
AddColumn,
15+
CopyColumn,
16+
ColumnRename,
17+
DataReader,
18+
HFDataReader,
19+
MapStringsTransform,
20+
MMDataLoader,
21+
SamplerTransform,
22+
SequenceTransform,
23+
)
24+
25+
from eureka_ml_insights.configs import(
26+
AggregatorConfig,
27+
DataSetConfig,
28+
EvalReportingConfig,
29+
InferenceConfig,
30+
ModelConfig,
31+
PipelineConfig,
32+
PromptProcessingConfig,
33+
)
34+
35+
36+
from eureka_ml_insights.metrics.reports import AverageAggregator, ValueFilteredAggregator
37+
from eureka_ml_insights.configs import ExperimentConfig
38+
from eureka_ml_insights.configs.model_configs import OAI_GPT4_1106_PREVIEW_CONFIG as PERSONAL_GPT4O
39+
40+
41+
class NOCAPS_PIPELINE(ExperimentConfig):
42+
def configure_pipeline(
43+
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
44+
) -> PipelineConfig:
45+
# Configure the data processing component.
46+
self.data_processing_comp = PromptProcessingConfig(
47+
component_type=PromptProcessing,
48+
data_reader_config=DataSetConfig(
49+
HFDataReader,
50+
{
51+
"path": "HuggingFaceM4/NoCaps",
52+
"split": "validation",
53+
"transform": SequenceTransform(
54+
[
55+
AddColumnAndData(column_name="prompt", data="Write a brief caption to summarize the contents of the image."),
56+
#SamplerTransform(sample_count=200, random_seed=1234),
57+
]
58+
),
59+
},
60+
),
61+
output_dir=os.path.join(self.log_dir, "data_processing_output"),
62+
)
63+
64+
# Configure the inference component
65+
self.inference_comp = InferenceConfig(
66+
component_type=Inference,
67+
model_config=model_config,
68+
data_loader_config=DataSetConfig(
69+
MMDataLoader,
70+
{"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")},
71+
),
72+
output_dir=os.path.join(self.log_dir, "inference_result"),
73+
resume_from=resume_from,
74+
)
75+
76+
# Eval data pre processing component
77+
self.eval_data_pre_processing = PromptProcessingConfig(
78+
component_type=PromptProcessing,
79+
data_reader_config=DataSetConfig(
80+
DataReader,
81+
{
82+
"path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
83+
"format": ".jsonl",
84+
"transform": SequenceTransform([ColumnRename(name_mapping={"model_output": "response"})]),
85+
},
86+
),
87+
prompt_template_path=os.path.join(
88+
os.path.dirname(__file__), "../prompt_templates/nocaps_templates/scoring_prompt.jinja"
89+
),
90+
output_dir=os.path.join(self.log_dir, "eval_data_pre_processing_output"),
91+
)
92+
93+
# Eval Inference component (LLM scoring)
94+
self.eval_inference_comp = InferenceConfig(
95+
component_type=Inference,
96+
model_config=PERSONAL_GPT4O,
97+
data_loader_config=DataSetConfig(
98+
MMDataLoader,
99+
{"path": os.path.join(self.eval_data_pre_processing.output_dir, "transformed_data.jsonl"), "load_images":False},
100+
),
101+
output_dir=os.path.join(self.log_dir, "eval_inference_result"),
102+
)
103+
104+
self.evalreporting_comp = EvalReportingConfig(
105+
component_type=EvalReporting,
106+
data_reader_config=DataSetConfig(
107+
DataReader,
108+
{
109+
"path": os.path.join(self.eval_inference_comp.output_dir, "inference_result.jsonl"),
110+
"format": ".jsonl",
111+
"transform": SequenceTransform(
112+
[
113+
AddColumn(column_name="score"),
114+
CopyColumn(column_name_src="model_output", column_name_dst="score"),
115+
MapStringsTransform(columns=["score"], mapping = lambda x: x.split("SCORE: ")[-1][0] if isinstance(x, str) and x.find("SCORE: ") != -1 else "-1"),
116+
]
117+
)
118+
},
119+
),
120+
aggregator_configs=[
121+
AggregatorConfig(
122+
ValueFilteredAggregator,
123+
{
124+
"agg_class": AverageAggregator,
125+
"value": "-1",
126+
"column_names": ["score"],
127+
"filename_base": "NoCaps_Score",
128+
"ignore_non_numeric": True,
129+
},
130+
),
131+
],
132+
output_dir=os.path.join(self.log_dir, "eval_report"),
133+
)
134+
135+
return PipelineConfig(
136+
[
137+
self.data_processing_comp,
138+
self.inference_comp,
139+
self.eval_data_pre_processing,
140+
self.eval_inference_comp,
141+
self.evalreporting_comp,
142+
],
143+
self.log_dir,
144+
)

0 commit comments

Comments
 (0)