Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mteb/benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
FA_MTEB,
FA_MTEB_2,
JINA_VDR,
REAL_MM_RAG,
LONG_EMBED,
MIEB_ENG,
MIEB_IMG,
Expand Down Expand Up @@ -112,4 +113,5 @@
"RTEB_ENGLISH",
"RTEB_FRENCH",
"RTEB_GERMAN",
"REAL_MM_RAG",
]
23 changes: 23 additions & 0 deletions mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2295,3 +2295,26 @@
year = {2025},
}""",
)

REAL_MM_RAG = Benchmark(
name="REAL_MM_RAG",
display_name="IBM Visual Document Retrieval",
tasks=get_tasks(
tasks=[
"RealMMRagFinReportRetrieval",
"RealMMRagFinSlidesRetrieval",
"RealMMRagTechReportRetrieval",
"RealMMRagTechSlidesRetrieval",
],
),
description="Realistic and multi-modal document retrieval benchmark.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This description is too short. Why should I prefer this over another VDR benchmark

reference="https://arxiv.org/abs/2502.12342",
citation=r"""
@article{wasserman2025real,
title={REAL-MM-RAG: A Real-World Multi-Modal Retrieval Benchmark},
author={Wasserman, Navve and Pony, Roi and Naparstek, Oshri and Goldfarb, Adi Raz and Schwartz, Eli and Barzelay, Udi and Karlinsky, Leonid},
journal={arXiv preprint arXiv:2502.12342},
year={2025}
}
""",
)
1 change: 1 addition & 0 deletions mteb/tasks/Image/Any2AnyRetrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from .eng.VQA2IT2TRetrieval import *
from .eng.WebQAT2ITRetrieval import *
from .eng.WebQAT2TRetrieval import *
from .eng.RealMMRagBenchRetrieval import *
from .multilingual.JinaVDRBenchRetrieval import *
from .multilingual.MIRACLVisionRetrieval import *
from .multilingual.VdrMultilingualRetrieval import *
Expand Down
303 changes: 303 additions & 0 deletions mteb/tasks/Image/Any2AnyRetrieval/eng/RealMMRagBenchRetrieval.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Samoed should we snake-case the filename (easier to merge with v2)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, that will be better

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since v2 release, you need to move your tasks into retrieval/eng/ folder

Original file line number Diff line number Diff line change
@@ -0,0 +1,303 @@
from __future__ import annotations

from datasets import load_dataset

from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also would be

Suggested change
from __future__ import annotations
from datasets import load_dataset
from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata
from datasets import load_dataset
from mteb.abstasks.retrieval import AbsTaskRetrieval
from mteb.abstasks.task_metadata import TaskMetadata



def _load_data(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can reupload your tasks using task.push_dataset_to_hub() to use our format

path: str,
splits: str,
cache_dir: str | None = None,
revision: str | None = None,
text_col: str | "query" = "query",
):
corpus = {}
queries = {}
relevant_docs = {}


for split in splits:
query_ds = load_dataset(
path,
"queries",
split=split,
cache_dir=cache_dir,
revision=revision,
)
query_ds = query_ds.map(
lambda x: {
"id": f"query-{split}-{x['query-id']}",
"text": x[text_col],
"image": None,
"modality": "text",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't add columns with None and don't need modality column

},
remove_columns=["query-id", "query"],
)

corpus_ds = load_dataset(
path,
"corpus",
split=split,
cache_dir=cache_dir,
revision=revision,
)
corpus_ds = corpus_ds.map(
lambda x: {
"id": f"corpus-{split}-{x['corpus-id']}",
"text": None,
"modality": "image",
},
remove_columns=["corpus-id"],
)

qrels_ds = load_dataset(
path,
"qrels",
split=split,
cache_dir=cache_dir,
revision=revision,
)

queries[split] = query_ds
corpus[split] = corpus_ds
relevant_docs[split] = {}
for row in qrels_ds:
qid = f"query-{split}-{row['query-id']}"
did = f"corpus-{split}-{row['corpus-id']}"
if qid not in relevant_docs[split]:
relevant_docs[split][qid] = {}
relevant_docs[split][qid][did] = int(row["score"])

return corpus, queries, relevant_docs


class RealMMRagFinReportRetrieval(AbsTaskAny2AnyRetrieval):
metadata = TaskMetadata(
name="RealMMRagFinReportRetrieval",
description="Retrieve associated pages according to questions.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This description is too vague - It should be clear from the description what queries and corpus it contains, as well as the retrieval goal. Please fix this for all tasks.

reference="https://arxiv.org/abs/2502.12342",
dataset={
"path": "ibm-research/REAL-MM-RAG_FinReport_BEIR",
"revision": "e66ef8cc883d823483db7b5b71065eb7c1dae12c",
},
type="DocumentUnderstanding",
category="t2i",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_5",
date=("2025-01-01", "2025-07-01"),
domains=["Academic"],
task_subtypes=["Image Text Retrieval"],
license="cdla-sharing-1.0",
annotations_creators="derived",
dialect=[],
modalities=["text", "image"],
sample_creation="found",
bibtex_citation=r"""
@article{wasserman2025real,
title={REAL-MM-RAG: A Real-World Multi-Modal Retrieval Benchmark},
author={Wasserman, Navve and Pony, Roi and Naparstek, Oshri and Goldfarb, Adi Raz and Schwartz, Eli and Barzelay, Udi and Karlinsky, Leonid},
journal={arXiv preprint arXiv:2502.12342},
year={2025}
}
""",
prompt={"query": "Find a screenshot that relevant to the user's question."},
descriptive_stats={
"n_samples": None,
"avg_character_length": {
"test": {
"average_document_length": 141.5,
"num_documents": 19,
"num_queries": 853,
"average_relevant_docs_per_query": 1.0,
}
},
},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have descriptive_stats in task metadata. You need to use task.calculate_desriptive_statistics()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
prompt={"query": "Find a screenshot that relevant to the user's question."},
descriptive_stats={
"n_samples": None,
"avg_character_length": {
"test": {
"average_document_length": 141.5,
"num_documents": 19,
"num_queries": 853,
"average_relevant_docs_per_query": 1.0,
}
},
},
prompt={"query": "Find a screenshot that relevant to the user's question."},

)

def load_data(self, **kwargs):
if self.data_loaded:
return

self.corpus, self.queries, self.relevant_docs = _load_data(
path=self.metadata_dict["dataset"]["path"],
splits=self.metadata_dict["eval_splits"],
cache_dir=kwargs.get("cache_dir", None),
revision=self.metadata_dict["dataset"]["revision"],
)

self.data_loaded = True

class RealMMRagFinSlidesRetrieval(AbsTaskAny2AnyRetrieval):
metadata = TaskMetadata(
name="RealMMRagFinSlidesRetrieval",
description="Retrieve associated pages according to questions.",
reference="https://arxiv.org/abs/2502.12342",
dataset={
"path": "ibm-research/REAL-MM-RAG_FinSlides_BEIR",
"revision": "main",
},
type="DocumentUnderstanding",
category="t2i",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_5",
date=("2025-01-01", "2025-07-01"),
domains=["Academic"],
task_subtypes=["Image Text Retrieval"],
license="cdla-sharing-1.0",
annotations_creators="derived",
dialect=[],
modalities=["text", "image"],
sample_creation="found",
bibtex_citation=r"""
@article{wasserman2025real,
title={REAL-MM-RAG: A Real-World Multi-Modal Retrieval Benchmark},
author={Wasserman, Navve and Pony, Roi and Naparstek, Oshri and Goldfarb, Adi Raz and Schwartz, Eli and Barzelay, Udi and Karlinsky, Leonid},
journal={arXiv preprint arXiv:2502.12342},
year={2025}
}
""",
prompt={"query": "Find a screenshot that relevant to the user's question."},
descriptive_stats={
"n_samples": None,
"avg_character_length": {
"test": {
"average_document_length": 35,
"num_documents": 65,
"num_queries": 1052,
"average_relevant_docs_per_query": 1.0,
}
},
},
)

def load_data(self, **kwargs):
if self.data_loaded:
return

self.corpus, self.queries, self.relevant_docs = _load_data(
path=self.metadata_dict["dataset"]["path"],
splits=self.metadata_dict["eval_splits"],
cache_dir=kwargs.get("cache_dir", None),
revision=self.metadata_dict["dataset"]["revision"],
)
self.data_loaded = True


class RealMMRagTechReportRetrieval(AbsTaskAny2AnyRetrieval):
metadata = TaskMetadata(
name="RealMMRagTechReportRetrieval",
description="Retrieve associated pages according to questions.",
reference="https://arxiv.org/abs/2502.12342",
dataset={
"path": "ibm-research/REAL-MM-RAG_TechReport_BEIR",
"revision": "main",
},
type="DocumentUnderstanding",
category="t2i",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_5",
date=("2025-01-01", "2025-07-01"),
domains=["Academic"],
task_subtypes=["Image Text Retrieval"],
license="cdla-sharing-1.0",
annotations_creators="derived",
dialect=[],
modalities=["text", "image"],
sample_creation="found",
bibtex_citation=r"""
@article{wasserman2025real,
title={REAL-MM-RAG: A Real-World Multi-Modal Retrieval Benchmark},
author={Wasserman, Navve and Pony, Roi and Naparstek, Oshri and Goldfarb, Adi Raz and Schwartz, Eli and Barzelay, Udi and Karlinsky, Leonid},
journal={arXiv preprint arXiv:2502.12342},
year={2025}
}
""",
prompt={"query": "Find a screenshot that relevant to the user's question."},
descriptive_stats={
"n_samples": None,
"avg_character_length": {
"test": {
"average_document_length": 98.5,
"num_documents": 17,
"num_queries": 1294,
"average_relevant_docs_per_query": 1.0,
}
},
},
)

def load_data(self, **kwargs):
if self.data_loaded:
return

self.corpus, self.queries, self.relevant_docs = _load_data(
path=self.metadata_dict["dataset"]["path"],
splits=self.metadata_dict["eval_splits"],
cache_dir=kwargs.get("cache_dir", None),
revision=self.metadata_dict["dataset"]["revision"],
)
self.data_loaded = True


class RealMMRagTechSlidesRetrieval(AbsTaskAny2AnyRetrieval):
metadata = TaskMetadata(
name="RealMMRagTechSlidesRetrieval",
description="Retrieve associated pages according to questions.",
reference="https://arxiv.org/abs/2502.12342",
dataset={
"path": "ibm-research/REAL-MM-RAG_TechSlides_BEIR",
"revision": "main",
},
type="DocumentUnderstanding",
category="t2i",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_5",
date=("2025-01-01", "2025-07-01"),
domains=["Academic"],
task_subtypes=["Image Text Retrieval"],
license="cdla-sharing-1.0",
annotations_creators="derived",
dialect=[],
modalities=["text", "image"],
sample_creation="found",
bibtex_citation=r"""
@article{wasserman2025real,
title={REAL-MM-RAG: A Real-World Multi-Modal Retrieval Benchmark},
author={Wasserman, Navve and Pony, Roi and Naparstek, Oshri and Goldfarb, Adi Raz and Schwartz, Eli and Barzelay, Udi and Karlinsky, Leonid},
journal={arXiv preprint arXiv:2502.12342},
year={2025}
}
""",
prompt={"query": "Find a screenshot that relevant to the user's question."},
descriptive_stats={
"n_samples": None,
"avg_character_length": {
"test": {
"average_document_length": 31.7,
"num_documents": 62,
"num_queries": 1354,
"average_relevant_docs_per_query": 1.0,
}
},
},
)

def load_data(self, **kwargs):
if self.data_loaded:
return

self.corpus, self.queries, self.relevant_docs = _load_data(
path=self.metadata_dict["dataset"]["path"],
splits=self.metadata_dict["eval_splits"],
cache_dir=kwargs.get("cache_dir", None),
revision=self.metadata_dict["dataset"]["revision"],
)
self.data_loaded = True


Loading