Skip to content
4 changes: 4 additions & 0 deletions mteb/benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
BEIR_NL,
BRIGHT,
BRIGHT_LONG,
BRIGHT_SUBSETS,
BRIGHT_SUBSETS_LONG,
BUILT_MTEB,
C_MTEB,
CHEMTEB,
Expand Down Expand Up @@ -62,6 +64,8 @@
"BEIR_NL",
"BRIGHT",
"BRIGHT_LONG",
"BRIGHT_SUBSETS",
"BRIGHT_SUBSETS_LONG",
"BUILT_MTEB",
"CHEMTEB",
"CODE_RAG",
Expand Down
70 changes: 68 additions & 2 deletions mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,42 @@
""",
)

BRIGHT_SUBSETS = Benchmark(
name="BRIGHT (subsets)",
display_name="Reasoning Retrieval (subsets)",
tasks=get_tasks(
tasks=[
"BrightBiologyRetrieval",
"BrightEarthScienceRetrieval",
"BrightEconomicsRetrieval",
"BrightPsychologyRetrieval",
"BrightRoboticsRetrieval",
"BrightStackoverflowRetrieval",
"BrightSustainableLivingRetrieval",
"BrightPonyRetrieval",
"BrightLeetcodeRetrieval",
"BrightAopsRetrieval",
"BrightTheoremQATheoremsRetrieval",
"BrightTheoremQAQuestionsRetrieval",
],
),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets).
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark,
allowing for domain-specific evaluation. The subsets include: biology, earth science, economics,
psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems,
and theoremqa_questions.
""",
reference="https://brightbenchmark.github.io/",
citation=r"""
@article{su2024bright,
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
journal = {arXiv preprint arXiv:2407.12883},
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
year = {2024},
}
""",
)

BRIGHT_LONG = Benchmark(
name="BRIGHT (long)",
tasks=MTEBTasks(
Expand Down Expand Up @@ -1227,6 +1263,37 @@
""",
)

BRIGHT_SUBSETS_LONG = Benchmark(
name="BRIGHT (long subsets)",
display_name="Reasoning Retrieval (long subsets)",
tasks=get_tasks(
tasks=[
"BrightBiologyLongRetrieval",
"BrightEarthScienceLongRetrieval",
"BrightEconomicsLongRetrieval",
"BrightPsychologyLongRetrieval",
"BrightRoboticsLongRetrieval",
"BrightStackoverflowLongRetrieval",
"BrightSustainableLivingLongRetrieval",
"BrightPonyLongRetrieval",
],
),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets).
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents,
allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science,
economics, psychology, robotics, stackoverflow, sustainable living, and pony.
""",
reference="https://brightbenchmark.github.io/",
citation=r"""
@article{su2024bright,
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
journal = {arXiv preprint arXiv:2407.12883},
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
year = {2024},
}
""",
)

CODE_RAG = Benchmark(
name="CodeRAG",
tasks=get_tasks(
Expand Down Expand Up @@ -1619,8 +1686,7 @@
"TRECCOVID-NL",
],
),
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
"translation.",
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
reference="https://arxiv.org/abs/2412.08329",
contacts=["nikolay-banar"],
citation=r"""
Expand Down
44 changes: 44 additions & 0 deletions mteb/tasks/retrieval/eng/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,30 @@
from .blink_it2i_retrieval import BLINKIT2IRetrieval
from .blink_it2t_retrieval import BLINKIT2TRetrieval
from .bright_retrieval import BrightLongRetrieval, BrightRetrieval
from .bright_subset_long_retrieval import (
BrightBiologyLongRetrieval,
BrightEarthScienceLongRetrieval,
BrightEconomicsLongRetrieval,
BrightPonyLongRetrieval,
BrightPsychologyLongRetrieval,
BrightRoboticsLongRetrieval,
BrightStackoverflowLongRetrieval,
BrightSustainableLivingLongRetrieval,
)
from .bright_subsets_retrieval import (
BrightAopsRetrieval,
BrightBiologyRetrieval,
BrightEarthScienceRetrieval,
BrightEconomicsRetrieval,
BrightLeetcodeRetrieval,
BrightPonyRetrieval,
BrightPsychologyRetrieval,
BrightRoboticsRetrieval,
BrightStackoverflowRetrieval,
BrightSustainableLivingRetrieval,
BrightTheoremQAQuestionsRetrieval,
BrightTheoremQATheoremsRetrieval,
)
from .built_bench_retrieval import BuiltBenchRetrieval
from .chat_doctor_retrieval import ChatDoctorRetrieval
from .chem_hotpot_qa_retrieval import ChemHotpotQARetrieval
Expand Down Expand Up @@ -226,8 +250,28 @@
"BarExamQARetrieval",
"BillSumCARetrieval",
"BillSumUSRetrieval",
"BrightAopsRetrieval",
"BrightBiologyLongRetrieval",
"BrightBiologyRetrieval",
"BrightEarthScienceLongRetrieval",
"BrightEarthScienceRetrieval",
"BrightEconomicsLongRetrieval",
"BrightEconomicsRetrieval",
"BrightLeetcodeRetrieval",
"BrightLongRetrieval",
"BrightPonyLongRetrieval",
"BrightPonyRetrieval",
"BrightPsychologyLongRetrieval",
"BrightPsychologyRetrieval",
"BrightRetrieval",
"BrightRoboticsLongRetrieval",
"BrightRoboticsRetrieval",
"BrightStackoverflowLongRetrieval",
"BrightStackoverflowRetrieval",
"BrightSustainableLivingLongRetrieval",
"BrightSustainableLivingRetrieval",
"BrightTheoremQAQuestionsRetrieval",
"BrightTheoremQATheoremsRetrieval",
"BuiltBenchRetrieval",
"CIRRIT2IRetrieval",
"CQADupstackAndroidRetrieval",
Expand Down
Loading
Loading