Skip to content

Commit 3ed620f

Browse files
committed
refactor: split BRIGHT benchmark into individual subset tasks
1 parent 0f61c9f commit 3ed620f

File tree

5 files changed

+978
-2
lines changed

5 files changed

+978
-2
lines changed

mteb/benchmarks/benchmarks/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
BEIR_NL,
77
BRIGHT,
88
BRIGHT_LONG,
9+
BRIGHT_LONG_SUBSETS,
10+
BRIGHT_SUBSETS,
911
BUILT_MTEB,
1012
C_MTEB,
1113
CHEMTEB,
@@ -86,6 +88,8 @@
8688
"LONG_EMBED",
8789
"BRIGHT",
8890
"BRIGHT_LONG",
91+
"BRIGHT_SUBSETS",
92+
"BRIGHT_LONG_SUBSETS",
8993
"CODE_RAG",
9094
"BEIR",
9195
"NANOBEIR",

mteb/benchmarks/benchmarks/benchmarks.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,6 +1184,42 @@
11841184
""",
11851185
)
11861186

1187+
BRIGHT_SUBSETS = Benchmark(
1188+
name="BRIGHT (subsets)",
1189+
display_name="Reasoning Retrieval (subsets)",
1190+
tasks=get_tasks(
1191+
tasks=[
1192+
"BrightBiologyRetrieval",
1193+
"BrightEarthScienceRetrieval",
1194+
"BrightEconomicsRetrieval",
1195+
"BrightPsychologyRetrieval",
1196+
"BrightRoboticsRetrieval",
1197+
"BrightStackoverflowRetrieval",
1198+
"BrightSustainableLivingRetrieval",
1199+
"BrightPonyRetrieval",
1200+
"BrightLeetcodeRetrieval",
1201+
"BrightAopsRetrieval",
1202+
"BrightTheoremQATheoremsRetrieval",
1203+
"BrightTheoremQAQuestionsRetrieval",
1204+
],
1205+
),
1206+
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets).
1207+
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark,
1208+
allowing for domain-specific evaluation. The subsets include: biology, earth science, economics,
1209+
psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems,
1210+
and theoremqa_questions.
1211+
""",
1212+
reference="https://brightbenchmark.github.io/",
1213+
citation=r"""
1214+
@article{su2024bright,
1215+
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
1216+
journal = {arXiv preprint arXiv:2407.12883},
1217+
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
1218+
year = {2024},
1219+
}
1220+
""",
1221+
)
1222+
11871223
BRIGHT_LONG = Benchmark(
11881224
name="BRIGHT (long)",
11891225
tasks=MTEBTasks(
@@ -1213,6 +1249,37 @@
12131249
""",
12141250
)
12151251

1252+
BRIGHT_LONG_SUBSETS = Benchmark(
1253+
name="BRIGHT (long subsets)",
1254+
display_name="Reasoning Retrieval (long subsets)",
1255+
tasks=get_tasks(
1256+
tasks=[
1257+
"BrightBiologyLongRetrieval",
1258+
"BrightEarthScienceLongRetrieval",
1259+
"BrightEconomicsLongRetrieval",
1260+
"BrightPsychologyLongRetrieval",
1261+
"BrightRoboticsLongRetrieval",
1262+
"BrightStackoverflowLongRetrieval",
1263+
"BrightSustainableLivingLongRetrieval",
1264+
"BrightPonyLongRetrieval",
1265+
],
1266+
),
1267+
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets).
1268+
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents,
1269+
allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science,
1270+
economics, psychology, robotics, stackoverflow, sustainable living, and pony.
1271+
""",
1272+
reference="https://brightbenchmark.github.io/",
1273+
citation=r"""
1274+
@article{su2024bright,
1275+
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
1276+
journal = {arXiv preprint arXiv:2407.12883},
1277+
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
1278+
year = {2024},
1279+
}
1280+
""",
1281+
)
1282+
12161283
CODE_RAG = Benchmark(
12171284
name="CodeRAG",
12181285
tasks=get_tasks(
@@ -1603,8 +1670,7 @@
16031670
"TRECCOVID-NL",
16041671
],
16051672
),
1606-
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
1607-
"translation.",
1673+
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
16081674
reference="https://arxiv.org/abs/2412.08329",
16091675
contacts=["nikolay-banar"],
16101676
citation=r"""

mteb/tasks/Retrieval/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@
4242
from .eng.BarExamQARetrieval import *
4343
from .eng.BillSumCARetrieval import *
4444
from .eng.BillSumUSRetrieval import *
45+
from .eng.BrightLongSubsetsRetrieval import *
4546
from .eng.BrightRetrieval import *
47+
from .eng.BrightSubsetsRetrieval import *
4648
from .eng.BuiltBenchRetrieval import *
4749
from .eng.ChatDoctorRetrieval import *
4850
from .eng.ChemHotpotQARetrieval import *

0 commit comments

Comments
 (0)