Skip to content

Commit 826990a

Browse files
committed
refactor: split BRIGHT benchmark into individual subset tasks
1 parent 0f61c9f commit 826990a

File tree

5 files changed

+987
-9
lines changed

5 files changed

+987
-9
lines changed

mteb/benchmarks/benchmarks/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
BEIR_NL,
77
BRIGHT,
88
BRIGHT_LONG,
9+
BRIGHT_LONG_SUBSETS,
10+
BRIGHT_SUBSETS,
911
BUILT_MTEB,
1012
C_MTEB,
1113
CHEMTEB,
@@ -86,6 +88,8 @@
8688
"LONG_EMBED",
8789
"BRIGHT",
8890
"BRIGHT_LONG",
91+
"BRIGHT_SUBSETS",
92+
"BRIGHT_LONG_SUBSETS",
8993
"CODE_RAG",
9094
"BEIR",
9195
"NANOBEIR",

mteb/benchmarks/benchmarks/benchmarks.py

Lines changed: 77 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,7 +1165,11 @@
11651165
BRIGHT = Benchmark(
11661166
name="BRIGHT",
11671167
display_name="Reasoning Retrieval",
1168-
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]),
1168+
tasks=get_tasks(
1169+
tasks=[
1170+
"BrightRetrieval",
1171+
],
1172+
),
11691173
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
11701174
BRIGHT is the first text retrieval
11711175
benchmark that requires intensive reasoning to retrieve relevant documents with
@@ -1184,14 +1188,48 @@
11841188
""",
11851189
)
11861190

1191+
BRIGHT_SUBSETS = Benchmark(
1192+
name="BRIGHT (subsets)",
1193+
display_name="Reasoning Retrieval (subsets)",
1194+
tasks=get_tasks(
1195+
tasks=[
1196+
"BrightBiologyRetrieval",
1197+
"BrightEarthScienceRetrieval",
1198+
"BrightEconomicsRetrieval",
1199+
"BrightPsychologyRetrieval",
1200+
"BrightRoboticsRetrieval",
1201+
"BrightStackoverflowRetrieval",
1202+
"BrightSustainableLivingRetrieval",
1203+
"BrightPonyRetrieval",
1204+
"BrightLeetcodeRetrieval",
1205+
"BrightAopsRetrieval",
1206+
"BrightTheoremQATheoremsRetrieval",
1207+
"BrightTheoremQAQuestionsRetrieval",
1208+
],
1209+
),
1210+
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets).
1211+
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark,
1212+
allowing for domain-specific evaluation. The subsets include: biology, earth science, economics,
1213+
psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems,
1214+
and theoremqa_questions.
1215+
""",
1216+
reference="https://brightbenchmark.github.io/",
1217+
citation=r"""
1218+
@article{su2024bright,
1219+
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
1220+
journal = {arXiv preprint arXiv:2407.12883},
1221+
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
1222+
year = {2024},
1223+
}
1224+
""",
1225+
)
1226+
11871227
BRIGHT_LONG = Benchmark(
11881228
name="BRIGHT (long)",
1189-
tasks=MTEBTasks(
1190-
(
1191-
get_task(
1192-
"BrightLongRetrieval",
1193-
),
1194-
)
1229+
tasks=get_tasks(
1230+
tasks=[
1231+
"BrightLongRetrieval",
1232+
],
11951233
),
11961234
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
11971235
BRIGHT is the first text retrieval
@@ -1213,6 +1251,37 @@
12131251
""",
12141252
)
12151253

1254+
BRIGHT_LONG_SUBSETS = Benchmark(
1255+
name="BRIGHT (long subsets)",
1256+
display_name="Reasoning Retrieval (long subsets)",
1257+
tasks=get_tasks(
1258+
tasks=[
1259+
"BrightBiologyLongRetrieval",
1260+
"BrightEarthScienceLongRetrieval",
1261+
"BrightEconomicsLongRetrieval",
1262+
"BrightPsychologyLongRetrieval",
1263+
"BrightRoboticsLongRetrieval",
1264+
"BrightStackoverflowLongRetrieval",
1265+
"BrightSustainableLivingLongRetrieval",
1266+
"BrightPonyLongRetrieval",
1267+
],
1268+
),
1269+
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets).
1270+
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents,
1271+
allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science,
1272+
economics, psychology, robotics, stackoverflow, sustainable living, and pony.
1273+
""",
1274+
reference="https://brightbenchmark.github.io/",
1275+
citation=r"""
1276+
@article{su2024bright,
1277+
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
1278+
journal = {arXiv preprint arXiv:2407.12883},
1279+
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
1280+
year = {2024},
1281+
}
1282+
""",
1283+
)
1284+
12161285
CODE_RAG = Benchmark(
12171286
name="CodeRAG",
12181287
tasks=get_tasks(
@@ -1603,8 +1672,7 @@
16031672
"TRECCOVID-NL",
16041673
],
16051674
),
1606-
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
1607-
"translation.",
1675+
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
16081676
reference="https://arxiv.org/abs/2412.08329",
16091677
contacts=["nikolay-banar"],
16101678
citation=r"""

mteb/tasks/Retrieval/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@
4242
from .eng.BarExamQARetrieval import *
4343
from .eng.BillSumCARetrieval import *
4444
from .eng.BillSumUSRetrieval import *
45+
from .eng.BrightLongSubsetsRetrieval import *
4546
from .eng.BrightRetrieval import *
47+
from .eng.BrightSubsetsRetrieval import *
4648
from .eng.BuiltBenchRetrieval import *
4749
from .eng.ChatDoctorRetrieval import *
4850
from .eng.ChemHotpotQARetrieval import *

0 commit comments

Comments
 (0)