embeddings-benchmark
diff --git a/‎mteb/benchmarks/benchmarks/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎mteb/benchmarks/benchmarks/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mteb/benchmarks/benchmarks/benchmarks.py‎
Lines changed: 77 additions & 9 deletions b/‎mteb/benchmarks/benchmarks/benchmarks.py‎
Lines changed: 77 additions & 9 deletions
diff --git a/‎mteb/tasks/Retrieval/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎mteb/tasks/Retrieval/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -6,6 +6,8 @@
     BEIR_NL,
     BRIGHT,
     BRIGHT_LONG,
+    BRIGHT_LONG_SUBSETS,
+    BRIGHT_SUBSETS,
     BUILT_MTEB,
     C_MTEB,
     CHEMTEB,
@@ -86,6 +88,8 @@
     "LONG_EMBED",
     "BRIGHT",
     "BRIGHT_LONG",
+    "BRIGHT_SUBSETS",
+    "BRIGHT_LONG_SUBSETS",
     "CODE_RAG",
     "BEIR",
     "NANOBEIR",
 
@@ -1165,7 +1165,11 @@
 BRIGHT = Benchmark(
     name="BRIGHT",
     display_name="Reasoning Retrieval",
-    tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]),
+    tasks=get_tasks(
+        tasks=[
+            "BrightRetrieval",
+        ],
+    ),
     description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
     BRIGHT is the first text retrieval
     benchmark that requires intensive reasoning to retrieve relevant documents with
@@ -1184,14 +1188,48 @@
 """,
 )
 
+BRIGHT_SUBSETS = Benchmark(
+    name="BRIGHT (subsets)",
+    display_name="Reasoning Retrieval (subsets)",
+    tasks=get_tasks(
+        tasks=[
+            "BrightBiologyRetrieval",
+            "BrightEarthScienceRetrieval",
+            "BrightEconomicsRetrieval",
+            "BrightPsychologyRetrieval",
+            "BrightRoboticsRetrieval",
+            "BrightStackoverflowRetrieval",
+            "BrightSustainableLivingRetrieval",
+            "BrightPonyRetrieval",
+            "BrightLeetcodeRetrieval",
+            "BrightAopsRetrieval",
+            "BrightTheoremQATheoremsRetrieval",
+            "BrightTheoremQAQuestionsRetrieval",
+        ],
+    ),
+    description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets).
+    This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark,
+    allowing for domain-specific evaluation. The subsets include: biology, earth science, economics,
+    psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems,
+    and theoremqa_questions.
+    """,
+    reference="https://brightbenchmark.github.io/",
+    citation=r"""
+@article{su2024bright,
+  author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
+  journal = {arXiv preprint arXiv:2407.12883},
+  title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
+  year = {2024},
+}
+""",
+)
+
 BRIGHT_LONG = Benchmark(
     name="BRIGHT (long)",
-    tasks=MTEBTasks(
-        (
-            get_task(
-                "BrightLongRetrieval",
-            ),
-        )
+    tasks=get_tasks(
+        tasks=[
+            "BrightLongRetrieval",
+        ],
     ),
     description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
 BRIGHT is the first text retrieval
@@ -1213,6 +1251,37 @@
 """,
 )
 
+BRIGHT_LONG_SUBSETS = Benchmark(
+    name="BRIGHT (long subsets)",
+    display_name="Reasoning Retrieval (long subsets)",
+    tasks=get_tasks(
+        tasks=[
+            "BrightBiologyLongRetrieval",
+            "BrightEarthScienceLongRetrieval",
+            "BrightEconomicsLongRetrieval",
+            "BrightPsychologyLongRetrieval",
+            "BrightRoboticsLongRetrieval",
+            "BrightStackoverflowLongRetrieval",
+            "BrightSustainableLivingLongRetrieval",
+            "BrightPonyLongRetrieval",
+        ],
+    ),
+    description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets).
+    This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents,
+    allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science,
+    economics, psychology, robotics, stackoverflow, sustainable living, and pony.
+    """,
+    reference="https://brightbenchmark.github.io/",
+    citation=r"""
+@article{su2024bright,
+  author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
+  journal = {arXiv preprint arXiv:2407.12883},
+  title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
+  year = {2024},
+}
+""",
+)
+
 CODE_RAG = Benchmark(
     name="CodeRAG",
     tasks=get_tasks(
@@ -1603,8 +1672,7 @@
             "TRECCOVID-NL",
         ],
     ),
-    description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
-    "translation.",
+    description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
     reference="https://arxiv.org/abs/2412.08329",
     contacts=["nikolay-banar"],
     citation=r"""
 
@@ -42,7 +42,9 @@
 from .eng.BarExamQARetrieval import *
 from .eng.BillSumCARetrieval import *
 from .eng.BillSumUSRetrieval import *
+from .eng.BrightLongSubsetsRetrieval import *
 from .eng.BrightRetrieval import *
+from .eng.BrightSubsetsRetrieval import *
 from .eng.BuiltBenchRetrieval import *
 from .eng.ChatDoctorRetrieval import *
 from .eng.ChemHotpotQARetrieval import *