embeddings-benchmark · whybe-choi · Oct 8, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py
@@ -3,6 +3,8 @@
     BEIR_NL,
     BRIGHT,
     BRIGHT_LONG,
+    BRIGHT_SUBSETS,
+    BRIGHT_SUBSETS_LONG,
     BUILT_MTEB,
     C_MTEB,
     CHEMTEB,
@@ -62,6 +64,8 @@
     "BEIR_NL",
     "BRIGHT",
     "BRIGHT_LONG",
+    "BRIGHT_SUBSETS",
+    "BRIGHT_SUBSETS_LONG",
     "BUILT_MTEB",
     "CHEMTEB",
     "CODE_RAG",

diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py
@@ -1198,6 +1198,42 @@
 """,
 )
 
+BRIGHT_SUBSETS = Benchmark(
+    name="BRIGHT (subsets)",
+    display_name="Reasoning Retrieval (subsets)",
+    tasks=get_tasks(
+        tasks=[
+            "BrightBiologyRetrieval",
+            "BrightEarthScienceRetrieval",
+            "BrightEconomicsRetrieval",
+            "BrightPsychologyRetrieval",
+            "BrightRoboticsRetrieval",
+            "BrightStackoverflowRetrieval",
+            "BrightSustainableLivingRetrieval",
+            "BrightPonyRetrieval",
+            "BrightLeetcodeRetrieval",
+            "BrightAopsRetrieval",
+            "BrightTheoremQATheoremsRetrieval",
+            "BrightTheoremQAQuestionsRetrieval",
+        ],
+    ),
+    description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets).
+    This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark,
+    allowing for domain-specific evaluation. The subsets include: biology, earth science, economics,
+    psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems,
+    and theoremqa_questions.
+    """,
+    reference="https://brightbenchmark.github.io/",
+    citation=r"""
+@article{su2024bright,
+  author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
+  journal = {arXiv preprint arXiv:2407.12883},
+  title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
+  year = {2024},
+}
+""",
+)
+
 BRIGHT_LONG = Benchmark(
     name="BRIGHT (long)",
     tasks=MTEBTasks(
@@ -1227,6 +1263,37 @@
 """,
 )
 
+BRIGHT_SUBSETS_LONG = Benchmark(
+    name="BRIGHT (long subsets)",
+    display_name="Reasoning Retrieval (long subsets)",
+    tasks=get_tasks(
+        tasks=[
+            "BrightBiologyLongRetrieval",
+            "BrightEarthScienceLongRetrieval",
+            "BrightEconomicsLongRetrieval",
+            "BrightPsychologyLongRetrieval",
+            "BrightRoboticsLongRetrieval",
+            "BrightStackoverflowLongRetrieval",
+            "BrightSustainableLivingLongRetrieval",
+            "BrightPonyLongRetrieval",
+        ],
+    ),
+    description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets).
+    This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents,
+    allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science,
+    economics, psychology, robotics, stackoverflow, sustainable living, and pony.
+    """,
+    reference="https://brightbenchmark.github.io/",
+    citation=r"""
+@article{su2024bright,
+  author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
+  journal = {arXiv preprint arXiv:2407.12883},
+  title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
+  year = {2024},
+}
+""",
+)
+
 CODE_RAG = Benchmark(
     name="CodeRAG",
     tasks=get_tasks(
@@ -1619,8 +1686,7 @@
             "TRECCOVID-NL",
         ],
     ),
-    description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
-    "translation.",
+    description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
     reference="https://arxiv.org/abs/2412.08329",
     contacts=["nikolay-banar"],
     citation=r"""

diff --git a/mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "standard": {
+        "num_samples": 188113,
+        "number_of_characters": 141769714,
+        "documents_text_statistics": {
+            "total_text_length": 141734227,
+            "min_text_length": 58,
+            "average_text_length": 753.8974425803981,
+            "max_text_length": 7334,
+            "unique_texts": 176508
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 35487,
+            "min_text_length": 85,
+            "average_text_length": 319.7027027027027,
+            "max_text_length": 1167,
+            "unique_texts": 111
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 524,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 4.7207207207207205,
+            "max_relevant_docs_per_query": 8,
+            "unique_relevant_docs": 111
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "long": {
+        "num_samples": 627,
+        "number_of_characters": 19398082,
+        "documents_text_statistics": {
+            "total_text_length": 19344209,
+            "min_text_length": 142,
+            "average_text_length": 36916.42938931298,
+            "max_text_length": 1324201,
+            "unique_texts": 498
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 53873,
+            "min_text_length": 89,
+            "average_text_length": 523.0388349514564,
+            "max_text_length": 2195,
+            "unique_texts": 103
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 134,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.3009708737864079,
+            "max_relevant_docs_per_query": 4,
+            "unique_relevant_docs": 134
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "standard": {
+        "num_samples": 57462,
+        "number_of_characters": 18936054,
+        "documents_text_statistics": {
+            "total_text_length": 18882181,
+            "min_text_length": 1,
+            "average_text_length": 329.192994996426,
+            "max_text_length": 31130,
+            "unique_texts": 49434
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 53873,
+            "min_text_length": 89,
+            "average_text_length": 523.0388349514564,
+            "max_text_length": 2195,
+            "unique_texts": 103
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 374,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 3.6310679611650487,
+            "max_relevant_docs_per_query": 19,
+            "unique_relevant_docs": 374
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "long": {
+        "num_samples": 717,
+        "number_of_characters": 41696684,
+        "documents_text_statistics": {
+            "total_text_length": 41641374,
+            "min_text_length": 28,
+            "average_text_length": 69286.81198003328,
+            "max_text_length": 2627262,
+            "unique_texts": 587
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 55310,
+            "min_text_length": 83,
+            "average_text_length": 476.8103448275862,
+            "max_text_length": 1565,
+            "unique_texts": 116
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 187,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.6120689655172413,
+            "max_relevant_docs_per_query": 4,
+            "unique_relevant_docs": 187
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "standard": {
+        "num_samples": 121365,
+        "number_of_characters": 40478259,
+        "documents_text_statistics": {
+            "total_text_length": 40422949,
+            "min_text_length": 1,
+            "average_text_length": 333.3878959826473,
+            "max_text_length": 233622,
+            "unique_texts": 117633
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 55310,
+            "min_text_length": 83,
+            "average_text_length": 476.8103448275862,
+            "max_text_length": 1565,
+            "unique_texts": 116
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 609,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 5.25,
+            "max_relevant_docs_per_query": 23,
+            "unique_relevant_docs": 609
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "long": {
+        "num_samples": 619,
+        "number_of_characters": 19993261,
+        "documents_text_statistics": {
+            "total_text_length": 19917079,
+            "min_text_length": 43,
+            "average_text_length": 38598.99031007752,
+            "max_text_length": 429507,
+            "unique_texts": 515
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 76182,
+            "min_text_length": 164,
+            "average_text_length": 739.6310679611651,
+            "max_text_length": 2223,
+            "unique_texts": 103
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 109,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.058252427184466,
+            "max_relevant_docs_per_query": 3,
+            "unique_relevant_docs": 109
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "standard": {
+        "num_samples": 50323,
+        "number_of_characters": 19882579,
+        "documents_text_statistics": {
+            "total_text_length": 19806397,
+            "min_text_length": 1,
+            "average_text_length": 394.3926125049781,
+            "max_text_length": 39672,
+            "unique_texts": 40594
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 76182,
+            "min_text_length": 164,
+            "average_text_length": 739.6310679611651,
+            "max_text_length": 2223,
+            "unique_texts": 103
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 823,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 7.990291262135922,
+            "max_relevant_docs_per_query": 85,
+            "unique_relevant_docs": 823
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "standard": {
+        "num_samples": 414074,
+        "number_of_characters": 438348000,
+        "documents_text_statistics": {
+            "total_text_length": 438140779,
+            "min_text_length": 75,
+            "average_text_length": 1058.4849178125876,
+            "max_text_length": 103665,
+            "unique_texts": 413932
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 207221,
+            "min_text_length": 422,
+            "average_text_length": 1459.3028169014085,
+            "max_text_length": 3964,
+            "unique_texts": 142
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 262,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.8450704225352113,
+            "max_relevant_docs_per_query": 5,
+            "unique_relevant_docs": 216
+        },
+        "top_ranked_statistics": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json
@@ -0,0 +1,30 @@
+{
+    "long": {
+        "num_samples": 689,
+        "number_of_characters": 2093720,
+        "documents_text_statistics": {
+            "total_text_length": 2050155,
+            "min_text_length": 28,
+            "average_text_length": 3553.1282495667247,
+            "max_text_length": 108885,
+            "unique_texts": 577
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 43565,
+            "min_text_length": 182,
+            "average_text_length": 388.9732142857143,
+            "max_text_length": 946,
+            "unique_texts": 112
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 769,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 6.866071428571429,
+            "max_relevant_docs_per_query": 12,
+            "unique_relevant_docs": 17
+        },
+        "top_ranked_statistics": null
+    }
+}