Skip to content
4 changes: 4 additions & 0 deletions mteb/benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
BEIR_NL,
BRIGHT,
BRIGHT_LONG,
BRIGHT_SUBSETS,
BRIGHT_SUBSETS_LONG,
BUILT_MTEB,
C_MTEB,
CHEMTEB,
Expand Down Expand Up @@ -62,6 +64,8 @@
"BEIR_NL",
"BRIGHT",
"BRIGHT_LONG",
"BRIGHT_SUBSETS",
"BRIGHT_SUBSETS_LONG",
"BUILT_MTEB",
"CHEMTEB",
"CODE_RAG",
Expand Down
70 changes: 68 additions & 2 deletions mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,42 @@
""",
)

BRIGHT_SUBSETS = Benchmark(
name="BRIGHT (subsets)",
display_name="Reasoning Retrieval (subsets)",
tasks=get_tasks(
tasks=[
"BrightBiologyRetrieval",
"BrightEarthScienceRetrieval",
"BrightEconomicsRetrieval",
"BrightPsychologyRetrieval",
"BrightRoboticsRetrieval",
"BrightStackoverflowRetrieval",
"BrightSustainableLivingRetrieval",
"BrightPonyRetrieval",
"BrightLeetcodeRetrieval",
"BrightAopsRetrieval",
"BrightTheoremQATheoremsRetrieval",
"BrightTheoremQAQuestionsRetrieval",
],
),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets).
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark,
allowing for domain-specific evaluation. The subsets include: biology, earth science, economics,
psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems,
and theoremqa_questions.
""",
reference="https://brightbenchmark.github.io/",
citation=r"""
@article{su2024bright,
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
journal = {arXiv preprint arXiv:2407.12883},
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
year = {2024},
}
""",
)

BRIGHT_LONG = Benchmark(
name="BRIGHT (long)",
tasks=MTEBTasks(
Expand Down Expand Up @@ -1227,6 +1263,37 @@
""",
)

BRIGHT_SUBSETS_LONG = Benchmark(
name="BRIGHT (long subsets)",
display_name="Reasoning Retrieval (long subsets)",
tasks=get_tasks(
tasks=[
"BrightBiologyLongRetrieval",
"BrightEarthScienceLongRetrieval",
"BrightEconomicsLongRetrieval",
"BrightPsychologyLongRetrieval",
"BrightRoboticsLongRetrieval",
"BrightStackoverflowLongRetrieval",
"BrightSustainableLivingLongRetrieval",
"BrightPonyLongRetrieval",
],
),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets).
This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents,
allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science,
economics, psychology, robotics, stackoverflow, sustainable living, and pony.
""",
reference="https://brightbenchmark.github.io/",
citation=r"""
@article{su2024bright,
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
journal = {arXiv preprint arXiv:2407.12883},
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
year = {2024},
}
""",
)

CODE_RAG = Benchmark(
name="CodeRAG",
tasks=get_tasks(
Expand Down Expand Up @@ -1619,8 +1686,7 @@
"TRECCOVID-NL",
],
),
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
"translation.",
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
reference="https://arxiv.org/abs/2412.08329",
contacts=["nikolay-banar"],
citation=r"""
Expand Down
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"standard": {
"num_samples": 188113,
"number_of_characters": 141769714,
"documents_text_statistics": {
"total_text_length": 141734227,
"min_text_length": 58,
"average_text_length": 753.8974425803981,
"max_text_length": 7334,
"unique_texts": 176508
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 35487,
"min_text_length": 85,
"average_text_length": 319.7027027027027,
"max_text_length": 1167,
"unique_texts": 111
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 524,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 4.7207207207207205,
"max_relevant_docs_per_query": 8,
"unique_relevant_docs": 111
},
"top_ranked_statistics": null
}
}
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"long": {
"num_samples": 627,
"number_of_characters": 19398082,
"documents_text_statistics": {
"total_text_length": 19344209,
"min_text_length": 142,
"average_text_length": 36916.42938931298,
"max_text_length": 1324201,
"unique_texts": 498
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 53873,
"min_text_length": 89,
"average_text_length": 523.0388349514564,
"max_text_length": 2195,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 134,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.3009708737864079,
"max_relevant_docs_per_query": 4,
"unique_relevant_docs": 134
},
"top_ranked_statistics": null
}
}
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"standard": {
"num_samples": 57462,
"number_of_characters": 18936054,
"documents_text_statistics": {
"total_text_length": 18882181,
"min_text_length": 1,
"average_text_length": 329.192994996426,
"max_text_length": 31130,
"unique_texts": 49434
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 53873,
"min_text_length": 89,
"average_text_length": 523.0388349514564,
"max_text_length": 2195,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 374,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 3.6310679611650487,
"max_relevant_docs_per_query": 19,
"unique_relevant_docs": 374
},
"top_ranked_statistics": null
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"long": {
"num_samples": 717,
"number_of_characters": 41696684,
"documents_text_statistics": {
"total_text_length": 41641374,
"min_text_length": 28,
"average_text_length": 69286.81198003328,
"max_text_length": 2627262,
"unique_texts": 587
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 55310,
"min_text_length": 83,
"average_text_length": 476.8103448275862,
"max_text_length": 1565,
"unique_texts": 116
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 187,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.6120689655172413,
"max_relevant_docs_per_query": 4,
"unique_relevant_docs": 187
},
"top_ranked_statistics": null
}
}
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"standard": {
"num_samples": 121365,
"number_of_characters": 40478259,
"documents_text_statistics": {
"total_text_length": 40422949,
"min_text_length": 1,
"average_text_length": 333.3878959826473,
"max_text_length": 233622,
"unique_texts": 117633
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 55310,
"min_text_length": 83,
"average_text_length": 476.8103448275862,
"max_text_length": 1565,
"unique_texts": 116
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 609,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 5.25,
"max_relevant_docs_per_query": 23,
"unique_relevant_docs": 609
},
"top_ranked_statistics": null
}
}
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"long": {
"num_samples": 619,
"number_of_characters": 19993261,
"documents_text_statistics": {
"total_text_length": 19917079,
"min_text_length": 43,
"average_text_length": 38598.99031007752,
"max_text_length": 429507,
"unique_texts": 515
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 76182,
"min_text_length": 164,
"average_text_length": 739.6310679611651,
"max_text_length": 2223,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 109,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.058252427184466,
"max_relevant_docs_per_query": 3,
"unique_relevant_docs": 109
},
"top_ranked_statistics": null
}
}
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"standard": {
"num_samples": 50323,
"number_of_characters": 19882579,
"documents_text_statistics": {
"total_text_length": 19806397,
"min_text_length": 1,
"average_text_length": 394.3926125049781,
"max_text_length": 39672,
"unique_texts": 40594
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 76182,
"min_text_length": 164,
"average_text_length": 739.6310679611651,
"max_text_length": 2223,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 823,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 7.990291262135922,
"max_relevant_docs_per_query": 85,
"unique_relevant_docs": 823
},
"top_ranked_statistics": null
}
}
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"standard": {
"num_samples": 414074,
"number_of_characters": 438348000,
"documents_text_statistics": {
"total_text_length": 438140779,
"min_text_length": 75,
"average_text_length": 1058.4849178125876,
"max_text_length": 103665,
"unique_texts": 413932
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 207221,
"min_text_length": 422,
"average_text_length": 1459.3028169014085,
"max_text_length": 3964,
"unique_texts": 142
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 262,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.8450704225352113,
"max_relevant_docs_per_query": 5,
"unique_relevant_docs": 216
},
"top_ranked_statistics": null
}
}
30 changes: 30 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"long": {
"num_samples": 689,
"number_of_characters": 2093720,
"documents_text_statistics": {
"total_text_length": 2050155,
"min_text_length": 28,
"average_text_length": 3553.1282495667247,
"max_text_length": 108885,
"unique_texts": 577
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 43565,
"min_text_length": 182,
"average_text_length": 388.9732142857143,
"max_text_length": 946,
"unique_texts": 112
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 769,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 6.866071428571429,
"max_relevant_docs_per_query": 12,
"unique_relevant_docs": 17
},
"top_ranked_statistics": null
}
}
Loading