diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index 3e16b58c82..0bb7fab61e 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -3,6 +3,8 @@ BEIR_NL, BRIGHT, BRIGHT_LONG, + BRIGHT_SUBSETS, + BRIGHT_SUBSETS_LONG, BUILT_MTEB, C_MTEB, CHEMTEB, @@ -62,6 +64,8 @@ "BEIR_NL", "BRIGHT", "BRIGHT_LONG", + "BRIGHT_SUBSETS", + "BRIGHT_SUBSETS_LONG", "BUILT_MTEB", "CHEMTEB", "CODE_RAG", diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py index e29eab05b3..c59c624d05 100644 --- a/mteb/benchmarks/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks/benchmarks.py @@ -1198,6 +1198,42 @@ """, ) +BRIGHT_SUBSETS = Benchmark( + name="BRIGHT (subsets)", + display_name="Reasoning Retrieval (subsets)", + tasks=get_tasks( + tasks=[ + "BrightBiologyRetrieval", + "BrightEarthScienceRetrieval", + "BrightEconomicsRetrieval", + "BrightPsychologyRetrieval", + "BrightRoboticsRetrieval", + "BrightStackoverflowRetrieval", + "BrightSustainableLivingRetrieval", + "BrightPonyRetrieval", + "BrightLeetcodeRetrieval", + "BrightAopsRetrieval", + "BrightTheoremQATheoremsRetrieval", + "BrightTheoremQAQuestionsRetrieval", + ], + ), + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets). + This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark, + allowing for domain-specific evaluation. The subsets include: biology, earth science, economics, + psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems, + and theoremqa_questions. + """, + reference="https://brightbenchmark.github.io/", + citation=r""" +@article{su2024bright, + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, + journal = {arXiv preprint arXiv:2407.12883}, + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, + year = {2024}, +} +""", +) + BRIGHT_LONG = Benchmark( name="BRIGHT (long)", tasks=MTEBTasks( @@ -1227,6 +1263,37 @@ """, ) +BRIGHT_SUBSETS_LONG = Benchmark( + name="BRIGHT (long subsets)", + display_name="Reasoning Retrieval (long subsets)", + tasks=get_tasks( + tasks=[ + "BrightBiologyLongRetrieval", + "BrightEarthScienceLongRetrieval", + "BrightEconomicsLongRetrieval", + "BrightPsychologyLongRetrieval", + "BrightRoboticsLongRetrieval", + "BrightStackoverflowLongRetrieval", + "BrightSustainableLivingLongRetrieval", + "BrightPonyLongRetrieval", + ], + ), + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets). + This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents, + allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science, + economics, psychology, robotics, stackoverflow, sustainable living, and pony. + """, + reference="https://brightbenchmark.github.io/", + citation=r""" +@article{su2024bright, + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, + journal = {arXiv preprint arXiv:2407.12883}, + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, + year = {2024}, +} +""", +) + CODE_RAG = Benchmark( name="CodeRAG", tasks=get_tasks( @@ -1619,8 +1686,7 @@ "TRECCOVID-NL", ], ), - description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated " - "translation.", + description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.", reference="https://arxiv.org/abs/2412.08329", contacts=["nikolay-banar"], citation=r""" diff --git a/mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json new file mode 100644 index 0000000000..ffeff980ca --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 188113, + "number_of_characters": 141769714, + "documents_text_statistics": { + "total_text_length": 141734227, + "min_text_length": 58, + "average_text_length": 753.8974425803981, + "max_text_length": 7334, + "unique_texts": 176508 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 35487, + "min_text_length": 85, + "average_text_length": 319.7027027027027, + "max_text_length": 1167, + "unique_texts": 111 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 524, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 4.7207207207207205, + "max_relevant_docs_per_query": 8, + "unique_relevant_docs": 111 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json new file mode 100644 index 0000000000..b5853bc1ee --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 627, + "number_of_characters": 19398082, + "documents_text_statistics": { + "total_text_length": 19344209, + "min_text_length": 142, + "average_text_length": 36916.42938931298, + "max_text_length": 1324201, + "unique_texts": 498 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 53873, + "min_text_length": 89, + "average_text_length": 523.0388349514564, + "max_text_length": 2195, + "unique_texts": 103 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 134, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.3009708737864079, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 134 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json new file mode 100644 index 0000000000..582502f801 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 57462, + "number_of_characters": 18936054, + "documents_text_statistics": { + "total_text_length": 18882181, + "min_text_length": 1, + "average_text_length": 329.192994996426, + "max_text_length": 31130, + "unique_texts": 49434 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 53873, + "min_text_length": 89, + "average_text_length": 523.0388349514564, + "max_text_length": 2195, + "unique_texts": 103 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 374, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 3.6310679611650487, + "max_relevant_docs_per_query": 19, + "unique_relevant_docs": 374 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json new file mode 100644 index 0000000000..d9aa1db590 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 717, + "number_of_characters": 41696684, + "documents_text_statistics": { + "total_text_length": 41641374, + "min_text_length": 28, + "average_text_length": 69286.81198003328, + "max_text_length": 2627262, + "unique_texts": 587 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 55310, + "min_text_length": 83, + "average_text_length": 476.8103448275862, + "max_text_length": 1565, + "unique_texts": 116 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 187, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.6120689655172413, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 187 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json new file mode 100644 index 0000000000..6877be42ee --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 121365, + "number_of_characters": 40478259, + "documents_text_statistics": { + "total_text_length": 40422949, + "min_text_length": 1, + "average_text_length": 333.3878959826473, + "max_text_length": 233622, + "unique_texts": 117633 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 55310, + "min_text_length": 83, + "average_text_length": 476.8103448275862, + "max_text_length": 1565, + "unique_texts": 116 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 609, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 5.25, + "max_relevant_docs_per_query": 23, + "unique_relevant_docs": 609 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json new file mode 100644 index 0000000000..b97b56375e --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 619, + "number_of_characters": 19993261, + "documents_text_statistics": { + "total_text_length": 19917079, + "min_text_length": 43, + "average_text_length": 38598.99031007752, + "max_text_length": 429507, + "unique_texts": 515 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 76182, + "min_text_length": 164, + "average_text_length": 739.6310679611651, + "max_text_length": 2223, + "unique_texts": 103 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 109, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.058252427184466, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 109 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json new file mode 100644 index 0000000000..33d5af6ae3 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 50323, + "number_of_characters": 19882579, + "documents_text_statistics": { + "total_text_length": 19806397, + "min_text_length": 1, + "average_text_length": 394.3926125049781, + "max_text_length": 39672, + "unique_texts": 40594 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 76182, + "min_text_length": 164, + "average_text_length": 739.6310679611651, + "max_text_length": 2223, + "unique_texts": 103 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 823, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 7.990291262135922, + "max_relevant_docs_per_query": 85, + "unique_relevant_docs": 823 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json new file mode 100644 index 0000000000..49ff28ca24 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 414074, + "number_of_characters": 438348000, + "documents_text_statistics": { + "total_text_length": 438140779, + "min_text_length": 75, + "average_text_length": 1058.4849178125876, + "max_text_length": 103665, + "unique_texts": 413932 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 207221, + "min_text_length": 422, + "average_text_length": 1459.3028169014085, + "max_text_length": 3964, + "unique_texts": 142 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 262, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.8450704225352113, + "max_relevant_docs_per_query": 5, + "unique_relevant_docs": 216 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json new file mode 100644 index 0000000000..7f90d7b396 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 689, + "number_of_characters": 2093720, + "documents_text_statistics": { + "total_text_length": 2050155, + "min_text_length": 28, + "average_text_length": 3553.1282495667247, + "max_text_length": 108885, + "unique_texts": 577 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 43565, + "min_text_length": 182, + "average_text_length": 388.9732142857143, + "max_text_length": 946, + "unique_texts": 112 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 769, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 6.866071428571429, + "max_relevant_docs_per_query": 12, + "unique_relevant_docs": 17 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json new file mode 100644 index 0000000000..f2add40090 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 8006, + "number_of_characters": 2082980, + "documents_text_statistics": { + "total_text_length": 2039415, + "min_text_length": 5, + "average_text_length": 258.350012667849, + "max_text_length": 2583, + "unique_texts": 6183 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 43565, + "min_text_length": 182, + "average_text_length": 388.9732142857143, + "max_text_length": 946, + "unique_texts": 112 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 2519, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 22.491071428571427, + "max_relevant_docs_per_query": 32, + "unique_relevant_docs": 47 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json new file mode 100644 index 0000000000..5dbc828642 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 613, + "number_of_characters": 20489389, + "documents_text_statistics": { + "total_text_length": 20419376, + "min_text_length": 23, + "average_text_length": 39881.59375, + "max_text_length": 669575, + "unique_texts": 509 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 70013, + "min_text_length": 166, + "average_text_length": 693.1980198019802, + "max_text_length": 2334, + "unique_texts": 101 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 116, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1485148514851484, + "max_relevant_docs_per_query": 5, + "unique_relevant_docs": 113 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json new file mode 100644 index 0000000000..be5b122fc8 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 52936, + "number_of_characters": 20372421, + "documents_text_statistics": { + "total_text_length": 20302408, + "min_text_length": 3, + "average_text_length": 384.26058483959497, + "max_text_length": 226941, + "unique_texts": 43756 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 70013, + "min_text_length": 166, + "average_text_length": 693.1980198019802, + "max_text_length": 2334, + "unique_texts": 101 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 742, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 7.346534653465347, + "max_relevant_docs_per_query": 59, + "unique_relevant_docs": 738 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json new file mode 100644 index 0000000000..061d53d9ca --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 609, + "number_of_characters": 18386897, + "documents_text_statistics": { + "total_text_length": 18166762, + "min_text_length": 117, + "average_text_length": 35761.34251968504, + "max_text_length": 3589928, + "unique_texts": 505 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 220135, + "min_text_length": 165, + "average_text_length": 2179.5544554455446, + "max_text_length": 19341, + "unique_texts": 101 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 106, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0495049504950495, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 106 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json new file mode 100644 index 0000000000..c35f9d65bc --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 62062, + "number_of_characters": 18167360, + "documents_text_statistics": { + "total_text_length": 17947225, + "min_text_length": 1, + "average_text_length": 289.6535724084505, + "max_text_length": 28637, + "unique_texts": 40431 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 220135, + "min_text_length": 165, + "average_text_length": 2179.5544554455446, + "max_text_length": 19341, + "unique_texts": 101 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 553, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 5.475247524752476, + "max_relevant_docs_per_query": 36, + "unique_relevant_docs": 553 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json new file mode 100644 index 0000000000..2a16d89936 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 1975, + "number_of_characters": 184326754, + "documents_text_statistics": { + "total_text_length": 184175475, + "min_text_length": 41, + "average_text_length": 99125.65931108719, + "max_text_length": 9182738, + "unique_texts": 1846 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 151279, + "min_text_length": 185, + "average_text_length": 1292.982905982906, + "max_text_length": 12432, + "unique_texts": 117 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 129, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1025641025641026, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 125 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json new file mode 100644 index 0000000000..2945dae20e --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 107198, + "number_of_characters": 183652816, + "documents_text_statistics": { + "total_text_length": 183501537, + "min_text_length": 1, + "average_text_length": 1713.6703710275399, + "max_text_length": 4000, + "unique_texts": 66270 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 151279, + "min_text_length": 185, + "average_text_length": 1292.982905982906, + "max_text_length": 12432, + "unique_texts": 117 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 819, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 7.0, + "max_relevant_docs_per_query": 59, + "unique_relevant_docs": 816 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json new file mode 100644 index 0000000000..959565a372 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json @@ -0,0 +1,30 @@ +{ + "long": { + "num_samples": 662, + "number_of_characters": 21154322, + "documents_text_statistics": { + "total_text_length": 21080575, + "min_text_length": 30, + "average_text_length": 38051.579422382674, + "max_text_length": 5732344, + "unique_texts": 551 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 73747, + "min_text_length": 158, + "average_text_length": 682.8425925925926, + "max_text_length": 2843, + "unique_texts": 108 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 129, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1944444444444444, + "max_relevant_docs_per_query": 5, + "unique_relevant_docs": 129 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json new file mode 100644 index 0000000000..7bb2b2524b --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 60900, + "number_of_characters": 20971763, + "documents_text_statistics": { + "total_text_length": 20898016, + "min_text_length": 1, + "average_text_length": 343.7626003421503, + "max_text_length": 158296, + "unique_texts": 50142 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 73747, + "min_text_length": 158, + "average_text_length": 682.8425925925926, + "max_text_length": 2843, + "unique_texts": 108 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 604, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 5.592592592592593, + "max_relevant_docs_per_query": 59, + "unique_relevant_docs": 604 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json new file mode 100644 index 0000000000..76593b399c --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 188207, + "number_of_characters": 141817604, + "documents_text_statistics": { + "total_text_length": 141734227, + "min_text_length": 58, + "average_text_length": 753.8974425803981, + "max_text_length": 7334, + "unique_texts": 176508 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 83377, + "min_text_length": 12, + "average_text_length": 406.7170731707317, + "max_text_length": 1255, + "unique_texts": 201 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 469, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.299019607843137, + "max_relevant_docs_per_query": 7, + "unique_relevant_docs": 234 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json new file mode 100644 index 0000000000..8eb2a4eb5d --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json @@ -0,0 +1,30 @@ +{ + "standard": { + "num_samples": 23904, + "number_of_characters": 20825122, + "documents_text_statistics": { + "total_text_length": 20797224, + "min_text_length": 74, + "average_text_length": 872.4033726246906, + "max_text_length": 19104, + "unique_texts": 23839 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 27898, + "min_text_length": 13, + "average_text_length": 429.2, + "max_text_length": 1255, + "unique_texts": 65 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 126, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.9384615384615385, + "max_relevant_docs_per_query": 6, + "unique_relevant_docs": 95 + }, + "top_ranked_statistics": null + } +} diff --git a/mteb/tasks/retrieval/eng/__init__.py b/mteb/tasks/retrieval/eng/__init__.py index cfa2814d38..6675bfeec0 100644 --- a/mteb/tasks/retrieval/eng/__init__.py +++ b/mteb/tasks/retrieval/eng/__init__.py @@ -14,6 +14,30 @@ from .blink_it2i_retrieval import BLINKIT2IRetrieval from .blink_it2t_retrieval import BLINKIT2TRetrieval from .bright_retrieval import BrightLongRetrieval, BrightRetrieval +from .bright_subsets_long_retrieval import ( + BrightBiologyLongRetrieval, + BrightEarthScienceLongRetrieval, + BrightEconomicsLongRetrieval, + BrightPonyLongRetrieval, + BrightPsychologyLongRetrieval, + BrightRoboticsLongRetrieval, + BrightStackoverflowLongRetrieval, + BrightSustainableLivingLongRetrieval, +) +from .bright_subsets_retrieval import ( + BrightAopsRetrieval, + BrightBiologyRetrieval, + BrightEarthScienceRetrieval, + BrightEconomicsRetrieval, + BrightLeetcodeRetrieval, + BrightPonyRetrieval, + BrightPsychologyRetrieval, + BrightRoboticsRetrieval, + BrightStackoverflowRetrieval, + BrightSustainableLivingRetrieval, + BrightTheoremQAQuestionsRetrieval, + BrightTheoremQATheoremsRetrieval, +) from .built_bench_retrieval import BuiltBenchRetrieval from .chat_doctor_retrieval import ChatDoctorRetrieval from .chem_hotpot_qa_retrieval import ChemHotpotQARetrieval @@ -226,8 +250,28 @@ "BarExamQARetrieval", "BillSumCARetrieval", "BillSumUSRetrieval", + "BrightAopsRetrieval", + "BrightBiologyLongRetrieval", + "BrightBiologyRetrieval", + "BrightEarthScienceLongRetrieval", + "BrightEarthScienceRetrieval", + "BrightEconomicsLongRetrieval", + "BrightEconomicsRetrieval", + "BrightLeetcodeRetrieval", "BrightLongRetrieval", + "BrightPonyLongRetrieval", + "BrightPonyRetrieval", + "BrightPsychologyLongRetrieval", + "BrightPsychologyRetrieval", "BrightRetrieval", + "BrightRoboticsLongRetrieval", + "BrightRoboticsRetrieval", + "BrightStackoverflowLongRetrieval", + "BrightStackoverflowRetrieval", + "BrightSustainableLivingLongRetrieval", + "BrightSustainableLivingRetrieval", + "BrightTheoremQAQuestionsRetrieval", + "BrightTheoremQATheoremsRetrieval", "BuiltBenchRetrieval", "CIRRIT2IRetrieval", "CQADupstackAndroidRetrieval", diff --git a/mteb/tasks/retrieval/eng/bright_retrieval.py b/mteb/tasks/retrieval/eng/bright_retrieval.py index 82c56b8451..84a73077ee 100644 --- a/mteb/tasks/retrieval/eng/bright_retrieval.py +++ b/mteb/tasks/retrieval/eng/bright_retrieval.py @@ -104,7 +104,7 @@ class BrightRetrieval(AbsTaskRetrieval): "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", }, reference="https://huggingface.co/datasets/xlangai/BRIGHT", - description="Bright retrieval dataset.", + description="BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval", type="Retrieval", category="t2t", eval_splits=["standard"], diff --git a/mteb/tasks/retrieval/eng/bright_subsets_long_retrieval.py b/mteb/tasks/retrieval/eng/bright_subsets_long_retrieval.py new file mode 100644 index 0000000000..423605ebf1 --- /dev/null +++ b/mteb/tasks/retrieval/eng/bright_subsets_long_retrieval.py @@ -0,0 +1,431 @@ +from __future__ import annotations + +from collections import defaultdict + +import datasets + +from mteb.abstasks import AbsTaskRetrieval +from mteb.abstasks.task_metadata import TaskMetadata + + +def load_bright_long_data( + path: str, + domain: str, + eval_splits: list, + cache_dir: str | None = None, + revision: str | None = None, +): + corpus = dict.fromkeys(eval_splits) + queries = dict.fromkeys(eval_splits) + relevant_docs = dict.fromkeys(eval_splits) + top_ranked = dict.fromkeys(eval_splits) + + domain_corpus_long = datasets.load_dataset( + path, + "long_documents", + split=domain, + cache_dir=cache_dir, + revision=revision, + ) + examples = datasets.load_dataset( + path, + "examples", + split=domain, + cache_dir=cache_dir, + revision=revision, + ) + corpus["long"] = {e["id"]: {"text": e["content"]} for e in domain_corpus_long} + queries["long"] = {e["id"]: e["query"] for e in examples} + relevant_docs["long"] = defaultdict(dict) + top_ranked["long"] = defaultdict(list) + + # Get all document IDs + all_doc_ids = [e["id"] for e in domain_corpus_long] + + for e in examples: + qid = e["id"] + gold_ids_long = e["gold_ids_long"] + for gid in gold_ids_long: + relevant_docs["long"][qid].update({gid: 1}) + + # Create top_ranked: all documents except excluded_ids + excluded_ids = e.get("excluded_ids", []) + if excluded_ids and excluded_ids != ["N/A"]: + excluded_set = set(excluded_ids) + top_ranked["long"][qid] = [ + doc_id for doc_id in all_doc_ids if doc_id not in excluded_set + ] + else: + # No exclusions, use all documents + top_ranked["long"][qid] = all_doc_ids + + corpus = datasets.DatasetDict(corpus) + queries = datasets.DatasetDict(queries) + relevant_docs = datasets.DatasetDict(relevant_docs) + top_ranked = datasets.DatasetDict(top_ranked) + return corpus, queries, relevant_docs, top_ranked + + +_BIBTEX_CITATION = r""" +@misc{su2024brightrealisticchallengingbenchmark, + archiveprefix = {arXiv}, + author = {Hongjin Su and Howard Yen and Mengzhou Xia and Weijia Shi and Niklas Muennighoff and Han-yu Wang and Haisu Liu and Quan Shi and Zachary S. Siegel and Michael Tang and Ruoxi Sun and Jinsung Yoon and Sercan O. Arik and Danqi Chen and Tao Yu}, + eprint = {2407.12883}, + primaryclass = {cs.CL}, + title = {BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval}, + url = {https://arxiv.org/abs/2407.12883}, + year = {2024}, +} +""" + + +class BrightBiologyLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightBiologyLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Biology StackExchange answers with long documents.", + type="Retrieval", + prompt={ + "query": "Represent this Biology post for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="biology", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightEarthScienceLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightEarthScienceLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Earth Science StackExchange answers with long documents.", + type="Retrieval", + prompt={ + "query": "Represent this Earth Science post for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="earth_science", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightEconomicsLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightEconomicsLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Economics StackExchange answers with long documents.", + type="Retrieval", + prompt={ + "query": "Represent this Economics post for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="economics", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightPsychologyLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightPsychologyLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Psychology StackExchange answers with long documents.", + type="Retrieval", + prompt={ + "query": "Represent this Psychology post for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="psychology", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightRoboticsLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightRoboticsLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Robotics StackExchange answers with long documents.", + type="Retrieval", + prompt={ + "query": "Represent this Robotics post for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="robotics", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightStackoverflowLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightStackoverflowLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Stack Overflow answers with long documents.", + type="Retrieval", + prompt={ + "query": "Represent this Stack Overflow post for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="stackoverflow", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightSustainableLivingLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightSustainableLivingLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Sustainable Living StackExchange answers with long documents.", + type="Retrieval", + prompt={ + "query": "Represent thisSustainable Living post for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="sustainable_living", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightPonyLongRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightPonyLongRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of Pony programming language syntax documentation with long documents.", + type="Retrieval", + prompt={ + "query": "Represent this Pony question for searching relevant passages: " + }, + category="t2t", + eval_splits=["long"], + eval_langs=["eng-Latn"], + main_score="recall_at_1", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_long_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="pony", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True diff --git a/mteb/tasks/retrieval/eng/bright_subsets_retrieval.py b/mteb/tasks/retrieval/eng/bright_subsets_retrieval.py new file mode 100644 index 0000000000..460f57a933 --- /dev/null +++ b/mteb/tasks/retrieval/eng/bright_subsets_retrieval.py @@ -0,0 +1,607 @@ +from __future__ import annotations + +from collections import defaultdict + +import datasets + +from mteb.abstasks import AbsTaskRetrieval +from mteb.abstasks.task_metadata import TaskMetadata + + +def load_bright_data( + path: str, + domain: str, + eval_splits: list, + cache_dir: str | None = None, + revision: str | None = None, +): + corpus = dict.fromkeys(eval_splits) + queries = dict.fromkeys(eval_splits) + relevant_docs = dict.fromkeys(eval_splits) + top_ranked = dict.fromkeys(eval_splits) + + domain_corpus = datasets.load_dataset( + path, + "documents", + split=domain, + cache_dir=cache_dir, + revision=revision, + ) + examples = datasets.load_dataset( + path, + "examples", + split=domain, + cache_dir=cache_dir, + revision=revision, + ) + corpus["standard"] = {e["id"]: {"text": e["content"]} for e in domain_corpus} + queries["standard"] = {e["id"]: e["query"] for e in examples} + relevant_docs["standard"] = defaultdict(dict) + top_ranked["standard"] = defaultdict(list) + + # Get all document IDs + all_doc_ids = [e["id"] for e in domain_corpus] + + for e in examples: + qid = e["id"] + gold_ids = e["gold_ids"] + for gid in gold_ids: + relevant_docs["standard"][qid].update({gid: 1}) + + # Create top_ranked: all documents except excluded_ids + excluded_ids = e.get("excluded_ids", []) + if excluded_ids and excluded_ids != ["N/A"]: + excluded_set = set(excluded_ids) + top_ranked["standard"][qid] = [ + doc_id for doc_id in all_doc_ids if doc_id not in excluded_set + ] + else: + # No exclusions, use all documents + top_ranked["standard"][qid] = all_doc_ids + + corpus = datasets.DatasetDict(corpus) + queries = datasets.DatasetDict(queries) + relevant_docs = datasets.DatasetDict(relevant_docs) + top_ranked = datasets.DatasetDict(top_ranked) + return corpus, queries, relevant_docs, top_ranked + + +_BIBTEX_CITATION = r""" +@misc{su2024brightrealisticchallengingbenchmark, + archiveprefix = {arXiv}, + author = {Hongjin Su and Howard Yen and Mengzhou Xia and Weijia Shi and Niklas Muennighoff and Han-yu Wang and Haisu Liu and Quan Shi and Zachary S. Siegel and Michael Tang and Ruoxi Sun and Jinsung Yoon and Sercan O. Arik and Danqi Chen and Tao Yu}, + eprint = {2407.12883}, + primaryclass = {cs.CL}, + title = {BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval}, + url = {https://arxiv.org/abs/2407.12883}, + year = {2024}, +} +""" + + +class BrightBiologyRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightBiologyRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Biology StackExchange answers.", + type="Retrieval", + prompt={ + "query": "Represent this Biology post for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="biology", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightEarthScienceRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightEarthScienceRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Earth Science StackExchange answers.", + type="Retrieval", + prompt={ + "query": "Represent this Earth Science post for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="earth_science", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightEconomicsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightEconomicsRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Economics StackExchange answers.", + type="Retrieval", + prompt={ + "query": "Represent this Economics post for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="economics", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightPsychologyRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightPsychologyRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Psychology StackExchange answers.", + type="Retrieval", + prompt={ + "query": "Represent this Psychology post for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="psychology", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightRoboticsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightRoboticsRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Robotics StackExchange answers.", + type="Retrieval", + prompt={ + "query": "Represent this Robotics post for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="robotics", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightStackoverflowRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightStackoverflowRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Stack Overflow answers.", + type="Retrieval", + prompt={ + "query": "Represent this Stack Overflow post for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="stackoverflow", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightSustainableLivingRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightSustainableLivingRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of web documents cited in Sustainable Living StackExchange answers.", + type="Retrieval", + prompt={ + "query": "Represent this Sustainable Living post for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="sustainable_living", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightPonyRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightPonyRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of Pony programming language syntax documentation.", + type="Retrieval", + prompt={ + "query": "Represent this Pony question for searching relevant passages: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="pony", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightLeetcodeRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightLeetcodeRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of similar algorithmic problems based on shared solution techniques.", + type="Retrieval", + prompt={ + "query": "Represent this Coding problem for searching relevant examples: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="leetcode", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightAopsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightAopsRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of similar Math Olympiad problems from Art of Problem Solving.", + type="Retrieval", + prompt={ + "query": "Represent this Math problem for searching relevant examples: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="aops", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightTheoremQATheoremsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightTheoremQATheoremsRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of theorem definitions and proofs from ProofWiki.", + type="Retrieval", + prompt={ + "query": "Represent this Math problem for searching relevant theorems: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="theoremqa_theorems", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True + + +class BrightTheoremQAQuestionsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BrightTheoremQAQuestionsRetrieval", + dataset={ + "path": "xlangai/BRIGHT", + "revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb", + }, + reference="https://huggingface.co/datasets/xlangai/BRIGHT", + description="Part of the BRIGHT benchmark for reasoning-intensive retrieval. Retrieval of theorem definitions from ProofWiki given questions rephrased as real-world scenarios.", + type="Retrieval", + prompt={ + "query": "Represent this Math problem for searching relevant examples: " + }, + category="t2t", + eval_splits=["standard"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-03-01", "2024-06-01"), + domains=["Non-fiction", "Written"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=_BIBTEX_CITATION, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs, self.top_ranked = ( + load_bright_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + domain="theoremqa_questions", + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + ) + self.data_loaded = True