|
1184 | 1184 | """, |
1185 | 1185 | ) |
1186 | 1186 |
|
| 1187 | +BRIGHT_SUBSETS = Benchmark( |
| 1188 | + name="BRIGHT (subsets)", |
| 1189 | + display_name="Reasoning Retrieval (subsets)", |
| 1190 | + tasks=get_tasks( |
| 1191 | + tasks=[ |
| 1192 | + "BrightBiologyRetrieval", |
| 1193 | + "BrightEarthScienceRetrieval", |
| 1194 | + "BrightEconomicsRetrieval", |
| 1195 | + "BrightPsychologyRetrieval", |
| 1196 | + "BrightRoboticsRetrieval", |
| 1197 | + "BrightStackoverflowRetrieval", |
| 1198 | + "BrightSustainableLivingRetrieval", |
| 1199 | + "BrightPonyRetrieval", |
| 1200 | + "BrightLeetcodeRetrieval", |
| 1201 | + "BrightAopsRetrieval", |
| 1202 | + "BrightTheoremQATheoremsRetrieval", |
| 1203 | + "BrightTheoremQAQuestionsRetrieval", |
| 1204 | + ], |
| 1205 | + ), |
| 1206 | + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets). |
| 1207 | + This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark, |
| 1208 | + allowing for domain-specific evaluation. The subsets include: biology, earth science, economics, |
| 1209 | + psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems, |
| 1210 | + and theoremqa_questions. |
| 1211 | + """, |
| 1212 | + reference="https://brightbenchmark.github.io/", |
| 1213 | + citation=r""" |
| 1214 | +@article{su2024bright, |
| 1215 | + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, |
| 1216 | + journal = {arXiv preprint arXiv:2407.12883}, |
| 1217 | + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, |
| 1218 | + year = {2024}, |
| 1219 | +} |
| 1220 | +""", |
| 1221 | +) |
| 1222 | + |
1187 | 1223 | BRIGHT_LONG = Benchmark( |
1188 | 1224 | name="BRIGHT (long)", |
1189 | 1225 | tasks=MTEBTasks( |
|
1213 | 1249 | """, |
1214 | 1250 | ) |
1215 | 1251 |
|
| 1252 | +BRIGHT_LONG_SUBSETS = Benchmark( |
| 1253 | + name="BRIGHT (long subsets)", |
| 1254 | + display_name="Reasoning Retrieval (long subsets)", |
| 1255 | + tasks=get_tasks( |
| 1256 | + tasks=[ |
| 1257 | + "BrightBiologyLongRetrieval", |
| 1258 | + "BrightEarthScienceLongRetrieval", |
| 1259 | + "BrightEconomicsLongRetrieval", |
| 1260 | + "BrightPsychologyLongRetrieval", |
| 1261 | + "BrightRoboticsLongRetrieval", |
| 1262 | + "BrightStackoverflowLongRetrieval", |
| 1263 | + "BrightSustainableLivingLongRetrieval", |
| 1264 | + "BrightPonyLongRetrieval", |
| 1265 | + ], |
| 1266 | + ), |
| 1267 | + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets). |
| 1268 | + This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents, |
| 1269 | + allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science, |
| 1270 | + economics, psychology, robotics, stackoverflow, sustainable living, and pony. |
| 1271 | + """, |
| 1272 | + reference="https://brightbenchmark.github.io/", |
| 1273 | + citation=r""" |
| 1274 | +@article{su2024bright, |
| 1275 | + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, |
| 1276 | + journal = {arXiv preprint arXiv:2407.12883}, |
| 1277 | + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, |
| 1278 | + year = {2024}, |
| 1279 | +} |
| 1280 | +""", |
| 1281 | +) |
| 1282 | + |
1216 | 1283 | CODE_RAG = Benchmark( |
1217 | 1284 | name="CodeRAG", |
1218 | 1285 | tasks=get_tasks( |
|
1603 | 1670 | "TRECCOVID-NL", |
1604 | 1671 | ], |
1605 | 1672 | ), |
1606 | | - description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated " |
1607 | | - "translation.", |
| 1673 | + description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.", |
1608 | 1674 | reference="https://arxiv.org/abs/2412.08329", |
1609 | 1675 | contacts=["nikolay-banar"], |
1610 | 1676 | citation=r""" |
|
0 commit comments