|
1165 | 1165 | BRIGHT = Benchmark( |
1166 | 1166 | name="BRIGHT", |
1167 | 1167 | display_name="Reasoning Retrieval", |
1168 | | - tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]), |
| 1168 | + tasks=get_tasks( |
| 1169 | + tasks=[ |
| 1170 | + "BrightRetrieval", |
| 1171 | + ], |
| 1172 | + ), |
1169 | 1173 | description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. |
1170 | 1174 | BRIGHT is the first text retrieval |
1171 | 1175 | benchmark that requires intensive reasoning to retrieve relevant documents with |
|
1184 | 1188 | """, |
1185 | 1189 | ) |
1186 | 1190 |
|
| 1191 | +BRIGHT_SUBSETS = Benchmark( |
| 1192 | + name="BRIGHT (subsets)", |
| 1193 | + display_name="Reasoning Retrieval (subsets)", |
| 1194 | + tasks=get_tasks( |
| 1195 | + tasks=[ |
| 1196 | + "BrightBiologyRetrieval", |
| 1197 | + "BrightEarthScienceRetrieval", |
| 1198 | + "BrightEconomicsRetrieval", |
| 1199 | + "BrightPsychologyRetrieval", |
| 1200 | + "BrightRoboticsRetrieval", |
| 1201 | + "BrightStackoverflowRetrieval", |
| 1202 | + "BrightSustainableLivingRetrieval", |
| 1203 | + "BrightPonyRetrieval", |
| 1204 | + "BrightLeetcodeRetrieval", |
| 1205 | + "BrightAopsRetrieval", |
| 1206 | + "BrightTheoremQATheoremsRetrieval", |
| 1207 | + "BrightTheoremQAQuestionsRetrieval", |
| 1208 | + ], |
| 1209 | + ), |
| 1210 | + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Individual Subsets). |
| 1211 | + This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark, |
| 1212 | + allowing for domain-specific evaluation. The subsets include: biology, earth science, economics, |
| 1213 | + psychology, robotics, stackoverflow, sustainable living, pony, leetcode, aops, theoremqa_theorems, |
| 1214 | + and theoremqa_questions. |
| 1215 | + """, |
| 1216 | + reference="https://brightbenchmark.github.io/", |
| 1217 | + citation=r""" |
| 1218 | +@article{su2024bright, |
| 1219 | + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, |
| 1220 | + journal = {arXiv preprint arXiv:2407.12883}, |
| 1221 | + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, |
| 1222 | + year = {2024}, |
| 1223 | +} |
| 1224 | +""", |
| 1225 | +) |
| 1226 | + |
1187 | 1227 | BRIGHT_LONG = Benchmark( |
1188 | 1228 | name="BRIGHT (long)", |
1189 | | - tasks=MTEBTasks( |
1190 | | - ( |
1191 | | - get_task( |
1192 | | - "BrightLongRetrieval", |
1193 | | - ), |
1194 | | - ) |
| 1229 | + tasks=get_tasks( |
| 1230 | + tasks=[ |
| 1231 | + "BrightLongRetrieval", |
| 1232 | + ], |
1195 | 1233 | ), |
1196 | 1234 | description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. |
1197 | 1235 | BRIGHT is the first text retrieval |
|
1213 | 1251 | """, |
1214 | 1252 | ) |
1215 | 1253 |
|
| 1254 | +BRIGHT_LONG_SUBSETS = Benchmark( |
| 1255 | + name="BRIGHT (long subsets)", |
| 1256 | + display_name="Reasoning Retrieval (long subsets)", |
| 1257 | + tasks=get_tasks( |
| 1258 | + tasks=[ |
| 1259 | + "BrightBiologyLongRetrieval", |
| 1260 | + "BrightEarthScienceLongRetrieval", |
| 1261 | + "BrightEconomicsLongRetrieval", |
| 1262 | + "BrightPsychologyLongRetrieval", |
| 1263 | + "BrightRoboticsLongRetrieval", |
| 1264 | + "BrightStackoverflowLongRetrieval", |
| 1265 | + "BrightSustainableLivingLongRetrieval", |
| 1266 | + "BrightPonyLongRetrieval", |
| 1267 | + ], |
| 1268 | + ), |
| 1269 | + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval (Long Individual Subsets). |
| 1270 | + This benchmark contains individual subset tasks for each domain in the BRIGHT benchmark with long documents, |
| 1271 | + allowing for domain-specific evaluation with longer context. The subsets include: biology, earth science, |
| 1272 | + economics, psychology, robotics, stackoverflow, sustainable living, and pony. |
| 1273 | + """, |
| 1274 | + reference="https://brightbenchmark.github.io/", |
| 1275 | + citation=r""" |
| 1276 | +@article{su2024bright, |
| 1277 | + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, |
| 1278 | + journal = {arXiv preprint arXiv:2407.12883}, |
| 1279 | + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, |
| 1280 | + year = {2024}, |
| 1281 | +} |
| 1282 | +""", |
| 1283 | +) |
| 1284 | + |
1216 | 1285 | CODE_RAG = Benchmark( |
1217 | 1286 | name="CodeRAG", |
1218 | 1287 | tasks=get_tasks( |
|
1603 | 1672 | "TRECCOVID-NL", |
1604 | 1673 | ], |
1605 | 1674 | ), |
1606 | | - description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated " |
1607 | | - "translation.", |
| 1675 | + description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.", |
1608 | 1676 | reference="https://arxiv.org/abs/2412.08329", |
1609 | 1677 | contacts=["nikolay-banar"], |
1610 | 1678 | citation=r""" |
|
0 commit comments