@@ -234,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
234234 p .join ()
235235
236236 # extract the result which is not None
237- assert (
238- len ([ res for res in results . values () if res is not None ]) == 1
239- ), "we expect exactly 1 process to return a results dict properly"
237+ assert len ([ res for res in results . values () if res is not None ]) == 1 , (
238+ "we expect exactly 1 process to return a results dict properly"
239+ )
240240 results_dict = [res for res in results .values () if res is not None ][0 ]
241241 return results_dict
242242
@@ -302,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
302302 parsed_scores = parse_multitask_results (
303303 result_dict , LeaderboardV2Tasks .BBH .value , "acc_norm"
304304 )
305- assert (
306- len ( parsed_scores [ "subtasks" ]) == 24
307- ), "there should be 24 subtasks of bbh run"
305+ assert len ( parsed_scores [ "subtasks" ]) == 24 , (
306+ "there should be 24 subtasks of bbh run"
307+ )
308308 return parsed_scores
309309
310310
@@ -355,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
355355 scores .append (value )
356356 target_metrics .remove (metric )
357357
358- assert (
359- len (scores ) == 2
360- ), f"there should only be 2 values extracted in ifeval, got: { len ( scores ) } "
358+ assert len ( scores ) == 2 , (
359+ f"there should only be 2 values extracted in ifeval, got: { len (scores )} "
360+ )
361361 return {
362362 "score" : sum (scores ) / 2 ,
363363 }
@@ -381,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
381381 parsed_scores = parse_multitask_results (
382382 result_dict , LeaderboardV2Tasks .GPQA .value , "acc_norm"
383383 )
384- assert (
385- len (parsed_scores [" subtasks" ]) == 3
386- ), f"Expected 3 gpqa scores, got { len ( parsed_scores [ 'subtasks' ]) } "
384+ assert len ( parsed_scores [ "subtasks" ]) == 3 , (
385+ f"Expected 3 gpqa scores, got { len (parsed_scores [' subtasks' ]) } "
386+ )
387387 return parsed_scores
388388
389389
@@ -394,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
394394 parsed_scores = parse_multitask_results (
395395 result_dict , LeaderboardV2Tasks .MATH_HARD .value , "exact_match"
396396 )
397- assert (
398- len (parsed_scores [" subtasks" ]) == 7
399- ), f"leaderboard_math_hard should have 7 subtasks, found: { len ( parsed_scores [ 'subtasks' ]) } "
397+ assert len ( parsed_scores [ "subtasks" ]) == 7 , (
398+ f"leaderboard_math_hard should have 7 subtasks, found: { len (parsed_scores [' subtasks' ]) } "
399+ )
400400 return parsed_scores
401401
402402
@@ -463,9 +463,9 @@ def get_scores_from_result_dicts(
463463 # this is just a sanity check step
464464 benchmarks_already_covered = set (parsed_scores .keys ())
465465 overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
466- assert (
467- len ( benchmarks_already_covered & benchmarks_to_parse ) == 0
468- ), f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
466+ assert len ( benchmarks_already_covered & benchmarks_to_parse ) == 0 , (
467+ f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
468+ )
469469
470470 # now actually add them
471471 for benchmark in benchmarks_to_parse :
0 commit comments