Skip to content

Commit 8d2e895

Browse files
committed
tox -e fix
Signed-off-by: Ihar Hrachyshka <[email protected]>
1 parent f6b9755 commit 8d2e895

File tree

5 files changed

+29
-29
lines changed

5 files changed

+29
-29
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ jobs:
101101

102102
- name: Install dependencies
103103
run: |
104-
pip_install=python -m pip install -c constraints-dev.txt"
104+
pip_install="python -m pip install -c constraints-dev.txt"
105105
$pip_install --upgrade pip
106106
$pip_install tox tox-gh>=1.2
107107

src/instructlab/eval/leaderboard.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
234234
p.join()
235235

236236
# extract the result which is not None
237-
assert (
238-
len([res for res in results.values() if res is not None]) == 1
239-
), "we expect exactly 1 process to return a results dict properly"
237+
assert len([res for res in results.values() if res is not None]) == 1, (
238+
"we expect exactly 1 process to return a results dict properly"
239+
)
240240
results_dict = [res for res in results.values() if res is not None][0]
241241
return results_dict
242242

@@ -302,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
302302
parsed_scores = parse_multitask_results(
303303
result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
304304
)
305-
assert (
306-
len(parsed_scores["subtasks"]) == 24
307-
), "there should be 24 subtasks of bbh run"
305+
assert len(parsed_scores["subtasks"]) == 24, (
306+
"there should be 24 subtasks of bbh run"
307+
)
308308
return parsed_scores
309309

310310

@@ -355,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
355355
scores.append(value)
356356
target_metrics.remove(metric)
357357

358-
assert (
359-
len(scores) == 2
360-
), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
358+
assert len(scores) == 2, (
359+
f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
360+
)
361361
return {
362362
"score": sum(scores) / 2,
363363
}
@@ -381,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
381381
parsed_scores = parse_multitask_results(
382382
result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
383383
)
384-
assert (
385-
len(parsed_scores["subtasks"]) == 3
386-
), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
384+
assert len(parsed_scores["subtasks"]) == 3, (
385+
f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
386+
)
387387
return parsed_scores
388388

389389

@@ -394,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
394394
parsed_scores = parse_multitask_results(
395395
result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
396396
)
397-
assert (
398-
len(parsed_scores["subtasks"]) == 7
399-
), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
397+
assert len(parsed_scores["subtasks"]) == 7, (
398+
f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
399+
)
400400
return parsed_scores
401401

402402

@@ -463,9 +463,9 @@ def get_scores_from_result_dicts(
463463
# this is just a sanity check step
464464
benchmarks_already_covered = set(parsed_scores.keys())
465465
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
466-
assert (
467-
len(benchmarks_already_covered & benchmarks_to_parse) == 0
468-
), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
466+
assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
467+
f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
468+
)
469469

470470
# now actually add them
471471
for benchmark in benchmarks_to_parse:

src/instructlab/eval/mt_bench_common.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -346,19 +346,19 @@ def check_data(questions, model_answers, ref_answers, models, judges):
346346
assert m in model_answers, f"Missing model answer for {m}"
347347
m_answer = model_answers[m]
348348
for q in questions:
349-
assert (
350-
q["question_id"] in m_answer
351-
), f"Missing model {m}'s answer to Question {q['question_id']}"
349+
assert q["question_id"] in m_answer, (
350+
f"Missing model {m}'s answer to Question {q['question_id']}"
351+
)
352352
# check ref answers
353353
for jg in judges.values():
354354
if not jg.ref_based:
355355
continue
356356
for q in questions:
357357
if q["category"] not in NEED_REF_CATS:
358358
continue
359-
assert (
360-
q["question_id"] in ref_answers[jg.model_name]
361-
), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
359+
assert q["question_id"] in ref_answers[jg.model_name], (
360+
f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
361+
)
362362

363363

364364
def get_model_list(answer_file):

src/instructlab/eval/mt_bench_conversation.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,9 @@ def dict(self):
116116
def register_conv_template(template: Conversation, override: bool = False):
117117
"""Register a new conversation template."""
118118
if not override:
119-
assert (
120-
template.name not in conv_templates
121-
), f"{template.name} has been registered."
119+
assert template.name not in conv_templates, (
120+
f"{template.name} has been registered."
121+
)
122122

123123
conv_templates[template.name] = template
124124

tests/test_mt_bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def gen_qa_pairs(odd):
4343
{
4444
"question_id": i + 1,
4545
"score": 0.6,
46-
"qna_file": f"category{i+1}/qna.yaml",
46+
"qna_file": f"category{i + 1}/qna.yaml",
4747
}
4848
)
4949
return qa_pairs

0 commit comments

Comments
 (0)