tox -e fix

booxter · booxter · commit 8d2e895accbf · 2025-05-30T21:15:36.000Z
Signed-off-by: Ihar Hrachyshka &lt;ihar.hrachyshka@gmail.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -101,7 +101,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip_install=python -m pip install -c constraints-dev.txt"
+          pip_install="python -m pip install -c constraints-dev.txt"
           $pip_install --upgrade pip
           $pip_install tox tox-gh>=1.2
 
diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py
@@ -234,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
         p.join()
 
     # extract the result which is not None
-    assert (
-        len([res for res in results.values() if res is not None]) == 1
-    ), "we expect exactly 1 process to return a results dict properly"
+    assert len([res for res in results.values() if res is not None]) == 1, (
+        "we expect exactly 1 process to return a results dict properly"
+    )
     results_dict = [res for res in results.values() if res is not None][0]
     return results_dict
 
@@ -302,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
     )
-    assert (
-        len(parsed_scores["subtasks"]) == 24
-    ), "there should be 24 subtasks of bbh run"
+    assert len(parsed_scores["subtasks"]) == 24, (
+        "there should be 24 subtasks of bbh run"
+    )
     return parsed_scores
 
 
@@ -355,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
             scores.append(value)
             target_metrics.remove(metric)
 
-    assert (
-        len(scores) == 2
-    ), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
+    assert len(scores) == 2, (
+        f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
+    )
     return {
         "score": sum(scores) / 2,
     }
@@ -381,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
     )
-    assert (
-        len(parsed_scores["subtasks"]) == 3
-    ), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
+    assert len(parsed_scores["subtasks"]) == 3, (
+        f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
+    )
     return parsed_scores
 
 
@@ -394,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
     )
-    assert (
-        len(parsed_scores["subtasks"]) == 7
-    ), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
+    assert len(parsed_scores["subtasks"]) == 7, (
+        f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
+    )
     return parsed_scores
 
 
@@ -463,9 +463,9 @@ def get_scores_from_result_dicts(
         # this is just a sanity check step
         benchmarks_already_covered = set(parsed_scores.keys())
         overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
-        assert (
-            len(benchmarks_already_covered & benchmarks_to_parse) == 0
-        ), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
+        assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
+            f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
+        )
 
         # now actually add them
         for benchmark in benchmarks_to_parse:
diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
@@ -346,19 +346,19 @@ def check_data(questions, model_answers, ref_answers, models, judges):
         assert m in model_answers, f"Missing model answer for {m}"
         m_answer = model_answers[m]
         for q in questions:
-            assert (
-                q["question_id"] in m_answer
-            ), f"Missing model {m}'s answer to Question {q['question_id']}"
+            assert q["question_id"] in m_answer, (
+                f"Missing model {m}'s answer to Question {q['question_id']}"
+            )
     # check ref answers
     for jg in judges.values():
         if not jg.ref_based:
             continue
         for q in questions:
             if q["category"] not in NEED_REF_CATS:
                 continue
-            assert (
-                q["question_id"] in ref_answers[jg.model_name]
-            ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
+            assert q["question_id"] in ref_answers[jg.model_name], (
+                f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
+            )
 
 
 def get_model_list(answer_file):
diff --git a/src/instructlab/eval/mt_bench_conversation.py b/src/instructlab/eval/mt_bench_conversation.py
@@ -116,9 +116,9 @@ def dict(self):
 def register_conv_template(template: Conversation, override: bool = False):
     """Register a new conversation template."""
     if not override:
-        assert (
-            template.name not in conv_templates
-        ), f"{template.name} has been registered."
+        assert template.name not in conv_templates, (
+            f"{template.name} has been registered."
+        )
 
     conv_templates[template.name] = template
 
diff --git a/tests/test_mt_bench.py b/tests/test_mt_bench.py
@@ -43,7 +43,7 @@ def gen_qa_pairs(odd):
             {
                 "question_id": i + 1,
                 "score": 0.6,
-                "qna_file": f"category{i+1}/qna.yaml",
+                "qna_file": f"category{i + 1}/qna.yaml",
             }
         )
     return qa_pairs

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def gen_qa_pairs(odd):`
`43`	`43`	`{`
`44`	`44`	`"question_id": i + 1,`
`45`	`45`	`"score": 0.6,`
`46`		`- "qna_file": f"category{i+1}/qna.yaml",`
	`46`	`+ "qna_file": f"category{i + 1}/qna.yaml",`
`47`	`47`	`}`
`48`	`48`	`)`
`49`	`49`	`return qa_pairs`