fix: resolve pre-commit issues

zz297429 · zz297429 · commit ac0a5089ace8 · 2025-12-29T16:32:45.000+08:00
Link: https://code.alibaba-inc.com/DAIL-LLM/OpenJudge/codereview/25166864 * fix: resolve pre-commit issues - Fix flake8 E226: add whitespace around arithmetic operators - Fix flake8 F541/pylint W1309: remove unused f-string prefix - Add pytest-asyncio configuration in pytest.ini - Fix test_grading_runner.py: use aggregator name string as key instead of callable object
diff --git a/docs/images/logo.svg b/docs/images/logo.svg
diff --git a/openjudge/generator/iterative_rubric/categorizer.py b/openjudge/generator/iterative_rubric/categorizer.py
@@ -248,7 +248,7 @@ async def categorize_rubrics(
         try:
             # Format rubrics text
             rubrics_text = "\n".join(
-                [f"{i+1}. {rubric}" for i, rubric in enumerate(rubrics)],
+                [f"{i + 1}. {rubric}" for i, rubric in enumerate(rubrics)],
             )
 
             # Call LLM using Chat with structured output
diff --git a/openjudge/generator/iterative_rubric/generator.py b/openjudge/generator/iterative_rubric/generator.py
@@ -444,7 +444,8 @@ async def _generate_query_rubrics(
                 current_index = end_idx
 
                 logger.info(
-                    f"Iteration {iteration}: Processing batch {start_idx}-{end_idx-1} " f"({len(batch_data)} samples)",
+                    f"Iteration {iteration}: Processing batch {start_idx}-{end_idx - 1} "
+                    f"({len(batch_data)} samples)",
                 )
 
                 # Generate rubrics for batch concurrently
@@ -559,7 +560,7 @@ async def _categorize_query_rubrics(
         if not self.config.enable_categorization:
             logger.info(f"Categorization disabled: keeping all {len(query_rubrics)} rubrics")
             formatted_rubrics = "\n\n".join(
-                [f"{i+1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
+                [f"{i + 1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
             )
             return formatted_rubrics
 
@@ -583,7 +584,7 @@ async def _categorize_query_rubrics(
                 logger.error("Rubric categorization failed, falling back to numbered list format")
                 # Fallback: return original rubrics as formatted string
                 return "\n\n".join(
-                    [f"{i+1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
+                    [f"{i + 1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
                 )
 
             logger.info(
@@ -592,13 +593,13 @@ async def _categorize_query_rubrics(
 
             # Format categorized rubrics into a single string
             formatted_rubrics = "\n\n".join(
-                [f"Rubric {i+1}:\n{rubric}" for i, rubric in enumerate(categorized_rubrics)],
+                [f"Rubric {i + 1}:\n{rubric}" for i, rubric in enumerate(categorized_rubrics)],
             )
 
             return formatted_rubrics
 
         except Exception as e:
             logger.error(f"Categorization error: {e}, falling back to numbered list format")
             return "\n\n".join(
-                [f"{i+1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
+                [f"{i + 1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
             )
diff --git a/openjudge/generator/iterative_rubric/query_rubric_generator.py b/openjudge/generator/iterative_rubric/query_rubric_generator.py
@@ -937,7 +937,7 @@ async def _evaluate_listwise(
 
         # Format responses for evaluation
         responses_text = "\n\n".join(
-            [f"Response {i+1}:\n{resp}" for i, resp in enumerate(responses)],
+            [f"Response {i + 1}:\n{resp}" for i, resp in enumerate(responses)],
         )
 
         try:
@@ -1112,7 +1112,7 @@ def _format_data_context(self, data: dict) -> str:
             ranks = data.get("label_rank", [])
 
             for i, response in enumerate(responses):
-                lines.append(f"Response {i+1}:")
+                lines.append(f"Response {i + 1}:")
                 lines.append(f"{response}")
 
                 if i < len(ranks):
@@ -1128,5 +1128,5 @@ def _format_data_context(self, data: dict) -> str:
     def _format_rubrics_text(self, rubrics: List[str]) -> str:
         """Format rubrics list into numbered text"""
         return "\n".join(
-            [f"{i+1}. {rubric}" for i, rubric in enumerate(rubrics)],
+            [f"{i + 1}. {rubric}" for i, rubric in enumerate(rubrics)],
         )
diff --git a/openjudge/models/openai_chat_model.py b/openjudge/models/openai_chat_model.py
@@ -170,7 +170,7 @@ async def achat(
             if "extra_body" not in kwargs:
                 kwargs["extra_body"] = {}
             kwargs["extra_body"]["enable_thinking"] = False
-            logger.debug(f"Set enable_thinking=False in extra_body for qwen model")
+            logger.debug("Set enable_thinking=False in extra_body for qwen model")
 
         if tool_choice:
             self._validate_tool_choice(tool_choice, tools)
diff --git a/pytest.ini b/pytest.ini
@@ -1,6 +1,9 @@
 
 [pytest]
+asyncio_mode = auto
+asyncio_default_fixture_loop_scope = function
 markers =
     unit: Unit tests (offline testing with mocks)
     quality: Quality tests (evaluation against gold standard datasets)
+    asyncio: mark a test as an async test
 norecursedirs = cookbooks
diff --git a/tests/benchmarks/test_rewardbench2.py b/tests/benchmarks/test_rewardbench2.py
@@ -135,7 +135,7 @@ async def run_rewardbench2_test(
 
         metadata = analysis_result.metadata
         logger.info("\nOverall Performance:")
-        logger.info(f"  Accuracy: {metadata.get('accuracy', 0):.4f} ({metadata.get('accuracy', 0)*100:.2f}%)")
+        logger.info(f"  Accuracy: {metadata.get('accuracy', 0):.4f} ({metadata.get('accuracy', 0) * 100:.2f}%)")
         logger.info(f"  Correct: {metadata.get('correct_count', 0)}/{metadata.get('valid_samples', 0)}")
         logger.info(f"  Total samples: {metadata.get('total_samples', 0)}")
 
@@ -147,7 +147,7 @@ async def run_rewardbench2_test(
                 correct = metrics.get("correct_count", 0)
                 total = metrics.get("total_samples", 0)
                 logger.info(
-                    f"  {subset:15s}: {accuracy:.4f} ({accuracy*100:5.2f}%) - " f"{correct:2d}/{total:2d} correct",
+                    f"  {subset:15s}: {accuracy:.4f} ({accuracy * 100:5.2f}%) - " f"{correct:2d}/{total:2d} correct",
                 )
 
         logger.info("\n" + "=" * 80)
diff --git a/tests/generator/test_iterative_rubric.py b/tests/generator/test_iterative_rubric.py
@@ -263,7 +263,7 @@ async def test_iterative_grader_listwise() -> None:
     # Evaluate test sample
     test_query = LISTWISE_TEST_SAMPLE[0]["query"]
     test_responses = LISTWISE_TEST_SAMPLE[0]["responses"]
-    responses = "\n\n".join([f"Response {i+1}:\n{ans}" for i, ans in enumerate(test_responses)])
+    responses = "\n\n".join([f"Response {i + 1}:\n{ans}" for i, ans in enumerate(test_responses)])
 
     result = await grader.aevaluate(query=test_query, responses=responses)
 
diff --git a/tests/graders/common/test_correctness.py b/tests/graders/common/test_correctness.py
@@ -411,11 +411,11 @@ async def test_adversarial_correctness_with_runner(self, dataset, model):
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(
             f"CorrectnessGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
         )
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that false positive and false negative rates meet expected thresholds
         # Note: We log the rates but don't fail the test if they exceed thresholds
diff --git a/tests/graders/common/test_hallucination.py b/tests/graders/common/test_hallucination.py
@@ -414,11 +414,11 @@ async def test_adversarial_hallucination_with_runner(self, dataset, model):
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(
             f"HallucinationGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
         )
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that false positive and false negative rates meet expected thresholds
         # Note: We log the rates but don't fail the test if they exceed thresholds
diff --git a/tests/graders/common/test_harmfulness.py b/tests/graders/common/test_harmfulness.py
@@ -398,11 +398,11 @@ async def test_adversarial_harmfulness_with_runner(self, dataset, model):
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(
             f"HarmfulnessGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
         )
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that false positive and false negative rates meet expected thresholds
         assert fp_result.false_positive_rate <= 0.3, f"False positive rate too high: {fp_result.false_positive_rate}"
diff --git a/tests/graders/common/test_instruction_following.py b/tests/graders/common/test_instruction_following.py
@@ -386,11 +386,11 @@ async def test_adversarial_instruction_following_with_runner(self, dataset, mode
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(
             f"InstructionFollowingGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
         )
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that analyzers executed successfully
         # Note: Instruction following evaluation can be challenging with current data
diff --git a/tests/graders/common/test_relevance.py b/tests/graders/common/test_relevance.py
@@ -385,9 +385,9 @@ async def test_adversarial_relevance_with_runner(self, dataset, model):
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(f"RelevanceGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})")
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that analyzers executed successfully
         assert fp_result.name == "False Positive Analysis"
diff --git a/tests/graders/multimodal/test_image_coherence.py b/tests/graders/multimodal/test_image_coherence.py
@@ -402,11 +402,11 @@ def map_incoherent_response(sample):
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(
             f"ImageCoherenceGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
         )
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that analyzers executed successfully
         assert fp_result.name == "False Positive Analysis"
diff --git a/tests/graders/multimodal/test_image_helpfulness.py b/tests/graders/multimodal/test_image_helpfulness.py
@@ -402,11 +402,11 @@ def map_unhelpful_response(sample):
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(
             f"ImageHelpfulnessGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
         )
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that analyzers executed successfully
         assert fp_result.name == "False Positive Analysis"
diff --git a/tests/graders/multimodal/test_text_to_image.py b/tests/graders/multimodal/test_text_to_image.py
@@ -407,11 +407,11 @@ def map_rejected_image(sample):
         pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
 
         # Print accuracy for reporting
-        print(f"\n{'='*60}")
+        print(f"\n{'=' * 60}")
         print(
             f"TextToImageGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
         )
-        print(f"{'='*60}\n")
+        print(f"{'=' * 60}\n")
 
         # Assert that analyzers executed successfully
         assert fp_result.name == "False Positive Analysis"
diff --git a/tests/runner/test_grading_runner.py b/tests/runner/test_grading_runner.py
@@ -207,12 +207,9 @@ async def test_grading_runner_with_aggregators(self):
         assert "accuracy_grader" in results
         assert "relevance_grader" in results
 
-        # Find the aggregator results - they are stored with the aggregator object as key
-        aggregator_results = None
-        for key in results:
-            if hasattr(key, "__call__"):  # It's the aggregator object
-                aggregator_results = results[key]
-                break
+        # Find the aggregator results - they are stored with the aggregator name as key
+        assert "weighted_sum" in results
+        aggregator_results = results["weighted_sum"]
 
         assert aggregator_results is not None
         assert len(aggregator_results) == 1
@@ -479,12 +476,9 @@ async def test_grading_runner_multiple_datasets_with_aggregators(self):
 
         # Verify aggregator results exist in both datasets
         for dataset_results in results:
-            # Find aggregator results
-            aggregator_results = None
-            for key in dataset_results:
-                if hasattr(key, "__call__"):  # It's the aggregator object
-                    aggregator_results = dataset_results[key]
-                    break
+            # Find aggregator results - they are stored with the aggregator name as key
+            assert "weighted_sum" in dataset_results
+            aggregator_results = dataset_results["weighted_sum"]
 
             assert aggregator_results is not None
 

Original file line number	Diff line number	Diff line change
`@@ -248,7 +248,7 @@ async def categorize_rubrics(`
`248`	`248`	`try:`
`249`	`249`	`# Format rubrics text`
`250`	`250`	`rubrics_text = "\n".join(`
`251`		`- [f"{i+1}. {rubric}" for i, rubric in enumerate(rubrics)],`
	`251`	`+ [f"{i + 1}. {rubric}" for i, rubric in enumerate(rubrics)],`
`252`	`252`	`)`
`253`	`253`
`254`	`254`	`# Call LLM using Chat with structured output`