Skip to content

Commit ac0a508

Browse files
author
zz297429
committed
fix: resolve pre-commit issues
Link: https://code.alibaba-inc.com/DAIL-LLM/OpenJudge/codereview/25166864 * fix: resolve pre-commit issues - Fix flake8 E226: add whitespace around arithmetic operators - Fix flake8 F541/pylint W1309: remove unused f-string prefix - Add pytest-asyncio configuration in pytest.ini - Fix test_grading_runner.py: use aggregator name string as key instead of callable object
1 parent 473e0a1 commit ac0a508

File tree

17 files changed

+44
-46
lines changed

17 files changed

+44
-46
lines changed

docs/images/logo.svg

Lines changed: 5 additions & 5 deletions
Loading

openjudge/generator/iterative_rubric/categorizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ async def categorize_rubrics(
248248
try:
249249
# Format rubrics text
250250
rubrics_text = "\n".join(
251-
[f"{i+1}. {rubric}" for i, rubric in enumerate(rubrics)],
251+
[f"{i + 1}. {rubric}" for i, rubric in enumerate(rubrics)],
252252
)
253253

254254
# Call LLM using Chat with structured output

openjudge/generator/iterative_rubric/generator.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,8 @@ async def _generate_query_rubrics(
444444
current_index = end_idx
445445

446446
logger.info(
447-
f"Iteration {iteration}: Processing batch {start_idx}-{end_idx-1} " f"({len(batch_data)} samples)",
447+
f"Iteration {iteration}: Processing batch {start_idx}-{end_idx - 1} "
448+
f"({len(batch_data)} samples)",
448449
)
449450

450451
# Generate rubrics for batch concurrently
@@ -559,7 +560,7 @@ async def _categorize_query_rubrics(
559560
if not self.config.enable_categorization:
560561
logger.info(f"Categorization disabled: keeping all {len(query_rubrics)} rubrics")
561562
formatted_rubrics = "\n\n".join(
562-
[f"{i+1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
563+
[f"{i + 1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
563564
)
564565
return formatted_rubrics
565566

@@ -583,7 +584,7 @@ async def _categorize_query_rubrics(
583584
logger.error("Rubric categorization failed, falling back to numbered list format")
584585
# Fallback: return original rubrics as formatted string
585586
return "\n\n".join(
586-
[f"{i+1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
587+
[f"{i + 1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
587588
)
588589

589590
logger.info(
@@ -592,13 +593,13 @@ async def _categorize_query_rubrics(
592593

593594
# Format categorized rubrics into a single string
594595
formatted_rubrics = "\n\n".join(
595-
[f"Rubric {i+1}:\n{rubric}" for i, rubric in enumerate(categorized_rubrics)],
596+
[f"Rubric {i + 1}:\n{rubric}" for i, rubric in enumerate(categorized_rubrics)],
596597
)
597598

598599
return formatted_rubrics
599600

600601
except Exception as e:
601602
logger.error(f"Categorization error: {e}, falling back to numbered list format")
602603
return "\n\n".join(
603-
[f"{i+1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
604+
[f"{i + 1}. {rubric}" for i, rubric in enumerate(query_rubrics)],
604605
)

openjudge/generator/iterative_rubric/query_rubric_generator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,7 @@ async def _evaluate_listwise(
937937

938938
# Format responses for evaluation
939939
responses_text = "\n\n".join(
940-
[f"Response {i+1}:\n{resp}" for i, resp in enumerate(responses)],
940+
[f"Response {i + 1}:\n{resp}" for i, resp in enumerate(responses)],
941941
)
942942

943943
try:
@@ -1112,7 +1112,7 @@ def _format_data_context(self, data: dict) -> str:
11121112
ranks = data.get("label_rank", [])
11131113

11141114
for i, response in enumerate(responses):
1115-
lines.append(f"Response {i+1}:")
1115+
lines.append(f"Response {i + 1}:")
11161116
lines.append(f"{response}")
11171117

11181118
if i < len(ranks):
@@ -1128,5 +1128,5 @@ def _format_data_context(self, data: dict) -> str:
11281128
def _format_rubrics_text(self, rubrics: List[str]) -> str:
11291129
"""Format rubrics list into numbered text"""
11301130
return "\n".join(
1131-
[f"{i+1}. {rubric}" for i, rubric in enumerate(rubrics)],
1131+
[f"{i + 1}. {rubric}" for i, rubric in enumerate(rubrics)],
11321132
)

openjudge/models/openai_chat_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ async def achat(
170170
if "extra_body" not in kwargs:
171171
kwargs["extra_body"] = {}
172172
kwargs["extra_body"]["enable_thinking"] = False
173-
logger.debug(f"Set enable_thinking=False in extra_body for qwen model")
173+
logger.debug("Set enable_thinking=False in extra_body for qwen model")
174174

175175
if tool_choice:
176176
self._validate_tool_choice(tool_choice, tools)

pytest.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11

22
[pytest]
3+
asyncio_mode = auto
4+
asyncio_default_fixture_loop_scope = function
35
markers =
46
unit: Unit tests (offline testing with mocks)
57
quality: Quality tests (evaluation against gold standard datasets)
8+
asyncio: mark a test as an async test
69
norecursedirs = cookbooks

tests/benchmarks/test_rewardbench2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ async def run_rewardbench2_test(
135135

136136
metadata = analysis_result.metadata
137137
logger.info("\nOverall Performance:")
138-
logger.info(f" Accuracy: {metadata.get('accuracy', 0):.4f} ({metadata.get('accuracy', 0)*100:.2f}%)")
138+
logger.info(f" Accuracy: {metadata.get('accuracy', 0):.4f} ({metadata.get('accuracy', 0) * 100:.2f}%)")
139139
logger.info(f" Correct: {metadata.get('correct_count', 0)}/{metadata.get('valid_samples', 0)}")
140140
logger.info(f" Total samples: {metadata.get('total_samples', 0)}")
141141

@@ -147,7 +147,7 @@ async def run_rewardbench2_test(
147147
correct = metrics.get("correct_count", 0)
148148
total = metrics.get("total_samples", 0)
149149
logger.info(
150-
f" {subset:15s}: {accuracy:.4f} ({accuracy*100:5.2f}%) - " f"{correct:2d}/{total:2d} correct",
150+
f" {subset:15s}: {accuracy:.4f} ({accuracy * 100:5.2f}%) - " f"{correct:2d}/{total:2d} correct",
151151
)
152152

153153
logger.info("\n" + "=" * 80)

tests/generator/test_iterative_rubric.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ async def test_iterative_grader_listwise() -> None:
263263
# Evaluate test sample
264264
test_query = LISTWISE_TEST_SAMPLE[0]["query"]
265265
test_responses = LISTWISE_TEST_SAMPLE[0]["responses"]
266-
responses = "\n\n".join([f"Response {i+1}:\n{ans}" for i, ans in enumerate(test_responses)])
266+
responses = "\n\n".join([f"Response {i + 1}:\n{ans}" for i, ans in enumerate(test_responses)])
267267

268268
result = await grader.aevaluate(query=test_query, responses=responses)
269269

tests/graders/common/test_correctness.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -411,11 +411,11 @@ async def test_adversarial_correctness_with_runner(self, dataset, model):
411411
pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
412412

413413
# Print accuracy for reporting
414-
print(f"\n{'='*60}")
414+
print(f"\n{'=' * 60}")
415415
print(
416416
f"CorrectnessGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
417417
)
418-
print(f"{'='*60}\n")
418+
print(f"{'=' * 60}\n")
419419

420420
# Assert that false positive and false negative rates meet expected thresholds
421421
# Note: We log the rates but don't fail the test if they exceed thresholds

tests/graders/common/test_hallucination.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -414,11 +414,11 @@ async def test_adversarial_hallucination_with_runner(self, dataset, model):
414414
pairwise_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
415415

416416
# Print accuracy for reporting
417-
print(f"\n{'='*60}")
417+
print(f"\n{'=' * 60}")
418418
print(
419419
f"HallucinationGrader Pairwise Accuracy: {pairwise_accuracy:.4f} ({correct_predictions}/{total_predictions})"
420420
)
421-
print(f"{'='*60}\n")
421+
print(f"{'=' * 60}\n")
422422

423423
# Assert that false positive and false negative rates meet expected thresholds
424424
# Note: We log the rates but don't fail the test if they exceed thresholds

0 commit comments

Comments
 (0)