From 05469959b003962cabb5488e71d82ad987989ed6 Mon Sep 17 00:00:00 2001 From: huangsen Date: Thu, 8 Jan 2026 14:34:19 +0800 Subject: [PATCH 1/2] refactor(graders): standardize parameter naming and response parsing\n\n- Change parameter name from 'answer' to 'response' to follow\n the project's parameter naming convention\n- Update response parsing to use 'parsed' instead of 'metadata'\n for structured model responses\n- Apply changes consistently across all affected graders and tests --- openjudge/graders/base_grader.py | 2 +- openjudge/graders/format/length_penalty.py | 2 +- openjudge/graders/function_grader.py | 10 +++++----- .../graders/multimodal/image_coherence.py | 4 ++-- .../graders/multimodal/image_helpfulness.py | 14 +++++++------- openjudge/graders/multimodal/text_to_image.py | 18 +++++++++--------- tests/docs/test_building_graders_custom.py | 6 +++--- 7 files changed, 28 insertions(+), 28 deletions(-) diff --git a/openjudge/graders/base_grader.py b/openjudge/graders/base_grader.py index 2055da68..9fbdb220 100644 --- a/openjudge/graders/base_grader.py +++ b/openjudge/graders/base_grader.py @@ -119,7 +119,7 @@ async def aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank | GraderErr ... description="Evaluates factual accuracy of answers" ... ) ... - ... async def aevaluate(self, query: str, answer: str, **kwargs): + ... async def aevaluate(self, query: str, response: str, **kwargs): ... # Implementation would evaluate accuracy ... return GraderScore( ... name=self.name, diff --git a/openjudge/graders/format/length_penalty.py b/openjudge/graders/format/length_penalty.py index 6981bc3d..eb8d4e89 100644 --- a/openjudge/graders/format/length_penalty.py +++ b/openjudge/graders/format/length_penalty.py @@ -51,7 +51,7 @@ async def aevaluate(self, response: str) -> GraderScore: - Otherwise: penalty = 0.0 Args: - answer: The text content to evaluate for length. + response: The text content to evaluate for length. Returns: GraderScore: A GraderScore object containing: diff --git a/openjudge/graders/function_grader.py b/openjudge/graders/function_grader.py index 4d888eb6..022ed39c 100644 --- a/openjudge/graders/function_grader.py +++ b/openjudge/graders/function_grader.py @@ -51,10 +51,10 @@ def __init__( pointwise mode) or a GraderRank (for listwise mode). For pointwise mode, typical signature: - ```async def my_func(query: str, answer: str, **kwargs) -> GraderScore:``` + ```async def my_func(query: str, response: str, **kwargs) -> GraderScore:``` For listwise mode, typical signature: - ```async def my_func(query: str, answer_1: str, answer_2: str, **kwargs) -> GraderRank:``` + ```async def my_func(query: str, responses: List[str], **kwargs) -> GraderRank:``` name: The name of the grader. Used for identification and logging. mode: The grader mode. Either POINTWISE (individual sample evaluation) or LISTWISE (joint evaluation of multiple samples). @@ -104,7 +104,7 @@ async def aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank: Example: >>> # Example for pointwise function grader - >>> def accuracy_function(query: str, answer: str) -> GraderScore: + >>> def accuracy_function(query: str, response: str) -> GraderScore: ... # Simple accuracy function - checks if answer contains key facts ... if "Paris" in answer and "capital" in answer.lower(): ... return GraderScore(name=self.name, @@ -189,9 +189,9 @@ def wrap(cls, func: Callable) -> Callable: Example: >>> @FunctionGrader.wrap - >>> def my_accuracy_function(query: str, answer: str) -> GraderScore: + >>> def my_accuracy_function(query: str, response: str) -> GraderScore: >>> # Custom accuracy evaluation logic - >>> score = calculate_accuracy(query, answer) + >>> score = calculate_accuracy(query, response) >>> return GraderScore(name="accuracy", score=score, reason="Custom calculation") >>> >>> # Create the grader instance diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py index 5efefaff..ae95a495 100644 --- a/openjudge/graders/multimodal/image_coherence.py +++ b/openjudge/graders/multimodal/image_coherence.py @@ -234,8 +234,8 @@ async def _aevaluate_single_image( messages=[{"role": "user", "content": content}], structured_model=GraderScoreCallback, ) - score = chat_response.metadata["score"] - reason = chat_response.metadata["reason"] + score = chat_response.parsed["score"] + reason = chat_response.parsed["reason"] return score, reason except Exception as e: diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py index 9d586f7a..effd61c5 100644 --- a/openjudge/graders/multimodal/image_helpfulness.py +++ b/openjudge/graders/multimodal/image_helpfulness.py @@ -234,20 +234,20 @@ async def _aevaluate_single_image( if hasattr(chat_response, "__aiter__"): # This is a streaming response, we need to collect it first collected_content = [] - metadata = {} + parsed = {} async for chunk in chat_response: if chunk.content: collected_content.extend(chunk.content) - if chunk.metadata: - metadata.update(chunk.metadata) + if chunk.parsed: + parsed.update(chunk.parsed) # Extract score and reason from metadata - score = metadata.get("score", 0.0) - reason = metadata.get("reason", "") + score = parsed.get("score", 0.0) + reason = parsed.get("reason", "") else: # Non-streaming response - score = chat_response.metadata["score"] - reason = chat_response.metadata["reason"] + score = chat_response.parsed["score"] + reason = chat_response.parsed["reason"] return score, reason except Exception as e: diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py index 476dd869..39662b7c 100644 --- a/openjudge/graders/multimodal/text_to_image.py +++ b/openjudge/graders/multimodal/text_to_image.py @@ -270,21 +270,21 @@ async def _aevaluate_semantic_consistency( if hasattr(chat_response, "__aiter__"): # This is a streaming response, we need to collect it first collected_content = [] - metadata = {} + parsed = {} async for chunk in chat_response: if chunk.content: collected_content.extend(chunk.content) - if chunk.metadata: - metadata.update(chunk.metadata) + if chunk.parsed: + parsed.update(chunk.parsed) # Extract score and reason from metadata - score = metadata.get("score", 0.0) - reason = metadata.get("reason", "") + score = parsed.get("score", 0.0) + reason = parsed.get("reason", "") else: # Non-streaming response - score = chat_response.metadata["score"] + score = chat_response.parsed["score"] score = score if isinstance(score, list) else [score] - reason = chat_response.metadata["reason"] + reason = chat_response.parsed["reason"] return score, reason except Exception as e: @@ -305,9 +305,9 @@ async def _aevaluate_perceptual_quality( messages=[{"role": "user", "content": content}], structured_model=GraderScoreCallback, ) - score = chat_response.metadata["score"] + score = chat_response.parsed["score"] score = score[:2] if isinstance(score, list) else [score, score] - reason = chat_response.metadata["reason"] + reason = chat_response.parsed["reason"] return score, reason except Exception as e: diff --git a/tests/docs/test_building_graders_custom.py b/tests/docs/test_building_graders_custom.py index ee6dd0e7..9f8dec09 100644 --- a/tests/docs/test_building_graders_custom.py +++ b/tests/docs/test_building_graders_custom.py @@ -197,16 +197,16 @@ class TestRuleBasedGraders: async def test_length_evaluator(self): """Test pointwise length check from line 294-313""" - async def length_evaluator(query: str, answer: str) -> GraderScore: + async def length_evaluator(query: str, response: str) -> GraderScore: """Evaluate response length.""" - length = len(answer) + length = len(response) score = min(length / 100.0, 1.0) # Normalize to 0-1 return GraderScore(name="length_grader", score=score, reason=f"Length: {length} chars (target: 100+)") grader = FunctionGrader(func=length_evaluator, name="length_check", mode="pointwise") - result = await grader.aevaluate(query="Test query", answer="Short") + result = await grader.aevaluate(query="Test query", response="Short") assert 0.0 <= result.score <= 1.0 assert "Length:" in result.reason From 7f3b1fadea1dd325b6bc4757ae2af2940df89f7f Mon Sep 17 00:00:00 2001 From: Sen Huang <48879559+ployts@users.noreply.github.com> Date: Thu, 8 Jan 2026 14:46:58 +0800 Subject: [PATCH 2/2] Update openjudge/graders/function_grader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- openjudge/graders/function_grader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openjudge/graders/function_grader.py b/openjudge/graders/function_grader.py index 022ed39c..12af35c7 100644 --- a/openjudge/graders/function_grader.py +++ b/openjudge/graders/function_grader.py @@ -106,7 +106,7 @@ async def aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank: >>> # Example for pointwise function grader >>> def accuracy_function(query: str, response: str) -> GraderScore: ... # Simple accuracy function - checks if answer contains key facts - ... if "Paris" in answer and "capital" in answer.lower(): + ... if "Paris" in response and "capital" in response.lower(): ... return GraderScore(name=self.name, ... score=1.0, ... reason="Correctly identifies Paris as capital")