refactor(graders): standardize parameter naming and response parsing (#44)

ployts · gemini-code-assist[bot] · web-flow · commit d9cdc751f5ad · 2026-01-09T14:27:37.000+08:00
* refactor(graders): standardize parameter naming and response parsing\n\n- Change parameter name from 'answer' to 'response' to follow\n  the project's parameter naming convention\n- Update response parsing to use 'parsed' instead of 'metadata'\n  for structured model responses\n- Apply changes consistently across all affected graders and tests

* Update openjudge/graders/function_grader.py

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;

---------

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/openjudge/graders/base_grader.py b/openjudge/graders/base_grader.py
@@ -119,7 +119,7 @@ async def aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank | GraderErr
             ...             description="Evaluates factual accuracy of answers"
             ...         )
             ...
-            ...     async def aevaluate(self, query: str, answer: str, **kwargs):
+            ...     async def aevaluate(self, query: str, response: str, **kwargs):
             ...         # Implementation would evaluate accuracy
             ...         return GraderScore(
             ...             name=self.name,
diff --git a/openjudge/graders/format/length_penalty.py b/openjudge/graders/format/length_penalty.py
@@ -51,7 +51,7 @@ async def aevaluate(self, response: str) -> GraderScore:
         - Otherwise: penalty = 0.0
 
         Args:
-            answer: The text content to evaluate for length.
+            response: The text content to evaluate for length.
 
         Returns:
             GraderScore: A GraderScore object containing:
diff --git a/openjudge/graders/function_grader.py b/openjudge/graders/function_grader.py
@@ -51,10 +51,10 @@ def __init__(
                   pointwise mode) or a GraderRank (for listwise mode).
 
                   For pointwise mode, typical signature:
-                  ```async def my_func(query: str, answer: str, **kwargs) -> GraderScore:```
+                  ```async def my_func(query: str, response: str, **kwargs) -> GraderScore:```
 
                   For listwise mode, typical signature:
-                  ```async def my_func(query: str, answer_1: str, answer_2: str, **kwargs) -> GraderRank:```
+                  ```async def my_func(query: str, responses: List[str], **kwargs) -> GraderRank:```
             name: The name of the grader. Used for identification and logging.
             mode: The grader mode. Either POINTWISE (individual sample evaluation)
                   or LISTWISE (joint evaluation of multiple samples).
@@ -104,9 +104,9 @@ async def aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank:
 
         Example:
             >>> # Example for pointwise function grader
-            >>> def accuracy_function(query: str, answer: str) -> GraderScore:
+            >>> def accuracy_function(query: str, response: str) -> GraderScore:
             ...     # Simple accuracy function - checks if answer contains key facts
-            ...     if "Paris" in answer and "capital" in answer.lower():
+            ...     if "Paris" in response and "capital" in response.lower():
             ...         return GraderScore(name=self.name,
             ...                            score=1.0,
             ...                            reason="Correctly identifies Paris as capital")
@@ -189,9 +189,9 @@ def wrap(cls, func: Callable) -> Callable:
 
         Example:
             >>> @FunctionGrader.wrap
-            >>> def my_accuracy_function(query: str, answer: str) -> GraderScore:
+            >>> def my_accuracy_function(query: str, response: str) -> GraderScore:
             >>>     # Custom accuracy evaluation logic
-            >>>     score = calculate_accuracy(query, answer)
+            >>>     score = calculate_accuracy(query, response)
             >>>     return GraderScore(name="accuracy", score=score, reason="Custom calculation")
             >>>
             >>> # Create the grader instance
diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py
@@ -234,8 +234,8 @@ async def _aevaluate_single_image(
                 messages=[{"role": "user", "content": content}],
                 structured_model=GraderScoreCallback,
             )
-            score = chat_response.metadata["score"]
-            reason = chat_response.metadata["reason"]
+            score = chat_response.parsed["score"]
+            reason = chat_response.parsed["reason"]
             return score, reason
 
         except Exception as e:
diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py
@@ -234,20 +234,20 @@ async def _aevaluate_single_image(
             if hasattr(chat_response, "__aiter__"):
                 # This is a streaming response, we need to collect it first
                 collected_content = []
-                metadata = {}
+                parsed = {}
                 async for chunk in chat_response:
                     if chunk.content:
                         collected_content.extend(chunk.content)
-                    if chunk.metadata:
-                        metadata.update(chunk.metadata)
+                    if chunk.parsed:
+                        parsed.update(chunk.parsed)
 
                 # Extract score and reason from metadata
-                score = metadata.get("score", 0.0)
-                reason = metadata.get("reason", "")
+                score = parsed.get("score", 0.0)
+                reason = parsed.get("reason", "")
             else:
                 # Non-streaming response
-                score = chat_response.metadata["score"]
-                reason = chat_response.metadata["reason"]
+                score = chat_response.parsed["score"]
+                reason = chat_response.parsed["reason"]
             return score, reason
 
         except Exception as e:
diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py
@@ -270,21 +270,21 @@ async def _aevaluate_semantic_consistency(
             if hasattr(chat_response, "__aiter__"):
                 # This is a streaming response, we need to collect it first
                 collected_content = []
-                metadata = {}
+                parsed = {}
                 async for chunk in chat_response:
                     if chunk.content:
                         collected_content.extend(chunk.content)
-                    if chunk.metadata:
-                        metadata.update(chunk.metadata)
+                    if chunk.parsed:
+                        parsed.update(chunk.parsed)
 
                 # Extract score and reason from metadata
-                score = metadata.get("score", 0.0)
-                reason = metadata.get("reason", "")
+                score = parsed.get("score", 0.0)
+                reason = parsed.get("reason", "")
             else:
                 # Non-streaming response
-                score = chat_response.metadata["score"]
+                score = chat_response.parsed["score"]
                 score = score if isinstance(score, list) else [score]
-                reason = chat_response.metadata["reason"]
+                reason = chat_response.parsed["reason"]
             return score, reason
 
         except Exception as e:
@@ -305,9 +305,9 @@ async def _aevaluate_perceptual_quality(
                 messages=[{"role": "user", "content": content}],
                 structured_model=GraderScoreCallback,
             )
-            score = chat_response.metadata["score"]
+            score = chat_response.parsed["score"]
             score = score[:2] if isinstance(score, list) else [score, score]
-            reason = chat_response.metadata["reason"]
+            reason = chat_response.parsed["reason"]
             return score, reason
 
         except Exception as e:
diff --git a/tests/docs/test_building_graders_custom.py b/tests/docs/test_building_graders_custom.py
@@ -197,16 +197,16 @@ class TestRuleBasedGraders:
     async def test_length_evaluator(self):
         """Test pointwise length check from line 294-313"""
 
-        async def length_evaluator(query: str, answer: str) -> GraderScore:
+        async def length_evaluator(query: str, response: str) -> GraderScore:
             """Evaluate response length."""
-            length = len(answer)
+            length = len(response)
             score = min(length / 100.0, 1.0)  # Normalize to 0-1
 
             return GraderScore(name="length_grader", score=score, reason=f"Length: {length} chars (target: 100+)")
 
         grader = FunctionGrader(func=length_evaluator, name="length_check", mode="pointwise")
 
-        result = await grader.aevaluate(query="Test query", answer="Short")
+        result = await grader.aevaluate(query="Test query", response="Short")
         assert 0.0 <= result.score <= 1.0
         assert "Length:" in result.reason