Skip to content

Commit d9cdc75

Browse files
refactor(graders): standardize parameter naming and response parsing (#44)
* refactor(graders): standardize parameter naming and response parsing\n\n- Change parameter name from 'answer' to 'response' to follow\n the project's parameter naming convention\n- Update response parsing to use 'parsed' instead of 'metadata'\n for structured model responses\n- Apply changes consistently across all affected graders and tests * Update openjudge/graders/function_grader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 582e28c commit d9cdc75

File tree

7 files changed

+29
-29
lines changed

7 files changed

+29
-29
lines changed

openjudge/graders/base_grader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ async def aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank | GraderErr
119119
... description="Evaluates factual accuracy of answers"
120120
... )
121121
...
122-
... async def aevaluate(self, query: str, answer: str, **kwargs):
122+
... async def aevaluate(self, query: str, response: str, **kwargs):
123123
... # Implementation would evaluate accuracy
124124
... return GraderScore(
125125
... name=self.name,

openjudge/graders/format/length_penalty.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ async def aevaluate(self, response: str) -> GraderScore:
5151
- Otherwise: penalty = 0.0
5252
5353
Args:
54-
answer: The text content to evaluate for length.
54+
response: The text content to evaluate for length.
5555
5656
Returns:
5757
GraderScore: A GraderScore object containing:

openjudge/graders/function_grader.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,10 @@ def __init__(
5151
pointwise mode) or a GraderRank (for listwise mode).
5252
5353
For pointwise mode, typical signature:
54-
```async def my_func(query: str, answer: str, **kwargs) -> GraderScore:```
54+
```async def my_func(query: str, response: str, **kwargs) -> GraderScore:```
5555
5656
For listwise mode, typical signature:
57-
```async def my_func(query: str, answer_1: str, answer_2: str, **kwargs) -> GraderRank:```
57+
```async def my_func(query: str, responses: List[str], **kwargs) -> GraderRank:```
5858
name: The name of the grader. Used for identification and logging.
5959
mode: The grader mode. Either POINTWISE (individual sample evaluation)
6060
or LISTWISE (joint evaluation of multiple samples).
@@ -104,9 +104,9 @@ async def aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank:
104104
105105
Example:
106106
>>> # Example for pointwise function grader
107-
>>> def accuracy_function(query: str, answer: str) -> GraderScore:
107+
>>> def accuracy_function(query: str, response: str) -> GraderScore:
108108
... # Simple accuracy function - checks if answer contains key facts
109-
... if "Paris" in answer and "capital" in answer.lower():
109+
... if "Paris" in response and "capital" in response.lower():
110110
... return GraderScore(name=self.name,
111111
... score=1.0,
112112
... reason="Correctly identifies Paris as capital")
@@ -189,9 +189,9 @@ def wrap(cls, func: Callable) -> Callable:
189189
190190
Example:
191191
>>> @FunctionGrader.wrap
192-
>>> def my_accuracy_function(query: str, answer: str) -> GraderScore:
192+
>>> def my_accuracy_function(query: str, response: str) -> GraderScore:
193193
>>> # Custom accuracy evaluation logic
194-
>>> score = calculate_accuracy(query, answer)
194+
>>> score = calculate_accuracy(query, response)
195195
>>> return GraderScore(name="accuracy", score=score, reason="Custom calculation")
196196
>>>
197197
>>> # Create the grader instance

openjudge/graders/multimodal/image_coherence.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,8 @@ async def _aevaluate_single_image(
234234
messages=[{"role": "user", "content": content}],
235235
structured_model=GraderScoreCallback,
236236
)
237-
score = chat_response.metadata["score"]
238-
reason = chat_response.metadata["reason"]
237+
score = chat_response.parsed["score"]
238+
reason = chat_response.parsed["reason"]
239239
return score, reason
240240

241241
except Exception as e:

openjudge/graders/multimodal/image_helpfulness.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,20 +234,20 @@ async def _aevaluate_single_image(
234234
if hasattr(chat_response, "__aiter__"):
235235
# This is a streaming response, we need to collect it first
236236
collected_content = []
237-
metadata = {}
237+
parsed = {}
238238
async for chunk in chat_response:
239239
if chunk.content:
240240
collected_content.extend(chunk.content)
241-
if chunk.metadata:
242-
metadata.update(chunk.metadata)
241+
if chunk.parsed:
242+
parsed.update(chunk.parsed)
243243

244244
# Extract score and reason from metadata
245-
score = metadata.get("score", 0.0)
246-
reason = metadata.get("reason", "")
245+
score = parsed.get("score", 0.0)
246+
reason = parsed.get("reason", "")
247247
else:
248248
# Non-streaming response
249-
score = chat_response.metadata["score"]
250-
reason = chat_response.metadata["reason"]
249+
score = chat_response.parsed["score"]
250+
reason = chat_response.parsed["reason"]
251251
return score, reason
252252

253253
except Exception as e:

openjudge/graders/multimodal/text_to_image.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -270,21 +270,21 @@ async def _aevaluate_semantic_consistency(
270270
if hasattr(chat_response, "__aiter__"):
271271
# This is a streaming response, we need to collect it first
272272
collected_content = []
273-
metadata = {}
273+
parsed = {}
274274
async for chunk in chat_response:
275275
if chunk.content:
276276
collected_content.extend(chunk.content)
277-
if chunk.metadata:
278-
metadata.update(chunk.metadata)
277+
if chunk.parsed:
278+
parsed.update(chunk.parsed)
279279

280280
# Extract score and reason from metadata
281-
score = metadata.get("score", 0.0)
282-
reason = metadata.get("reason", "")
281+
score = parsed.get("score", 0.0)
282+
reason = parsed.get("reason", "")
283283
else:
284284
# Non-streaming response
285-
score = chat_response.metadata["score"]
285+
score = chat_response.parsed["score"]
286286
score = score if isinstance(score, list) else [score]
287-
reason = chat_response.metadata["reason"]
287+
reason = chat_response.parsed["reason"]
288288
return score, reason
289289

290290
except Exception as e:
@@ -305,9 +305,9 @@ async def _aevaluate_perceptual_quality(
305305
messages=[{"role": "user", "content": content}],
306306
structured_model=GraderScoreCallback,
307307
)
308-
score = chat_response.metadata["score"]
308+
score = chat_response.parsed["score"]
309309
score = score[:2] if isinstance(score, list) else [score, score]
310-
reason = chat_response.metadata["reason"]
310+
reason = chat_response.parsed["reason"]
311311
return score, reason
312312

313313
except Exception as e:

tests/docs/test_building_graders_custom.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,16 +197,16 @@ class TestRuleBasedGraders:
197197
async def test_length_evaluator(self):
198198
"""Test pointwise length check from line 294-313"""
199199

200-
async def length_evaluator(query: str, answer: str) -> GraderScore:
200+
async def length_evaluator(query: str, response: str) -> GraderScore:
201201
"""Evaluate response length."""
202-
length = len(answer)
202+
length = len(response)
203203
score = min(length / 100.0, 1.0) # Normalize to 0-1
204204

205205
return GraderScore(name="length_grader", score=score, reason=f"Length: {length} chars (target: 100+)")
206206

207207
grader = FunctionGrader(func=length_evaluator, name="length_check", mode="pointwise")
208208

209-
result = await grader.aevaluate(query="Test query", answer="Short")
209+
result = await grader.aevaluate(query="Test query", response="Short")
210210
assert 0.0 <= result.score <= 1.0
211211
assert "Length:" in result.reason
212212

0 commit comments

Comments
 (0)