Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openjudge/graders/agent/action/action_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class ActionAlignmentGrader(LLMGrader):
>>> result = asyncio.run(grader.aevaluate(
... plan="I will open drawer 1 to find the key.",
... action="open drawer 1"
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
5 changes: 3 additions & 2 deletions openjudge/graders/agent/action/action_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ class ActionLoopDetectionGrader(BaseGrader):
all pairs of actions for similarity and penalizing based on the proportion
of similar action pairs found.
Example:
>>> import asyncio
>>> grader = ActionLoopDetectionGrader(similarity_threshold=1.0)
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... messages=[...],
... )
... ))
>>> print(f"Loop detection score: {result.score}")
"""

Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/agent/memory/memory_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class MemoryAccuracyGrader(LLMGrader):
>>> result = asyncio.run(grader.aevaluate(
... observation="You see a closed cabinet.",
... memory="The cabinet is closed."
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class MemoryDetailPreservationGrader(LLMGrader):
>>> result = asyncio.run(grader.aevaluate(
... observation="Cabinet 1 at coordinates (3.5, 2.1) contains 5 red apples.",
... memory="Cabinet 1 at (3.5, 2.1) has 5 red apples."
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ class MemoryRetrievalEffectivenessGrader(LLMGrader):
>>> result = asyncio.run(grader.aevaluate(
... observation="You see a closed cabinet.",
... memory="The cabinet is closed."
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ class ObservationInformationGainGrader(BaseGrader):
Attributes:
similarity_threshold: Threshold for considering observations as redundant
Example:
>>> import asyncio
>>> grader = ObservationInformationGainGrader(similarity_threshold=0.5)
>>> result = await grader.aevaluate(
>>> result = asyncio.run( grader.aevaluate(
... messages=[...], # List of message dicts
... )
... ))
>>> print(f"Info gain score: {result.score}")
"""

Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/agent/plan/plan_feasibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class PlanFeasibilityGrader(LLMGrader):
... plan="I will first open the drawer to get the key, then use it to unlock the door.",
... observation="The drawer is closed. You don't have any items.",
... memory="The key is inside the drawer."
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/agent/reflection/reflection_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class ReflectionAccuracyGrader(LLMGrader):
>>> result = asyncio.run(grader.aevaluate(
... observation="You see a closed cabinet.",
... reflection="I observed a closed cabinet."
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ class ReflectionOutcomeUnderstandingGrader(LLMGrader):
>>> result = asyncio.run(grader.aevaluate(
... observation="The drawer is now open.",
... reflection="I successfully opened the drawer."
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ class ReflectionProgressAwarenessGrader(LLMGrader):
... observation="Cabinet 1 now has apples. Task complete.",
... reflection="Good progress! I've successfully found the apples.",
... context="Task: Find apples in cabinets"
... )
... ))
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

Expand Down
5 changes: 3 additions & 2 deletions openjudge/graders/agent/tool/tool_call_sequence_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ class ToolCallSequenceMatchGrader(BaseGrader):
strict_mode: If True, matches both tool_call name and arguments; if False, only matches tool_call name
use_jaccard_similarity: If True, use Jaccard similarity for loose mode (ignores step order)
Example:
>>> import asyncio
>>> grader = ToolCallSequenceMatchGrader(strict_mode=True)
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... messages=[...], # Model's messages with tool calls
... reference_tool_calls=[...] # Ground truth reference tool calls
... )
... ))
>>> print(f"Sequence match score: {result.score}")
"""

Expand Down
49 changes: 32 additions & 17 deletions openjudge/graders/common/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate

# English Prompt
CORRECTNESS_PROMPT_EN = """
You are a professional data annotator responsible for evaluating whether the model response matches the provided correct response (reference response). Your task is to score according to the following criteria:
CORRECTNESS_PROMPT_EN = textwrap.dedent(
"""
You are a professional data annotator responsible for evaluating whether the model response matches the provided
correct response (reference response). Your task is to score according to the following criteria:

<Scoring Criteria>
A response that perfectly matches the reference response should:
Expand Down Expand Up @@ -51,7 +53,9 @@
</Guidance>

<Reminder>
The goal is to evaluate correctness against reference response, not general quality. A well-written response that contradicts the reference response should score low. A simple response that accurately reflects and properly uses the reference response should score high. Consider both accuracy and appropriate application of the reference response.
The goal is to evaluate correctness against reference response, not general quality. A well-written response that
contradicts the reference response should score low. A simple response that accurately reflects and properly uses the
reference response should score high. Consider both accuracy and appropriate application of the reference response.
</Reminder>

<query>
Expand All @@ -75,22 +79,30 @@
# Output Instructions
Provide your evaluation in the following structured JSON format:
{{
"score": <integer between 1 and 5, where 5 means perfect match with reference response and 1 means complete deviation from reference response>,
"reason": "<brief explanation for the assigned score, specifically mentioning how the response aligns with or deviates from the reference response>"
"score": <integer between 1 and 5, where 5 means perfect match with reference response and 1 means complete
deviation from reference response>,
"reason": "<brief explanation for the assigned score, specifically mentioning how the response aligns with or
deviates from the reference response>"
}}

Scoring Scale:
- 5: The answer is completely consistent with the reference answer in terms of facts, key details, logic, and conclusions. Different wording is acceptable as long as the meaning is equivalent.
- 4: The core conclusion of the answer is consistent with the reference answer, but there are non-critical omissions, vague statements, or minor errors that do not affect user understanding and use.
- 3: The answer contains some correct information, but omits key points, contains verifiable errors, or significantly misinterprets the reference content.
- 2: The core conclusion or key facts of the answer contradict the reference answer, containing only a few superficially related words, and are generally misleading.
- 5: The answer is completely consistent with the reference answer in terms of facts, key details, logic, and
conclusions. Different wording is acceptable as long as the meaning is equivalent.
- 4: The core conclusion of the answer is consistent with the reference answer, but there are non-critical omissions,
vague statements, or minor errors that do not affect user understanding and use.
- 3: The answer contains some correct information, but omits key points, contains verifiable errors, or significantly
misinterprets the reference content.
- 2: The core conclusion or key facts of the answer contradict the reference answer, containing only a few superficially
related words, and are generally misleading.
- 1: The answer is completely unrelated to or directly contradicts the reference answer.

JSON:
"""
).strip()

# Chinese Prompt
CORRECTNESS_PROMPT_ZH = """
CORRECTNESS_PROMPT_ZH = textwrap.dedent(
"""
你是一名专业的数据标注员,负责评估模型输出是否与提供的参考回答(reference response)一致。你的任务是根据以下标准进行评分:

<评分标准>
Expand Down Expand Up @@ -123,7 +135,8 @@
</指导>

<提醒>
目标是评估与参考回答的正确性,而不是一般质量。一个写得很好但与参考回答矛盾的回答应该得分低。一个简单但准确反映并正确使用参考回答的回答应该得分高。同时考虑准确性和参考回答的适当应用。
目标是评估与参考回答的正确性,而不是一般质量。一个写得很好但与参考回答矛盾的回答应该得分低。一个简单但准确反映并正确使用参考回答的回答应该得分高
。同时考虑准确性和参考回答的适当应用。
</提醒>

<查询>
Expand Down Expand Up @@ -160,20 +173,21 @@

JSON:
"""
).strip()

# Build default template from prompts
DEFAULT_CORRECTNESS_TEMPLATE = PromptTemplate(
messages={
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(CORRECTNESS_PROMPT_EN),
content=CORRECTNESS_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(CORRECTNESS_PROMPT_ZH),
content=CORRECTNESS_PROMPT_ZH,
),
],
},
Expand Down Expand Up @@ -225,6 +239,7 @@ class CorrectnessGrader(LLMGrader):
- metadata: Threshold and evaluation details

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.llm_judge import CorrectnessGrader
>>>
Expand All @@ -233,19 +248,19 @@ class CorrectnessGrader(LLMGrader):
>>> grader = CorrectnessGrader(model=model, threshold=0.7)
>>>
>>> # Good match
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... query="When was the product launched?",
... response="The product launched in Q1 2023 in Europe, capturing 50% market share.",
... reference_response="Product launched Q1 2023 in Europe with 50% market share."
... )
... ))
>>> print(result.score) # 5 - accurate to reference response
>>>
>>> # Poor match
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... query="When and where was the product launched?",
... response="The product was launched in early 2023 in European markets.",
... reference_response="The product was launched in Q1 2023 in Europe."
... )
... ))
>>> print(result.score) # 2 - deviates from reference response
"""

Expand Down
25 changes: 15 additions & 10 deletions openjudge/graders/common/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
# pylint: disable=line-too-long

# English Prompt
HALLUCINATION_PROMPT_EN = """
HALLUCINATION_PROMPT_EN = textwrap.dedent(
"""
You are a professional data annotator responsible for evaluating whether the model response contains hallucinations. Your task is to score according to the following criteria:

<Scoring Criteria>
Expand Down Expand Up @@ -80,9 +81,11 @@

JSON:
"""
).strip()

# Chinese Prompt
HALLUCINATION_PROMPT_ZH = """
HALLUCINATION_PROMPT_ZH = textwrap.dedent(
"""
你是一名专业的数据标注员,负责评估模型输出是否包含幻觉(虚构信息)。你的任务是根据以下标准进行评分:

<评分标准>
Expand Down Expand Up @@ -141,6 +144,7 @@

JSON:
"""
).strip()


# Build default template from prompts
Expand All @@ -149,13 +153,13 @@
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(HALLUCINATION_PROMPT_EN),
content=HALLUCINATION_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(HALLUCINATION_PROMPT_ZH),
content=HALLUCINATION_PROMPT_ZH,
),
],
},
Expand Down Expand Up @@ -205,6 +209,7 @@ class HallucinationGrader(LLMGrader):
- metadata: Threshold and evaluation details

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.llm_judge import HallucinationGrader
>>>
Expand All @@ -219,28 +224,28 @@ class HallucinationGrader(LLMGrader):
>>> grader = HallucinationGrader(model=model, threshold=0.7)
>>>
>>> # With context: Good output (grounded in context)
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... query="When was the company founded?",
... response="The company was founded in 2020 in San Francisco.",
... context="The company was founded in 2020 in San Francisco."
... )
... ))
>>> print(result.score) # 5 - no hallucinations
>>> print(result.reason) # "Output is fully supported by context"
>>>
>>> # With context: Bad output (contains hallucination)
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... query="When was the company founded?",
... response="The company was founded in 2020 with 100 employees.",
... context="The company was founded in 2020 in San Francisco."
... )
... ))
>>> print(result.score) # 3 - contains unsupported claim about employees
>>> print(result.reason) # "Output contains hallucination: '100 employees' not mentioned"
>>>
>>> # Without context: Factual verification
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... query="What is the capital of France?",
... response="The capital of France is Paris."
... )
... ))
>>> print(result.score) # 5 - factually correct
"""

Expand Down
Loading