feat: update example code and template processing in agent and common graders (#49)

jc200808 · web-flow · commit d722e38d12fc · 2026-01-09T11:07:51.000+08:00
* feat: update example code and template processing in agent and common graders

* feat: fix grader template long text format issue
diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
@@ -165,7 +165,7 @@ class ActionAlignmentGrader(LLMGrader):
         >>> result = asyncio.run(grader.aevaluate(
         ...     plan="I will open drawer 1 to find the key.",
         ...     action="open drawer 1"
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/action/action_loop.py b/openjudge/graders/agent/action/action_loop.py
@@ -21,10 +21,11 @@ class ActionLoopDetectionGrader(BaseGrader):
     all pairs of actions for similarity and penalizing based on the proportion
     of similar action pairs found.
     Example:
+        >>> import asyncio
         >>> grader = ActionLoopDetectionGrader(similarity_threshold=1.0)
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     messages=[...],
-        ... )
+        ... ))
         >>> print(f"Loop detection score: {result.score}")
     """
 
diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py
@@ -165,7 +165,7 @@ class MemoryAccuracyGrader(LLMGrader):
         >>> result = asyncio.run(grader.aevaluate(
         ...     observation="You see a closed cabinet.",
         ...     memory="The cabinet is closed."
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py
@@ -165,7 +165,7 @@ class MemoryDetailPreservationGrader(LLMGrader):
         >>> result = asyncio.run(grader.aevaluate(
         ...     observation="Cabinet 1 at coordinates (3.5, 2.1) contains 5 red apples.",
         ...     memory="Cabinet 1 at (3.5, 2.1) has 5 red apples."
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
@@ -167,7 +167,7 @@ class MemoryRetrievalEffectivenessGrader(LLMGrader):
         >>> result = asyncio.run(grader.aevaluate(
         ...     observation="You see a closed cabinet.",
         ...     memory="The cabinet is closed."
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/observation/observation_information_gain.py b/openjudge/graders/agent/observation/observation_information_gain.py
@@ -23,10 +23,11 @@ class ObservationInformationGainGrader(BaseGrader):
     Attributes:
         similarity_threshold: Threshold for considering observations as redundant
     Example:
+        >>> import asyncio
         >>> grader = ObservationInformationGainGrader(similarity_threshold=0.5)
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run( grader.aevaluate(
         ...     messages=[...],  # List of message dicts
-        ... )
+        ... ))
         >>> print(f"Info gain score: {result.score}")
     """
 
diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py
@@ -168,7 +168,7 @@ class PlanFeasibilityGrader(LLMGrader):
         ...     plan="I will first open the drawer to get the key, then use it to unlock the door.",
         ...     observation="The drawer is closed. You don't have any items.",
         ...     memory="The key is inside the drawer."
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py
@@ -165,7 +165,7 @@ class ReflectionAccuracyGrader(LLMGrader):
         >>> result = asyncio.run(grader.aevaluate(
         ...     observation="You see a closed cabinet.",
         ...     reflection="I observed a closed cabinet."
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
@@ -289,7 +289,7 @@ class ReflectionOutcomeUnderstandingGrader(LLMGrader):
         >>> result = asyncio.run(grader.aevaluate(
         ...     observation="The drawer is now open.",
         ...     reflection="I successfully opened the drawer."
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py
@@ -206,7 +206,7 @@ class ReflectionProgressAwarenessGrader(LLMGrader):
         ...     observation="Cabinet 1 now has apples. Task complete.",
         ...     reflection="Good progress! I've successfully found the apples.",
         ...     context="Task: Find apples in cabinets"
-        ... )
+        ... ))
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
diff --git a/openjudge/graders/agent/tool/tool_call_sequence_match.py b/openjudge/graders/agent/tool/tool_call_sequence_match.py
@@ -31,11 +31,12 @@ class ToolCallSequenceMatchGrader(BaseGrader):
         strict_mode: If True, matches both tool_call name and arguments; if False, only matches tool_call name
         use_jaccard_similarity: If True, use Jaccard similarity for loose mode (ignores step order)
     Example:
+        >>> import asyncio
         >>> grader = ToolCallSequenceMatchGrader(strict_mode=True)
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     messages=[...],  # Model's messages with tool calls
         ...     reference_tool_calls=[...]  # Ground truth reference tool calls
-        ... )
+        ... ))
         >>> print(f"Sequence match score: {result.score}")
     """
 
diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py
@@ -18,8 +18,10 @@
 from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
 
 # English Prompt
-CORRECTNESS_PROMPT_EN = """
-You are a professional data annotator responsible for evaluating whether the model response matches the provided correct response (reference response). Your task is to score according to the following criteria:
+CORRECTNESS_PROMPT_EN = textwrap.dedent(
+    """
+You are a professional data annotator responsible for evaluating whether the model response matches the provided
+correct response (reference response). Your task is to score according to the following criteria:
 
 <Scoring Criteria>
 A response that perfectly matches the reference response should:
@@ -51,7 +53,9 @@
 </Guidance>
 
 <Reminder>
-The goal is to evaluate correctness against reference response, not general quality. A well-written response that contradicts the reference response should score low. A simple response that accurately reflects and properly uses the reference response should score high. Consider both accuracy and appropriate application of the reference response.
+The goal is to evaluate correctness against reference response, not general quality. A well-written response that
+contradicts the reference response should score low. A simple response that accurately reflects and properly uses the
+reference response should score high. Consider both accuracy and appropriate application of the reference response.
 </Reminder>
 
 <query>
@@ -75,22 +79,30 @@
 # Output Instructions
 Provide your evaluation in the following structured JSON format:
 {{
-    "score": <integer between 1 and 5, where 5 means perfect match with reference response and 1 means complete deviation from reference response>,
-    "reason": "<brief explanation for the assigned score, specifically mentioning how the response aligns with or deviates from the reference response>"
+    "score": <integer between 1 and 5, where 5 means perfect match with reference response and 1 means complete
+    deviation from reference response>,
+    "reason": "<brief explanation for the assigned score, specifically mentioning how the response aligns with or
+    deviates from the reference response>"
 }}
 
 Scoring Scale:
-- 5: The answer is completely consistent with the reference answer in terms of facts, key details, logic, and conclusions. Different wording is acceptable as long as the meaning is equivalent.
-- 4: The core conclusion of the answer is consistent with the reference answer, but there are non-critical omissions, vague statements, or minor errors that do not affect user understanding and use.
-- 3: The answer contains some correct information, but omits key points, contains verifiable errors, or significantly misinterprets the reference content.
-- 2: The core conclusion or key facts of the answer contradict the reference answer, containing only a few superficially related words, and are generally misleading.
+- 5: The answer is completely consistent with the reference answer in terms of facts, key details, logic, and
+conclusions. Different wording is acceptable as long as the meaning is equivalent.
+- 4: The core conclusion of the answer is consistent with the reference answer, but there are non-critical omissions,
+vague statements, or minor errors that do not affect user understanding and use.
+- 3: The answer contains some correct information, but omits key points, contains verifiable errors, or significantly
+misinterprets the reference content.
+- 2: The core conclusion or key facts of the answer contradict the reference answer, containing only a few superficially
+ related words, and are generally misleading.
 - 1: The answer is completely unrelated to or directly contradicts the reference answer.
 
 JSON:
 """
+).strip()
 
 # Chinese Prompt
-CORRECTNESS_PROMPT_ZH = """
+CORRECTNESS_PROMPT_ZH = textwrap.dedent(
+    """
 你是一名专业的数据标注员，负责评估模型输出是否与提供的参考回答（reference response）一致。你的任务是根据以下标准进行评分：
 
 <评分标准>
@@ -123,7 +135,8 @@
 </指导>
 
 <提醒>
-目标是评估与参考回答的正确性，而不是一般质量。一个写得很好但与参考回答矛盾的回答应该得分低。一个简单但准确反映并正确使用参考回答的回答应该得分高。同时考虑准确性和参考回答的适当应用。
+目标是评估与参考回答的正确性，而不是一般质量。一个写得很好但与参考回答矛盾的回答应该得分低。一个简单但准确反映并正确使用参考回答的回答应该得分高
+。同时考虑准确性和参考回答的适当应用。
 </提醒>
 
 <查询>
@@ -160,20 +173,21 @@
 
 JSON:
 """
+).strip()
 
 # Build default template from prompts
 DEFAULT_CORRECTNESS_TEMPLATE = PromptTemplate(
     messages={
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(CORRECTNESS_PROMPT_EN),
+                content=CORRECTNESS_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(CORRECTNESS_PROMPT_ZH),
+                content=CORRECTNESS_PROMPT_ZH,
             ),
         ],
     },
@@ -225,6 +239,7 @@ class CorrectnessGrader(LLMGrader):
             - metadata: Threshold and evaluation details
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
         >>> from openjudge.llm_judge import CorrectnessGrader
         >>>
@@ -233,19 +248,19 @@ class CorrectnessGrader(LLMGrader):
         >>> grader = CorrectnessGrader(model=model, threshold=0.7)
         >>>
         >>> # Good match
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     query="When was the product launched?",
         ...     response="The product launched in Q1 2023 in Europe, capturing 50% market share.",
         ...     reference_response="Product launched Q1 2023 in Europe with 50% market share."
-        ... )
+        ... ))
         >>> print(result.score)  # 5 - accurate to reference response
         >>>
         >>> # Poor match
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     query="When and where was the product launched?",
         ...     response="The product was launched in early 2023 in European markets.",
         ...     reference_response="The product was launched in Q1 2023 in Europe."
-        ... )
+        ... ))
         >>> print(result.score)  # 2 - deviates from reference response
     """
 
diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py
@@ -20,7 +20,8 @@
 # pylint: disable=line-too-long
 
 # English Prompt
-HALLUCINATION_PROMPT_EN = """
+HALLUCINATION_PROMPT_EN = textwrap.dedent(
+    """
 You are a professional data annotator responsible for evaluating whether the model response contains hallucinations. Your task is to score according to the following criteria:
 
 <Scoring Criteria>
@@ -80,9 +81,11 @@
 
 JSON:
 """
+).strip()
 
 # Chinese Prompt
-HALLUCINATION_PROMPT_ZH = """
+HALLUCINATION_PROMPT_ZH = textwrap.dedent(
+    """
 你是一名专业的数据标注员，负责评估模型输出是否包含幻觉（虚构信息）。你的任务是根据以下标准进行评分：
 
 <评分标准>
@@ -141,6 +144,7 @@
 
 JSON:
 """
+).strip()
 
 
 # Build default template from prompts
@@ -149,13 +153,13 @@
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(HALLUCINATION_PROMPT_EN),
+                content=HALLUCINATION_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(HALLUCINATION_PROMPT_ZH),
+                content=HALLUCINATION_PROMPT_ZH,
             ),
         ],
     },
@@ -205,6 +209,7 @@ class HallucinationGrader(LLMGrader):
             - metadata: Threshold and evaluation details
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
         >>> from openjudge.llm_judge import HallucinationGrader
         >>>
@@ -219,28 +224,28 @@ class HallucinationGrader(LLMGrader):
         >>> grader = HallucinationGrader(model=model, threshold=0.7)
         >>>
         >>> # With context: Good output (grounded in context)
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     query="When was the company founded?",
         ...     response="The company was founded in 2020 in San Francisco.",
         ...     context="The company was founded in 2020 in San Francisco."
-        ... )
+        ... ))
         >>> print(result.score)  # 5 - no hallucinations
         >>> print(result.reason)  # "Output is fully supported by context"
         >>>
         >>> # With context: Bad output (contains hallucination)
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     query="When was the company founded?",
         ...     response="The company was founded in 2020 with 100 employees.",
         ...     context="The company was founded in 2020 in San Francisco."
-        ... )
+        ... ))
         >>> print(result.score)  # 3 - contains unsupported claim about employees
         >>> print(result.reason)  # "Output contains hallucination: '100 employees' not mentioned"
         >>>
         >>> # Without context: Factual verification
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     query="What is the capital of France?",
         ...     response="The capital of France is Paris."
-        ... )
+        ... ))
         >>> print(result.score)  # 5 - factually correct
     """
 
diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py
diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py
diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py