agentscope-ai · ployts · Jan 8, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
@@ -6,7 +6,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Optional, Any, Dict, List
 
 from loguru import logger
 
@@ -19,7 +19,7 @@
 # pylint: disable=line-too-long
 
 # English Prompt
-ACTION_ALIGNMENT_PROMPT_EN = """
+ACTION_ALIGNMENT_PROMPT_EN = textwrap.dedent("""
 You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning.
 
 <Evaluation Type: Action Alignment>
@@ -62,10 +62,10 @@
 }}
 
 JSON:
-"""
+""").strip()
 
 # Chinese Prompt
-ACTION_ALIGNMENT_PROMPT_ZH = """
+ACTION_ALIGNMENT_PROMPT_ZH = textwrap.dedent("""
 你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。
 
 <评估类型：动作对齐>
@@ -108,21 +108,21 @@
 }}
 
 JSON:
-"""
+""").strip()
 
 # Build default template from prompts
 DEFAULT_ACTION_ALIGNMENT_TEMPLATE = PromptTemplate(
     messages={
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_EN),
+                content=ACTION_ALIGNMENT_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_ZH),
+                content=ACTION_ALIGNMENT_PROMPT_ZH,
             ),
         ],
     },
@@ -144,25 +144,24 @@ class ActionAlignmentGrader(LLMGrader):
         language: Language for evaluation prompts (default: LanguageEnum.EN)
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
-        >>> from openjudge.schema.template import LanguageEnum
+        >>> from openjudge.models.schema.prompt_template import LanguageEnum
         >>>
         >>> api = OpenAIChatModel(
-        ...     api_key="your-key",  # pragma: allowlist secret
+        ...     api_key="your-key",
         ...     model="qwen3-max",
         ...     generate_kwargs={"temperature": 0.1}
         ... )
-        >>>
         >>> grader = ActionAlignmentGrader(
         ...     model=api,
         ...     language=LanguageEnum.EN
         ... )
-        >>>
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     plan="I will open drawer 1 to find the key.",
         ...     action="open drawer 1"
         ... )
-        >>> print(f"Score: {result.score}")  # 1.0 (good alignment)
+        >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
     def __init__(
@@ -190,7 +189,7 @@ def __init__(
         )
         self.template = template if template is not None else DEFAULT_ACTION_ALIGNMENT_TEMPLATE
 
-    def _format_history(self, history: Optional[list] = None) -> str:
+    def _format_history(self, history: Optional[List[Dict[str, Any]]] = None) -> str:
         """Format history steps for evaluation.
 
         Args:
@@ -203,8 +202,8 @@ def _format_history(self, history: Optional[list] = None) -> str:
             return ""
 
         lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
+        for i, hist_step in enumerate(history, start=1):
+            lines.append(f"Step {i}:")
             for key, value in hist_step.items():
                 if value:
                     lines.append(f"{key.capitalize()}: {value}")
@@ -217,7 +216,7 @@ async def aevaluate(
         self,
         plan: str,
         action: str,
-        history: Optional[list] = None,
+        history: Optional[List[Dict[str, Any]]] = None,
         context: Optional[str] = None,
     ) -> GraderScore:
         """
@@ -240,9 +239,7 @@ async def aevaluate(
             ... )
         """
         # Format context section
-        context_str = ""
-        if context:
-            context_str = f"<context>\n{context}\n</context>"
+        context_str = f"<context>\n{context}\n</context>" if context else ""
 
         # Format history
         history_str = self._format_history(history)

diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py
@@ -6,7 +6,7 @@
 """
 
 import textwrap
-from typing import Any, Optional
+from typing import Optional, Any, Dict, List
 
 from loguru import logger
 
@@ -19,7 +19,7 @@
 # pylint: disable=line-too-long
 
 # English Prompt
-MEMORY_ACCURACY_PROMPT_EN = """
+MEMORY_ACCURACY_PROMPT_EN = textwrap.dedent("""
 You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent stores accurate and factual information in its memory module.
 
 <Evaluation Type: Memory Accuracy>
@@ -62,10 +62,10 @@
 }}
 
 JSON:
-"""
+""").strip()
 
 # Chinese Prompt
-MEMORY_ACCURACY_PROMPT_ZH = """
+MEMORY_ACCURACY_PROMPT_ZH = textwrap.dedent("""
 你是一名分析智能体行为的专家。你的任务是评估智能体是否在其记忆模块中存储了准确且真实的信息。
 
 <评估类型：记忆准确性>
@@ -108,21 +108,21 @@
 }}
 
 JSON:
-"""
+""").strip()
 
 # Build default template from prompts
 DEFAULT_MEMORY_ACCURACY_TEMPLATE = PromptTemplate(
     messages={
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_EN),
+                content=MEMORY_ACCURACY_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_ZH),
+                content=MEMORY_ACCURACY_PROMPT_ZH,
             ),
         ],
     },
@@ -144,25 +144,24 @@ class MemoryAccuracyGrader(LLMGrader):
         language: Language for evaluation prompts (default: LanguageEnum.EN)
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
-        >>> from openjudge.schema.template import LanguageEnum
+        >>> from openjudge.models.schema.prompt_template import LanguageEnum
         >>>
         >>> api = OpenAIChatModel(
-        ...     api_key="your-key",  # pragma: allowlist secret
+        ...     api_key="your-key",
         ...     model="qwen3-max",
         ...     generate_kwargs={"temperature": 0.1}
         ... )
-        >>>
         >>> grader = MemoryAccuracyGrader(
         ...     model=api,
         ...     language=LanguageEnum.EN
         ... )
-        >>>
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     observation="You see a closed cabinet.",
         ...     memory="The cabinet is closed."
         ... )
-        >>> print(f"Score: {result.score}")  # 1.0 (good accuracy)
+        >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
     def __init__(
@@ -180,7 +179,7 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
+    def _format_history(self, history: Optional[List[Dict[str, Any]]] = None) -> str:
         """Format history steps for evaluation.
 
         Args:
@@ -193,8 +192,8 @@ def _format_history(self, history: Optional[list] = None) -> str:
             return ""
 
         lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
+        for i, hist_step in enumerate(history, start=1):
+            lines.append(f"Step {i}:")
             for key, value in hist_step.items():
                 if value:
                     lines.append(f"{key.capitalize()}: {value}")
@@ -207,7 +206,7 @@ async def aevaluate(
         self,
         observation: str,
         memory: str,
-        history: Optional[list] = None,
+        history: Optional[List[Dict[str, Any]]] = None,
         context: Optional[str] = None,
         **kwargs: Any,
     ) -> GraderScore:
@@ -232,9 +231,7 @@ async def aevaluate(
             ... )
         """
         # Format context section
-        context_str = ""
-        if context:
-            context_str = f"<context>\n{context}\n</context>"
+        context_str = f"<context>\n{context}\n</context>" if context else ""
 
         # Format history
         history_str = self._format_history(history)

diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py
@@ -6,7 +6,7 @@
 """
 
 import textwrap
-from typing import Any, Optional
+from typing import Optional, Any, Dict, List
 
 from loguru import logger
 
@@ -19,7 +19,7 @@
 # pylint: disable=line-too-long
 
 # English Prompt
-MEMORY_DETAIL_PRESERVATION_PROMPT_EN = """
+MEMORY_DETAIL_PRESERVATION_PROMPT_EN = textwrap.dedent("""
 You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent preserves important details when storing information in memory.
 
 <Evaluation Type: Memory Detail Preservation>
@@ -62,10 +62,10 @@
 }}
 
 JSON:
-"""
+""").strip()
 
 # Chinese Prompt
-MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = """
+MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = textwrap.dedent("""
 你是一名分析智能体行为的专家。你的任务是评估智能体在将信息存储到记忆中时是否保留了重要细节。
 
 <评估类型：记忆细节保留>
@@ -108,21 +108,21 @@
 }}
 
 JSON:
-"""
+""").strip()
 
 # Build default template from prompts
 DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE = PromptTemplate(
     messages={
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_EN),
+                content=MEMORY_DETAIL_PRESERVATION_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_ZH),
+                content=MEMORY_DETAIL_PRESERVATION_PROMPT_ZH,
             ),
         ],
     },
@@ -144,25 +144,24 @@ class MemoryDetailPreservationGrader(LLMGrader):
         language: Language for evaluation prompts (default: LanguageEnum.EN)
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
-        >>> from openjudge.schema.template import LanguageEnum
+        >>> from openjudge.models.schema.prompt_template import LanguageEnum
         >>>
         >>> api = OpenAIChatModel(
-        ...     api_key="your-key",  # pragma: allowlist secret
+        ...     api_key="your-key",
         ...     model="qwen3-max",
         ...     generate_kwargs={"temperature": 0.1}
         ... )
-        >>>
         >>> grader = MemoryDetailPreservationGrader(
         ...     model=api,
         ...     language=LanguageEnum.EN
         ... )
-        >>>
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     observation="Cabinet 1 at coordinates (3.5, 2.1) contains 5 red apples.",
         ...     memory="Cabinet 1 at (3.5, 2.1) has 5 red apples."
         ... )
-        >>> print(f"Score: {result.score}")  # 1.0 (good detail preservation)
+        >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
     def __init__(
@@ -181,7 +180,7 @@ def __init__(
         )
         self.template = template if template is not None else DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE
 
-    def _format_history(self, history: Optional[list] = None) -> str:
+    def _format_history(self, history: Optional[List[Dict[str, Any]]] = None) -> str:
         """Format history steps for evaluation.
 
         Args:
@@ -194,8 +193,8 @@ def _format_history(self, history: Optional[list] = None) -> str:
             return ""
 
         lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
+        for i, hist_step in enumerate(history, start=1):
+            lines.append(f"Step {i}:")
             for key, value in hist_step.items():
                 if value:
                     lines.append(f"{key.capitalize()}: {value}")
@@ -208,7 +207,7 @@ async def aevaluate(
         self,
         observation: str,
         memory: str,
-        history: Optional[list] = None,
+        history: Optional[List[Dict[str, Any]]] = None,
         context: Optional[str] = None,
         **kwargs: Any,
     ) -> GraderScore:
@@ -233,9 +232,7 @@ async def aevaluate(
             ... )
         """
         # Format context section
-        context_str = ""
-        if context:
-            context_str = f"<context>\n{context}\n</context>"
+        context_str = f"<context>\n{context}\n</context>" if context else ""
 
         # Format history
         history_str = self._format_history(history)