Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 17 additions & 20 deletions openjudge/graders/agent/action/action_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import textwrap
from typing import Optional
from typing import Optional, Any, Dict, List

from loguru import logger

Expand All @@ -19,7 +19,7 @@
# pylint: disable=line-too-long

# English Prompt
ACTION_ALIGNMENT_PROMPT_EN = """
ACTION_ALIGNMENT_PROMPT_EN = textwrap.dedent("""
You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning.

<Evaluation Type: Action Alignment>
Expand Down Expand Up @@ -62,10 +62,10 @@
}}

JSON:
"""
""").strip()

# Chinese Prompt
ACTION_ALIGNMENT_PROMPT_ZH = """
ACTION_ALIGNMENT_PROMPT_ZH = textwrap.dedent("""
你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。

<评估类型:动作对齐>
Expand Down Expand Up @@ -108,21 +108,21 @@
}}

JSON:
"""
""").strip()

# Build default template from prompts
DEFAULT_ACTION_ALIGNMENT_TEMPLATE = PromptTemplate(
messages={
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_EN),
content=ACTION_ALIGNMENT_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_ZH),
content=ACTION_ALIGNMENT_PROMPT_ZH,
),
],
},
Expand All @@ -144,25 +144,24 @@ class ActionAlignmentGrader(LLMGrader):
language: Language for evaluation prompts (default: LanguageEnum.EN)

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.schema.template import LanguageEnum
>>> from openjudge.models.schema.prompt_template import LanguageEnum
>>>
>>> api = OpenAIChatModel(
... api_key="your-key", # pragma: allowlist secret
... api_key="your-key",
... model="qwen3-max",
... generate_kwargs={"temperature": 0.1}
... )
>>>
>>> grader = ActionAlignmentGrader(
... model=api,
... language=LanguageEnum.EN
... )
>>>
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... plan="I will open drawer 1 to find the key.",
... action="open drawer 1"
... )
>>> print(f"Score: {result.score}") # 1.0 (good alignment)
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

def __init__(
Expand Down Expand Up @@ -190,7 +189,7 @@ def __init__(
)
self.template = template if template is not None else DEFAULT_ACTION_ALIGNMENT_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
def _format_history(self, history: Optional[List[Dict[str, Any]]] = None) -> str:
"""Format history steps for evaluation.

Args:
Expand All @@ -203,8 +202,8 @@ def _format_history(self, history: Optional[list] = None) -> str:
return ""

lines = ["<History Steps>"]
for i, hist_step in enumerate(history):
lines.append(f"Step {i + 1}:")
for i, hist_step in enumerate(history, start=1):
lines.append(f"Step {i}:")
for key, value in hist_step.items():
if value:
lines.append(f"{key.capitalize()}: {value}")
Expand All @@ -217,7 +216,7 @@ async def aevaluate(
self,
plan: str,
action: str,
history: Optional[list] = None,
history: Optional[List[Dict[str, Any]]] = None,
context: Optional[str] = None,
) -> GraderScore:
"""
Expand All @@ -240,9 +239,7 @@ async def aevaluate(
... )
"""
# Format context section
context_str = ""
if context:
context_str = f"<context>\n{context}\n</context>"
context_str = f"<context>\n{context}\n</context>" if context else ""

# Format history
history_str = self._format_history(history)
Expand Down
37 changes: 17 additions & 20 deletions openjudge/graders/agent/memory/memory_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import textwrap
from typing import Any, Optional
from typing import Optional, Any, Dict, List

from loguru import logger

Expand All @@ -19,7 +19,7 @@
# pylint: disable=line-too-long

# English Prompt
MEMORY_ACCURACY_PROMPT_EN = """
MEMORY_ACCURACY_PROMPT_EN = textwrap.dedent("""
You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent stores accurate and factual information in its memory module.

<Evaluation Type: Memory Accuracy>
Expand Down Expand Up @@ -62,10 +62,10 @@
}}

JSON:
"""
""").strip()

# Chinese Prompt
MEMORY_ACCURACY_PROMPT_ZH = """
MEMORY_ACCURACY_PROMPT_ZH = textwrap.dedent("""
你是一名分析智能体行为的专家。你的任务是评估智能体是否在其记忆模块中存储了准确且真实的信息。

<评估类型:记忆准确性>
Expand Down Expand Up @@ -108,21 +108,21 @@
}}

JSON:
"""
""").strip()

# Build default template from prompts
DEFAULT_MEMORY_ACCURACY_TEMPLATE = PromptTemplate(
messages={
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_EN),
content=MEMORY_ACCURACY_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_ZH),
content=MEMORY_ACCURACY_PROMPT_ZH,
),
],
},
Expand All @@ -144,25 +144,24 @@ class MemoryAccuracyGrader(LLMGrader):
language: Language for evaluation prompts (default: LanguageEnum.EN)

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.schema.template import LanguageEnum
>>> from openjudge.models.schema.prompt_template import LanguageEnum
>>>
>>> api = OpenAIChatModel(
... api_key="your-key", # pragma: allowlist secret
... api_key="your-key",
... model="qwen3-max",
... generate_kwargs={"temperature": 0.1}
... )
>>>
>>> grader = MemoryAccuracyGrader(
... model=api,
... language=LanguageEnum.EN
... )
>>>
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... observation="You see a closed cabinet.",
... memory="The cabinet is closed."
... )
>>> print(f"Score: {result.score}") # 1.0 (good accuracy)
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

def __init__(
Expand All @@ -180,7 +179,7 @@ def __init__(
language=language,
)

def _format_history(self, history: Optional[list] = None) -> str:
def _format_history(self, history: Optional[List[Dict[str, Any]]] = None) -> str:
"""Format history steps for evaluation.

Args:
Expand All @@ -193,8 +192,8 @@ def _format_history(self, history: Optional[list] = None) -> str:
return ""

lines = ["<History Steps>"]
for i, hist_step in enumerate(history):
lines.append(f"Step {i + 1}:")
for i, hist_step in enumerate(history, start=1):
lines.append(f"Step {i}:")
for key, value in hist_step.items():
if value:
lines.append(f"{key.capitalize()}: {value}")
Expand All @@ -207,7 +206,7 @@ async def aevaluate(
self,
observation: str,
memory: str,
history: Optional[list] = None,
history: Optional[List[Dict[str, Any]]] = None,
context: Optional[str] = None,
**kwargs: Any,
) -> GraderScore:
Expand All @@ -232,9 +231,7 @@ async def aevaluate(
... )
"""
# Format context section
context_str = ""
if context:
context_str = f"<context>\n{context}\n</context>"
context_str = f"<context>\n{context}\n</context>" if context else ""

# Format history
history_str = self._format_history(history)
Expand Down
37 changes: 17 additions & 20 deletions openjudge/graders/agent/memory/memory_detail_preservation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import textwrap
from typing import Any, Optional
from typing import Optional, Any, Dict, List

from loguru import logger

Expand All @@ -19,7 +19,7 @@
# pylint: disable=line-too-long

# English Prompt
MEMORY_DETAIL_PRESERVATION_PROMPT_EN = """
MEMORY_DETAIL_PRESERVATION_PROMPT_EN = textwrap.dedent("""
You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent preserves important details when storing information in memory.

<Evaluation Type: Memory Detail Preservation>
Expand Down Expand Up @@ -62,10 +62,10 @@
}}

JSON:
"""
""").strip()

# Chinese Prompt
MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = """
MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = textwrap.dedent("""
你是一名分析智能体行为的专家。你的任务是评估智能体在将信息存储到记忆中时是否保留了重要细节。

<评估类型:记忆细节保留>
Expand Down Expand Up @@ -108,21 +108,21 @@
}}

JSON:
"""
""").strip()

# Build default template from prompts
DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE = PromptTemplate(
messages={
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_EN),
content=MEMORY_DETAIL_PRESERVATION_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_ZH),
content=MEMORY_DETAIL_PRESERVATION_PROMPT_ZH,
),
],
},
Expand All @@ -144,25 +144,24 @@ class MemoryDetailPreservationGrader(LLMGrader):
language: Language for evaluation prompts (default: LanguageEnum.EN)

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.schema.template import LanguageEnum
>>> from openjudge.models.schema.prompt_template import LanguageEnum
>>>
>>> api = OpenAIChatModel(
... api_key="your-key", # pragma: allowlist secret
... api_key="your-key",
... model="qwen3-max",
... generate_kwargs={"temperature": 0.1}
... )
>>>
>>> grader = MemoryDetailPreservationGrader(
... model=api,
... language=LanguageEnum.EN
... )
>>>
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... observation="Cabinet 1 at coordinates (3.5, 2.1) contains 5 red apples.",
... memory="Cabinet 1 at (3.5, 2.1) has 5 red apples."
... )
>>> print(f"Score: {result.score}") # 1.0 (good detail preservation)
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

def __init__(
Expand All @@ -181,7 +180,7 @@ def __init__(
)
self.template = template if template is not None else DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
def _format_history(self, history: Optional[List[Dict[str, Any]]] = None) -> str:
"""Format history steps for evaluation.

Args:
Expand All @@ -194,8 +193,8 @@ def _format_history(self, history: Optional[list] = None) -> str:
return ""

lines = ["<History Steps>"]
for i, hist_step in enumerate(history):
lines.append(f"Step {i + 1}:")
for i, hist_step in enumerate(history, start=1):
lines.append(f"Step {i}:")
for key, value in hist_step.items():
if value:
lines.append(f"{key.capitalize()}: {value}")
Expand All @@ -208,7 +207,7 @@ async def aevaluate(
self,
observation: str,
memory: str,
history: Optional[list] = None,
history: Optional[List[Dict[str, Any]]] = None,
context: Optional[str] = None,
**kwargs: Any,
) -> GraderScore:
Expand All @@ -233,9 +232,7 @@ async def aevaluate(
... )
"""
# Format context section
context_str = ""
if context:
context_str = f"<context>\n{context}\n</context>"
context_str = f"<context>\n{context}\n</context>" if context else ""

# Format history
history_str = self._format_history(history)
Expand Down
Loading
Loading