Skip to content

Commit a784b0f

Browse files
authored
feat: update agent graders (#35)
* feat: update agent graders * feat: fix the example code * feat: reformat the code
1 parent 10844f0 commit a784b0f

13 files changed

+185
-152
lines changed

openjudge/graders/agent/action/action_alignment.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"""
77

88
import textwrap
9-
from typing import Optional
9+
from typing import Any, Dict, List, Optional
1010

1111
from loguru import logger
1212

@@ -20,7 +20,8 @@
2020
# pylint: disable=line-too-long
2121

2222
# English Prompt
23-
ACTION_ALIGNMENT_PROMPT_EN = """
23+
ACTION_ALIGNMENT_PROMPT_EN = textwrap.dedent(
24+
"""
2425
You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning.
2526
2627
<Evaluation Type: Action Alignment>
@@ -64,9 +65,11 @@
6465
6566
JSON:
6667
"""
68+
).strip()
6769

6870
# Chinese Prompt
69-
ACTION_ALIGNMENT_PROMPT_ZH = """
71+
ACTION_ALIGNMENT_PROMPT_ZH = textwrap.dedent(
72+
"""
7073
你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。
7174
7275
<评估类型:动作对齐>
@@ -110,20 +113,21 @@
110113
111114
JSON:
112115
"""
116+
).strip()
113117

114118
# Build default template from prompts
115119
DEFAULT_ACTION_ALIGNMENT_TEMPLATE = PromptTemplate(
116120
messages={
117121
LanguageEnum.EN: [
118122
ChatMessage(
119123
role="user",
120-
content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_EN),
124+
content=ACTION_ALIGNMENT_PROMPT_EN,
121125
),
122126
],
123127
LanguageEnum.ZH: [
124128
ChatMessage(
125129
role="user",
126-
content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_ZH),
130+
content=ACTION_ALIGNMENT_PROMPT_ZH,
127131
),
128132
],
129133
},
@@ -145,25 +149,24 @@ class ActionAlignmentGrader(LLMGrader):
145149
language: Language for evaluation prompts (default: LanguageEnum.EN)
146150
147151
Example:
152+
>>> import asyncio
148153
>>> from openjudge.model.openai_llm import OpenAIChatModel
149-
>>> from openjudge.schema.template import LanguageEnum
154+
>>> from openjudge.models.schema.prompt_template import LanguageEnum
150155
>>>
151156
>>> api = OpenAIChatModel(
152-
... api_key="your-key", # pragma: allowlist secret
157+
... api_key="your-key",
153158
... model="qwen3-max",
154159
... generate_kwargs={"temperature": 0.1}
155160
... )
156-
>>>
157161
>>> grader = ActionAlignmentGrader(
158162
... model=api,
159163
... language=LanguageEnum.EN
160164
... )
161-
>>>
162-
>>> result = await grader.aevaluate(
165+
>>> result = asyncio.run(grader.aevaluate(
163166
... plan="I will open drawer 1 to find the key.",
164167
... action="open drawer 1"
165168
... )
166-
>>> print(f"Score: {result.score}") # 1.0 (good alignment)
169+
>>> print(f"Score: {result.score}") # Expected: 1.0
167170
"""
168171

169172
def __init__(
@@ -194,7 +197,7 @@ async def aevaluate(
194197
self,
195198
plan: str,
196199
action: str,
197-
history: Optional[list] = None,
200+
history: Optional[List[Dict[str, Any]]] = None,
198201
context: Optional[str] = None,
199202
) -> GraderScore:
200203
"""
@@ -217,9 +220,7 @@ async def aevaluate(
217220
... )
218221
"""
219222
# Format context section
220-
context_str = ""
221-
if context:
222-
context_str = f"<context>\n{context}\n</context>"
223+
context_str = f"<context>\n{context}\n</context>" if context else ""
223224

224225
# Format history
225226
history_str = format_history(history)

openjudge/graders/agent/memory/memory_accuracy.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"""
77

88
import textwrap
9-
from typing import Any, Optional
9+
from typing import Any, Dict, List, Optional
1010

1111
from loguru import logger
1212

@@ -20,7 +20,8 @@
2020
# pylint: disable=line-too-long
2121

2222
# English Prompt
23-
MEMORY_ACCURACY_PROMPT_EN = """
23+
MEMORY_ACCURACY_PROMPT_EN = textwrap.dedent(
24+
"""
2425
You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent stores accurate and factual information in its memory module.
2526
2627
<Evaluation Type: Memory Accuracy>
@@ -64,9 +65,11 @@
6465
6566
JSON:
6667
"""
68+
).strip()
6769

6870
# Chinese Prompt
69-
MEMORY_ACCURACY_PROMPT_ZH = """
71+
MEMORY_ACCURACY_PROMPT_ZH = textwrap.dedent(
72+
"""
7073
你是一名分析智能体行为的专家。你的任务是评估智能体是否在其记忆模块中存储了准确且真实的信息。
7174
7275
<评估类型:记忆准确性>
@@ -110,20 +113,21 @@
110113
111114
JSON:
112115
"""
116+
).strip()
113117

114118
# Build default template from prompts
115119
DEFAULT_MEMORY_ACCURACY_TEMPLATE = PromptTemplate(
116120
messages={
117121
LanguageEnum.EN: [
118122
ChatMessage(
119123
role="user",
120-
content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_EN),
124+
content=MEMORY_ACCURACY_PROMPT_EN,
121125
),
122126
],
123127
LanguageEnum.ZH: [
124128
ChatMessage(
125129
role="user",
126-
content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_ZH),
130+
content=MEMORY_ACCURACY_PROMPT_ZH,
127131
),
128132
],
129133
},
@@ -145,25 +149,24 @@ class MemoryAccuracyGrader(LLMGrader):
145149
language: Language for evaluation prompts (default: LanguageEnum.EN)
146150
147151
Example:
152+
>>> import asyncio
148153
>>> from openjudge.model.openai_llm import OpenAIChatModel
149-
>>> from openjudge.schema.template import LanguageEnum
154+
>>> from openjudge.models.schema.prompt_template import LanguageEnum
150155
>>>
151156
>>> api = OpenAIChatModel(
152-
... api_key="your-key", # pragma: allowlist secret
157+
... api_key="your-key",
153158
... model="qwen3-max",
154159
... generate_kwargs={"temperature": 0.1}
155160
... )
156-
>>>
157161
>>> grader = MemoryAccuracyGrader(
158162
... model=api,
159163
... language=LanguageEnum.EN
160164
... )
161-
>>>
162-
>>> result = await grader.aevaluate(
165+
>>> result = asyncio.run(grader.aevaluate(
163166
... observation="You see a closed cabinet.",
164167
... memory="The cabinet is closed."
165168
... )
166-
>>> print(f"Score: {result.score}") # 1.0 (good accuracy)
169+
>>> print(f"Score: {result.score}") # Expected: 1.0
167170
"""
168171

169172
def __init__(
@@ -185,7 +188,7 @@ async def aevaluate(
185188
self,
186189
observation: str,
187190
memory: str,
188-
history: Optional[list] = None,
191+
history: Optional[List[Dict[str, Any]]] = None,
189192
context: Optional[str] = None,
190193
**kwargs: Any,
191194
) -> GraderScore:
@@ -210,9 +213,7 @@ async def aevaluate(
210213
... )
211214
"""
212215
# Format context section
213-
context_str = ""
214-
if context:
215-
context_str = f"<context>\n{context}\n</context>"
216+
context_str = f"<context>\n{context}\n</context>" if context else ""
216217

217218
# Format history
218219
history_str = format_history(history)

openjudge/graders/agent/memory/memory_detail_preservation.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"""
77

88
import textwrap
9-
from typing import Any, Optional
9+
from typing import Any, Dict, List, Optional
1010

1111
from loguru import logger
1212

@@ -20,7 +20,8 @@
2020
# pylint: disable=line-too-long
2121

2222
# English Prompt
23-
MEMORY_DETAIL_PRESERVATION_PROMPT_EN = """
23+
MEMORY_DETAIL_PRESERVATION_PROMPT_EN = textwrap.dedent(
24+
"""
2425
You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent preserves important details when storing information in memory.
2526
2627
<Evaluation Type: Memory Detail Preservation>
@@ -64,9 +65,11 @@
6465
6566
JSON:
6667
"""
68+
).strip()
6769

6870
# Chinese Prompt
69-
MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = """
71+
MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = textwrap.dedent(
72+
"""
7073
你是一名分析智能体行为的专家。你的任务是评估智能体在将信息存储到记忆中时是否保留了重要细节。
7174
7275
<评估类型:记忆细节保留>
@@ -110,20 +113,21 @@
110113
111114
JSON:
112115
"""
116+
).strip()
113117

114118
# Build default template from prompts
115119
DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE = PromptTemplate(
116120
messages={
117121
LanguageEnum.EN: [
118122
ChatMessage(
119123
role="user",
120-
content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_EN),
124+
content=MEMORY_DETAIL_PRESERVATION_PROMPT_EN,
121125
),
122126
],
123127
LanguageEnum.ZH: [
124128
ChatMessage(
125129
role="user",
126-
content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_ZH),
130+
content=MEMORY_DETAIL_PRESERVATION_PROMPT_ZH,
127131
),
128132
],
129133
},
@@ -145,25 +149,24 @@ class MemoryDetailPreservationGrader(LLMGrader):
145149
language: Language for evaluation prompts (default: LanguageEnum.EN)
146150
147151
Example:
152+
>>> import asyncio
148153
>>> from openjudge.model.openai_llm import OpenAIChatModel
149-
>>> from openjudge.schema.template import LanguageEnum
154+
>>> from openjudge.models.schema.prompt_template import LanguageEnum
150155
>>>
151156
>>> api = OpenAIChatModel(
152-
... api_key="your-key", # pragma: allowlist secret
157+
... api_key="your-key",
153158
... model="qwen3-max",
154159
... generate_kwargs={"temperature": 0.1}
155160
... )
156-
>>>
157161
>>> grader = MemoryDetailPreservationGrader(
158162
... model=api,
159163
... language=LanguageEnum.EN
160164
... )
161-
>>>
162-
>>> result = await grader.aevaluate(
165+
>>> result = asyncio.run(grader.aevaluate(
163166
... observation="Cabinet 1 at coordinates (3.5, 2.1) contains 5 red apples.",
164167
... memory="Cabinet 1 at (3.5, 2.1) has 5 red apples."
165168
... )
166-
>>> print(f"Score: {result.score}") # 1.0 (good detail preservation)
169+
>>> print(f"Score: {result.score}") # Expected: 1.0
167170
"""
168171

169172
def __init__(
@@ -185,7 +188,7 @@ async def aevaluate(
185188
self,
186189
observation: str,
187190
memory: str,
188-
history: Optional[list] = None,
191+
history: Optional[List[Dict[str, Any]]] = None,
189192
context: Optional[str] = None,
190193
**kwargs: Any,
191194
) -> GraderScore:
@@ -210,9 +213,7 @@ async def aevaluate(
210213
... )
211214
"""
212215
# Format context section
213-
context_str = ""
214-
if context:
215-
context_str = f"<context>\n{context}\n</context>"
216+
context_str = f"<context>\n{context}\n</context>" if context else ""
216217

217218
# Format history
218219
history_str = format_history(history)

0 commit comments

Comments
 (0)