diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py index d8c025a8d..2e7fae83f 100644 --- a/openjudge/graders/agent/action/action_alignment.py +++ b/openjudge/graders/agent/action/action_alignment.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -ACTION_ALIGNMENT_PROMPT_EN = """ +ACTION_ALIGNMENT_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning. @@ -64,9 +65,11 @@ JSON: """ +).strip() # Chinese Prompt -ACTION_ALIGNMENT_PROMPT_ZH = """ +ACTION_ALIGNMENT_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。 <评估类型:动作对齐> @@ -110,6 +113,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_ACTION_ALIGNMENT_TEMPLATE = PromptTemplate( @@ -117,13 +121,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_EN), + content=ACTION_ALIGNMENT_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(ACTION_ALIGNMENT_PROMPT_ZH), + content=ACTION_ALIGNMENT_PROMPT_ZH, ), ], }, @@ -145,25 +149,24 @@ class ActionAlignmentGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = ActionAlignmentGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... plan="I will open drawer 1 to find the key.", ... action="open drawer 1" ... ) - >>> print(f"Score: {result.score}") # 1.0 (good alignment) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -194,7 +197,7 @@ async def aevaluate( self, plan: str, action: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, ) -> GraderScore: """ @@ -217,9 +220,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py index da23a4240..ef860ded5 100644 --- a/openjudge/graders/agent/memory/memory_accuracy.py +++ b/openjudge/graders/agent/memory/memory_accuracy.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Any, Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -MEMORY_ACCURACY_PROMPT_EN = """ +MEMORY_ACCURACY_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent stores accurate and factual information in its memory module. @@ -64,9 +65,11 @@ JSON: """ +).strip() # Chinese Prompt -MEMORY_ACCURACY_PROMPT_ZH = """ +MEMORY_ACCURACY_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体是否在其记忆模块中存储了准确且真实的信息。 <评估类型:记忆准确性> @@ -110,6 +113,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_MEMORY_ACCURACY_TEMPLATE = PromptTemplate( @@ -117,13 +121,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_EN), + content=MEMORY_ACCURACY_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(MEMORY_ACCURACY_PROMPT_ZH), + content=MEMORY_ACCURACY_PROMPT_ZH, ), ], }, @@ -145,25 +149,24 @@ class MemoryAccuracyGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = MemoryAccuracyGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... observation="You see a closed cabinet.", ... memory="The cabinet is closed." ... ) - >>> print(f"Score: {result.score}") # 1.0 (good accuracy) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -185,7 +188,7 @@ async def aevaluate( self, observation: str, memory: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, **kwargs: Any, ) -> GraderScore: @@ -210,9 +213,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py index c3c79c83a..155e09e3f 100644 --- a/openjudge/graders/agent/memory/memory_detail_preservation.py +++ b/openjudge/graders/agent/memory/memory_detail_preservation.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Any, Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -MEMORY_DETAIL_PRESERVATION_PROMPT_EN = """ +MEMORY_DETAIL_PRESERVATION_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent preserves important details when storing information in memory. @@ -64,9 +65,11 @@ JSON: """ +).strip() # Chinese Prompt -MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = """ +MEMORY_DETAIL_PRESERVATION_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体在将信息存储到记忆中时是否保留了重要细节。 <评估类型:记忆细节保留> @@ -110,6 +113,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE = PromptTemplate( @@ -117,13 +121,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_EN), + content=MEMORY_DETAIL_PRESERVATION_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(MEMORY_DETAIL_PRESERVATION_PROMPT_ZH), + content=MEMORY_DETAIL_PRESERVATION_PROMPT_ZH, ), ], }, @@ -145,25 +149,24 @@ class MemoryDetailPreservationGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = MemoryDetailPreservationGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... observation="Cabinet 1 at coordinates (3.5, 2.1) contains 5 red apples.", ... memory="Cabinet 1 at (3.5, 2.1) has 5 red apples." ... ) - >>> print(f"Score: {result.score}") # 1.0 (good detail preservation) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -185,7 +188,7 @@ async def aevaluate( self, observation: str, memory: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, **kwargs: Any, ) -> GraderScore: @@ -210,9 +213,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py index 2204706c8..a50bd07c0 100644 --- a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py +++ b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Any, Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_EN = """ +MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent effectively retrieves relevant information from memory when needed. @@ -65,9 +66,11 @@ JSON: """ +).strip() # Chinese Prompt -MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_ZH = """ +MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体在需要时是否有效地从记忆中检索相关信息。 <评估类型:记忆检索有效性> @@ -112,6 +115,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE = PromptTemplate( @@ -119,13 +123,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_EN), + content=MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_ZH), + content=MEMORY_RETRIEVAL_EFFECTIVENESS_PROMPT_ZH, ), ], }, @@ -147,26 +151,24 @@ class MemoryRetrievalEffectivenessGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = MemoryRetrievalEffectivenessGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( - ... plan="I will use the key from drawer 1.", - ... observation="You are standing in the room.", - ... memory="The key was found in drawer 1 in step 3." + >>> result = asyncio.run(grader.aevaluate( + ... observation="You see a closed cabinet.", + ... memory="The cabinet is closed." ... ) - >>> print(f"Score: {result.score}") # 1.0 (effective retrieval) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -189,7 +191,7 @@ async def aevaluate( plan: str, observation: str, memory: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, **kwargs: Any, ) -> GraderScore: @@ -216,9 +218,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py index c69686e31..cb0a73378 100644 --- a/openjudge/graders/agent/plan/plan_feasibility.py +++ b/openjudge/graders/agent/plan/plan_feasibility.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Any, Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -PLAN_FEASIBILITY_PROMPT_EN = """ +PLAN_FEASIBILITY_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent creates a plan that is logically sound and feasible. @@ -65,9 +66,11 @@ JSON: """ +).strip() # Chinese Prompt -PLAN_FEASIBILITY_PROMPT_ZH = """ +PLAN_FEASIBILITY_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体是否创建了逻辑上合理且可行的计划。 <评估类型:计划可行性> @@ -112,6 +115,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_PLAN_FEASIBILITY_TEMPLATE = PromptTemplate( @@ -119,13 +123,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(PLAN_FEASIBILITY_PROMPT_EN), + content=PLAN_FEASIBILITY_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(PLAN_FEASIBILITY_PROMPT_ZH), + content=PLAN_FEASIBILITY_PROMPT_ZH, ), ], }, @@ -147,26 +151,25 @@ class PlanFeasibilityGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = PlanFeasibilityGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... plan="I will first open the drawer to get the key, then use it to unlock the door.", ... observation="The drawer is closed. You don't have any items.", ... memory="The key is inside the drawer." ... ) - >>> print(f"Score: {result.score}") # 1.0 (feasible plan) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -189,7 +192,7 @@ async def aevaluate( plan: str, observation: str, memory: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, **kwargs: Any, ) -> GraderScore: @@ -216,9 +219,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py index 8b54d4620..4818005ec 100644 --- a/openjudge/graders/agent/reflection/reflection_accuracy.py +++ b/openjudge/graders/agent/reflection/reflection_accuracy.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Any, Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -REFLECTION_ACCURACY_PROMPT_EN = """ +REFLECTION_ACCURACY_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent provides accurate reflections based on actual observations. @@ -64,9 +65,11 @@ JSON: """ +).strip() # Chinese Prompt -REFLECTION_ACCURACY_PROMPT_ZH = """ +REFLECTION_ACCURACY_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体是否基于实际观察提供了准确的反思。 <评估类型:反思准确性> @@ -110,6 +113,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_REFLECTION_ACCURACY_TEMPLATE = PromptTemplate( @@ -117,13 +121,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(REFLECTION_ACCURACY_PROMPT_EN), + content=REFLECTION_ACCURACY_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(REFLECTION_ACCURACY_PROMPT_ZH), + content=REFLECTION_ACCURACY_PROMPT_ZH, ), ], }, @@ -145,25 +149,24 @@ class ReflectionAccuracyGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = ReflectionAccuracyGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... observation="You see a closed cabinet.", ... reflection="I observed a closed cabinet." ... ) - >>> print(f"Score: {result.score}") # 1.0 (accurate reflection) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -185,7 +188,7 @@ async def aevaluate( self, observation: str, reflection: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, **kwargs: Any, ) -> GraderScore: @@ -210,9 +213,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py index e7ce341f9..466792bad 100644 --- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py +++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py @@ -7,7 +7,7 @@ """ import textwrap -from typing import Any, Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -21,7 +21,8 @@ # pylint: disable=line-too-long # English Prompt -REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_EN = """ +REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent correctly understands and interprets the outcome or result of an action in its reflection. @@ -126,9 +127,11 @@ JSON: """ +).strip() # Chinese Prompt -REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_ZH = """ +REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体是否在其反思中正确理解和解释了动作的结果或输出。 <评估类型:反思结果理解> @@ -233,6 +236,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE = PromptTemplate( @@ -240,13 +244,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_EN), + content=REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_ZH), + content=REFLECTION_OUTCOME_UNDERSTANDING_PROMPT_ZH, ), ], }, @@ -269,25 +273,24 @@ class ReflectionOutcomeUnderstandingGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = ReflectionOutcomeUnderstandingGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... observation="The drawer is now open.", ... reflection="I successfully opened the drawer." ... ) - >>> print(f"Score: {result.score}") # 1.0 (correct understanding) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -309,7 +312,7 @@ async def aevaluate( self, observation: str, reflection: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, **kwargs: Any, ) -> GraderScore: @@ -334,9 +337,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py index 8d1d0f344..df9856954 100644 --- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py +++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py @@ -7,7 +7,7 @@ """ import textwrap -from typing import Any, Optional +from typing import Any, Dict, List, Optional from loguru import logger @@ -21,7 +21,8 @@ # pylint: disable=line-too-long # English Prompt -REFLECTION_PROGRESS_AWARENESS_PROMPT_EN = """ +REFLECTION_PROGRESS_AWARENESS_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent demonstrates accurate awareness of progress toward completing the task in its reflection. @@ -84,9 +85,11 @@ JSON: """ +).strip() # Chinese Prompt -REFLECTION_PROGRESS_AWARENESS_PROMPT_ZH = """ +REFLECTION_PROGRESS_AWARENESS_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析智能体行为的专家。你的任务是评估智能体是否在其反思中展示了对完成任务进度的准确意识。 <评估类型:反思进度意识> @@ -149,6 +152,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE = PromptTemplate( @@ -156,13 +160,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(REFLECTION_PROGRESS_AWARENESS_PROMPT_EN), + content=REFLECTION_PROGRESS_AWARENESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(REFLECTION_PROGRESS_AWARENESS_PROMPT_ZH), + content=REFLECTION_PROGRESS_AWARENESS_PROMPT_ZH, ), ], }, @@ -185,26 +189,25 @@ class ReflectionProgressAwarenessGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.schema.template import LanguageEnum + >>> from openjudge.models.schema.prompt_template import LanguageEnum >>> >>> api = OpenAIChatModel( - ... api_key="your-key", # pragma: allowlist secret + ... api_key="your-key", ... model="qwen3-max", ... generate_kwargs={"temperature": 0.1} ... ) - >>> >>> grader = ReflectionProgressAwarenessGrader( ... model=api, ... language=LanguageEnum.EN ... ) - >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... observation="Cabinet 1 now has apples. Task complete.", ... reflection="Good progress! I've successfully found the apples.", ... context="Task: Find apples in cabinets" ... ) - >>> print(f"Score: {result.score}") # 1.0 (accurate awareness) + >>> print(f"Score: {result.score}") # Expected: 1.0 """ def __init__( @@ -226,7 +229,7 @@ async def aevaluate( self, observation: str, reflection: str, - history: Optional[list] = None, + history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, **kwargs: Any, ) -> GraderScore: @@ -251,9 +254,7 @@ async def aevaluate( ... ) """ # Format context section - context_str = "" - if context: - context_str = f"\n{context}\n" + context_str = f"\n{context}\n" if context else "" # Format history history_str = format_history(history) diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py index 7551dfeb1..f97d10e36 100644 --- a/openjudge/graders/agent/tool/tool_call_accuracy.py +++ b/openjudge/graders/agent/tool/tool_call_accuracy.py @@ -21,7 +21,8 @@ # pylint: disable=line-too-long # English Prompt -TOOL_CALL_ACCURACY_PROMPT_EN = """# Instruction +TOOL_CALL_ACCURACY_PROMPT_EN = textwrap.dedent( + """# Instruction ## Goal Your are an expert in evaluating the accuracy of a tool call considering relevance and \ potential usefulness including syntactic and semantic correctness of a proposed tool call \ @@ -60,9 +61,11 @@ }} ``` """ +).strip() # Chinese Prompt -TOOL_CALL_ACCURACY_PROMPT_ZH = """# 指令 +TOOL_CALL_ACCURACY_PROMPT_ZH = textwrap.dedent( + """# 指令 ## 目标 你是评估工具调用准确性的专家,需要考虑相关性和潜在有用性,包括基于提供的定义和数据,对智能系统提出的工具调用的语法和语义正确性进行评估。你的目标是使用提供的信息回答以下问题。 @@ -96,6 +99,7 @@ }} ``` """ +).strip() # Build default template from prompts DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE = PromptTemplate( @@ -103,13 +107,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_CALL_ACCURACY_PROMPT_EN), + content=TOOL_CALL_ACCURACY_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_CALL_ACCURACY_PROMPT_ZH), + content=TOOL_CALL_ACCURACY_PROMPT_ZH, ), ], }, @@ -163,9 +167,12 @@ class ToolCallAccuracyGrader(LLMGrader): ... ) >>> >>> conversation = [ - ... {"role": "user", "content": "What's the weather like in New York?"} + ... { + ... "role": "user", + ... "content": "What's the weather like in New York?" + ... } ... ] - >>> tool_definitions = [ + ... tool_definitions = [ ... { ... "name": "get_weather", ... "description": "Get weather information for a location", @@ -174,7 +181,7 @@ class ToolCallAccuracyGrader(LLMGrader): ... } ... } ... ] - >>> tool_calls = [ + ... tool_calls = [ ... { ... "name": "get_weather", ... "arguments": {"location": "New York"} diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py index 1fc9111fd..e4b135d06 100644 --- a/openjudge/graders/agent/tool/tool_call_success.py +++ b/openjudge/graders/agent/tool/tool_call_success.py @@ -21,7 +21,8 @@ # pylint: disable=line-too-long # English Prompt -TOOL_CALL_SUCCESS_PROMPT_EN = """You are an expert evaluator with strong software \ +TOOL_CALL_SUCCESS_PROMPT_EN = textwrap.dedent( + """You are an expert evaluator with strong software \ development background. You are required to extract the tool result for every tool call \ then decide for each tool result whether it indicates that the tool call succeeded or failed. @@ -79,9 +80,11 @@ }} ``` """ +).strip() # Chinese Prompt -TOOL_CALL_SUCCESS_PROMPT_ZH = """你是一位具有强大软件开发背景的专家评估员。你需要为每个工具调用提取工具结果,然后判断每个工具结果是否表明工具调用成功或失败。 +TOOL_CALL_SUCCESS_PROMPT_ZH = textwrap.dedent( + """你是一位具有强大软件开发背景的专家评估员。你需要为每个工具调用提取工具结果,然后判断每个工具结果是否表明工具调用成功或失败。 角色 ==== @@ -130,6 +133,7 @@ }} ``` """ +).strip() # Build default template from prompts DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE = PromptTemplate( @@ -137,13 +141,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_CALL_SUCCESS_PROMPT_EN), + content=TOOL_CALL_SUCCESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_CALL_SUCCESS_PROMPT_ZH), + content=TOOL_CALL_SUCCESS_PROMPT_ZH, ), ], }, diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py index 442719b34..b969a3e4a 100644 --- a/openjudge/graders/agent/tool/tool_parameter_check.py +++ b/openjudge/graders/agent/tool/tool_parameter_check.py @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -TOOL_PARAMETER_CHECK_PROMPT_EN = """ +TOOL_PARAMETER_CHECK_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing tool calls. Your task is to evaluate whether the generated tool call extracts completely correct parameters from the user query. @@ -70,9 +71,11 @@ JSON: """ +).strip() # Chinese Prompt -TOOL_PARAMETER_CHECK_PROMPT_ZH = """ +TOOL_PARAMETER_CHECK_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析工具调用的专家。你的任务是评估生成的工具调用是否从用户查询中提取了完全正确的参数。 <评估类型:工具参数提取正确性> @@ -122,6 +125,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE = PromptTemplate( @@ -129,13 +133,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_PARAMETER_CHECK_PROMPT_EN), + content=TOOL_PARAMETER_CHECK_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_PARAMETER_CHECK_PROMPT_ZH), + content=TOOL_PARAMETER_CHECK_PROMPT_ZH, ), ], }, @@ -156,6 +160,7 @@ class ToolParameterCheckGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel >>> from openjudge.schema.template import LanguageEnum >>> @@ -170,11 +175,11 @@ class ToolParameterCheckGrader(LLMGrader): ... language=LanguageEnum.EN ... ) >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... query="Search for Python files in the src directory", ... tool_definition="search_files(pattern: str, directory: str)", ... tool_calls='search_files(pattern="*.py", directory="src")' - ... ) + ... )) >>> print(f"Score: {result.score}") # 1.0 (correct parameters) """ diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py index dec310b17..4508e7112 100644 --- a/openjudge/graders/agent/tool/tool_selection.py +++ b/openjudge/graders/agent/tool/tool_selection.py @@ -20,7 +20,8 @@ # pylint: disable=line-too-long # English Prompt -TOOL_SELECTION_PROMPT_EN = """ +TOOL_SELECTION_PROMPT_EN = textwrap.dedent( + """ You are an expert in analyzing tool selection decisions. Your task is to evaluate the of tool selection made by an agent to address the user query. @@ -74,9 +75,11 @@ JSON: """ +).strip() # Chinese Prompt -TOOL_SELECTION_PROMPT_ZH = """ +TOOL_SELECTION_PROMPT_ZH = textwrap.dedent( + """ 你是一名分析工具选择决策的专家。你的任务是评估智能体为解决用户查询而做出的工具选择的质量。 <评估维度:工具选择质量> @@ -130,6 +133,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_TOOL_SELECTION_TEMPLATE = PromptTemplate( @@ -137,13 +141,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_SELECTION_PROMPT_EN), + content=TOOL_SELECTION_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(TOOL_SELECTION_PROMPT_ZH), + content=TOOL_SELECTION_PROMPT_ZH, ), ], }, @@ -163,6 +167,7 @@ class ToolSelectionGrader(LLMGrader): language: Language for evaluation prompts (default: LanguageEnum.EN) Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel >>> from openjudge.schema.template import LanguageEnum >>> @@ -177,7 +182,7 @@ class ToolSelectionGrader(LLMGrader): ... language=LanguageEnum.EN ... ) >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... query="Find all Python files modified in the last week", ... tool_definitions=[ ... {"name": "search_files", "description": "Search for files"}, @@ -187,7 +192,7 @@ class ToolSelectionGrader(LLMGrader): ... {"name": "search_files", "arguments": {"pattern": "*.py"}}, ... {"name": "git_log", "arguments": {"days": 7}} ... ] - ... ) + ... )) >>> print(f"Score: {result.score}") # Score from 1 to 5 """ diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py index b18d864e5..85a3546d7 100644 --- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py +++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py @@ -23,7 +23,8 @@ # pylint: disable=line-too-long,too-many-statements # Chinese Prompt -TRAJECTORY_COMPREHENSIVE_PROMPT_ZH = """# 任务描述 +TRAJECTORY_COMPREHENSIVE_PROMPT_ZH = textwrap.dedent( + """# 任务描述 你是一位专业的评估专家,负责评估智能体轨迹中每个工具调用步骤对问题解决的贡献度。 @@ -111,9 +112,11 @@ JSON: """ +).strip() # English Prompt -TRAJECTORY_COMPREHENSIVE_PROMPT_EN = """# Task Description +TRAJECTORY_COMPREHENSIVE_PROMPT_EN = textwrap.dedent( + """# Task Description You are a professional evaluation expert responsible for assessing the contribution of each tool call step in an agent trajectory. @@ -202,6 +205,7 @@ JSON: """ +).strip() # Build default template from prompts DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE = PromptTemplate( @@ -209,13 +213,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(TRAJECTORY_COMPREHENSIVE_PROMPT_EN), + content=TRAJECTORY_COMPREHENSIVE_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(TRAJECTORY_COMPREHENSIVE_PROMPT_ZH), + content=TRAJECTORY_COMPREHENSIVE_PROMPT_ZH, ), ], }, @@ -291,17 +295,18 @@ class TrajectoryComprehensiveGrader(LLMGrader): resolution_threshold: Threshold for determining if the trajectory is resolved (default: 0.8, on normalized 0-1 scale) Example: + >>> import asyncio >>> from openjudge.models.openai_chat_model import OpenAIChatModel >>> api = OpenAIChatModel(api_key="...", model="qwen3-32b") >>> grader = TrajectoryComprehensiveGrader(model=api, resolution_threshold=0.75) - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... messages=[ ... {"role": "system", "content": "..."}, ... {"role": "user", "content": "帮我找投资建议"}, ... {"role": "assistant", "content": "...", "tool_calls": [...]}, ... ... ... ] - ... ) + ... )) >>> print(f"Score: {result.score}") # computed from step averages """