diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py index c083a11a..d8c025a8 100644 --- a/openjudge/graders/agent/action/action_alignment.py +++ b/openjudge/graders/agent/action/action_alignment.py @@ -10,6 +10,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -189,29 +190,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, plan: str, @@ -244,7 +222,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py index dd7067b5..da23a424 100644 --- a/openjudge/graders/agent/memory/memory_accuracy.py +++ b/openjudge/graders/agent/memory/memory_accuracy.py @@ -10,6 +10,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -180,29 +181,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, observation: str, @@ -237,7 +215,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py index 28b3eb6c..c3c79c83 100644 --- a/openjudge/graders/agent/memory/memory_detail_preservation.py +++ b/openjudge/graders/agent/memory/memory_detail_preservation.py @@ -10,6 +10,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -180,29 +181,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, observation: str, @@ -237,7 +215,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py index 2cb9e5b1..2204706c 100644 --- a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py +++ b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py @@ -10,6 +10,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -183,29 +184,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, plan: str, @@ -243,7 +221,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py index ebf4dd68..c69686e3 100644 --- a/openjudge/graders/agent/plan/plan_feasibility.py +++ b/openjudge/graders/agent/plan/plan_feasibility.py @@ -10,6 +10,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -183,29 +184,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, plan: str, @@ -243,7 +221,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py index 296cfab7..8b54d462 100644 --- a/openjudge/graders/agent/reflection/reflection_accuracy.py +++ b/openjudge/graders/agent/reflection/reflection_accuracy.py @@ -10,6 +10,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -180,29 +181,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, observation: str, @@ -237,7 +215,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py index 6a15b2ef..e7ce341f 100644 --- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py +++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py @@ -11,6 +11,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -304,29 +305,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, observation: str, @@ -361,7 +339,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py index 896b2cb3..8d1d0f34 100644 --- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py +++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py @@ -11,6 +11,7 @@ from loguru import logger +from openjudge.graders.agent.utils import format_history from openjudge.graders.base_grader import GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel @@ -221,29 +222,6 @@ def __init__( language=language, ) - def _format_history(self, history: Optional[list] = None) -> str: - """Format history steps for evaluation. - - Args: - history: Optional list of previous step dictionaries - - Returns: - Formatted history string, or empty string if no history - """ - if not history: - return "" - - lines = [""] - for i, hist_step in enumerate(history): - lines.append(f"Step {i + 1}:") - for key, value in hist_step.items(): - if value: - lines.append(f"{key.capitalize()}: {value}") - lines.append("") - lines.append("") - - return "\n".join(lines) - async def aevaluate( self, observation: str, @@ -278,7 +256,7 @@ async def aevaluate( context_str = f"\n{context}\n" # Format history - history_str = self._format_history(history) + history_str = format_history(history) try: result = await super().aevaluate( diff --git a/openjudge/graders/agent/utils.py b/openjudge/graders/agent/utils.py index 32b76038..17af90a4 100644 --- a/openjudge/graders/agent/utils.py +++ b/openjudge/graders/agent/utils.py @@ -4,7 +4,7 @@ This module provides utility functions for analyzing agent action behaviors, including action-observation pair extraction and similarity calculations. """ -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple from loguru import logger from sklearn.feature_extraction.text import TfidfVectorizer @@ -147,8 +147,33 @@ def calculate_semantic_similarity(text1: str, text2: str) -> float: return 0.0 +def format_history(history: Optional[list] = None) -> str: + """Format history steps for evaluation. + + Args: + history: Optional list of previous step dictionaries + + Returns: + Formatted history string, or empty string if no history + """ + if not history: + return "" + + lines = [""] + for i, hist_step in enumerate(history): + lines.append(f"Step {i + 1}:") + for key, value in hist_step.items(): + if value: + lines.append(f"{key.capitalize()}: {value}") + lines.append("") + lines.append("") + + return "\n".join(lines) + + __all__ = [ "extract_action_observation_pairs", "calculate_text_similarity", "calculate_semantic_similarity", + "format_history", ]