diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
index c083a11a..d8c025a8 100644
--- a/openjudge/graders/agent/action/action_alignment.py
+++ b/openjudge/graders/agent/action/action_alignment.py
@@ -10,6 +10,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -189,29 +190,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
plan: str,
@@ -244,7 +222,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py
index dd7067b5..da23a424 100644
--- a/openjudge/graders/agent/memory/memory_accuracy.py
+++ b/openjudge/graders/agent/memory/memory_accuracy.py
@@ -10,6 +10,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -180,29 +181,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
observation: str,
@@ -237,7 +215,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py
index 28b3eb6c..c3c79c83 100644
--- a/openjudge/graders/agent/memory/memory_detail_preservation.py
+++ b/openjudge/graders/agent/memory/memory_detail_preservation.py
@@ -10,6 +10,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -180,29 +181,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
observation: str,
@@ -237,7 +215,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
index 2cb9e5b1..2204706c 100644
--- a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
+++ b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
@@ -10,6 +10,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -183,29 +184,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
plan: str,
@@ -243,7 +221,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py
index ebf4dd68..c69686e3 100644
--- a/openjudge/graders/agent/plan/plan_feasibility.py
+++ b/openjudge/graders/agent/plan/plan_feasibility.py
@@ -10,6 +10,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -183,29 +184,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
plan: str,
@@ -243,7 +221,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py
index 296cfab7..8b54d462 100644
--- a/openjudge/graders/agent/reflection/reflection_accuracy.py
+++ b/openjudge/graders/agent/reflection/reflection_accuracy.py
@@ -10,6 +10,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -180,29 +181,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
observation: str,
@@ -237,7 +215,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
index 6a15b2ef..e7ce341f 100644
--- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
+++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
@@ -11,6 +11,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -304,29 +305,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
observation: str,
@@ -361,7 +339,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py
index 896b2cb3..8d1d0f34 100644
--- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py
+++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py
@@ -11,6 +11,7 @@
from loguru import logger
+from openjudge.graders.agent.utils import format_history
from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
@@ -221,29 +222,6 @@ def __init__(
language=language,
)
- def _format_history(self, history: Optional[list] = None) -> str:
- """Format history steps for evaluation.
-
- Args:
- history: Optional list of previous step dictionaries
-
- Returns:
- Formatted history string, or empty string if no history
- """
- if not history:
- return ""
-
- lines = [""]
- for i, hist_step in enumerate(history):
- lines.append(f"Step {i + 1}:")
- for key, value in hist_step.items():
- if value:
- lines.append(f"{key.capitalize()}: {value}")
- lines.append("")
- lines.append("")
-
- return "\n".join(lines)
-
async def aevaluate(
self,
observation: str,
@@ -278,7 +256,7 @@ async def aevaluate(
context_str = f"\n{context}\n"
# Format history
- history_str = self._format_history(history)
+ history_str = format_history(history)
try:
result = await super().aevaluate(
diff --git a/openjudge/graders/agent/utils.py b/openjudge/graders/agent/utils.py
index 32b76038..17af90a4 100644
--- a/openjudge/graders/agent/utils.py
+++ b/openjudge/graders/agent/utils.py
@@ -4,7 +4,7 @@
This module provides utility functions for analyzing agent action behaviors,
including action-observation pair extraction and similarity calculations.
"""
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
from loguru import logger
from sklearn.feature_extraction.text import TfidfVectorizer
@@ -147,8 +147,33 @@ def calculate_semantic_similarity(text1: str, text2: str) -> float:
return 0.0
+def format_history(history: Optional[list] = None) -> str:
+ """Format history steps for evaluation.
+
+ Args:
+ history: Optional list of previous step dictionaries
+
+ Returns:
+ Formatted history string, or empty string if no history
+ """
+ if not history:
+ return ""
+
+ lines = [""]
+ for i, hist_step in enumerate(history):
+ lines.append(f"Step {i + 1}:")
+ for key, value in hist_step.items():
+ if value:
+ lines.append(f"{key.capitalize()}: {value}")
+ lines.append("")
+ lines.append("")
+
+ return "\n".join(lines)
+
+
__all__ = [
"extract_action_observation_pairs",
"calculate_text_similarity",
"calculate_semantic_similarity",
+ "format_history",
]