feat: a common util method of formatting history for agent graders. (#42)

weizhang25 · web-flow · commit 10844f00a345 · 2026-01-08T10:33:16.000+08:00
diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
@@ -10,6 +10,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -189,29 +190,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         plan: str,
@@ -244,7 +222,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py
@@ -10,6 +10,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -180,29 +181,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         observation: str,
@@ -237,7 +215,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py
@@ -10,6 +10,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -180,29 +181,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         observation: str,
@@ -237,7 +215,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
@@ -10,6 +10,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -183,29 +184,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         plan: str,
@@ -243,7 +221,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py
@@ -10,6 +10,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -183,29 +184,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         plan: str,
@@ -243,7 +221,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py
@@ -10,6 +10,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -180,29 +181,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         observation: str,
@@ -237,7 +215,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
@@ -11,6 +11,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -304,29 +305,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         observation: str,
@@ -361,7 +339,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py
@@ -11,6 +11,7 @@
 
 from loguru import logger
 
+from openjudge.graders.agent.utils import format_history
 from openjudge.graders.base_grader import GraderMode, GraderScore
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.models.base_chat_model import BaseChatModel
@@ -221,29 +222,6 @@ def __init__(
             language=language,
         )
 
-    def _format_history(self, history: Optional[list] = None) -> str:
-        """Format history steps for evaluation.
-
-        Args:
-            history: Optional list of previous step dictionaries
-
-        Returns:
-            Formatted history string, or empty string if no history
-        """
-        if not history:
-            return ""
-
-        lines = ["<History Steps>"]
-        for i, hist_step in enumerate(history):
-            lines.append(f"Step {i + 1}:")
-            for key, value in hist_step.items():
-                if value:
-                    lines.append(f"{key.capitalize()}: {value}")
-            lines.append("")
-        lines.append("</History Steps>")
-
-        return "\n".join(lines)
-
     async def aevaluate(
         self,
         observation: str,
@@ -278,7 +256,7 @@ async def aevaluate(
             context_str = f"<context>\n{context}\n</context>"
 
         # Format history
-        history_str = self._format_history(history)
+        history_str = format_history(history)
 
         try:
             result = await super().aevaluate(
diff --git a/openjudge/graders/agent/utils.py b/openjudge/graders/agent/utils.py