Expose templates of LLM grader instances and default templates of LLM grader classes. (#124)

weizhang25 · web-flow · commit fa8af37b9409 · 2026-02-26T11:59:43.000-08:00
* Expose templates of grader instances and default templates of grader classes.

* Resolve code review feedbacks.

* Update function argument type annotation.
diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
@@ -170,10 +170,12 @@ class ActionAlignmentGrader(LLMGrader):
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_ACTION_ALIGNMENT_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
-        template: Optional[PromptTemplate] = DEFAULT_ACTION_ALIGNMENT_TEMPLATE,
+        template: Optional[PromptTemplate] = DEFAULT_TEMPLATE,
         language: LanguageEnum = LanguageEnum.EN,
         strategy: BaseEvaluationStrategy | None = None,
     ):
@@ -183,7 +185,7 @@ def __init__(
         Args:
             model: The chat model to use for evaluation, either as a BaseChatModel instance or config dict
             template: The prompt template for action alignment evaluation.
-                     Defaults to DEFAULT_ACTION_ALIGNMENT_TEMPLATE.
+                     Defaults to DEFAULT_TEMPLATE.
             language: The language for the evaluation prompt. Defaults to LanguageEnum.EN.
             strategy: The evaluation strategy to use. Defaults to DirectStrategy.
         """
@@ -192,7 +194,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate action alignment with plan",
             model=model,
-            template=template or DEFAULT_ACTION_ALIGNMENT_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )
diff --git a/openjudge/graders/llm_grader.py b/openjudge/graders/llm_grader.py
@@ -53,6 +53,10 @@ class LLMGrader(BaseGrader):
         callback (Callable): Function to process model response metadata.
     """
 
+    # The default template value is just a placeholder.
+    # Extended classes must set proper value to DEFAULT_TEMPLATE
+    DEFAULT_TEMPLATE = PromptTemplate(messages={})
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -108,6 +112,9 @@ def __init__(
         else:
             self.language = language
 
+        if not template:
+            raise ValueError("Missing template argument value")
+
         if isinstance(template, str):
             self.template = PromptTemplate(
                 messages={
@@ -343,6 +350,15 @@ async def _aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank:
             raise ValueError(f"Unsupported grader mode: {self.mode}")
         return result
 
+    def get_template(self, language: LanguageEnum = LanguageEnum.EN) -> Dict[str, Any]:
+        """Return the template of the specified language in this grader instance"""
+        return self.template.get_prompt(language)
+
+    @classmethod
+    def get_default_template(cls, language: LanguageEnum = LanguageEnum.EN) -> Dict[str, Any]:
+        """Return the default template of the specified language in this grader class"""
+        return cls.DEFAULT_TEMPLATE.get_prompt(language)
+
     @staticmethod
     def get_metadata() -> Dict[str, Any]:
         """Return the docstring of the aevaluate method to explain how LLMGrader works with LLM."""
diff --git a/tests/graders/agent/action/test_action_alignment.py b/tests/graders/agent/action/test_action_alignment.py
@@ -61,6 +61,28 @@ def test_initialization(self):
         assert grader.name == "action_alignment"
         assert grader.model == mock_model
 
+        language_template = grader.get_template(LanguageEnum.ZH)
+        assert len(language_template) == 1
+        assert "zh" in language_template
+        template = language_template["zh"]
+        assert len(template) == 1
+        assert len(template[0]) == 2
+        assert template[0]["role"] == "user"
+        assert template[0]["content"].startswith(
+            "你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。"
+        )
+
+        language_template = grader.get_default_template(LanguageEnum.EN)
+        assert len(language_template) == 1
+        assert "en" in language_template
+        template = language_template["en"]
+        assert len(template) == 1
+        assert len(template[0]) == 2
+        assert template[0]["role"] == "user"
+        assert template[0]["content"].startswith(
+            "You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning."
+        )
+
     @pytest.mark.asyncio
     async def test_successful_evaluation_aligned(self):
         """Test successful evaluation with good alignment"""
@@ -156,12 +178,8 @@ async def test_error_handling(self):
 OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
 RUN_QUALITY_TESTS = bool(OPENAI_API_KEY and OPENAI_BASE_URL)
 
-pytestmark = pytest.mark.skipif(
-    not RUN_QUALITY_TESTS,
-    reason="Requires API keys and base URL to run quality tests",
-)
-
 
+@pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests")
 @pytest.mark.quality
 class TestActionAlignmentGraderQuality:
     """Quality tests for ActionAlignmentGrader - testing evaluation quality"""
diff --git a/tests/graders/test_llm_grader.py b/tests/graders/test_llm_grader.py
@@ -42,6 +42,7 @@
 from openjudge.graders.llm_grader import LLMGrader
 from openjudge.graders.schema import GraderError
 from openjudge.models.openai_chat_model import OpenAIChatModel
+from openjudge.models.schema.prompt_template import LanguageEnum
 from openjudge.runner.grading_runner import GraderConfig, GradingRunner
 
 # ==================== UNIT TESTS ====================
@@ -60,12 +61,18 @@ def test_initialization_failure_without_template(self):
                 model=AsyncMock(),
                 name="foo",
             )
+        assert "Missing template argument value" in str(error_obj.value)
+
+    def test_initialization_failure_with_invalid_template_type(self):
+        """Test initialization failure without template"""
+        with pytest.raises(ValueError) as error_obj:
+            LLMGrader(model=AsyncMock(), name="foo", template=AsyncMock())
         assert "Template must be a str, list, dict or PromptTemplate object" in str(error_obj.value)
 
     def test_initialization_with_string_template(self):
         """Test successful initialization with string template"""
         mock_model = AsyncMock()
-        template_str = """You're a LLM query answer relevance grader, you'll received Query/Response:
+        template_str = """You're a LLM query answer relevance grader, you'll receive Query/Response:
     Query: {query}
     Response: {response}
     Please read query/response, if the Response answers the Query, return 1, return 0 if no.
@@ -98,7 +105,7 @@ def test_initialization_with_dict_template(self):
                     },
                     {
                         "role": "user",
-                        "content": """You'll received Query/Response:
+                        "content": """You'll receive Query/Response:
     Query: {query}
     Response: {response}
     Please read query/response, if the Response answers the Query, return 1, return 0 if no.
@@ -139,7 +146,7 @@ def test_initialization_with_model_dict(self):
             "api_key": "test-key",
         }
 
-        template_str = """You're a LLM query answer relevance grader, you'll received Query/Response:
+        template_str = """You're a LLM query answer relevance grader, you'll receive Query/Response:
     Query: {query}
     Response: {response}
     Please read query/response, if the Response answers the Query, return 1, return 0 if no.
@@ -158,8 +165,29 @@ def test_initialization_with_model_dict(self):
         )
 
         assert grader.name == "test_llm_grader"
-        assert isinstance(grader.model, OpenAIChatModel)
         # Note: We can't easily check the model config since it's private
+        assert isinstance(grader.model, OpenAIChatModel)
+
+        language_template = grader.get_template()
+        assert len(language_template) == 1
+        assert LanguageEnum.EN in language_template
+        templates = language_template[LanguageEnum.EN]
+        assert len(templates) == 2
+        for t in templates:
+            assert len(t) == 2
+            assert "role" in t
+            assert "content" in t
+
+            if t["role"] == "system":
+                assert (
+                    "You are a professional evaluation assistant. Please evaluate according to the user's requirements."
+                    in t["content"]
+                )
+            elif t["role"] == "user":
+                assert "You're a LLM query answer relevance grader, you'll receive Query/Response" in t["content"]
+
+        default_template = grader.get_default_template()
+        assert len(default_template) == 0
 
     @pytest.mark.asyncio
     async def test_pointwise_evaluation_success(self):
@@ -217,7 +245,7 @@ async def test_listwise_evaluation_success(self):
         mock_model.achat = AsyncMock(return_value=mock_response)
 
         # Create grader with template that follows the specification in docs
-        template = """You're a LLM query answer ranking grader, you'll received Query and multiple Responses:
+        template = """You're a LLM query answer ranking grader, you'll receive Query and multiple Responses:
     Query: {query}
     Responses:
     1. {response_1}
@@ -308,9 +336,8 @@ def test_serialization_methods(self):
 OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
 RUN_QUALITY_TESTS = bool(OPENAI_API_KEY and OPENAI_BASE_URL)
 
-pytestmark = pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests")
-
 
+@pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests")
 @pytest.mark.quality
 class TestLLMGraderQuality:
     """Quality tests for LLMGrader - testing evaluation quality using golden dataset"""
@@ -361,7 +388,7 @@ def model(self):
     async def test_discriminative_power_with_runner(self, dataset, model):
         """Test the grader's ability to distinguish between accurate and inaccurate responses (using Runner)"""
         # Create grader with real model following the specification in docs
-        template = """You're a LLM query answer accuracy grader, you'll received Query/Response and Context:
+        template = """You're a LLM query answer accuracy grader, you'll receive Query/Response and Context:
     Query: {query}
     Response: {response}
     Context: {context}
@@ -420,7 +447,7 @@ async def test_discriminative_power_with_runner(self, dataset, model):
     async def test_consistency_with_runner(self, dataset, model):
         """Test grader evaluation consistency (using Runner)"""
         # Create grader with real model following the specification in docs
-        template = """You're a LLM query answer accuracy grader, you'll received Query/Response and Context:
+        template = """You're a LLM query answer accuracy grader, you'll receive Query/Response and Context:
     Query: {query}
     Response: {response}
     Context: {context}