PR update: jinja, pydantic

Keshav Ramji Keshav.Ramji@ibm.com · Keshav Ramji Keshav.Ramji@ibm.com · commit 90b8f490daa6 · 2025-11-25T18:16:25.000Z
diff --git a/cli/eval/commands.py b/cli/eval/commands.py
@@ -1,3 +1,6 @@
+"""Use the eval command for LLM-as-a-judge evaluation, given a (set of) test file(s) consisting of prompts, instructions, and optionally, targets.
+Instantiate a generator model to produce candidate responses, and a judge model to determine whether the instructions have been followed."""
+
 import typer
 
 eval_app = typer.Typer(name="eval")
diff --git a/cli/eval/runner.py b/cli/eval/runner.py
@@ -15,7 +15,7 @@
 
 
 class InputEvalResult:
-    """Store results of a single input evaluation (within a unit test)"""
+    """Store results of a single input evaluation (within a unit test)."""
 
     def __init__(
         self,
@@ -42,7 +42,7 @@ def to_dict(self):
 
 
 class TestEvalResult:
-    """Store results of a single test evaluation"""
+    """Store results of a single test evaluation."""
 
     def __init__(self, test_eval: TestBasedEval, input_results: list[InputEvalResult]):
         self.test_eval = test_eval
@@ -77,7 +77,7 @@ def pass_rate(self) -> float:
 def create_session(
     backend: str, model: str | None, max_tokens: int | None
 ) -> mellea.MelleaSession:
-    """Create a mellea session with the specified backend and  model."""
+    """Create a mellea session with the specified backend and model."""
 
     model_id = None
     if model:
@@ -164,7 +164,15 @@ def run_evaluations(
     output_format: str,
     continue_on_error: bool,
 ):
-    """Run all 'unit test' evaluations"""
+    """Run all 'unit test' evaluations
+
+    Each test file should be a json containing:
+        "id": an id that is unique to this test file
+        "source": the origin for the evaluation prompts, else "N/A"
+        "name": an instruction-following attribute that the user intends to evaluate through this test
+        "instructions": a set (in string form) of requirements which the generation should follow; the judge will evaluate if these are satisfied
+        "examples": a list of entries containing an input_id, an input(prompt), and a list of targets. Each input may have multiple (or no) targets; inputs and targets are in messages format.
+    """
     all_test_evals: List[TestBasedEval] = []
 
     for test_file in test_files:
@@ -230,7 +238,7 @@ def execute_test_eval(
 ) -> TestEvalResult:
     """Execute a single test evaluation
     For each input in the test, generate a response using generation_session
-    Then, after all inputs are processed, validate using judge_session
+    Then, after all inputs are processed, validate using judge_session.
     """
 
     input_results = []
@@ -245,10 +253,12 @@ def execute_test_eval(
         )
 
         # query the judge
-        judge_prompt = create_judge_requirement(
-            test_eval, input_text, model_output, targets_for_input
+        test_eval.set_judge_context(
+            input_text=input_text,
+            prediction=model_output,
+            targets_for_input=targets_for_input,
         )
-        judge_output_thunk = judge_session.act(judge_prompt)
+        judge_output_thunk = judge_session.act(test_eval)
         judge_output = str(judge_output_thunk)
         score, justification = parse_judge_output(judge_output)
         passed = score == 1 if score is not None else False
@@ -270,33 +280,6 @@ def execute_test_eval(
     return test_result
 
 
-def create_judge_requirement(
-    test_eval: TestBasedEval,
-    input_text: str,
-    model_output: str,
-    targets_for_input: list[str],
-):
-    """Create judge requirement description"""
-
-    if len(targets_for_input) == 0:  # no reference
-        target_text = "N/A"
-    elif len(targets_for_input) == 1:
-        target_text = targets_for_input[0]
-    else:  # enumerate when there are multiple targets
-        target_text = "\n".join(
-            [f"{i}. {target}" for i, target in enumerate(targets_for_input, 1)]
-        )
-
-    judge_prompt = test_eval.judge_prompt.format(
-        input=input_text,
-        prediction=model_output,
-        target=target_text,
-        guidelines=test_eval.instructions,
-    )
-
-    return judge_prompt
-
-
 def parse_judge_output(judge_output: str):
     try:
         json_match = re.search(r'\{[^}]*"score"[^}]*\}', judge_output, re.DOTALL)
diff --git a/mellea/__init__.py b/mellea/__init__.py
@@ -3,6 +3,5 @@
 import mellea.backends.model_ids as model_ids
 from mellea.stdlib.genslot import generative
 from mellea.stdlib.session import MelleaSession, start_session
-from mellea.stdlib.test_based_eval import TestBasedEval
 
-__all__ = ["MelleaSession", "TestBasedEval", "generative", "model_ids", "start_session"]
+__all__ = ["MelleaSession", "generative", "model_ids", "start_session"]
diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py
@@ -4,7 +4,42 @@
 from pathlib import Path
 from typing import Any
 
-from mellea.stdlib.base import Component
+from pydantic import BaseModel, Field, field_validator
+
+from mellea.stdlib.base import CBlock, Component, TemplateRepresentation
+
+
+class Message(BaseModel):
+    """Schema for a message in the test data."""
+
+    role: str
+    content: str
+
+
+class Example(BaseModel):
+    """Schema for an example in the test data."""
+
+    input: list[Message]
+    targets: list[Message] = Field(default_factory=list)
+    input_id: str = ""
+
+
+class TestData(BaseModel):
+    """Schema for test data loaded from json."""
+
+    source: str
+    name: str
+    instructions: str
+    examples: list[Example] = Field(default_factory=list)
+    id: str
+
+    @field_validator("examples")
+    @classmethod
+    def validate_examples(cls, v):
+        """Ensure examples list is not empty."""
+        if not v:
+            raise ValueError("examples list cannot be empty")
+        return v
 
 
 class TestBasedEval(Component):
@@ -29,32 +64,37 @@ def __init__(
         self.test_id = test_id
         self.input_ids = input_ids or []
 
-        self.judge_prompt = """**Input to the model**
-
-            {input}
-
-            **Model output to be rated**
-
-            {prediction}
-
-            **Ground truth text**
+    def parts(self) -> list[Component | CBlock]:
+        """The set of constituent parts of the Component."""
+        return []
 
-            {target}
+    def format_for_llm(self) -> TemplateRepresentation:
+        """Formats the test for judge evaluation."""
+        return TemplateRepresentation(
+            obj=self,
+            args=self._judge_context if hasattr(self, "_judge_context") else {},
+            template_order=["*"],
+        )
 
-            **Rating Guidelines**
-            The model output should adhere to the following guidelines:
-             {guidelines}
-
-            **Scoring Criteria**
-             * Score 0: The model output violates any of the guidelines.
-             * Score 1: The model output is well aligned with the ground truth - if it exists, the input to the model, and adheres to all guidelines.
-
-            **Return Your Rating**
-               Return your rating in the following format:
-               {{\"score\": your_score, \"justification\": \"your_justification\"}}
+    def set_judge_context(
+        self, input_text: str, prediction: str, targets_for_input: list[str]
+    ):
+        """Set context for judge evaluation."""
+        if len(targets_for_input) == 0:  # no reference
+            target_text = "N/A"
+        elif len(targets_for_input) == 1:
+            target_text = targets_for_input[0]
+        else:  # enumerate when there are multiple targets
+            target_text = "\n".join(
+                [f"{i}. {target}" for i, target in enumerate(targets_for_input, 1)]
+            )
 
-            Your rating:
-            """
+        self._judge_context: dict[str, Any] = {
+            "input": input_text,
+            "prediction": prediction,
+            "target": target_text,
+            "guidelines": self.instructions,
+        }
 
     @classmethod
     def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
@@ -68,38 +108,35 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
             data = [data]
 
         test_evals = []
-        for test_data in data:
-            examples = test_data.get("examples", [])
+        for test_data_dict in data:
+            try:
+                test_data = TestData(**test_data_dict)
+            except Exception as e:
+                raise ValueError(f"Invalid test data in {filepath}: {e}")
 
             inputs = []
             targets = []
             input_ids = []
 
-            for example in examples:
-                input_messages = example.get("input", [])
-                user_messages = [
-                    msg for msg in input_messages if msg.get("role") == "user"
-                ]
+            for example in test_data.examples:
+                user_messages = [msg for msg in example.input if msg.role == "user"]
                 if user_messages:
-                    inputs.append(user_messages[-1].get("content", ""))
+                    inputs.append(user_messages[-1].content)
 
-                target_messages = example.get("targets", [])
                 targets_for_input = [
-                    msg.get("content", "")
-                    for msg in target_messages
-                    if msg.get("role") == "assistant"
+                    msg.content for msg in example.targets if msg.role == "assistant"
                 ]
                 targets.append(targets_for_input)
 
-                input_ids.append(example.get("input_id", ""))
+                input_ids.append(example.input_id)
 
             test_eval = cls(
-                source=test_data.get("source", "unknown"),
-                name=test_data.get("name", ""),
-                instructions=test_data.get("instructions", ""),
+                source=test_data.source,
+                name=test_data.name,
+                instructions=test_data.instructions,
                 inputs=inputs,
                 targets=targets,
-                test_id=test_data.get("id", ""),
+                test_id=test_data.id,
                 input_ids=input_ids,
             )
             test_evals.append(test_eval)
diff --git a/mellea/templates/prompts/default/TestBasedEval.jinja2 b/mellea/templates/prompts/default/TestBasedEval.jinja2
@@ -0,0 +1,27 @@
+**Input to the model**
+
+{{ input }}
+
+**Model output to be rated**
+
+{{ prediction }}
+
+{% if target and target != "N/A" %}
+**Ground truth text**
+
+{{ target }}
+{% endif %}
+
+**Rating Guidelines**
+The model output should adhere to the following guidelines:
+{{ guidelines }}
+
+**Scoring Criteria**
+* Score 0: The model output violates any of the guidelines.
+* Score 1: The model output is well aligned with the ground truth{% if target and target != "N/A" %} - if it exists{% endif %}, the input to the model, and adheres to all guidelines.
+
+**Return Your Rating**
+Return your rating in the following format:
+{"score": your_score, "justification": "your_justification"}
+
+Your rating: