Improve serialization of LLMJudge and custom evaluators (#1367)

dmontagu · web-flow · commit 1caeda09956a · 2025-04-03T16:02:14.000-06:00
diff --git a/docs/api/pydantic_evals/evaluators.md b/docs/api/pydantic_evals/evaluators.md
@@ -1,3 +1,5 @@
 # `pydantic_evals.evaluators`
 
 ::: pydantic_evals.evaluators
+
+::: pydantic_evals.evaluators.llm_as_a_judge
diff --git a/pydantic_ai_slim/pydantic_ai/models/openai.py b/pydantic_ai_slim/pydantic_ai/models/openai.py
@@ -150,7 +150,7 @@ class OpenAIModel(Model):
     """
 
     client: AsyncOpenAI = field(repr=False)
-    system_prompt_role: OpenAISystemPromptRole | None = field(default=None)
+    system_prompt_role: OpenAISystemPromptRole | None = field(default=None, repr=False)
 
     _model_name: OpenAIModelName = field(repr=False)
     _system: str = field(default='openai', repr=False)
diff --git a/pydantic_evals/pydantic_evals/evaluators/common.py b/pydantic_evals/pydantic_evals/evaluators/common.py
@@ -155,10 +155,14 @@ def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> bool:
 
 @dataclass
 class LLMJudge(Evaluator[object, object, object]):
-    """Judge whether the output of a language model meets the criteria of a provided rubric."""
+    """Judge whether the output of a language model meets the criteria of a provided rubric.
+
+    If you do not specify a model, it uses the default model for judging. This starts as 'openai:gpt-4o', but can be
+    overridden by calling [`set_default_judge_model`][pydantic_evals.evaluators.llm_as_a_judge.set_default_judge_model].
+    """
 
     rubric: str
-    model: models.Model | models.KnownModelName = 'openai:gpt-4o'
+    model: models.Model | models.KnownModelName | None = None
     include_input: bool = False
 
     async def evaluate(
@@ -175,6 +179,17 @@ async def evaluate(
             grading_output = await judge_output(ctx.output, self.rubric, self.model)
         return EvaluationReason(value=grading_output.pass_, reason=grading_output.reason)
 
+    def build_serialization_arguments(self):
+        result = super().build_serialization_arguments()
+        # always serialize the model as a string when present; use its name if it's a KnownModelName
+        if (model := result.get('model')) and isinstance(model, models.Model):
+            result['model'] = f'{model.system}:{model.model_name}'
+
+        # Note: this may lead to confusion if you try to serialize-then-deserialize with a custom model.
+        # I expect that is rare enough to be worth not solving yet, but common enough that we probably will want to
+        # solve it eventually. I'm imagining some kind of model registry, but don't want to work out the details yet.
+        return result
+
 
 @dataclass
 class HasMatchingSpan(Evaluator[object, object, object]):
diff --git a/pydantic_evals/pydantic_evals/evaluators/evaluator.py b/pydantic_evals/pydantic_evals/evaluators/evaluator.py
@@ -223,6 +223,28 @@ def serialize(self, info: SerializationInfo) -> Any:
         Returns:
             A JSON-serializable representation of this evaluator as an EvaluatorSpec.
         """
+        raw_arguments = self.build_serialization_arguments()
+
+        arguments: None | tuple[Any,] | dict[str, Any]
+        if len(raw_arguments) == 0:
+            arguments = None
+        elif len(raw_arguments) == 1:
+            arguments = (next(iter(raw_arguments.values())),)
+        else:
+            arguments = raw_arguments
+        return to_jsonable_python(
+            EvaluatorSpec(name=self.name(), arguments=arguments), context=info.context, serialize_unknown=True
+        )
+
+    def build_serialization_arguments(self) -> dict[str, Any]:
+        """Build the arguments for serialization.
+
+        Evaluators are serialized for inclusion as the "source" in an `EvaluationResult`.
+        If you want to modify how the evaluator is serialized for that or other purposes, you can override this method.
+
+        Returns:
+            A dictionary of arguments to be used during serialization.
+        """
         raw_arguments: dict[str, Any] = {}
         for field in fields(self):
             value = getattr(self, field.name)
@@ -234,12 +256,4 @@ def serialize(self, info: SerializationInfo) -> Any:
                 if value == field.default_factory():
                     continue
             raw_arguments[field.name] = value
-
-        arguments: None | tuple[Any,] | dict[str, Any]
-        if len(raw_arguments) == 0:
-            arguments = None
-        elif len(raw_arguments) == 1:
-            arguments = (next(iter(raw_arguments.values())),)
-        else:
-            arguments = raw_arguments
-        return to_jsonable_python(EvaluatorSpec(name=self.name(), arguments=arguments), context=info.context)
+        return raw_arguments
diff --git a/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py b/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
@@ -8,7 +8,10 @@
 
 from pydantic_ai import Agent, models
 
-__all__ = ('GradingOutput', 'judge_input_output', 'judge_output')
+__all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
+
+
+_default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
 
 
 class GradingOutput(BaseModel, populate_by_name=True):
@@ -41,11 +44,15 @@ class GradingOutput(BaseModel, populate_by_name=True):
 
 
 async def judge_output(
-    output: Any, rubric: str, model: models.Model | models.KnownModelName = 'openai:gpt-4o'
+    output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None
 ) -> GradingOutput:
-    """Judge the output of a model based on a rubric."""
+    """Judge the output of a model based on a rubric.
+
+    If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
+    but this can be changed using the `set_default_judge_model` function.
+    """
     user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
-    return (await _judge_output_agent.run(user_prompt, model=model)).data
+    return (await _judge_output_agent.run(user_prompt, model=model or _default_model)).data
 
 
 _judge_input_output_agent = Agent(
@@ -72,11 +79,24 @@ async def judge_output(
 
 
 async def judge_input_output(
-    inputs: Any, output: Any, rubric: str, model: models.Model | models.KnownModelName = 'openai:gpt-4o'
+    inputs: Any, output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None
 ) -> GradingOutput:
-    """Judge the output of a model based on the inputs and a rubric."""
+    """Judge the output of a model based on the inputs and a rubric.
+
+    If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
+    but this can be changed using the `set_default_judge_model` function.
+    """
     user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
-    return (await _judge_input_output_agent.run(user_prompt, model=model)).data
+    return (await _judge_input_output_agent.run(user_prompt, model=model or _default_model)).data
+
+
+def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:  # pragma: no cover
+    """Set the default model used for judging.
+
+    This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
+    """
+    global _default_model
+    _default_model = model
 
 
 def _stringify(value: Any) -> str:
diff --git a/tests/evals/test_evaluator_common.py b/tests/evals/test_evaluator_common.py
@@ -222,10 +222,10 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
     assert result.value is True
     assert result.reason == 'Test passed'
 
-    mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', 'openai:gpt-4o')
+    mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None)
 
     # Test with input
-    evaluator = LLMJudge(rubric='Output contains input', include_input=True)
+    evaluator = LLMJudge(rubric='Output contains input', include_input=True, model='openai:gpt-4o')
     result = await evaluator.evaluate(ctx)
     assert isinstance(result, EvaluationReason)
     assert result.value is True
diff --git a/tests/evals/test_evaluators.py b/tests/evals/test_evaluators.py
@@ -7,6 +7,11 @@
 from inline_snapshot import snapshot
 from pydantic import BaseModel, TypeAdapter
 
+from pydantic_ai.messages import ModelMessage, ModelResponse
+from pydantic_ai.models import Model, ModelRequestParameters
+from pydantic_ai.settings import ModelSettings
+from pydantic_ai.usage import Usage
+
 from ..conftest import try_import
 
 with try_import() as imports_successful:
@@ -108,6 +113,34 @@ async def test_evaluator_spec_serialization():
     assert adapter.dump_python(spec_single_arg, context={'use_short_form': True}) == snapshot({'MyEvaluator': 'value1'})
 
 
+async def test_llm_judge_serialization():
+    # Ensure models are serialized based on their system + name when used with LLMJudge
+
+    class MyModel(Model):
+        async def request(
+            self,
+            messages: list[ModelMessage],
+            model_settings: ModelSettings | None,
+            model_request_parameters: ModelRequestParameters,
+        ) -> tuple[ModelResponse, Usage]:
+            raise NotImplementedError
+
+        @property
+        def model_name(self) -> str:
+            return 'my-model'
+
+        @property
+        def system(self) -> str:
+            return 'my-system'
+
+    adapter = TypeAdapter(Evaluator)
+
+    assert adapter.dump_python(LLMJudge(rubric='my rubric', model=MyModel())) == {
+        'name': 'LLMJudge',
+        'arguments': {'model': 'my-system:my-model', 'rubric': 'my rubric'},
+    }
+
+
 async def test_evaluator_call(test_context: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]):
     """Test calling an Evaluator."""